aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp28
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp20
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp229
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp35
-rw-r--r--llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td22
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp330
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp213
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp79
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td16
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp142
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp111
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td18
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h32
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td5
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td93
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td53
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td500
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td32
-rw-r--r--llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp13
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp11
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp39
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.h2
-rw-r--r--llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp14
-rw-r--r--llvm/lib/Target/PowerPC/PPCFrameLowering.cpp13
-rw-r--r--llvm/lib/Target/RISCV/RISCV.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td26
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVMacroFusion.cpp210
-rw-r--r--llvm/lib/Target/RISCV/RISCVMacroFusion.h28
-rw-r--r--llvm/lib/Target/RISCV/RISCVMacroFusion.td93
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h8
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp13
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp12
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.cpp1
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp15
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td2
-rw-r--r--llvm/lib/Target/X86/X86InstrFoldTables.cpp11
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp35
64 files changed, 2056 insertions, 720 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 352c61d48e2f..1af064b6de3c 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1544,6 +1544,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
return true;
}
+ case AArch64::COALESCER_BARRIER_FPR16:
+ case AArch64::COALESCER_BARRIER_FPR32:
+ case AArch64::COALESCER_BARRIER_FPR64:
+ case AArch64::COALESCER_BARRIER_FPR128:
+ MI.eraseFromParent();
+ return true;
case AArch64::LD1B_2Z_IMM_PSEUDO:
return expandMultiVecPseudo(
MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index d55deec97600..732e787d2a32 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4339,8 +4339,10 @@ AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
MBB.addSuccessor(LoopMBB);
// Update liveins.
- recomputeLiveIns(*LoopMBB);
- recomputeLiveIns(*ExitMBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
+ } while (anyChange);
return ExitMBB->begin();
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 332fb3765528..e97f5e322014 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2375,6 +2375,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((AArch64ISD::NodeType)Opcode) {
case AArch64ISD::FIRST_NUMBER:
break;
+ MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
MAKE_CASE(AArch64ISD::SMSTART)
MAKE_CASE(AArch64ISD::SMSTOP)
MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -7154,13 +7155,18 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
}
}
+static bool isPassedInFPR(EVT VT) {
+ return VT.isFixedLengthVector() ||
+ (VT.isFloatingPoint() && !VT.isScalableVector());
+}
+
/// LowerCallResult - Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
SDValue AArch64TargetLowering::LowerCallResult(
SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
- SDValue ThisVal) const {
+ SDValue ThisVal, bool RequiresSMChange) const {
DenseMap<unsigned, SDValue> CopiedRegs;
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -7205,6 +7211,10 @@ SDValue AArch64TargetLowering::LowerCallResult(
break;
}
+ if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
+ Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(),
+ Val);
+
InVals.push_back(Val);
}
@@ -7915,6 +7925,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
return ArgReg.Reg == VA.getLocReg();
});
} else {
+ // Add an extra level of indirection for streaming mode changes by
+ // using a pseudo copy node that cannot be rematerialised between a
+ // smstart/smstop and the call by the simple register coalescer.
+ if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
+ Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
+ Arg.getValueType(), Arg);
RegsToPass.emplace_back(VA.getLocReg(), Arg);
RegsUsed.insert(VA.getLocReg());
const TargetOptions &Options = DAG.getTarget().Options;
@@ -8151,9 +8167,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Handle result values, copying them out of physregs into vregs that we
// return.
- SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs,
- DL, DAG, InVals, IsThisReturn,
- IsThisReturn ? OutVals[0] : SDValue());
+ SDValue Result = LowerCallResult(
+ Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
+ IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
if (!Ins.empty())
InGlue = Result.getValue(Result->getNumValues() - 1);
@@ -26899,7 +26915,7 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
return false;
// If the vector is scalable, SVE is enabled, implying support for complex
- // numbers. Otherwirse, we need to ensure complex number support is avaialble
+ // numbers. Otherwise, we need to ensure complex number support is available
if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
return false;
@@ -26915,7 +26931,7 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
!llvm::isPowerOf2_32(VTyWidth))
return false;
- if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2()) {
+ if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
return 8 <= ScalarWidth && ScalarWidth <= 64;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6505931e17e1..541a810fb5cb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -58,6 +58,8 @@ enum NodeType : unsigned {
CALL_BTI, // Function call followed by a BTI instruction.
+ COALESCER_BARRIER,
+
SMSTART,
SMSTOP,
RESTORE_ZA,
@@ -1026,7 +1028,7 @@ private:
const SmallVectorImpl<CCValAssign> &RVLocs,
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
- SDValue ThisVal) const;
+ SDValue ThisVal, bool RequiresSMChange) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 2e8d8c63d6be..9b4bb7c88bc8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4098,16 +4098,6 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
return MI.getOperand(Idx);
}
-const MachineOperand &
-AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default:
- llvm_unreachable("Unexpected opcode");
- case AArch64::LDRBBroX:
- return MI.getOperand(4);
- }
-}
-
static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
Register Reg) {
if (MI.getParent() == nullptr)
@@ -9597,9 +9587,13 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
// Update liveins.
if (MF.getRegInfo().reservedRegsFrozen()) {
- recomputeLiveIns(*LoopTestMBB);
- recomputeLiveIns(*LoopBodyMBB);
- recomputeLiveIns(*ExitMBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*ExitMBB) ||
+ recomputeLiveIns(*LoopBodyMBB) ||
+ recomputeLiveIns(*LoopTestMBB);
+ } while (anyChange);
+ ;
}
return ExitMBB->begin();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index db24a19fe5f8..6526f6740747 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -111,9 +111,6 @@ public:
/// Returns the immediate offset operator of a load/store.
static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
- /// Returns the shift amount operator of a load/store.
- static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
-
/// Returns whether the instruction is FP or NEON.
static bool isFpOrNEON(const MachineInstr &MI);
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index e90b8a8ca7ac..926a89466255 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -62,8 +62,6 @@ STATISTIC(NumUnscaledPairCreated,
"Number of load/store from unscaled generated");
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
-STATISTIC(NumConstOffsetFolded,
- "Number of const offset of index address folded");
DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
"Controls which pairs are considered for renaming");
@@ -77,11 +75,6 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
cl::Hidden);
-// The LdStConstLimit limits how far we search for const offset instructions
-// when we form index address load/store instructions.
-static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
- cl::init(10), cl::Hidden);
-
// Enable register renaming to find additional store pairing opportunities.
static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
cl::init(true), cl::Hidden);
@@ -178,13 +171,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
int UnscaledOffset, unsigned Limit);
- // Scan the instruction list to find a register assigned with a const
- // value that can be combined with the current instruction (a load or store)
- // using base addressing with writeback. Scan forwards.
- MachineBasicBlock::iterator
- findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
- unsigned &Offset);
-
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
// pre or post indexed addressing with writeback. Scan backwards.
@@ -196,19 +182,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
unsigned BaseReg, int Offset);
- bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
- unsigned IndexReg, unsigned &Offset);
-
// Merge a pre- or post-index base register update into a ld/st instruction.
MachineBasicBlock::iterator
mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Update, bool IsPreIdx);
- MachineBasicBlock::iterator
- mergeConstOffsetInsn(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Update, unsigned Offset,
- int Scale);
-
// Find and merge zero store instructions.
bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
@@ -221,9 +199,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and merge a base register updates before or after a ld/st instruction.
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
- // Find and merge a index ldr/st instructions into a base ld/st instruction.
- bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
-
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -506,16 +481,6 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
}
}
-static unsigned getBaseAddressOpcode(unsigned Opc) {
- // TODO: Add more index address loads/stores.
- switch (Opc) {
- default:
- llvm_unreachable("Opcode has no base address equivalent!");
- case AArch64::LDRBBroX:
- return AArch64::LDRBBui;
- }
-}
-
static unsigned getPostIndexedOpcode(unsigned Opc) {
switch (Opc) {
default:
@@ -757,20 +722,6 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
}
}
-// Make sure this is a reg+reg Ld/St
-static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
- unsigned Opc = MI.getOpcode();
- switch (Opc) {
- default:
- return false;
- // Scaled instructions.
- // TODO: Add more index address loads/stores.
- case AArch64::LDRBBroX:
- Scale = 1;
- return true;
- }
-}
-
static bool isRewritableImplicitDef(unsigned Opc) {
switch (Opc) {
default:
@@ -2097,63 +2048,6 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
return NextI;
}
-MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Update,
- unsigned Offset, int Scale) {
- assert((Update->getOpcode() == AArch64::MOVKWi) &&
- "Unexpected const mov instruction to merge!");
- MachineBasicBlock::iterator E = I->getParent()->end();
- MachineBasicBlock::iterator NextI = next_nodbg(I, E);
- MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
- MachineInstr &MemMI = *I;
- unsigned Mask = (1 << 12) * Scale - 1;
- unsigned Low = Offset & Mask;
- unsigned High = Offset - Low;
- Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
- Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
- MachineInstrBuilder AddMIB, MemMIB;
-
- // Add IndexReg, BaseReg, High (the BaseReg may be SP)
- AddMIB =
- BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
- .addDef(IndexReg)
- .addUse(BaseReg)
- .addImm(High >> 12) // shifted value
- .addImm(12); // shift 12
- (void)AddMIB;
- // Ld/St DestReg, IndexReg, Imm12
- unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
- MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
- .add(getLdStRegOp(MemMI))
- .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
- .addImm(Low / Scale)
- .setMemRefs(I->memoperands())
- .setMIFlags(I->mergeFlagsWith(*Update));
- (void)MemMIB;
-
- ++NumConstOffsetFolded;
- LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
- LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
- LLVM_DEBUG(PrevI->print(dbgs()));
- LLVM_DEBUG(dbgs() << " ");
- LLVM_DEBUG(Update->print(dbgs()));
- LLVM_DEBUG(dbgs() << " ");
- LLVM_DEBUG(I->print(dbgs()));
- LLVM_DEBUG(dbgs() << " with instruction:\n ");
- LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
- LLVM_DEBUG(dbgs() << " ");
- LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
-
- // Erase the old instructions for the block.
- I->eraseFromParent();
- PrevI->eraseFromParent();
- Update->eraseFromParent();
-
- return NextI;
-}
-
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
MachineInstr &MI,
unsigned BaseReg, int Offset) {
@@ -2201,31 +2095,6 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
return false;
}
-bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
- MachineInstr &MI,
- unsigned IndexReg,
- unsigned &Offset) {
- // The update instruction source and destination register must be the
- // same as the load/store index register.
- if (MI.getOpcode() == AArch64::MOVKWi &&
- TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
-
- // movz + movk hold a large offset of a Ld/St instruction.
- MachineBasicBlock::iterator B = MI.getParent()->begin();
- MachineBasicBlock::iterator MBBI = &MI;
- MBBI = prev_nodbg(MBBI, B);
- MachineInstr &MovzMI = *MBBI;
- if (MovzMI.getOpcode() == AArch64::MOVZWi) {
- unsigned Low = MovzMI.getOperand(1).getImm();
- unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
- Offset = High + Low;
- // 12-bit optionally shifted immediates are legal for adds.
- return Offset >> 24 == 0;
- }
- }
- return false;
-}
-
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2381,60 +2250,6 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
-MachineBasicBlock::iterator
-AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
- MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
- MachineBasicBlock::iterator B = I->getParent()->begin();
- MachineBasicBlock::iterator E = I->getParent()->end();
- MachineInstr &MemMI = *I;
- MachineBasicBlock::iterator MBBI = I;
-
- // If the load is the first instruction in the block, there's obviously
- // not any matching load or store.
- if (MBBI == B)
- return E;
-
- // Make sure the IndexReg is killed and the shift amount is zero.
- // TODO: Relex this restriction to extend, simplify processing now.
- if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
- !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
- (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
- return E;
-
- Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
-
- // Track which register units have been modified and used between the first
- // insn (inclusive) and the second insn.
- ModifiedRegUnits.clear();
- UsedRegUnits.clear();
- unsigned Count = 0;
- do {
- MBBI = prev_nodbg(MBBI, B);
- MachineInstr &MI = *MBBI;
-
- // Don't count transient instructions towards the search limit since there
- // may be different numbers of them if e.g. debug information is present.
- if (!MI.isTransient())
- ++Count;
-
- // If we found a match, return it.
- if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
- return MBBI;
- }
-
- // Update the status of what the instruction clobbered and used.
- LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
-
- // Otherwise, if the index register is used or modified, we have no match,
- // so return early.
- if (!ModifiedRegUnits.available(IndexReg) ||
- !UsedRegUnits.available(IndexReg))
- return E;
-
- } while (MBBI != B && Count < Limit);
- return E;
-}
-
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
@@ -2619,34 +2434,6 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
return false;
}
-bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
- int Scale) {
- MachineInstr &MI = *MBBI;
- MachineBasicBlock::iterator E = MI.getParent()->end();
- MachineBasicBlock::iterator Update;
-
- // Don't know how to handle unscaled pre/post-index versions below, so bail.
- if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
- return false;
-
- // Look back to try to find a const offset for index LdSt instruction. For
- // example,
- // mov x8, #LargeImm ; = a * (1<<12) + imm12
- // ldr x1, [x0, x8]
- // merged into:
- // add x8, x0, a * (1<<12)
- // ldr x1, [x8, imm12]
- unsigned Offset;
- Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
- if (Update != E && (Offset & (Scale - 1)) == 0) {
- // Merge the imm12 into the ld/st.
- MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
- return true;
- }
-
- return false;
-}
-
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
bool EnableNarrowZeroStOpt) {
@@ -2725,22 +2512,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++MBBI;
}
- // 5) Find a register assigned with a const value that can be combined with
- // into the load or store. e.g.,
- // mov x8, #LargeImm ; = a * (1<<12) + imm12
- // ldr x1, [x0, x8]
- // ; becomes
- // add x8, x0, a * (1<<12)
- // ldr x1, [x8, imm12]
- for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
- MBBI != E;) {
- int Scale;
- if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
- Modified = true;
- else
- ++MBBI;
- }
-
return Modified;
}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ea9882160d6f..f86e6947c9cd 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1015,6 +1015,8 @@ bool AArch64RegisterInfo::shouldCoalesce(
MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg,
const TargetRegisterClass *DstRC, unsigned DstSubReg,
const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+ MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+
if (MI->isCopy() &&
((DstRC->getID() == AArch64::GPR64RegClassID) ||
(DstRC->getID() == AArch64::GPR64commonRegClassID)) &&
@@ -1023,5 +1025,38 @@ bool AArch64RegisterInfo::shouldCoalesce(
// which implements a 32 to 64 bit zero extension
// which relies on the upper 32 bits being zeroed.
return false;
+
+ auto IsCoalescerBarrier = [](const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AArch64::COALESCER_BARRIER_FPR16:
+ case AArch64::COALESCER_BARRIER_FPR32:
+ case AArch64::COALESCER_BARRIER_FPR64:
+ case AArch64::COALESCER_BARRIER_FPR128:
+ return true;
+ default:
+ return false;
+ }
+ };
+
+ // For calls that temporarily have to toggle streaming mode as part of the
+ // call-sequence, we need to be more careful when coalescing copy instructions
+ // so that we don't end up coalescing the NEON/FP result or argument register
+ // with a whole Z-register, such that after coalescing the register allocator
+ // will try to spill/reload the entire Z register.
+ //
+ // We do this by checking if the node has any defs/uses that are
+ // COALESCER_BARRIER pseudos. These are 'nops' in practice, but they exist to
+ // instruct the coalescer to avoid coalescing the copy.
+ if (MI->isCopy() && SubReg != DstSubReg &&
+ (AArch64::ZPRRegClass.hasSubClassEq(DstRC) ||
+ AArch64::ZPRRegClass.hasSubClassEq(SrcRC))) {
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ if (any_of(MRI.def_instructions(SrcReg), IsCoalescerBarrier))
+ return false;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ if (any_of(MRI.use_nodbg_instructions(DstReg), IsCoalescerBarrier))
+ return false;
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index eeae5303a3f8..acf067f2cc5a 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -28,6 +28,8 @@ def AArch64_restore_zt : SDNode<"AArch64ISD::RESTORE_ZT", SDTypeProfile<0, 2,
def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
[SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+def AArch64CoalescerBarrier
+ : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, []>;
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
@@ -189,6 +191,26 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
(MSR 0xde85, GPR64:$val)>;
def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
(MRS 0xde85)>;
+
+multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> {
+ def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> {
+ let Constraints = "$dst = $src";
+ }
+ foreach vt = vts in {
+ def : Pat<(vt (AArch64CoalescerBarrier (vt rc:$src))),
+ (!cast<Instruction>(NAME) rc:$src)>;
+ }
+}
+
+multiclass CoalescerBarriers {
+ defm _FPR16 : CoalescerBarrierPseudo<FPR16, [bf16, f16]>;
+ defm _FPR32 : CoalescerBarrierPseudo<FPR32, [f32]>;
+ defm _FPR64 : CoalescerBarrierPseudo<FPR64, [f64, v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64, v4bf16]>;
+ defm _FPR128 : CoalescerBarrierPseudo<FPR128, [f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64, v8bf16]>;
+}
+
+defm COALESCER_BARRIER : CoalescerBarriers;
+
} // End let Predicates = [HasSME]
// Pseudo to match to smstart/smstop. This expands:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d611338fc268..992b11da7eee 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -233,15 +233,20 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
- SMEAttrs CallerAttrs(*Caller);
- SMEAttrs CalleeAttrs(*Callee);
+ SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
+
+ // When inlining, we should consider the body of the function, not the
+ // interface.
+ if (CalleeAttrs.hasStreamingBody()) {
+ CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
+ CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
+ }
+
if (CalleeAttrs.hasNewZABody())
return false;
if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
- (CallerAttrs.requiresSMChange(CalleeAttrs) &&
- (!CallerAttrs.hasStreamingInterfaceOrBody() ||
- !CalleeAttrs.hasStreamingBody()))) {
+ CallerAttrs.requiresSMChange(CalleeAttrs)) {
if (hasPossibleIncompatibleOps(Callee))
return false;
}
@@ -4062,4 +4067,4 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
cast<BranchInst>(I->getNextNode())->isUnconditional())
return true;
return BaseT::shouldTreatInstructionLikeSelect(I);
-} \ No newline at end of file
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index cb29d5d94759..250e3e350c02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1506,6 +1506,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeatureExtendedImageInsts,
+ FeatureFP8ConversionInsts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index a19b03b92923..152f495a452b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -59,6 +59,30 @@ def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
+def gi_wmmavisrc :
+ GIComplexOperandMatcher<s32, "selectWMMAVISrc">,
+ GIComplexPatternEquiv<WMMAVISrc>;
+
+def gi_wmmamods :
+ GIComplexOperandMatcher<s32, "selectWMMAModsF32NegAbs">,
+ GIComplexPatternEquiv<WMMAModsF32NegAbs>;
+
+def gi_wmmamodsf16Neg :
+ GIComplexOperandMatcher<s32, "selectWMMAModsF16Neg">,
+ GIComplexPatternEquiv<WMMAModsF16Neg>;
+
+def gi_wmmamodsf16NegAbs :
+ GIComplexOperandMatcher<s32, "selectWMMAModsF16NegAbs">,
+ GIComplexPatternEquiv<WMMAModsF16NegAbs>;
+
+def gi_swmmacindex8 :
+ GIComplexOperandMatcher<s32, "selectSWMMACIndex8">,
+ GIComplexPatternEquiv<SWMMACIndex8>;
+
+def gi_swmmacindex16 :
+ GIComplexOperandMatcher<s32, "selectSWMMACIndex16">,
+ GIComplexPatternEquiv<SWMMACIndex16>;
+
def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 4c35649cec6c..4f7bf3f7d35e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3048,6 +3048,336 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
return true;
}
+static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
+ llvm::SelectionDAG *CurDAG,
+ const SDLoc &DL) {
+ unsigned DstRegClass;
+ EVT DstTy;
+ switch (Elts.size()) {
+ case 8:
+ DstRegClass = AMDGPU::VReg_256RegClassID;
+ DstTy = MVT::v8i32;
+ break;
+ case 4:
+ DstRegClass = AMDGPU::VReg_128RegClassID;
+ DstTy = MVT::v4i32;
+ break;
+ case 2:
+ DstRegClass = AMDGPU::VReg_64RegClassID;
+ DstTy = MVT::v2i32;
+ break;
+ default:
+ llvm_unreachable("unhandled Reg sequence size");
+ }
+
+ SmallVector<SDValue, 17> Ops;
+ Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
+ for (unsigned i = 0; i < Elts.size(); ++i) {
+ Ops.push_back(Elts[i]);
+ Ops.push_back(CurDAG->getTargetConstant(
+ SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
+ }
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
+}
+
+static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
+ llvm::SelectionDAG *CurDAG,
+ const SDLoc &DL) {
+ SmallVector<SDValue, 8> PackedElts;
+ assert("unhandled Reg sequence size" &&
+ (Elts.size() == 8 || Elts.size() == 16));
+
+ // Pack 16-bit elements in pairs into 32-bit register. If both elements are
+ // unpacked from 32-bit source use it, otherwise pack them using v_perm.
+ for (unsigned i = 0; i < Elts.size(); i += 2) {
+ SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
+ SDValue HiSrc;
+ if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
+ PackedElts.push_back(HiSrc);
+ } else {
+ SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
+ MachineSDNode *Packed =
+ CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
+ {Elts[i + 1], Elts[i], PackLoLo});
+ PackedElts.push_back(SDValue(Packed, 0));
+ }
+ }
+
+ return buildRegSequence32(PackedElts, CurDAG, DL);
+}
+
+static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
+ llvm::SelectionDAG *CurDAG,
+ const SDLoc &DL, unsigned ElementSize) {
+ if (ElementSize == 16)
+ return buildRegSequence16(Elts, CurDAG, DL);
+ if (ElementSize == 32)
+ return buildRegSequence32(Elts, CurDAG, DL);
+ llvm_unreachable("Unhandled element size");
+}
+
+static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
+ SmallVectorImpl<SDValue> &Elts, SDValue &Src,
+ llvm::SelectionDAG *CurDAG, const SDLoc &DL,
+ unsigned ElementSize) {
+ if (ModOpcode == ISD::FNEG) {
+ Mods |= SISrcMods::NEG;
+ // Check if all elements also have abs modifier
+ SmallVector<SDValue, 8> NegAbsElts;
+ for (auto El : Elts) {
+ if (El.getOpcode() != ISD::FABS)
+ break;
+ NegAbsElts.push_back(El->getOperand(0));
+ }
+ if (Elts.size() != NegAbsElts.size()) {
+ // Neg
+ Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
+ } else {
+ // Neg and Abs
+ Mods |= SISrcMods::NEG_HI;
+ Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
+ }
+ } else {
+ assert(ModOpcode == ISD::FABS);
+ // Abs
+ Mods |= SISrcMods::NEG_HI;
+ Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
+ }
+}
+
+// Check all f16 elements for modifiers while looking through b32 and v2b16
+// build vector, stop if element does not satisfy ModifierCheck.
+static void
+checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
+ std::function<bool(SDValue)> ModifierCheck) {
+ for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+ if (auto *F16Pair =
+ dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
+ for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
+ SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
+ if (!ModifierCheck(ElF16))
+ break;
+ }
+ }
+ }
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ Src = In;
+ unsigned Mods = SISrcMods::OP_SEL_1;
+
+ // mods are on f16 elements
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ SmallVector<SDValue, 8> EltsF16;
+
+ checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
+ if (Element.getOpcode() != ISD::FNEG)
+ return false;
+ EltsF16.push_back(Element.getOperand(0));
+ return true;
+ });
+
+ // All elements have neg modifier
+ if (BV->getNumOperands() * 2 == EltsF16.size()) {
+ Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
+ Mods |= SISrcMods::NEG;
+ Mods |= SISrcMods::NEG_HI;
+ }
+ }
+
+ // mods are on v2f16 elements
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ SmallVector<SDValue, 8> EltsV2F16;
+ for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+ SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (ElV2f16.getOpcode() != ISD::FNEG)
+ break;
+ EltsV2F16.push_back(ElV2f16.getOperand(0));
+ }
+
+ // All pairs of elements have neg modifier
+ if (BV->getNumOperands() == EltsV2F16.size()) {
+ Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
+ Mods |= SISrcMods::NEG;
+ Mods |= SISrcMods::NEG_HI;
+ }
+ }
+
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ Src = In;
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned ModOpcode;
+
+ // mods are on f16 elements
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ SmallVector<SDValue, 8> EltsF16;
+ checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsF16.empty())
+ ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+ if (ElF16.getOpcode() != ModOpcode)
+ return false;
+ EltsF16.push_back(ElF16.getOperand(0));
+ return true;
+ });
+
+ // All elements have ModOpcode modifier
+ if (BV->getNumOperands() * 2 == EltsF16.size())
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
+ 16);
+ }
+
+ // mods are on v2f16 elements
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ SmallVector<SDValue, 8> EltsV2F16;
+
+ for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+ SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsV2F16.empty())
+ ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+ if (ElV2f16->getOpcode() != ModOpcode)
+ break;
+ EltsV2F16.push_back(ElV2f16->getOperand(0));
+ }
+
+ // All elements have ModOpcode modifier
+ if (BV->getNumOperands() == EltsV2F16.size())
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
+ 32);
+ }
+
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ Src = In;
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned ModOpcode;
+ SmallVector<SDValue, 8> EltsF32;
+
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+ SDValue ElF32 = stripBitcast(BV->getOperand(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsF32.empty())
+ ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+ if (ElF32.getOpcode() != ModOpcode)
+ break;
+ EltsF32.push_back(ElF32.getOperand(0));
+ }
+
+ // All elements had ModOpcode modifier
+ if (BV->getNumOperands() == EltsF32.size())
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
+ 32);
+ }
+
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
+ BitVector UndefElements;
+ if (SDValue Splat = BV->getSplatValue(&UndefElements))
+ if (isInlineImmediate(Splat.getNode())) {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
+ unsigned Imm = C->getAPIntValue().getSExtValue();
+ Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+ return true;
+ }
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
+ unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
+ Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+ return true;
+ }
+ llvm_unreachable("unhandled Constant node");
+ }
+ }
+
+ // 16 bit splat
+ SDValue SplatSrc32 = stripBitcast(In);
+ if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32)) {
+ if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
+ SDValue SplatSrc16 = stripBitcast(Splat32);
+ if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16)) {
+ if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
+
+ // f16
+ if (isInlineImmediate(Splat.getNode())) {
+ const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat);
+ int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
+ Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16);
+ return true;
+ }
+
+ // bf16
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ APInt BF16Value = C->getAPIntValue();
+ APInt F32Value = BF16Value.zext(32).shl(16);
+ if (TII->isInlineConstant(F32Value)) {
+ int64_t Imm = F32Value.getSExtValue();
+ Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+ return true;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
+ SDValue &IndexKey) const {
+ unsigned Key = 0;
+ Src = In;
+
+ if (In.getOpcode() == ISD::SRL) {
+ const llvm::SDValue &ShiftSrc = In.getOperand(0);
+ ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+ if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
+ ShiftAmt->getZExtValue() % 8 == 0) {
+ Key = ShiftAmt->getZExtValue() / 8;
+ Src = ShiftSrc;
+ }
+ }
+
+ IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
+ SDValue &IndexKey) const {
+ unsigned Key = 0;
+ Src = In;
+
+ if (In.getOpcode() == ISD::SRL) {
+ const llvm::SDValue &ShiftSrc = In.getOperand(0);
+ ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+ if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
+ ShiftAmt->getZExtValue() == 16) {
+ Key = 1;
+ Src = ShiftSrc;
+ }
+ }
+
+ IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 8645490f0b16..3b42d88df0c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -240,6 +240,16 @@ private:
bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
+ bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+ bool SelectWMMAModsF16Neg(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+ bool SelectWMMAVISrc(SDValue In, SDValue &Src) const;
+
+ bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+ bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 55d95154c758..2af53a664ff1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -577,6 +577,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
setMaxAtomicSizeInBitsSupported(64);
+ setMaxDivRemBitWidthSupported(64);
}
bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index fdee74d58d26..f255d098b631 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3956,6 +3956,219 @@ AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
}};
}
+static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
+ MachineInstr *InsertPt,
+ MachineRegisterInfo &MRI) {
+ const TargetRegisterClass *DstRegClass;
+ switch (Elts.size()) {
+ case 8:
+ DstRegClass = &AMDGPU::VReg_256RegClass;
+ break;
+ case 4:
+ DstRegClass = &AMDGPU::VReg_128RegClass;
+ break;
+ case 2:
+ DstRegClass = &AMDGPU::VReg_64RegClass;
+ break;
+ default:
+ llvm_unreachable("unhandled Reg sequence size");
+ }
+
+ MachineIRBuilder B(*InsertPt);
+ auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
+ .addDef(MRI.createVirtualRegister(DstRegClass));
+ for (unsigned i = 0; i < Elts.size(); ++i) {
+ MIB.addReg(Elts[i]);
+ MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
+ }
+ return MIB->getOperand(0).getReg();
+}
+
+static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
+ SmallVectorImpl<Register> &Elts, Register &Src,
+ MachineInstr *InsertPt,
+ MachineRegisterInfo &MRI) {
+ if (ModOpcode == TargetOpcode::G_FNEG) {
+ Mods |= SISrcMods::NEG;
+ // Check if all elements also have abs modifier
+ SmallVector<Register, 8> NegAbsElts;
+ for (auto El : Elts) {
+ Register FabsSrc;
+ if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
+ break;
+ NegAbsElts.push_back(FabsSrc);
+ }
+ if (Elts.size() != NegAbsElts.size()) {
+ // Neg
+ Src = buildRegSequence(Elts, InsertPt, MRI);
+ } else {
+ // Neg and Abs
+ Mods |= SISrcMods::NEG_HI;
+ Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
+ }
+ } else {
+ assert(ModOpcode == TargetOpcode::G_FABS);
+ // Abs
+ Mods |= SISrcMods::NEG_HI;
+ Src = buildRegSequence(Elts, InsertPt, MRI);
+ }
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
+ Register Src = Root.getReg();
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned ModOpcode;
+ SmallVector<Register, 8> EltsF32;
+
+ if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
+ for (unsigned i = 0; i < BV->getNumSources(); ++i) {
+ MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsF32.empty())
+ ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
+ : AMDGPU::G_FABS;
+ if (ElF32->getOpcode() != ModOpcode)
+ break;
+ EltsF32.push_back(ElF32->getOperand(1).getReg());
+ }
+
+ // All elements had ModOpcode modifier
+ if (BV->getNumSources() == EltsF32.size()) {
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
+ *MRI);
+ }
+ }
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
+ Register Src = Root.getReg();
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ SmallVector<Register, 8> EltsV2F16;
+
+ if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
+ for (unsigned i = 0; i < CV->getNumSources(); ++i) {
+ Register FNegSrc;
+ if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
+ break;
+ EltsV2F16.push_back(FNegSrc);
+ }
+
+ // All elements had ModOpcode modifier
+ if (CV->getNumSources() == EltsV2F16.size()) {
+ Mods |= SISrcMods::NEG;
+ Mods |= SISrcMods::NEG_HI;
+ Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
+ }
+ }
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
+ Register Src = Root.getReg();
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned ModOpcode;
+ SmallVector<Register, 8> EltsV2F16;
+
+ if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
+ for (unsigned i = 0; i < CV->getNumSources(); ++i) {
+ MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsV2F16.empty())
+ ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
+ : AMDGPU::G_FABS;
+ if (ElV2F16->getOpcode() != ModOpcode)
+ break;
+ EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
+ }
+
+ // All elements had ModOpcode modifier
+ if (CV->getNumSources() == EltsV2F16.size()) {
+ MachineIRBuilder B(*Root.getParent());
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
+ *MRI);
+ }
+ }
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
+ std::optional<FPValueAndVReg> FPValReg;
+ if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
+ if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) {
+ return {{[=](MachineInstrBuilder &MIB) {
+ MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
+ }}};
+ }
+ // Non-inlineable splat floats should not fall-through for integer immediate
+ // checks.
+ return {};
+ }
+
+ APInt ICst;
+ if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
+ if (TII.isInlineConstant(ICst)) {
+ return {
+ {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
+ }
+ }
+
+ return {};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
+ Register Src =
+ getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+ unsigned Key = 0;
+
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
+ MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
+ ShiftAmt->Value.getZExtValue() % 8 == 0) {
+ Key = ShiftAmt->Value.getZExtValue() / 8;
+ Src = ShiftSrc;
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
+
+ Register Src =
+ getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+ unsigned Key = 0;
+
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
+ MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
+ ShiftAmt->Value.getZExtValue() == 16) {
+ Src = ShiftSrc;
+ Key = 1;
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+ }};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 12ea46c2895b..ef7630f137ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -200,6 +200,19 @@ private:
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectWMMAModsF32NegAbs(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectWMMAModsF16Neg(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectWMMAModsF16NegAbs(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectWMMAVISrc(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSWMMACIndex8(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSWMMACIndex16(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8e74d4c0e945..17ffb7ec988f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4178,10 +4178,45 @@ bool AMDGPULegalizerInfo::loadInputValue(
Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
- const ArgDescriptor *Arg;
+ const ArgDescriptor *Arg = nullptr;
const TargetRegisterClass *ArgRC;
LLT ArgTy;
- std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
+
+ CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
+ const ArgDescriptor WorkGroupIDX =
+ ArgDescriptor::createRegister(AMDGPU::TTMP9);
+ // If GridZ is not programmed in an entry function then the hardware will set
+ // it to all zeros, so there is no need to mask the GridY value in the low
+ // order bits.
+ const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
+ AMDGPU::TTMP7,
+ AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
+ const ArgDescriptor WorkGroupIDZ =
+ ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+ if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
+ switch (ArgType) {
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+ Arg = &WorkGroupIDX;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+ Arg = &WorkGroupIDY;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+ Arg = &WorkGroupIDZ;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!Arg)
+ std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
if (!Arg) {
if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
@@ -6848,6 +6883,21 @@ bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
+ if (!ST.hasArchitectedSGPRs())
+ return false;
+ LLT S32 = LLT::scalar(32);
+ Register DstReg = MI.getOperand(0).getReg();
+ auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
+ auto LSB = B.buildConstant(S32, 25);
+ auto Width = B.buildConstant(S32, 5);
+ B.buildUbfx(DstReg, TTMP8, LSB, Width);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
@@ -6970,6 +7020,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_workgroup_id_z:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_wave_id:
+ return legalizeWaveID(MI, B);
case Intrinsic::amdgcn_lds_kernel_id:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
@@ -7134,6 +7186,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
case Intrinsic::amdgcn_image_bvh_intersect_ray:
return legalizeBVHIntrinsic(MI, B);
+ case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
+ Register Index = MI.getOperand(5).getReg();
+ LLT S32 = LLT::scalar(32);
+ if (MRI.getType(Index) != S32)
+ MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ return true;
+ }
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
+ Register Index = MI.getOperand(7).getReg();
+ LLT S32 = LLT::scalar(32);
+ if (MRI.getType(Index) != S32)
+ MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ return true;
+ }
case Intrinsic::amdgcn_fmed3: {
GISelChangeObserver &Observer = Helper.Observer;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 56aabd4f6ab7..ecbe42681c66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -212,6 +212,7 @@ public:
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeImageIntrinsic(
MachineInstr &MI, MachineIRBuilder &B,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 5e73411cae9b..c1b244f50d93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -521,10 +521,18 @@ static Value *promoteAllocaUserToVector(
// For memset, we don't need to know the previous value because we
// currently only allow memsets that cover the whole alloca.
Value *Elt = MSI->getOperand(1);
- if (DL.getTypeStoreSize(VecEltTy) > 1) {
- Value *EltBytes =
- Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
- Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
+ const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
+ if (BytesPerElt > 1) {
+ Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
+
+ // If the element type of the vector is a pointer, we need to first cast
+ // to an integer, then use a PtrCast.
+ if (VecEltTy->isPointerTy()) {
+ Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
+ Elt = Builder.CreateBitCast(EltBytes, PtrInt);
+ Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
+ } else
+ Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
}
return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bdd4e891f158..09fac963d222 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4505,6 +4505,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 67263f23b983..bb1c6b733729 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -414,6 +414,22 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_global_load_tr>;
// The dummy boolean output is divergent from the IR's perspective,
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 489cf85693ed..9ab657f4e7bb 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -151,6 +151,8 @@ public:
ImmTyOpSelHi,
ImmTyNegLo,
ImmTyNegHi,
+ ImmTyIndexKey8bit,
+ ImmTyIndexKey16bit,
ImmTyDPP8,
ImmTyDppCtrl,
ImmTyDppRowMask,
@@ -383,6 +385,8 @@ public:
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isLDS() const { return isImmTy(ImmTyLDS); }
bool isCPol() const { return isImmTy(ImmTyCPol); }
+ bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
+ bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); }
@@ -656,6 +660,14 @@ public:
return isVISrcF16() || isVISrcB32();
}
+ bool isVISrc_64F16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16);
+ }
+
+ bool isVISrc_64B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
+ }
+
bool isVISrc_64B64() const {
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64);
}
@@ -672,6 +684,14 @@ public:
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
}
+ bool isVISrc_256B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_256F32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32);
+ }
+
bool isVISrc_256B64() const {
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64);
}
@@ -1047,6 +1067,8 @@ public:
case ImmTyOffset1: OS << "Offset1"; break;
case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break;
case ImmTyCPol: OS << "CPol"; break;
+ case ImmTyIndexKey8bit: OS << "index_key"; break;
+ case ImmTyIndexKey16bit: OS << "index_key"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -1604,6 +1626,11 @@ public:
ParseStatus parseRegWithFPInputMods(OperandVector &Operands);
ParseStatus parseRegWithIntInputMods(OperandVector &Operands);
ParseStatus parseVReg32OrOff(OperandVector &Operands);
+ ParseStatus tryParseIndexKey(OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy);
+ ParseStatus parseIndexKey8bit(OperandVector &Operands);
+ ParseStatus parseIndexKey16bit(OperandVector &Operands);
+
ParseStatus parseDfmtNfmt(int64_t &Format);
ParseStatus parseUfmt(int64_t &Format);
ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc,
@@ -1784,6 +1811,8 @@ public:
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
+ void cvtSWMMAC(MCInst &Inst, const OperandVector &Operands);
+
void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx);
@@ -3500,6 +3529,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
return !isInlineConstant(Inst, OpIdx);
} else if (MO.isReg()) {
auto Reg = MO.getReg();
+ if (!Reg) {
+ return false;
+ }
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
auto PReg = mc2PseudoReg(Reg);
return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
@@ -4364,7 +4396,11 @@ bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) {
uint64_t TSFlags = MII.get(Opc).TSFlags;
// v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2)
- if (!(TSFlags & SIInstrFlags::IsDOT))
+ // v_wmma iu4/iu8 neg_lo not allowed on src2 (allowed on src0, src1)
+ // v_swmmac f16/bf16 neg_lo/neg_hi not allowed on src2 (allowed on src0, src1)
+ // other wmma/swmmac instructions don't have neg_lo/neg_hi operand.
+ if (!(TSFlags & SIInstrFlags::IsDOT) && !(TSFlags & SIInstrFlags::IsWMMA) &&
+ !(TSFlags & SIInstrFlags::IsSWMMAC))
return true;
int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
@@ -6465,6 +6501,33 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
return true;
}
+ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy) {
+ const char *Pref = "index_key";
+ int64_t ImmVal = 0;
+ SMLoc Loc = getLoc();
+ auto Res = parseIntWithPrefix(Pref, ImmVal);
+ if (!Res.isSuccess())
+ return Res;
+
+ if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1))
+ return Error(Loc, Twine("out of range ", StringRef(Pref)));
+
+ if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3))
+ return Error(Loc, Twine("out of range ", StringRef(Pref)));
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, ImmTy));
+ return ParseStatus::Success;
+}
+
+ParseStatus AMDGPUAsmParser::parseIndexKey8bit(OperandVector &Operands) {
+ return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey8bit);
+}
+
+ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) {
+ return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit);
+}
+
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -8303,12 +8366,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
- Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
Inst.addOperand(Inst.getOperand(0));
}
- if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) {
+ // Adding vdst_in operand is already covered for these DPP instructions in
+ // cvtVOP3DPP.
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
+ !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
assert(!IsPacked);
Inst.addOperand(Inst.getOperand(0));
}
@@ -8329,10 +8400,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
}
int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
- if (NegLoIdx != -1) {
+ if (NegLoIdx != -1)
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
+
+ int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+ if (NegHiIdx != -1)
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
- }
const int Ops[] = { AMDGPU::OpName::src0,
AMDGPU::OpName::src1,
@@ -8352,11 +8425,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
if (OpSelHiIdx != -1)
OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
- if (NegLoIdx != -1) {
- int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+ if (NegLoIdx != -1)
NegLo = Inst.getOperand(NegLoIdx).getImm();
+
+ if (NegHiIdx != -1)
NegHi = Inst.getOperand(NegHiIdx).getImm();
- }
for (int J = 0; J < 3; ++J) {
int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
@@ -8392,6 +8465,43 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
cvtVOP3P(Inst, Operands, OptIdx);
}
+static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands,
+ unsigned i, unsigned Opc, unsigned OpName) {
+ if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1)
+ ((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2);
+ else
+ ((AMDGPUOperand &)*Operands[i]).addRegOperands(Inst, 1);
+}
+
+void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) {
+ unsigned Opc = Inst.getOpcode();
+
+ ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
+ addSrcModifiersAndSrc(Inst, Operands, 2, Opc, AMDGPU::OpName::src0_modifiers);
+ addSrcModifiersAndSrc(Inst, Operands, 3, Opc, AMDGPU::OpName::src1_modifiers);
+ ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); // srcTiedDef
+ ((AMDGPUOperand &)*Operands[4]).addRegOperands(Inst, 1); // src2
+
+ OptionalImmIndexMap OptIdx;
+ for (unsigned i = 5; i < Operands.size(); ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ OptIdx[Op.getImmTy()] = i;
+ }
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_8bit))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyIndexKey8bit);
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_16bit))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyIndexKey16bit);
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
+ addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI);
+
+ cvtVOP3P(Inst, Operands, OptIdx);
+}
+
//===----------------------------------------------------------------------===//
// VOPD
//===----------------------------------------------------------------------===//
@@ -8770,6 +8880,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
}
}
+ int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
+ if (VdstInIdx == static_cast<int>(Inst.getNumOperands())) {
+ Inst.addOperand(Inst.getOperand(0));
+ }
+
+ bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
+ if (IsVOP3CvtSrDpp) {
+ if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
+ Inst.addOperand(MCOperand::createImm(0));
+ Inst.addOperand(MCOperand::createReg(0));
+ }
+ }
+
auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
MCOI::TIED_TO);
if (TiedTo != -1) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 86096b0d80b4..a9968cfe25b4 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -260,8 +260,12 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
@@ -704,6 +708,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
break;
Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS);
+ if (Res)
+ break;
+
+ Res = tryDecodeInst(DecoderTableWMMAGFX1264, MI, QW, Address, CS);
} while (false);
if (Res && AMDGPU::isMAC(MI.getOpcode())) {
@@ -712,6 +720,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
AMDGPU::OpName::src2_modifiers);
}
+ if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
+ MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
+ // Insert dummy unused src2_modifiers.
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src2_modifiers);
+ }
+
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
!AMDGPU::hasGDS(STI)) {
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
@@ -942,6 +957,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
// first add optional MI operands to check FI
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
+
if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
convertVOP3PDPPInst(MI);
} else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
@@ -951,6 +967,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
if (isMacDPP(MI))
convertMacDPPInst(MI);
+ int VDstInIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+ if (VDstInIdx != -1)
+ insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+
+ if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+ MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
+ insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -977,6 +1002,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
if (isMacDPP(MI))
convertMacDPPInst(MI);
+ int VDstInIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+ if (VDstInIdx != -1)
+ insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+
+ if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
+ MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
+ insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index b6e4e65ff5b0..08bef7ad3002 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1716,14 +1716,14 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
}
bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
- if (!SIInstrInfo::isWMMA(*MI))
+ if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
- if (!SIInstrInfo::isWMMA(I))
+ auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
+ if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
return false;
// Src0 or Src1 of the current wmma instruction overlaps with the dest of
@@ -1753,6 +1753,7 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
const MachineOperand *Src2Mods =
TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
const bool NoSrc2Mods =
+ !Src2Mods ||
(Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
// Exception: there is no hazard if the wmma instructions are of the same
// type and there is no input modifier on src2 of the current instruction.
@@ -1760,6 +1761,18 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
TII->pseudoToMCOpcode(MI->getOpcode())));
}
+ // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
+ // but Index can't overlap with PrevDstReg.
+ if (AMDGPU::isGFX12Plus(ST)) {
+ if (SIInstrInfo::isSWMMAC(*MI)) {
+ const Register CurIndex =
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(PrevDstReg, CurIndex))
+ return true;
+ }
+ return false;
+ }
+
return false;
};
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index e73e53aa270f..abfa4a3531e8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1275,6 +1275,23 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
(ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
}
+ // Print three values of neg/opsel for wmma instructions (prints 0 when there
+ // is no src_modifier operand instead of not printing anything).
+ if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC ||
+ MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) {
+ NumOps = 0;
+ int DefaultValue = Mod == SISrcMods::OP_SEL_1;
+ for (int OpName :
+ {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers}) {
+ int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+ if (Idx != -1)
+ Ops[NumOps++] = MI->getOperand(Idx).getImm();
+ else
+ Ops[NumOps++] = DefaultValue;
+ }
+ }
+
const bool HasDstSel =
NumOps > 0 &&
Mod == SISrcMods::OP_SEL_0 &&
@@ -1305,6 +1322,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned Opc = MI->getOpcode();
+ if (isCvt_F32_Fp8_Bf8_e64(Opc)) {
+ auto SrcMod =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ unsigned Mod = MI->getOperand(SrcMod).getImm();
+ unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0);
+ unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1);
+ if (Index0 || Index1)
+ O << " op_sel:[" << Index0 << ',' << Index1 << ']';
+ return;
+ }
if (isPermlane16(Opc)) {
auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
@@ -1336,6 +1363,26 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo,
printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O);
}
+void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+ if (Imm == 0)
+ return;
+
+ O << " index_key:" << Imm;
+}
+
+void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+ if (Imm == 0)
+ return;
+
+ O << " index_key:" << Imm;
+}
+
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index e3958f88277d..e91ff86b219a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -139,6 +139,10 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printNegHi(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printIndexKey8bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printIndexKey16bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 8ab66d4fd5b8..19596d53b453 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -167,6 +167,9 @@ enum : uint64_t {
// ds_gws_* instructions.
GWS = UINT64_C(1) << 62,
+
+ // Is a SWMMAC instruction.
+ IsSWMMAC = UINT64_C(1) << 63,
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2862a7787e75..a812cdc61500 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -208,6 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
assert(Old.isReg() && Fold.isImm());
if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
+ (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
(ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cf947dccafac..d6bf0d8cb2ef 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2072,11 +2072,45 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
const SIMachineFunctionInfo &MFI,
EVT VT,
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
- const ArgDescriptor *Reg;
+ const ArgDescriptor *Reg = nullptr;
const TargetRegisterClass *RC;
LLT Ty;
- std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
+ CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
+ const ArgDescriptor WorkGroupIDX =
+ ArgDescriptor::createRegister(AMDGPU::TTMP9);
+ // If GridZ is not programmed in an entry function then the hardware will set
+ // it to all zeros, so there is no need to mask the GridY value in the low
+ // order bits.
+ const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
+ AMDGPU::TTMP7,
+ AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
+ const ArgDescriptor WorkGroupIDZ =
+ ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+ if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
+ switch (PVID) {
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+ Reg = &WorkGroupIDX;
+ RC = &AMDGPU::SReg_32RegClass;
+ Ty = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+ Reg = &WorkGroupIDY;
+ RC = &AMDGPU::SReg_32RegClass;
+ Ty = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+ Reg = &WorkGroupIDZ;
+ RC = &AMDGPU::SReg_32RegClass;
+ Ty = LLT::scalar(32);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!Reg)
+ std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
if (!Reg) {
if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
// It's possible for a kernarg intrinsic call to appear in a kernel with
@@ -2505,28 +2539,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
}
}
- if (Info.hasWorkGroupIDX()) {
- Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
- if (!HasArchitectedSGPRs)
+ if (!HasArchitectedSGPRs) {
+ if (Info.hasWorkGroupIDX()) {
+ Register Reg = Info.addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupIDY()) {
- Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
- if (!HasArchitectedSGPRs)
+ if (Info.hasWorkGroupIDY()) {
+ Register Reg = Info.addWorkGroupIDY();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupIDZ()) {
- Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
- if (!HasArchitectedSGPRs)
+ if (Info.hasWorkGroupIDZ()) {
+ Register Reg = Info.addWorkGroupIDZ();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
-
- CCInfo.AllocateReg(Reg);
+ CCInfo.AllocateReg(Reg);
+ }
}
if (Info.hasWorkGroupInfo()) {
@@ -7890,6 +7920,17 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
return Loads[0];
}
+SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
+ // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
+ if (!Subtarget->hasArchitectedSGPRs())
+ return {};
+ SDLoc SL(Op);
+ MVT VT = MVT::i32;
+ SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
+ return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
+ DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
+}
+
SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
unsigned Dim,
const ArgDescriptor &Arg) const {
@@ -8060,6 +8101,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_workgroup_id_z:
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_wave_id:
+ return lowerWaveID(DAG, Op);
case Intrinsic::amdgcn_lds_kernel_id: {
if (MFI->isEntryFunction())
return getLDSKernelId(DAG, DL);
@@ -8242,6 +8285,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SIInstrInfo::MO_ABS32_LO);
return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
}
+ case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
+ if (Op.getOperand(4).getValueType() == MVT::i32)
+ return SDValue();
+
+ SDLoc SL(Op);
+ auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), IndexKeyi32);
+ }
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
+ if (Op.getOperand(6).getValueType() == MVT::i32)
+ return SDValue();
+
+ SDLoc SL(Op);
+ auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+ {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+ IndexKeyi32, Op.getOperand(7)});
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d66ba0b59ba9..e436c23af5bc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -80,6 +80,7 @@ private:
SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
unsigned NewOpcode) const;
+ SDValue lowerWaveID(SelectionDAG &DAG, SDValue Op) const;
SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim,
const ArgDescriptor &ArgDesc) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 1b66d163714f..ab536f8f49d5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -161,6 +161,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// ds_gws_* instructions.
field bit GWS = 0;
+ // This bit indicates that this is one of SWMMAC instructions.
+ field bit IsSWMMAC = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -248,6 +251,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{62} = GWS;
+ let TSFlags{63} = IsSWMMAC;
+
let SchedRW = [Write32Bit];
let AsmVariantName = AMDGPUAsmVariants.Default;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fc85b089aa47..1c9dacc09f81 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -802,6 +802,14 @@ public:
return isMFMA(MI) || isWMMA(MI);
}
+ static bool isSWMMAC(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
+ }
+
+ bool isSWMMAC(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC;
+ }
+
bool isDOT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index a6820544f4b4..45be81950aa3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1088,6 +1088,9 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
+def IndexKey16bit : CustomOperand<i32, 1>;
+def IndexKey8bit : CustomOperand<i32, 1>;
+
def dpp8 : CustomOperand<i32, 0, "DPP8">;
def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
@@ -1344,6 +1347,13 @@ def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
+def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
+def WMMAModsF16Neg : ComplexPattern<untyped, 2, "SelectWMMAModsF16Neg">;
+def WMMAModsF16NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">;
+def WMMAVISrc : ComplexPattern<untyped, 1, "SelectWMMAVISrc">;
+def SWMMACIndex8 : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">;
+def SWMMACIndex16 : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">;
+
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
@@ -1684,8 +1694,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
!if(HasOMod,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod0:$clamp, omod0:$omod),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- clampmod0:$clamp))
+ !if (HasClamp,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0)))
/* else */,
// VOP1 without modifiers
!if (HasClamp,
@@ -2278,6 +2289,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsDOT = 0;
field bit IsSingle = 0;
field bit IsWMMA = 0;
+ field bit IsSWMMAC = 0;
+
+ field bit IsFP8 = 0;
field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9ff66a094f99..0336ec4985ea 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -751,35 +751,21 @@ public:
}
// Add system SGPRs.
- Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
- Register Reg =
- HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
- ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
- if (!HasArchitectedSGPRs)
- NumSystemSGPRs += 1;
-
+ Register addWorkGroupIDX() {
+ ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
+ NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDX.getRegister();
}
- Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
- Register Reg =
- HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
- unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
- ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
- if (!HasArchitectedSGPRs)
- NumSystemSGPRs += 1;
-
+ Register addWorkGroupIDY() {
+ ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
+ NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDY.getRegister();
}
- Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
- Register Reg =
- HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
- unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
- ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
- if (!HasArchitectedSGPRs)
- NumSystemSGPRs += 1;
-
+ Register addWorkGroupIDZ() {
+ ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
+ NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDZ.getRegister();
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index f42af89cf5e6..b3265b73fa7e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1341,9 +1341,14 @@ def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">;
// VISrc_* Operands with a VGPR or an inline constant
//===----------------------------------------------------------------------===//
+def VISrc_64_f16 : RegOrF16 <"VReg_64", "OPERAND_REG_INLINE_C">;
+def VISrc_64_b32 : RegOrB32 <"VReg_64", "OPERAND_REG_INLINE_C">;
def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">;
+def VISrc_128_f16 : RegOrF16 <"VReg_128", "OPERAND_REG_INLINE_C">;
def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">;
def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">;
+def VISrc_256_b32 : RegOrB32 <"VReg_256", "OPERAND_REG_INLINE_C">;
+def VISrc_256_f32 : RegOrF32 <"VReg_256", "OPERAND_REG_INLINE_C">;
def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">;
def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">;
def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0bf9452d822e..106fdb19f278 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -529,6 +529,17 @@ bool isPermlane16(unsigned Opc) {
Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12;
}
+bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
+ return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 ||
+ Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 ||
+ Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
+}
+
bool isGenericAtomic(unsigned Opc) {
return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d3f55c792017..11b0bc5c8171 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -535,6 +535,9 @@ bool isPermlane16(unsigned Opc);
LLVM_READNONE
bool isGenericAtomic(unsigned Opc);
+LLVM_READNONE
+bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc);
+
namespace VOPD {
enum Component : unsigned {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 95a1d8696347..ef652fce6548 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -571,6 +571,7 @@ let SubtargetPredicate = isGFX9Only in {
} // End SubtargetPredicate = isGFX9Only
class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> {
+ let HasExtDPP = 1;
let HasExtSDWA = 1;
let HasExtSDWA9 = 1;
let HasExt = 1;
@@ -599,6 +600,7 @@ class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
(inst_sdwa 0, $src, 0, 0, index)
>;
+let SubtargetPredicate = isGFX9Only in {
let OtherPredicates = [HasCvtFP8VOP1Bug] in {
def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
(V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>;
@@ -617,6 +619,7 @@ foreach Index = [1, 2, 3] in {
def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index, V_CVT_F32_FP8_sdwa>;
def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index, V_CVT_F32_BF8_sdwa>;
}
+} // End SubtargetPredicate = isGFX9Only
class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
@@ -626,11 +629,77 @@ class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
(inst_e32 $src))
>;
-foreach Index = [0, -1] in {
- def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
- V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
- def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
- V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+let SubtargetPredicate = isGFX9Only in {
+ foreach Index = [0, -1] in {
+ def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
+ V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
+ def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
+ V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+ }
+}
+
+
+// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions.
+def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F <v2f32, i32> {
+ let HasOpSel = 1;
+ let HasExtVOP3DPP = 0;
+}
+
+def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> {
+ let HasOpSel = 1;
+ let HasExtDPP = 1;
+ let HasExtVOP3DPP = 1;
+ let IsFP8 = 1;
+ let HasClamp = 0;
+ let HasOMod = 0;
+ let HasModifiers = 1;
+ let Src1VOP3DPP = Src1RC64;
+}
+
+let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0,
+ SchedRW = [WriteFloatCvt] in {
+ defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
+ defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
+ defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
+ defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
+}
+
+class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index,
+ VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
+ (f32 (node i32:$src, index)),
+ !if (index,
+ (inst_e64 !if(index{0},
+ !if(index{1}, !or(SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1),
+ SRCMODS.OP_SEL_0),
+ !if(index{1}, SRCMODS.OP_SEL_1, 0)),
+ $src, 0),
+ (inst_e32 $src))
+>;
+
+let SubtargetPredicate = isGFX12Plus in {
+ foreach Index = [0, 1, 2, 3] in {
+ def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_fp8, Index,
+ V_CVT_F32_FP8_e32, V_CVT_F32_FP8_OP_SEL_e64>;
+ def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_bf8, Index,
+ V_CVT_F32_BF8_e32, V_CVT_F32_BF8_OP_SEL_e64>;
+ }
+}
+
+class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
+ VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
+ (v2f32 (node i32:$src, index)),
+ !if (index,
+ (inst_e64 SRCMODS.OP_SEL_0, $src, 0, 0, SRCMODS.NONE),
+ (inst_e32 $src))
+>;
+
+let SubtargetPredicate = isGFX12Plus in {
+ foreach Index = [0, -1] in {
+ def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_fp8, Index,
+ V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_OP_SEL_e64>;
+ def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_bf8, Index,
+ V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_OP_SEL_e64>;
+ }
}
let SubtargetPredicate = isGFX10Plus in {
@@ -853,6 +922,20 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
+// Define VOP1 instructions using the pseudo instruction with its old profile and
+// VOP3 using the OpSel profile for the pseudo instruction.
+defm V_CVT_F32_FP8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">;
+defm V_CVT_F32_FP8 : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
+
+defm V_CVT_F32_BF8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">;
+defm V_CVT_F32_BF8 : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
+
+defm V_CVT_PK_F32_FP8 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_OP_SEL", "v_cvt_pk_f32_fp8">;
+
+defm V_CVT_PK_F32_BF8 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_OP_SEL", "v_cvt_pk_f32_bf8">;
+
defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c,
"V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;
defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 713b4712d563..14db52210214 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -520,8 +520,26 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
FP32InputMods:$src1_modifiers, Src1RC64:$src1,
VGPR_32:$vdst_in, op_sel0:$op_sel);
+ let InsVOP3DPP = (ins VGPR_32:$old,
+ FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+ FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+ VGPR_32:$vdst_in, op_sel0:$op_sel,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
+ let InsVOP3DPP16 = (ins VGPR_32:$old,
+ FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+ FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+ VGPR_32:$vdst_in, op_sel0:$op_sel,
+ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi);
+ let InsVOP3DPP8 = (ins VGPR_32:$old,
+ FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+ FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+ VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, FI:$fi);
+
let HasClamp = 0;
- let HasExtVOP3DPP = 0;
+ let HasExtVOP3DPP = 1;
}
def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
@@ -530,14 +548,36 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
FP32InputMods:$src1_modifiers, Src1RC64:$src1,
FP32InputMods:$src2_modifiers, VGPR_32:$src2,
op_sel0:$op_sel);
+ let InsVOP3DPP16 = (ins VGPR_32:$old,
+ FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+ FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+ FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+ op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi);
+ let InsVOP3DPP8 = (ins VGPR_32:$old,
+ FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+ FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+ FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+ op_sel0:$op_sel, dpp8:$dpp8, FI:$fi);
let HasClamp = 0;
let HasSrc2 = 0;
let HasSrc2Mods = 1;
+ let HasExtVOP3DPP = 1;
+ let HasOpSel = 1;
let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
getAsmVOP3OpSel<3, HasClamp, HasOMod,
HasSrc0FloatMods, HasSrc1FloatMods,
HasSrc2FloatMods>.ret);
- let HasExtVOP3DPP = 0;
+ let AsmVOP3DPP16 = !subst(", $src2_modifiers", "",
+ getAsmVOP3DPP16<getAsmVOP3Base<3, 1, HasClamp, 1,
+ HasOMod, 0, 1, HasSrc0FloatMods,
+ HasSrc1FloatMods,
+ HasSrc2FloatMods>.ret>.ret);
+ let AsmVOP3DPP8 = !subst(", $src2_modifiers", "",
+ getAsmVOP3DPP8<getAsmVOP3Base<3, 1, HasClamp, 1,
+ HasOMod, 0, 1, HasSrc0FloatMods,
+ HasSrc1FloatMods,
+ HasSrc2FloatMods>.ret>.ret);
}
def IsPow2Plus1: PatLeaf<(i32 imm), [{
@@ -618,13 +658,13 @@ let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0,
class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : GCNPat<
(i32 (node f32:$src0, f32:$src1, i32:$old, index)),
- (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0))
+ (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
>;
class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
(i32 (node f32:$src0, i32:$src1, i32:$old, index)),
(inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
- !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0))
+ !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0)
>;
foreach Index = [0, -1] in {
@@ -998,6 +1038,11 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>;
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
+defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>;
+defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>;
+defm V_CVT_SR_FP8_F32 : VOP3Only_Realtriple_gfx12<0x36b>;
+defm V_CVT_SR_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36c>;
+
//===----------------------------------------------------------------------===//
// GFX11, GFX12
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 0c7a08cd4bc9..107b95a9ca8e 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -936,16 +936,19 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
!cast<Instruction>(NAME # _threeaddr # Suffix)>;
}
- if !eq(Type, WMMAOpSel) then {
- def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
- } else if !eq(Type, WMMAUIClamp) then {
- def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
- } else {
- def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ let SubtargetPredicate = isGFX11Only in {
+ if !eq(Type, WMMAOpSel) then {
+ def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ } else if !eq(Type, WMMAUIClamp) then {
+ def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ } else {
+ def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ }
}
}
+
let WaveSizePredicate = isWave32 in {
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
@@ -969,6 +972,398 @@ let WaveSizePredicate = isWave64 in {
}
+class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
+ bit _IsIU, bit _IsFP8BF8>
+ : VOP3P_Profile<VOPProfile<ArgTy>> {
+ bit IsIU = _IsIU;
+ bit IsFP8BF8 = _IsFP8BF8;
+ bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8));
+
+ int IndexType = _IndexType;
+
+ let IsPacked = 1;
+ let IsWMMA = !not(_IsSWMMAC);
+ let IsSWMMAC = _IsSWMMAC;
+
+ bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP);
+ bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret);
+ bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
+ bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16));
+ bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
+
+ bit NegLo01 = !or(IsF16BF16, IsIU);
+ bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+ bit NegHi01 = IsF16BF16;
+ bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+ bit NegLoAny = !or(NegLo01, NegLo2);
+ bit NegHiAny = !or(NegHi01, NegHi2);
+
+ let DstRC = !cond(!eq(ArgTy[0], v8f32): VDst_256,
+ !eq(ArgTy[0], v8i32): VDst_256,
+ !eq(ArgTy[0], v8f16): VDst_128,
+ !eq(ArgTy[0], v8i16): VDst_128,
+ !eq(ArgTy[0], v4f32): VDst_128,
+ !eq(ArgTy[0], v4i32): VDst_128,
+ !eq(ArgTy[0], v4f16): VDst_64,
+ !eq(ArgTy[0], v4i16): VDst_64);
+ let Src0RC64 = !cond(!eq(ArgTy[1], v8f16): VRegSrc_128,
+ !eq(ArgTy[1], v4f16): VRegSrc_64,
+ !eq(ArgTy[1], v4i16): VRegSrc_64,
+ !eq(ArgTy[1], v8i16): VRegSrc_128,
+ !eq(ArgTy[1], v4i32): VRegSrc_128,
+ !eq(ArgTy[1], v2i32): VRegSrc_64,
+ !eq(ArgTy[1], i32) : VRegSrc_32);
+ let Src1RC64 = !cond(!eq(ArgTy[2], v16f16): VRegSrc_256,
+ !eq(ArgTy[2], v16i16): VRegSrc_256,
+ !eq(ArgTy[2], v8f16): VRegSrc_128,
+ !eq(ArgTy[2], v8i16): VRegSrc_128,
+ !eq(ArgTy[2], v4i32): VRegSrc_128,
+ !eq(ArgTy[1], v4i16): VRegSrc_64,
+ !eq(ArgTy[1], v4f16): VRegSrc_64,
+ !eq(ArgTy[2], v2i32): VRegSrc_64,
+ !eq(ArgTy[2], i32) : VRegSrc_32);
+ let Src2RC64 = !if(IsSWMMAC, DstRC,
+ !cond(!eq(ArgTy[3], v8f32): VISrc_256_f32,
+ !eq(ArgTy[3], v8i32): VISrc_256_b32,
+ !eq(ArgTy[3], v8f16): VISrc_128_f16,
+ !eq(ArgTy[3], v8i16): VISrc_128_f32, // bf16
+ !eq(ArgTy[3], v4f16): VISrc_64_f16,
+ !eq(ArgTy[3], v4i16): VISrc_64_b32,
+ !eq(ArgTy[3], v4i32): VISrc_128_b32,
+ !eq(ArgTy[3], v4f32): VISrc_128_f32));
+
+ // For f16 and bf16 matrices A and B, each element can be modified by
+ // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is
+ // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext)
+ // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each
+ // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
+
+ // Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index
+ // ---------------------------------------------------------------------------
+ // wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32)
+ // wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32)
+ // ---------------------------------------------------------------------------
+ // wmma f16_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16)
+ // wmma bf16_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f16 or bf16)
+ // ---------------------------------------------------------------------------
+ // wmma i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for
+ // | neg_lo = 1 i4/i8(sext) | i32 matrices
+ // ---------------------------------------------------------------------------
+ // wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32)
+ // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32)
+ // ---------------------------------------------------------------------------
+ // swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
+ // swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
+ // ---------------------------------------------------------------------------
+ // swmmac f16_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
+ // swmmac bf16_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
+ // ---------------------------------------------------------------------------
+ // swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix
+ // | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst
+ // ---------------------------------------------------------------------------
+ // swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix
+ // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst
+
+ // pseudo
+
+ // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
+ // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
+ // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
+ // f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
+
+ dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers));
+ dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers));
+ dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers));
+ dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
+ !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
+ !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit));
+ dag Clamp = !if(IsIU, (ins clampmod0:$clamp), (ins));
+ dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
+ !and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo),
+ !and(!not(NegLoAny), !not(NegHiAny)) : (ins));
+
+ let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1),
+ !cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)),
+ IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)),
+ Clamp, Neg);
+
+ // asm
+
+ string IndexKeyAsm = !cond(!eq(IndexType, 0) : "",
+ !eq(IndexType, 8) : "$index_key_8bit",
+ !eq(IndexType, 16) : "$index_key_16bit");
+ string ClampAsm = !if(IsIU, "$clamp", "");
+ string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
+ !and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
+ !and(!not(NegLoAny), !not(NegHiAny)) : "");
+
+ let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm;
+
+ // isel patterns
+
+ dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
+ IsAB_BF16 : (ins Src0VT:$src0),
+ IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+ IsFP8BF8 : (ins Src0VT:$src0));
+ dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsAB_BF16 : (ins (i32 8), Src0VT:$src0),
+ IsIU : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsFP8BF8 : (ins Src0VT:$src0));
+ dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
+ IsAB_BF16 : (ins Src1VT:$src1),
+ IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+ IsFP8BF8 : (ins Src1VT:$src1));
+ dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsAB_BF16 : (ins (i32 8), Src1VT:$src1),
+ IsIU : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsFP8BF8 : (ins Src1VT:$src1));
+ dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+ IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+ IsC_BF16 : (ins Src2VT:$src2),
+ IsIU : (ins Src2VT:$src2),
+ IsSWMMAC : (ins));
+ dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_BF16 : (ins (i32 8), Src2VT:$src2),
+ IsIU : (ins Src2VT:$src2),
+ IsSWMMAC : (ins));
+ dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins));
+ dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
+ !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
+ !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))));
+ dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
+ !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
+ !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit));
+ dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2)));
+ dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2));
+
+
+ dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat);
+ dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat);
+
+ dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat);
+ dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat);
+
+ // wmma pattern where src2 is inline imm uses _threeaddr pseudo,
+ // can't use _twoaddr since it would violate src2 tied to vdst constraint.
+ dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat);
+ dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat);
+}
+
+multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+ let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in
+ def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let PseudoInstr = Instr#PseudoInstrSuffix;
+ }
+
+ let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in
+ def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let PseudoInstr = Instr#PseudoInstrSuffix;
+ }
+
+ }
+ def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr),
+ !cast<Instruction>(NAME # _threeaddr)>;
+}
+
+multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+ def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let Mnemonic = Instr;
+ let PseudoInstr = Instr#PseudoInstrSuffix;
+ let mayRaiseFPException = 0;
+ let ReadsModeReg = 0;
+ let AsmMatchConverter = "cvtSWMMAC";
+
+ let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
+ }
+}
+
+// First argument in Profile is types for matrices D, A, B and C (D = A * B + C)
+// as used by llvm ir, types are vectors(with matrix elements)
+// wave32:
+// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts,
+// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts.
+// wave64:
+// lanes will have half the size of elements in lanes compared to wave32 with
+// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored
+
+// general idea on element distribution differences:
+// wave32: lane n has 8 matrix elements
+// wave64: lane n has first 4, lane n+32 has other 4 elements
+
+// index size, for each 2 elements in lane you need 4bits in index
+
+// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s.
+// Original type for them is in comment on the right and refers to A and B.
+
+def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>;
+def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>;
+def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>;
+def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>;
+def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8
+def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4
+def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8
+def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4
+
+def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>;
+def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>;
+def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>;
+def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>;
+def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8
+def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 *
+def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8
+def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4
+
+def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>;
+def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>;
+def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>;
+def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>;
+def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8
+def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4
+def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 **
+def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8
+
+def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>;
+def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>;
+def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>;
+def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>;
+def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8
+def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 ***
+def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4
+def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8
+
+// * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored
+// ** IU4X64_SWMMAC_w32 index is i32, index_key is not used
+// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
+// for matrix A, index is i16; Matrix B uses all lanes
+
+let WaveSizePredicate = isWave32 in {
+defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w32, "_w32">;
+defm V_WMMA_BF16_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X16_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X16_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X32_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w32, "_w32">;
+
+defm V_SWMMAC_F32_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X32_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X32_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X64_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+}
+
+let WaveSizePredicate = isWave64 in {
+defm V_WMMA_F32_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w64, "_w64">;
+defm V_WMMA_F16_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w64, "_w64">;
+defm V_WMMA_BF16_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X16_IU8_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X16_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X32_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w64, "_w64">;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F16_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X32_IU8_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X32_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X64_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+}
+
+// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that
+// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0).
+multiclass WMMAPat<string Inst, SDPatternOperator node, VOP3PWMMA_Profile P, bit IsGFX11OpselIntrinsic = 0> {
+ def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
+ (P.DstVT !setdagop(P.WmmaOutPat, !cast<Instruction>(Inst#"_twoaddr")))>;
+ let AddedComplexity = 4 in
+ def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
+ (P.DstVT !setdagop(P.WmmaInlineOutPat, !cast<Instruction>(Inst#"_threeaddr")))>;
+}
+
+class SWMMACPat<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
+ GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
+ (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>;
+
+class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
+ GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
+ (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{
+ let WaveSizePredicate = isWave64;
+ }
+
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>;
+ defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w32,1>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w32>;
+
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w32>;
+ def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat, int_amdgcn_swmmac_i32_16x16x64_iu4)),
+ (I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
+}
+
+let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>;
+ defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w64,1>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w64>;
+
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x64_iu4, I32_IU4X64_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>;
+}
+
+
//===----------------------------------------------------------------------===//
// Begin Real Encodings
//===----------------------------------------------------------------------===//
@@ -1005,6 +1400,99 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
}
+class VOP3PeWmma<bits<7> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
+ : VOP3Pe_gfx11_gfx12<op, P>{
+ // opsel
+ let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
+ !eq(WMMAP.IndexType, 8) : index_key_8bit{0},
+ !eq(WMMAP.IndexType, 16) : index_key_16bit{0});
+ let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
+ let Inst{13} = 0;
+ // opsel_hi
+ let Inst{59} = 1;
+ let Inst{60} = 1;
+ let Inst{14} = 1;
+ // neg_lo
+ let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
+ let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
+ let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0);
+ // neg_hi
+ let Inst{8} = !if(WMMAP.NegHi01, src0_modifiers{1}, 0);
+ let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
+ let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
+ // clamp
+ let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0);
+}
+
+multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<7> op, VOP3PWMMA_Profile WMMAP,
+ string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ def Gen.Suffix :
+ VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>,
+ VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>;
+}
+
+multiclass VOP3P_Real_WMMA_gfx12 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
+ let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
+ defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
+ }
+}
+
+multiclass VOP3P_Real_WMMA_gfx12w64 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
+ let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX12" in {
+ defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
+ }
+}
+
+defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
+defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
+defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
+
+defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
+defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>;
+defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
+
+
+defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
+defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
+
multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op,
string backing_ps_name = NAME,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index df505c3365cb..20d7c88fb7e5 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -124,6 +124,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let IsPacked = P.IsPacked;
let IsMAI = P.IsMAI;
let IsWMMA = P.IsWMMA;
+ let IsSWMMAC = P.IsSWMMAC;
let AsmOperands = !if(isVop3OpSel,
P.AsmVOP3OpSel,
@@ -305,6 +306,11 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
+class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+ let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
+}
+
class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
let Inst{11} = ?;
let Inst{12} = ?;
@@ -378,6 +384,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
bits<4> src2_modifiers;
bits<9> src2;
bits<1> clamp;
+ bits<2> index_key_8bit;
+ bits<1> index_key_16bit;
let Inst{7-0} = vdst;
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
@@ -738,7 +746,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
// OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
- let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?);
+ let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
let Inst{15} = !if(P.HasClamp, clamp, 0);
@@ -1406,14 +1414,20 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands,
IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
- if ps.Pfl.HasOpSel then
- def _e64#Gen.Suffix :
- VOP3_Real_Gen<ps, Gen>,
- VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
- if !not(ps.Pfl.HasOpSel) then
- def _e64#Gen.Suffix :
- VOP3_Real_Gen<ps, Gen>,
- VOP3e_gfx11_gfx12<op, ps.Pfl>;
+ if ps.Pfl.IsFP8 then {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
+ } else {
+ if ps.Pfl.HasOpSel then
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
+ if !not(ps.Pfl.HasOpSel) then
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3e_gfx11_gfx12<op, ps.Pfl>;
+ }
}
def Gen.Suffix#"_VOP3_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>, LetDummies;
}
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 5c1c7046fdbf..8629551152cb 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1806,12 +1806,13 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
DFS.ProcessLoop();
const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
- for (auto *MBB : PostOrder) {
- recomputeLiveIns(*MBB);
- // FIXME: For some reason, the live-in print order is non-deterministic for
- // our tests and I can't out why... So just sort them.
- MBB->sortUniqueLiveIns();
- }
+ bool anyChange = false;
+ do {
+ anyChange = false;
+ for (auto *MBB : PostOrder) {
+ anyChange = recomputeLiveIns(*MBB) || anyChange;
+ }
+ } while (anyChange);
for (auto *MBB : reverse(PostOrder))
recomputeLivenessFlags(*MBB);
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index 04349aa52b54..d47dded9ea6e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -21,17 +21,20 @@ using namespace llvm;
TypeSize LoongArchTTIImpl::getRegisterBitWidth(
TargetTransformInfo::RegisterKind K) const {
+ TypeSize DefSize = TargetTransformInfoImplBase::getRegisterBitWidth(K);
switch (K) {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
case TargetTransformInfo::RGK_FixedWidthVector:
- if (ST->hasExtLASX() && ST->hasExpAutoVec())
+ if (!ST->hasExpAutoVec())
+ return DefSize;
+ if (ST->hasExtLASX())
return TypeSize::getFixed(256);
- if (ST->hasExtLSX() && ST->hasExpAutoVec())
+ if (ST->hasExtLSX())
return TypeSize::getFixed(128);
- return TypeSize::getFixed(0);
+ [[fallthrough]];
case TargetTransformInfo::RGK_ScalableVector:
- return TypeSize::getScalable(0);
+ return DefSize;
}
llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 718844bc36ff..66b2b0de8d52 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -471,45 +471,6 @@ void MipsAsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
TS.emitDirectiveInsn();
}
-/// isBlockOnlyReachableByFallthough - Return true if the basic block has
-/// exactly one predecessor and the control transfer mechanism between
-/// the predecessor and this block is a fall-through.
-bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
- MBB) const {
- // The predecessor has to be immediately before this block.
- const MachineBasicBlock *Pred = *MBB->pred_begin();
-
- // If the predecessor is a switch statement, assume a jump table
- // implementation, so it is not a fall through.
- if (const BasicBlock *bb = Pred->getBasicBlock())
- if (isa<SwitchInst>(bb->getTerminator()))
- return false;
-
- // If this is a landing pad, it isn't a fall through. If it has no preds,
- // then nothing falls through to it.
- if (MBB->isEHPad() || MBB->pred_empty())
- return false;
-
- // If there isn't exactly one predecessor, it can't be a fall through.
- if (MBB->pred_size() != 1)
- return false;
-
- // The predecessor has to be immediately before this block.
- if (!Pred->isLayoutSuccessor(MBB))
- return false;
-
- // If the block is completely empty, then it definitely does fall through.
- if (Pred->empty())
- return true;
-
- // Otherwise, check the last instruction.
- // Check if the last terminator is an unconditional branch.
- MachineBasicBlock::const_iterator I = Pred->end();
- while (I != Pred->begin() && !(--I)->isTerminator()) ;
-
- return !I->isBarrier();
-}
-
// Print out an operand for an inline asm expression.
bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
const char *ExtraCode, raw_ostream &O) {
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h
index 64424b181504..0b55089385d7 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.h
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -142,8 +142,6 @@ public:
void emitFunctionBodyStart() override;
void emitFunctionBodyEnd() override;
void emitBasicBlockEnd(const MachineBasicBlock &MBB) override;
- bool isBlockOnlyReachableByFallthrough(
- const MachineBasicBlock* MBB) const override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
index aee57a5075ff..b43eee8fdd8c 100644
--- a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
@@ -208,8 +208,10 @@ bool PPCExpandAtomicPseudo::expandAtomicRMW128(
.addMBB(LoopMBB);
CurrentMBB->addSuccessor(LoopMBB);
CurrentMBB->addSuccessor(ExitMBB);
- recomputeLiveIns(*LoopMBB);
- recomputeLiveIns(*ExitMBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
+ } while (anyChange);
NMBBI = MBB.end();
MI.eraseFromParent();
return true;
@@ -286,9 +288,11 @@ bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128(
CurrentMBB->addSuccessor(LoopCmpMBB);
CurrentMBB->addSuccessor(ExitMBB);
- recomputeLiveIns(*LoopCmpMBB);
- recomputeLiveIns(*CmpSuccMBB);
- recomputeLiveIns(*ExitMBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*CmpSuccMBB) ||
+ recomputeLiveIns(*LoopCmpMBB);
+ } while (anyChange);
NMBBI = MBB.end();
MI.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 245e78641ed6..6792842f8550 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1441,8 +1441,11 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB);
}
// Update liveins.
- recomputeLiveIns(*ProbeLoopBodyMBB);
- recomputeLiveIns(*ProbeExitMBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*ProbeExitMBB) ||
+ recomputeLiveIns(*ProbeLoopBodyMBB);
+ } while (anyChange);
return ProbeExitMBB;
};
// For case HasBP && MaxAlign > 1, we have to realign the SP by performing
@@ -1534,8 +1537,10 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
}
// Update liveins.
- recomputeLiveIns(*LoopMBB);
- recomputeLiveIns(*ExitMBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
+ } while (anyChange);
}
}
++NumPrologProbed;
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index e6e879282241..27d52c16a4f3 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -31,6 +31,12 @@ include "RISCVInstrInfo.td"
include "GISel/RISCVRegisterBanks.td"
//===----------------------------------------------------------------------===//
+// RISC-V macro fusions.
+//===----------------------------------------------------------------------===//
+
+include "RISCVMacroFusion.td"
+
+//===----------------------------------------------------------------------===//
// RISC-V Scheduling Models
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3878be680c04..26451c80f57b 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -72,7 +72,7 @@ def FeatureStdExtZicntr
[FeatureStdExtZicsr]>;
def FeatureStdExtZicond
- : SubtargetFeature<"experimental-zicond", "HasStdExtZicond", "true",
+ : SubtargetFeature<"zicond", "HasStdExtZicond", "true",
"'Zicond' (Integer Conditional Operations)">;
def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
AssemblerPredicate<(all_of FeatureStdExtZicond),
@@ -1044,30 +1044,6 @@ def TuneDLenFactor2
: SubtargetFeature<"dlen-factor-2", "DLenFactor2", "true",
"Vector unit DLEN(data path width) is half of VLEN">;
-def TuneLUIADDIFusion
- : SubtargetFeature<"lui-addi-fusion", "HasLUIADDIFusion",
- "true", "Enable LUI+ADDI macrofusion">;
-
-def TuneAUIPCADDIFusion
- : SubtargetFeature<"auipc-addi-fusion", "HasAUIPCADDIFusion",
- "true", "Enable AUIPC+ADDI macrofusion">;
-
-def TuneZExtHFusion
- : SubtargetFeature<"zexth-fusion", "HasZExtHFusion",
- "true", "Enable SLLI+SRLI to be fused to zero extension of halfword">;
-
-def TuneZExtWFusion
- : SubtargetFeature<"zextw-fusion", "HasZExtWFusion",
- "true", "Enable SLLI+SRLI to be fused to zero extension of word">;
-
-def TuneShiftedZExtWFusion
- : SubtargetFeature<"shifted-zextw-fusion", "HasShiftedZExtWFusion",
- "true", "Enable SLLI+SRLI to be fused when computing (shifted) zero extension of word">;
-
-def TuneLDADDFusion
- : SubtargetFeature<"ld-add-fusion", "HasLDADDFusion",
- "true", "Enable LD+ADD macrofusion.">;
-
def TuneNoDefaultUnroll
: SubtargetFeature<"no-default-unroll", "EnableDefaultUnroll", "false",
"Disable default unroll preference.">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 47c6cd6e5487..7895d74f06d1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4718,7 +4718,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
if (SrcVecIdx == -1)
continue;
unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
- SDValue SrcVec = (unsigned)SrcVecIdx > VRegsPerSrc ? V2 : V1;
+ SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
DAG.getVectorIdxConstant(ExtractIdx, DL));
SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td
index 0790a941823b..35d3fdae0bd7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicond.td
@@ -8,8 +8,6 @@
//
// This file describes the RISC-V instructions from the standard Integer
// Conditional operations extension (Zicond).
-// This version is still experimental as the 'Zicond' extension hasn't been
-// ratified yet. It is based on v1.0-rc1 of the specification.
//
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
deleted file mode 100644
index f948f05b22f7..000000000000
--- a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-//===- RISCVMacroFusion.cpp - RISC-V Macro Fusion -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This file contains the RISC-V implementation of the DAG scheduling
-/// mutation to pair instructions back to back.
-//
-//===----------------------------------------------------------------------===//
-//
-#include "RISCVMacroFusion.h"
-#include "RISCVSubtarget.h"
-#include "llvm/CodeGen/MacroFusion.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-
-using namespace llvm;
-
-static bool checkRegisters(Register FirstDest, const MachineInstr &SecondMI) {
- if (!SecondMI.getOperand(1).isReg())
- return false;
-
- if (SecondMI.getOperand(1).getReg() != FirstDest)
- return false;
-
- // If the input is virtual make sure this is the only user.
- if (FirstDest.isVirtual()) {
- auto &MRI = SecondMI.getMF()->getRegInfo();
- return MRI.hasOneNonDBGUse(FirstDest);
- }
-
- return SecondMI.getOperand(0).getReg() == FirstDest;
-}
-
-// Fuse load with add:
-// add rd, rs1, rs2
-// ld rd, 0(rd)
-static bool isLDADD(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
- if (SecondMI.getOpcode() != RISCV::LD)
- return false;
-
- if (!SecondMI.getOperand(2).isImm())
- return false;
-
- if (SecondMI.getOperand(2).getImm() != 0)
- return false;
-
- // Given SecondMI, when FirstMI is unspecified, we must return
- // if SecondMI may be part of a fused pair at all.
- if (!FirstMI)
- return true;
-
- if (FirstMI->getOpcode() != RISCV::ADD)
- return true;
-
- return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse zero extension of halfword:
-// slli rd, rs1, 48
-// srli rd, rd, 48
-static bool isZExtH(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
- if (SecondMI.getOpcode() != RISCV::SRLI)
- return false;
-
- if (!SecondMI.getOperand(2).isImm())
- return false;
-
- if (SecondMI.getOperand(2).getImm() != 48)
- return false;
-
- // Given SecondMI, when FirstMI is unspecified, we must return
- // if SecondMI may be part of a fused pair at all.
- if (!FirstMI)
- return true;
-
- if (FirstMI->getOpcode() != RISCV::SLLI)
- return false;
-
- if (FirstMI->getOperand(2).getImm() != 48)
- return false;
-
- return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse zero extension of word:
-// slli rd, rs1, 32
-// srli rd, rd, 32
-static bool isZExtW(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
- if (SecondMI.getOpcode() != RISCV::SRLI)
- return false;
-
- if (!SecondMI.getOperand(2).isImm())
- return false;
-
- if (SecondMI.getOperand(2).getImm() != 32)
- return false;
-
- // Given SecondMI, when FirstMI is unspecified, we must return
- // if SecondMI may be part of a fused pair at all.
- if (!FirstMI)
- return true;
-
- if (FirstMI->getOpcode() != RISCV::SLLI)
- return false;
-
- if (FirstMI->getOperand(2).getImm() != 32)
- return false;
-
- return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse shifted zero extension of word:
-// slli rd, rs1, 32
-// srli rd, rd, x
-// where 0 <= x < 32
-static bool isShiftedZExtW(const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
- if (SecondMI.getOpcode() != RISCV::SRLI)
- return false;
-
- if (!SecondMI.getOperand(2).isImm())
- return false;
-
- unsigned SRLIImm = SecondMI.getOperand(2).getImm();
- if (SRLIImm >= 32)
- return false;
-
- // Given SecondMI, when FirstMI is unspecified, we must return
- // if SecondMI may be part of a fused pair at all.
- if (!FirstMI)
- return true;
-
- if (FirstMI->getOpcode() != RISCV::SLLI)
- return false;
-
- if (FirstMI->getOperand(2).getImm() != 32)
- return false;
-
- return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse AUIPC followed by ADDI
-// auipc rd, imm20
-// addi rd, rd, imm12
-static bool isAUIPCADDI(const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
- if (SecondMI.getOpcode() != RISCV::ADDI)
- return false;
- // Assume the 1st instr to be a wildcard if it is unspecified.
- if (!FirstMI)
- return true;
-
- if (FirstMI->getOpcode() != RISCV::AUIPC)
- return false;
-
- return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-// Fuse LUI followed by ADDI or ADDIW.
-// rd = imm[31:0] which decomposes to
-// lui rd, imm[31:12]
-// addi(w) rd, rd, imm[11:0]
-static bool isLUIADDI(const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
- if (SecondMI.getOpcode() != RISCV::ADDI &&
- SecondMI.getOpcode() != RISCV::ADDIW)
- return false;
- // Assume the 1st instr to be a wildcard if it is unspecified.
- if (!FirstMI)
- return true;
-
- if (FirstMI->getOpcode() != RISCV::LUI)
- return false;
-
- return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
-}
-
-static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
- const TargetSubtargetInfo &TSI,
- const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
- const RISCVSubtarget &ST = static_cast<const RISCVSubtarget &>(TSI);
-
- if (ST.hasLUIADDIFusion() && isLUIADDI(FirstMI, SecondMI))
- return true;
-
- if (ST.hasAUIPCADDIFusion() && isAUIPCADDI(FirstMI, SecondMI))
- return true;
-
- if (ST.hasZExtHFusion() && isZExtH(FirstMI, SecondMI))
- return true;
-
- if (ST.hasZExtWFusion() && isZExtW(FirstMI, SecondMI))
- return true;
-
- if (ST.hasShiftedZExtWFusion() && isShiftedZExtW(FirstMI, SecondMI))
- return true;
-
- if (ST.hasLDADDFusion() && isLDADD(FirstMI, SecondMI))
- return true;
-
- return false;
-}
-
-std::unique_ptr<ScheduleDAGMutation> llvm::createRISCVMacroFusionDAGMutation() {
- return createMacroFusionDAGMutation(shouldScheduleAdjacent);
-}
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.h b/llvm/lib/Target/RISCV/RISCVMacroFusion.h
deleted file mode 100644
index 7598db3f8fe1..000000000000
--- a/llvm/lib/Target/RISCV/RISCVMacroFusion.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//===- RISCVMacroFusion.h - RISC-V Macro Fusion -----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This file contains the RISC-V definition of the DAG scheduling
-/// mutation to pair instructions back to back.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H
-#define LLVM_LIB_TARGET_RISCV_RISCVMACROFUSION_H
-
-#include "llvm/CodeGen/MachineScheduler.h"
-
-namespace llvm {
-
-/// Note that you have to add:
-/// DAG.addMutation(createRISCVMacroFusionDAGMutation());
-/// to RISCVPassConfig::createMachineScheduler() to have an effect.
-std::unique_ptr<ScheduleDAGMutation> createRISCVMacroFusionDAGMutation();
-
-} // namespace llvm
-
-#endif
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.td b/llvm/lib/Target/RISCV/RISCVMacroFusion.td
new file mode 100644
index 000000000000..875a93d09a2c
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.td
@@ -0,0 +1,93 @@
+//==----- RISCVMacroFusion.td - Macro Fusion Definitions -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the macro fusion predicators.
+
+// Fuse LUI followed by ADDI or ADDIW:
+// rd = imm[31:0] which decomposes to
+// lui rd, imm[31:12]
+// addi(w) rd, rd, imm[11:0]
+def TuneLUIADDIFusion
+ : SimpleFusion<"lui-addi-fusion", "HasLUIADDIFusion",
+ "Enable LUI+ADDI macro fusion",
+ CheckOpcode<[LUI]>,
+ CheckOpcode<[ADDI, ADDIW]>>;
+
+// Fuse AUIPC followed by ADDI:
+// auipc rd, imm20
+// addi rd, rd, imm12
+def TuneAUIPCADDIFusion
+ : SimpleFusion<"auipc-addi-fusion", "HasAUIPCADDIFusion",
+ "Enable AUIPC+ADDI macrofusion",
+ CheckOpcode<[AUIPC]>,
+ CheckOpcode<[ADDI]>>;
+
+// Fuse zero extension of halfword:
+// slli rd, rs1, 48
+// srli rd, rd, 48
+def TuneZExtHFusion
+ : SimpleFusion<"zexth-fusion", "HasZExtHFusion",
+ "Enable SLLI+SRLI to be fused to zero extension of halfword",
+ CheckAll<[
+ CheckOpcode<[SLLI]>,
+ CheckIsImmOperand<2>,
+ CheckImmOperand<2, 48>
+ ]>,
+ CheckAll<[
+ CheckOpcode<[SRLI]>,
+ CheckIsImmOperand<2>,
+ CheckImmOperand<2, 48>
+ ]>>;
+
+// Fuse zero extension of word:
+// slli rd, rs1, 32
+// srli rd, rd, 32
+def TuneZExtWFusion
+ : SimpleFusion<"zextw-fusion", "HasZExtWFusion",
+ "Enable SLLI+SRLI to be fused to zero extension of word",
+ CheckAll<[
+ CheckOpcode<[SLLI]>,
+ CheckIsImmOperand<2>,
+ CheckImmOperand<2, 32>
+ ]>,
+ CheckAll<[
+ CheckOpcode<[SRLI]>,
+ CheckIsImmOperand<2>,
+ CheckImmOperand<2, 32>
+ ]>>;
+
+// Fuse shifted zero extension of word:
+// slli rd, rs1, 32
+// srli rd, rd, x
+// where 0 <= x < 32
+def TuneShiftedZExtWFusion
+ : SimpleFusion<"shifted-zextw-fusion", "HasShiftedZExtWFusion",
+ "Enable SLLI+SRLI to be fused when computing (shifted) word zero extension",
+ CheckAll<[
+ CheckOpcode<[SLLI]>,
+ CheckIsImmOperand<2>,
+ CheckImmOperand<2, 32>
+ ]>,
+ CheckAll<[
+ CheckOpcode<[SRLI]>,
+ CheckIsImmOperand<2>,
+ CheckImmOperandRange<2, 0, 31>
+ ]>>;
+
+// Fuse load with add:
+// add rd, rs1, rs2
+// ld rd, 0(rd)
+def TuneLDADDFusion
+ : SimpleFusion<"ld-add-fusion", "HasLDADDFusion", "Enable LD+ADD macrofusion",
+ CheckOpcode<[ADD]>,
+ CheckAll<[
+ CheckOpcode<[LD]>,
+ CheckIsImmOperand<2>,
+ CheckImmOperand<2, 0>
+ ]>>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 7b64d3cee9c8..d3236bb07d56 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -16,8 +16,9 @@
#include "GISel/RISCVRegisterBankInfo.h"
#include "RISCV.h"
#include "RISCVFrameLowering.h"
-#include "RISCVMacroFusion.h"
#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/ErrorHandling.h"
@@ -29,6 +30,9 @@ using namespace llvm;
#define GET_SUBTARGETINFO_CTOR
#include "RISCVGenSubtargetInfo.inc"
+#define GET_RISCV_MACRO_FUSION_PRED_IMPL
+#include "RISCVGenMacroFusion.inc"
+
namespace llvm::RISCVTuneInfoTable {
#define GET_RISCVTuneInfoTable_IMPL
@@ -187,7 +191,7 @@ bool RISCVSubtarget::enableSubRegLiveness() const {
void RISCVSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
- Mutations.push_back(createRISCVMacroFusionDAGMutation());
+ Mutations.push_back(createMacroFusionDAGMutation(getMacroFusions()));
}
/// Enable use of alias analysis during code generation (during MI
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 2ba93764facd..8c55efa69a6a 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -27,6 +27,9 @@
#include "llvm/Target/TargetMachine.h"
#include <bitset>
+#define GET_RISCV_MACRO_FUSION_PRED_DECL
+#include "RISCVGenMacroFusion.inc"
+
#define GET_SUBTARGETINFO_HEADER
#include "RISCVGenSubtargetInfo.inc"
@@ -196,11 +199,6 @@ public:
return UserReservedRegister[i];
}
- bool hasMacroFusion() const {
- return hasLUIADDIFusion() || hasAUIPCADDIFusion() || hasZExtHFusion() ||
- hasZExtWFusion() || hasShiftedZExtWFusion() || hasLDADDFusion();
- }
-
// Vector codegen related methods.
bool hasVInstructions() const { return HasStdExtZve32x; }
bool hasVInstructionsI64() const { return HasStdExtZve64x; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b4b81b545a54..2285c99d7901 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -14,7 +14,6 @@
#include "MCTargetDesc/RISCVBaseInfo.h"
#include "RISCV.h"
#include "RISCVMachineFunctionInfo.h"
-#include "RISCVMacroFusion.h"
#include "RISCVTargetObjectFile.h"
#include "RISCVTargetTransformInfo.h"
#include "TargetInfo/RISCVTargetInfo.h"
@@ -26,6 +25,8 @@
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/MacroFusion.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -361,9 +362,10 @@ public:
DAG->addMutation(createLoadClusterDAGMutation(
DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
}
- if (ST.hasMacroFusion()) {
+ const auto &MacroFusions = ST.getMacroFusions();
+ if (!MacroFusions.empty()) {
DAG = DAG ? DAG : createGenericSchedLive(C);
- DAG->addMutation(createRISCVMacroFusionDAGMutation());
+ DAG->addMutation(createMacroFusionDAGMutation(MacroFusions));
}
return DAG;
}
@@ -371,9 +373,10 @@ public:
ScheduleDAGInstrs *
createPostMachineScheduler(MachineSchedContext *C) const override {
const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
- if (ST.hasMacroFusion()) {
+ const auto &MacroFusions = ST.getMacroFusions();
+ if (!MacroFusions.empty()) {
ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
- DAG->addMutation(createRISCVMacroFusionDAGMutation());
+ DAG->addMutation(createMacroFusionDAGMutation(MacroFusions));
return DAG;
}
return nullptr;
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index db19c8881c68..80c994a32ea9 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -840,8 +840,10 @@ void SystemZELFFrameLowering::inlineStackProbe(
StackAllocMI->eraseFromParent();
if (DoneMBB != nullptr) {
// Compute the live-in lists for the new blocks.
- recomputeLiveIns(*DoneMBB);
- recomputeLiveIns(*LoopMBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*DoneMBB) || recomputeLiveIns(*LoopMBB);
+ } while (anyChange);
}
}
@@ -1439,8 +1441,10 @@ void SystemZXPLINKFrameLowering::inlineStackProbe(
StackAllocMI->eraseFromParent();
// Compute the live-in lists for the new blocks.
- recomputeLiveIns(*NextMBB);
- recomputeLiveIns(*StackExtMBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*StackExtMBB) || recomputeLiveIns(*NextMBB);
+ } while (anyChange);
}
bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 9f0fd4d0938e..87ec8aa23080 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -877,7 +877,6 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
OutStreamer->emitInt32(FeatureFlagsAnd); // data
emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
- OutStreamer->endSection(Nt);
OutStreamer->switchSection(Cur);
}
}
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index c0d358ead278..c2f76a3b8abb 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -885,8 +885,10 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
}
// Update Live In information
- recomputeLiveIns(*testMBB);
- recomputeLiveIns(*tailMBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*tailMBB) || recomputeLiveIns(*testMBB);
+ } while (anyChange);
}
void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
@@ -1378,10 +1380,11 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
footMBB->addSuccessor(&MBB);
}
- recomputeLiveIns(*headMBB);
- recomputeLiveIns(*bodyMBB);
- recomputeLiveIns(*footMBB);
- recomputeLiveIns(MBB);
+ bool anyChange = false;
+ do {
+ anyChange = recomputeLiveIns(*footMBB) || recomputeLiveIns(*bodyMBB) ||
+ recomputeLiveIns(*headMBB) || recomputeLiveIns(MBB);
+ } while (anyChange);
}
} else {
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index fe7d90fbcdf7..bb5e22c71427 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12422,7 +12422,7 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
: avx512_3Op_rm_imm8<Op, OpStr, OpNode, sched, VTI, VTI> {
let ExeDomain = VTI.ExeDomain in
defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
- (ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3),
+ (ins VTI.RC:$src1, BcstVTI.ScalarMemOp:$src2, u8imm:$src3),
OpStr, "$src3, ${src2}"#BcstVTI.BroadcastStr#", $src1",
"$src1, ${src2}"#BcstVTI.BroadcastStr#", $src3",
(OpNode (VTI.VT VTI.RC:$src1),
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index c9d0f66c6e46..63136af2295f 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -291,12 +291,15 @@ struct X86BroadcastFoldTable {
static bool matchBroadcastSize(const X86FoldTableEntry &Entry,
unsigned BroadcastBits) {
switch (Entry.Flags & TB_BCAST_MASK) {
- case TB_BCAST_SD:
- case TB_BCAST_Q:
- return BroadcastBits == 64;
- case TB_BCAST_SS:
+ case TB_BCAST_W:
+ case TB_BCAST_SH:
+ return BroadcastBits == 16;
case TB_BCAST_D:
+ case TB_BCAST_SS:
return BroadcastBits == 32;
+ case TB_BCAST_Q:
+ case TB_BCAST_SD:
+ return BroadcastBits == 64;
}
return false;
}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index d6f9aa6d6ace..9ac1f783b7f0 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2354,33 +2354,26 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VBLENDPSrri:
// If we're optimizing for size, try to use MOVSD/MOVSS.
if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
- unsigned Mask;
- switch (Opc) {
- default:
- llvm_unreachable("Unreachable!");
- case X86::BLENDPDrri:
- Opc = X86::MOVSDrr;
- Mask = 0x03;
- break;
- case X86::BLENDPSrri:
- Opc = X86::MOVSSrr;
- Mask = 0x0F;
- break;
- case X86::VBLENDPDrri:
- Opc = X86::VMOVSDrr;
- Mask = 0x03;
- break;
- case X86::VBLENDPSrri:
- Opc = X86::VMOVSSrr;
- Mask = 0x0F;
- break;
- }
+ unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F;
if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
+#define FROM_TO(FROM, TO) \
+ case X86::FROM: \
+ Opc = X86::TO; \
+ break;
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unreachable!");
+ FROM_TO(BLENDPDrri, MOVSDrr)
+ FROM_TO(BLENDPSrri, MOVSSrr)
+ FROM_TO(VBLENDPDrri, VMOVSDrr)
+ FROM_TO(VBLENDPSrri, VMOVSSrr)
+ }
WorkingMI = CloneIfNew(MI);
WorkingMI->setDesc(get(Opc));
WorkingMI->removeOperand(3);
break;
}
+#undef FROM_TO
}
[[fallthrough]];
case X86::PBLENDWrri: