aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2024-01-09 19:58:18 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-01-09 19:58:18 +0000
commitaca2e42c67292825f835f094eb0c4df5ce6013db (patch)
tree9cfb7eeef35545100c4f7219e794e6a0306ea6a6 /llvm/lib/Target
parent77dbea07356e1ab2f37a777d4d1ddc5dd3e301c2 (diff)
downloadsrc-aca2e42c67292825f835f094eb0c4df5ce6013db.tar.gz
src-aca2e42c67292825f835f094eb0c4df5ce6013db.zip
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.h1
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64FastISel.cpp19
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp15
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td1
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp828
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h25
-rw-r--r--llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp11
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp59
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h4
-rw-r--r--llvm/lib/Target/AArch64/SVEInstrFormats.td12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp58
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp39
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp168
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/DSDIRInstructions.td192
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/EXPInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td46
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h9
-rw-r--r--llvm/lib/Target/AMDGPU/LDSDIRInstructions.td116
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp139
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h10
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp148
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp287
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp210
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td19
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td174
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp70
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td8
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td44
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td52
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp106
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h11
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td18
-rw-r--r--llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp2
-rw-r--r--llvm/lib/Target/ARC/ARCISelLowering.cpp2
-rw-r--r--llvm/lib/Target/ARC/ARCTargetMachine.cpp7
-rw-r--r--llvm/lib/Target/ARM/ARMFastISel.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp24
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp19
-rw-r--r--llvm/lib/Target/ARM/ARMLegalizerInfo.cpp10
-rw-r--r--llvm/lib/Target/ARM/ARMLegalizerInfo.h4
-rw-r--r--llvm/lib/Target/AVR/AVRISelLowering.cpp4
-rw-r--r--llvm/lib/Target/BPF/BPFISelLowering.cpp1
-rw-r--r--llvm/lib/Target/BPF/BPFTargetMachine.cpp5
-rw-r--r--llvm/lib/Target/BPF/BPFTargetMachine.h3
-rw-r--r--llvm/lib/Target/DirectX/DXILResourceAnalysis.h1
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetMachine.h3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp6
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.h3
-rw-r--r--llvm/lib/Target/Lanai/LanaiISelLowering.cpp2
-rw-r--r--llvm/lib/Target/Lanai/LanaiTargetMachine.cpp7
-rw-r--r--llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp61
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp521
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp65
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.h7
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.td106
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp3
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp23
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h1
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp4
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp69
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h3
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h1
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h5
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp3
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp3
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h1
-rw-r--r--llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h1
-rw-r--r--llvm/lib/Target/M68k/M68kISelLowering.cpp4
-rw-r--r--llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp4
-rw-r--r--llvm/lib/Target/MSP430/MSP430ISelLowering.cpp7
-rw-r--r--llvm/lib/Target/MSP430/MSP430TargetMachine.cpp7
-rw-r--r--llvm/lib/Target/Mips/MipsFastISel.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.cpp3
-rw-r--r--llvm/lib/Target/Mips/MipsLegalizerInfo.cpp5
-rw-r--r--llvm/lib/Target/Mips/MipsLegalizerInfo.h3
-rw-r--r--llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetMachine.h3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp34
-rw-r--r--llvm/lib/Target/PowerPC/PPCFastISel.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp13
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp16
-rw-r--r--llvm/lib/Target/PowerPC/PPCRegisterInfo.td9
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp9
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp2
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp7
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h4
-rw-r--r--llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp33
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp9
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h4
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h1
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp10
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp10
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp375
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h13
-rw-r--r--llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp7
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td54
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoA.td12
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoD.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td16
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td122
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZa.td44
-rw-r--r--llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h7
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp114
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp7
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp125
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp59
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp14
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.h2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp8
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp27
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp7
-rw-r--r--llvm/lib/Target/X86/GISel/X86LegalizerInfo.h1
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h16
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp3
-rw-r--r--llvm/lib/Target/X86/X86.h8
-rw-r--r--llvm/lib/Target/X86/X86CompressEVEX.cpp (renamed from llvm/lib/Target/X86/X86EvexToVex.cpp)168
-rw-r--r--llvm/lib/Target/X86/X86DomainReassignment.cpp67
-rw-r--r--llvm/lib/Target/X86/X86FastISel.cpp12
-rw-r--r--llvm/lib/Target/X86/X86FlagsCopyLowering.cpp1
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp122
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h10
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp37
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td248
-rw-r--r--llvm/lib/Target/X86/X86InstrArithmetic.td1030
-rw-r--r--llvm/lib/Target/X86/X86InstrFormats.td6
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h4
-rw-r--r--llvm/lib/Target/X86/X86InstrMisc.td120
-rw-r--r--llvm/lib/Target/X86/X86InstrPredicates.td33
-rw-r--r--llvm/lib/Target/X86/X86InstrShiftRotate.td2
-rw-r--r--llvm/lib/Target/X86/X86InstrSystem.td13
-rw-r--r--llvm/lib/Target/X86/X86InstrUtils.td10
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp7
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp4
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp1
-rw-r--r--llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp2
191 files changed, 5189 insertions, 2257 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 901769c54b6e..d20ef63a72e8 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -88,6 +88,7 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
void initializeAArch64ExpandPseudoPass(PassRegistry &);
void initializeAArch64GlobalsTaggingPass(PassRegistry &);
void initializeAArch64LoadStoreOptPass(PassRegistry&);
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
void initializeAArch64MIPeepholeOptPass(PassRegistry &);
void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 68f452039c9b..d5e8ed101d1c 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1405,7 +1405,7 @@ def ProcessorFeatures {
FeatureSSBS];
list<SubtargetFeature> A78C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
- FeatureFlagM, FeatureFP16FML, FeaturePAuth,
+ FeatureFlagM, FeaturePAuth,
FeaturePerfMon, FeatureRCPC, FeatureSPE,
FeatureSSBS];
list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 9b8162ce8dd4..e98f6c4984a7 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -645,7 +645,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
TmpOffset += SL->getElementOffset(Idx);
} else {
- uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ uint64_t S = GTI.getSequentialElementStride(DL);
while (true) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
@@ -1231,15 +1231,6 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
// Only extend the RHS within the instruction if there is a valid extend type.
if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() &&
isValueAvailable(RHS)) {
- if (const auto *SI = dyn_cast<BinaryOperator>(RHS))
- if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1)))
- if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) {
- Register RHSReg = getRegForValue(SI->getOperand(0));
- if (!RHSReg)
- return 0;
- return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType,
- C->getZExtValue(), SetFlags, WantResult);
- }
Register RHSReg = getRegForValue(RHS);
if (!RHSReg)
return 0;
@@ -4987,15 +4978,13 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
if (Field)
TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
} else {
- Type *Ty = GTI.getIndexedType();
-
// If this is a constant subscript, handle it quickly.
if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
if (CI->isZero())
continue;
// N = N + Offset
- TotalOffs +=
- DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
+ TotalOffs += GTI.getSequentialElementStride(DL) *
+ cast<ConstantInt>(CI)->getSExtValue();
continue;
}
if (TotalOffs) {
@@ -5006,7 +4995,7 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
}
// N = N + Idx * ElementSize;
- uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+ uint64_t ElementSize = GTI.getSequentialElementStride(DL);
unsigned IdxN = getRegForGEPIndex(Idx);
if (!IdxN)
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 476d99c2a7e0..edc8cc7d4d1e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -580,7 +580,7 @@ bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
if (!isa<ConstantSDNode>(N.getNode()))
return false;
- uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+ uint64_t Immed = N.getNode()->getAsZExtVal();
unsigned ShiftAmt;
if (Immed >> 12 == 0) {
@@ -611,7 +611,7 @@ bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
return false;
// The immediate operand must be a 24-bit zero-extended immediate.
- uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+ uint64_t Immed = N.getNode()->getAsZExtVal();
// This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
// have the opposite effect on the C flag, so this pattern mustn't match under
@@ -1326,7 +1326,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
// MOV X0, WideImmediate
// LDR X2, [BaseReg, X0]
if (isa<ConstantSDNode>(RHS)) {
- int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
+ int64_t ImmOff = (int64_t)RHS->getAsZExtVal();
// Skip the immediate can be selected by load/store addressing mode.
// Also skip the immediate can be encoded by a single ADD (SUB is also
// checked by using -ImmOff).
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 102fd0c3dae2..47e665176e8b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3588,8 +3588,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// cmp w13, w12
// can be turned into:
// cmp w12, w11, lsl #1
- if (!isa<ConstantSDNode>(RHS) ||
- !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
+ if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
@@ -3623,7 +3622,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
LHS.getNode()->hasNUsesOfValue(1, 0)) {
- int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+ int16_t ValueofRHS = RHS->getAsZExtVal();
if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
SDValue SExt =
DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
@@ -5619,7 +5618,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
// SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
// must be calculated before hand.
- uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
+ uint64_t ScaleVal = Scale->getAsZExtVal();
if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
EVT IndexVT = Index.getValueType();
@@ -5707,7 +5706,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
// SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
// must be calculated before hand.
- uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
+ uint64_t ScaleVal = Scale->getAsZExtVal();
if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
EVT IndexVT = Index.getValueType();
@@ -16516,9 +16515,9 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
VT.getVectorNumElements() * 2)
return SDValue();
- if ((Ext0.getConstantOperandVal(1) != 0 &&
+ if ((Ext0.getConstantOperandVal(1) != 0 ||
Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
- (Ext1.getConstantOperandVal(1) != 0 &&
+ (Ext1.getConstantOperandVal(1) != 0 ||
Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
return SDValue();
unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
@@ -22011,7 +22010,7 @@ static SDValue performBRCONDCombine(SDNode *N,
SDValue Cmp = N->getOperand(3);
assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
- unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+ unsigned CC = CCVal->getAsZExtVal();
if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
return SDValue();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index cb63d8726744..10ad5b1f8f25 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -12586,6 +12586,7 @@ def : TokenAlias<".4S", ".4s">;
def : TokenAlias<".2D", ".2d">;
def : TokenAlias<".1Q", ".1q">;
def : TokenAlias<".2H", ".2h">;
+def : TokenAlias<".2B", ".2b">;
def : TokenAlias<".B", ".b">;
def : TokenAlias<".H", ".h">;
def : TokenAlias<".S", ".s">;
diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp
new file mode 100644
index 000000000000..6fcd9c290e9c
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp
@@ -0,0 +1,828 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+// while (++i != n) {
+// if (a[i] != b[i])
+// break;
+// }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO List:
+//
+// * Add support for the inverse case where we scan for a matching element.
+// * Permit 64-bit induction variable types.
+// * Recognize loops that increment the IV *after* comparing bytes.
+// * Allow 32-bit sign-extends of the IV used by the GEP.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt<bool>
+ DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(true),
+ cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt<bool> DisableByteCmp(
+ "disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+ cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt<bool> VerifyLoops(
+ "aarch64-lit-verify", cl::Hidden, cl::init(false),
+ cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+ Loop *CurLoop = nullptr;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ const TargetTransformInfo *TTI;
+ const DataLayout *DL;
+
+public:
+ explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+ : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+ bool run(Loop *L);
+
+private:
+ /// \name Countable Loop Idiom Handling
+ /// @{
+
+ bool runOnCountableLoop();
+ bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+ bool recognizeByteCompare();
+ Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
+ GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+ Instruction *Index, Value *Start, Value *MaxLen);
+ void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+ PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+ Value *Start, bool IncIdx, BasicBlock *FoundBB,
+ BasicBlock *EndBB);
+ /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+ static char ID;
+
+ explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+ initializeAArch64LoopIdiomTransformLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Transform AArch64-specific loop idioms";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+ LPPassManager &LPM) {
+
+ if (skipLoop(L))
+ return false;
+
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent());
+ return AArch64LoopIdiomTransform(
+ DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout())
+ .run(L);
+}
+
+} // end anonymous namespace
+
+char AArch64LoopIdiomTransformLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+ AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
+ "Transform specific loop idioms into optimized vector forms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(
+ AArch64LoopIdiomTransformLegacyPass, "aarch64-lit",
+ "Transform specific loop idioms into optimized vector forms", false, false)
+
+Pass *llvm::createAArch64LoopIdiomTransformPass() {
+ return new AArch64LoopIdiomTransformLegacyPass();
+}
+
+PreservedAnalyses
+AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ if (DisableAll)
+ return PreservedAnalyses::all();
+
+ const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+
+ AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL);
+ if (!LIT.run(&L))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of AArch64LoopIdiomTransform
+//
+//===----------------------------------------------------------------------===//
+
+bool AArch64LoopIdiomTransform::run(Loop *L) {
+ CurLoop = L;
+
+ if (DisableAll || L->getHeader()->getParent()->hasOptSize())
+ return false;
+
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!L->getLoopPreheader())
+ return false;
+
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
+ << CurLoop->getHeader()->getParent()->getName()
+ << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
+
+ return recognizeByteCompare();
+}
+
+bool AArch64LoopIdiomTransform::recognizeByteCompare() {
+ // Currently the transformation only works on scalable vector types, although
+ // there is no fundamental reason why it cannot be made to work for fixed
+ // width too.
+
+ // We also need to know the minimum page size for the target in order to
+ // generate runtime memory checks to ensure the vector version won't fault.
+ if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() ||
+ DisableByteCmp)
+ return false;
+
+ BasicBlock *Header = CurLoop->getHeader();
+
+ // In AArch64LoopIdiomTransform::run we have already checked that the loop
+ // has a preheader so we can assume it's in a canonical form.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2)
+ return false;
+
+ PHINode *PN = dyn_cast<PHINode>(&Header->front());
+ if (!PN || PN->getNumIncomingValues() != 2)
+ return false;
+
+ auto LoopBlocks = CurLoop->getBlocks();
+ // The first block in the loop should contain only 4 instructions, e.g.
+ //
+ // while.cond:
+ // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ]
+ // %inc = add i32 %res.phi, 1
+ // %cmp.not = icmp eq i32 %inc, %n
+ // br i1 %cmp.not, label %while.end, label %while.body
+ //
+ auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug();
+ if (std::distance(CondBBInsts.begin(), CondBBInsts.end()) > 4)
+ return false;
+
+ // The second block should contain 7 instructions, e.g.
+ //
+ // while.body:
+ // %idx = zext i32 %inc to i64
+ // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx
+ // %load.a = load i8, ptr %idx.a
+ // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx
+ // %load.b = load i8, ptr %idx.b
+ // %cmp.not.ld = icmp eq i8 %load.a, %load.b
+ // br i1 %cmp.not.ld, label %while.cond, label %while.end
+ //
+ auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug();
+ if (std::distance(LoopBBInsts.begin(), LoopBBInsts.end()) > 7)
+ return false;
+
+ // The incoming value to the PHI node from the loop should be an add of 1.
+ Value *StartIdx = nullptr;
+ Instruction *Index = nullptr;
+ if (!CurLoop->contains(PN->getIncomingBlock(0))) {
+ StartIdx = PN->getIncomingValue(0);
+ Index = dyn_cast<Instruction>(PN->getIncomingValue(1));
+ } else {
+ StartIdx = PN->getIncomingValue(1);
+ Index = dyn_cast<Instruction>(PN->getIncomingValue(0));
+ }
+
+ // Limit to 32-bit types for now
+ if (!Index || !Index->getType()->isIntegerTy(32) ||
+ !match(Index, m_c_Add(m_Specific(PN), m_One())))
+ return false;
+
+ // If we match the pattern, PN and Index will be replaced with the result of
+ // the cttz.elts intrinsic. If any other instructions are used outside of
+ // the loop, we cannot replace it.
+ for (BasicBlock *BB : LoopBlocks)
+ for (Instruction &I : *BB)
+ if (&I != PN && &I != Index)
+ for (User *U : I.users())
+ if (!CurLoop->contains(cast<Instruction>(U)))
+ return false;
+
+ // Match the branch instruction for the header
+ ICmpInst::Predicate Pred;
+ Value *MaxLen;
+ BasicBlock *EndBB, *WhileBB;
+ if (!match(Header->getTerminator(),
+ m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)),
+ m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) ||
+ Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB))
+ return false;
+
+ // WhileBB should contain the pattern of load & compare instructions. Match
+ // the pattern and find the GEP instructions used by the loads.
+ ICmpInst::Predicate WhilePred;
+ BasicBlock *FoundBB;
+ BasicBlock *TrueBB;
+ Value *LoadA, *LoadB;
+ if (!match(WhileBB->getTerminator(),
+ m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)),
+ m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) ||
+ WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB))
+ return false;
+
+ Value *A, *B;
+ if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B))))
+ return false;
+
+ LoadInst *LoadAI = cast<LoadInst>(LoadA);
+ LoadInst *LoadBI = cast<LoadInst>(LoadB);
+ if (!LoadAI->isSimple() || !LoadBI->isSimple())
+ return false;
+
+ GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(A);
+ GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(B);
+
+ if (!GEPA || !GEPB)
+ return false;
+
+ Value *PtrA = GEPA->getPointerOperand();
+ Value *PtrB = GEPB->getPointerOperand();
+
+ // Check we are loading i8 values from two loop invariant pointers
+ if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) ||
+ !GEPA->getResultElementType()->isIntegerTy(8) ||
+ !GEPB->getResultElementType()->isIntegerTy(8) ||
+ !LoadAI->getType()->isIntegerTy(8) ||
+ !LoadBI->getType()->isIntegerTy(8) || PtrA == PtrB)
+ return false;
+
+ // Check that the index to the GEPs is the index we found earlier
+ if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1)
+ return false;
+
+ Value *IdxA = GEPA->getOperand(GEPA->getNumIndices());
+ Value *IdxB = GEPB->getOperand(GEPB->getNumIndices());
+ if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index))))
+ return false;
+
+ // We only ever expect the pre-incremented index value to be used inside the
+ // loop.
+ if (!PN->hasOneUse())
+ return false;
+
+ // Ensure that when the Found and End blocks are identical the PHIs have the
+ // supported format. We don't currently allow cases like this:
+ // while.cond:
+ // ...
+ // br i1 %cmp.not, label %while.end, label %while.body
+ //
+ // while.body:
+ // ...
+ // br i1 %cmp.not2, label %while.cond, label %while.end
+ //
+ // while.end:
+ // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
+ //
+ // Where the incoming values for %final_ptr are unique and from each of the
+ // loop blocks, but not actually defined in the loop. This requires extra
+ // work setting up the byte.compare block, i.e. by introducing a select to
+ // choose the correct value.
+ // TODO: We could add support for this in future.
+ if (FoundBB == EndBB) {
+ for (PHINode &EndPN : EndBB->phis()) {
+ Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header);
+ Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB);
+
+ // The value of the index when leaving the while.cond block is always the
+ // same as the end value (MaxLen) so we permit either. Otherwise for any
+ // other value defined outside the loop we only allow values that are the
+ // same as the exit value for while.body.
+ if (WhileCondVal != Index && WhileCondVal != MaxLen &&
+ WhileCondVal != WhileBodyVal)
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n"
+ << *(EndBB->getParent()) << "\n\n");
+
+ // The index is incremented before the GEP/Load pair so we need to
+ // add 1 to the start value.
+ transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, /*IncIdx=*/true,
+ FoundBB, EndBB);
+ return true;
+}
+
+Value *AArch64LoopIdiomTransform::expandFindMismatch(
+ IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
+ GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
+ Value *PtrA = GEPA->getPointerOperand();
+ Value *PtrB = GEPB->getPointerOperand();
+
+ // Get the arguments and types for the intrinsic.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+ LLVMContext &Ctx = PHBranch->getContext();
+ Type *LoadType = Type::getInt8Ty(Ctx);
+ Type *ResType = Builder.getInt32Ty();
+
+ // Split block in the original loop preheader.
+ BasicBlock *EndBlock =
+ SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
+
+ // Create the blocks that we're going to need:
+ // 1. A block for checking the zero-extended length exceeds 0
+ // 2. A block to check that the start and end addresses of a given array
+ // lie on the same page.
+ // 3. The SVE loop preheader.
+ // 4. The first SVE loop block.
+ // 5. The SVE loop increment block.
+ // 6. A block we can jump to from the SVE loop when a mismatch is found.
+ // 7. The first block of the scalar loop itself, containing PHIs , loads
+ // and cmp.
+ // 8. A scalar loop increment block to increment the PHIs and go back
+ // around the loop.
+
+ BasicBlock *MinItCheckBlock = BasicBlock::Create(
+ Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock);
+
+ // Update the terminator added by SplitBlock to branch to the first block
+ Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock);
+
+ BasicBlock *MemCheckBlock = BasicBlock::Create(
+ Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *SVELoopPreheaderBlock = BasicBlock::Create(
+ Ctx, "mismatch_sve_loop_preheader", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *SVELoopStartBlock = BasicBlock::Create(
+ Ctx, "mismatch_sve_loop", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *SVELoopIncBlock = BasicBlock::Create(
+ Ctx, "mismatch_sve_loop_inc", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *SVELoopMismatchBlock = BasicBlock::Create(
+ Ctx, "mismatch_sve_loop_found", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
+ Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *LoopStartBlock =
+ BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock);
+
+ BasicBlock *LoopIncBlock = BasicBlock::Create(
+ Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock);
+
+ DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock},
+ {DominatorTree::Delete, Preheader, EndBlock}});
+
+ // Update LoopInfo with the new SVE & scalar loops.
+ auto SVELoop = LI->AllocateLoop();
+ auto ScalarLoop = LI->AllocateLoop();
+
+ if (CurLoop->getParentLoop()) {
+ CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopPreheaderBlock, *LI);
+ CurLoop->getParentLoop()->addChildLoop(SVELoop);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopMismatchBlock, *LI);
+ CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI);
+ CurLoop->getParentLoop()->addChildLoop(ScalarLoop);
+ } else {
+ LI->addTopLevelLoop(SVELoop);
+ LI->addTopLevelLoop(ScalarLoop);
+ }
+
+ // Add the new basic blocks to their associated loops.
+ SVELoop->addBasicBlockToLoop(SVELoopStartBlock, *LI);
+ SVELoop->addBasicBlockToLoop(SVELoopIncBlock, *LI);
+
+ ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI);
+ ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI);
+
+ // Set up some types and constants that we intend to reuse.
+ Type *I64Type = Builder.getInt64Ty();
+
+ // Check the zero-extended iteration count > 0
+ Builder.SetInsertPoint(MinItCheckBlock);
+ Value *ExtStart = Builder.CreateZExt(Start, I64Type);
+ Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type);
+ // This check doesn't really cost us very much.
+
+ Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen);
+ BranchInst *MinItCheckBr =
+ BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck);
+ MinItCheckBr->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
+ Builder.Insert(MinItCheckBr);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock},
+ {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}});
+
+ // For each of the arrays, check the start/end addresses are on the same
+ // page.
+ Builder.SetInsertPoint(MemCheckBlock);
+
+ // The early exit in the original loop means that when performing vector
+ // loads we are potentially reading ahead of the early exit. So we could
+ // fault if crossing a page boundary. Therefore, we create runtime memory
+ // checks based on the minimum page size as follows:
+ // 1. Calculate the addresses of the first memory accesses in the loop,
+ // i.e. LhsStart and RhsStart.
+ // 2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd.
+ // 3. Determine which pages correspond to all the memory accesses, i.e
+ // LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage.
+ // 4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then
+ // we know we won't cross any page boundaries in the loop so we can
+ // enter the vector loop! Otherwise we fall back on the scalar loop.
+ Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart);
+ Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart);
+ Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type);
+ Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type);
+ Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd);
+ Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd);
+ Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type);
+ Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type);
+
+ const uint64_t MinPageSize = TTI->getMinPageSize().value();
+ const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
+ Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt);
+ Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt);
+ Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt);
+ Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt);
+ Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage);
+ Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage);
+
+ Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp);
+ BranchInst *CombinedPageCmpCmpBr = BranchInst::Create(
+ LoopPreHeaderBlock, SVELoopPreheaderBlock, CombinedPageCmp);
+ CombinedPageCmpCmpBr->setMetadata(
+ LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext())
+ .createBranchWeights(10, 90));
+ Builder.Insert(CombinedPageCmpCmpBr);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock},
+ {DominatorTree::Insert, MemCheckBlock, SVELoopPreheaderBlock}});
+
+ // Set up the SVE loop preheader, i.e. calculate initial loop predicate,
+ // zero-extend MaxLen to 64-bits, determine the number of vector elements
+ // processed in each iteration, etc.
+ Builder.SetInsertPoint(SVELoopPreheaderBlock);
+
+ // At this point we know two things must be true:
+ // 1. Start <= End
+ // 2. ExtMaxLen <= MinPageSize due to the page checks.
+ // Therefore, we know that we can use a 64-bit induction variable that
+ // starts from 0 -> ExtMaxLen and it will not overflow.
+ ScalableVectorType *PredVTy =
+ ScalableVectorType::get(Builder.getInt1Ty(), 16);
+
+ Value *InitialPred = Builder.CreateIntrinsic(
+ Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
+
+ Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
+ VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+
+ Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
+ Builder.getInt1(false));
+
+ BranchInst *JumpToSVELoop = BranchInst::Create(SVELoopStartBlock);
+ Builder.Insert(JumpToSVELoop);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, SVELoopPreheaderBlock, SVELoopStartBlock}});
+
+ // Set up the first SVE loop block by creating the PHIs, doing the vector
+ // loads and comparing the vectors.
+ Builder.SetInsertPoint(SVELoopStartBlock);
+ PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_sve_loop_pred");
+ LoopPred->addIncoming(InitialPred, SVELoopPreheaderBlock);
+ PHINode *SVEIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_sve_index");
+ SVEIndexPhi->addIncoming(ExtStart, SVELoopPreheaderBlock);
+ Type *SVELoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16);
+ Value *Passthru = ConstantInt::getNullValue(SVELoadType);
+
+ Value *SVELhsGep = Builder.CreateGEP(LoadType, PtrA, SVEIndexPhi);
+ if (GEPA->isInBounds())
+ cast<GetElementPtrInst>(SVELhsGep)->setIsInBounds(true);
+ Value *SVELhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVELhsGep, Align(1),
+ LoopPred, Passthru);
+
+ Value *SVERhsGep = Builder.CreateGEP(LoadType, PtrB, SVEIndexPhi);
+ if (GEPB->isInBounds())
+ cast<GetElementPtrInst>(SVERhsGep)->setIsInBounds(true);
+ Value *SVERhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVERhsGep, Align(1),
+ LoopPred, Passthru);
+
+ Value *SVEMatchCmp = Builder.CreateICmpNE(SVELhsLoad, SVERhsLoad);
+ SVEMatchCmp = Builder.CreateSelect(LoopPred, SVEMatchCmp, PFalse);
+ Value *SVEMatchHasActiveLanes = Builder.CreateOrReduce(SVEMatchCmp);
+ BranchInst *SVEEarlyExit = BranchInst::Create(
+ SVELoopMismatchBlock, SVELoopIncBlock, SVEMatchHasActiveLanes);
+ Builder.Insert(SVEEarlyExit);
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, SVELoopStartBlock, SVELoopMismatchBlock},
+ {DominatorTree::Insert, SVELoopStartBlock, SVELoopIncBlock}});
+
+ // Increment the index counter and calculate the predicate for the next
+ // iteration of the loop. We branch back to the start of the loop if there
+ // is at least one active lane.
+ Builder.SetInsertPoint(SVELoopIncBlock);
+ Value *NewSVEIndexPhi = Builder.CreateAdd(SVEIndexPhi, VecLen, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ SVEIndexPhi->addIncoming(NewSVEIndexPhi, SVELoopIncBlock);
+ Value *NewPred =
+ Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+ {PredVTy, I64Type}, {NewSVEIndexPhi, ExtEnd});
+ LoopPred->addIncoming(NewPred, SVELoopIncBlock);
+
+ Value *PredHasActiveLanes =
+ Builder.CreateExtractElement(NewPred, uint64_t(0));
+ BranchInst *SVELoopBranchBack =
+ BranchInst::Create(SVELoopStartBlock, EndBlock, PredHasActiveLanes);
+ Builder.Insert(SVELoopBranchBack);
+
+ DTU.applyUpdates({{DominatorTree::Insert, SVELoopIncBlock, SVELoopStartBlock},
+ {DominatorTree::Insert, SVELoopIncBlock, EndBlock}});
+
+ // If we found a mismatch then we need to calculate which lane in the vector
+ // had a mismatch and add that on to the current loop index.
+ Builder.SetInsertPoint(SVELoopMismatchBlock);
+ PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_sve_found_pred");
+ FoundPred->addIncoming(SVEMatchCmp, SVELoopStartBlock);
+ PHINode *LastLoopPred =
+ Builder.CreatePHI(PredVTy, 1, "mismatch_sve_last_loop_pred");
+ LastLoopPred->addIncoming(LoopPred, SVELoopStartBlock);
+ PHINode *SVEFoundIndex =
+ Builder.CreatePHI(I64Type, 1, "mismatch_sve_found_index");
+ SVEFoundIndex->addIncoming(SVEIndexPhi, SVELoopStartBlock);
+
+ Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
+ Value *Ctz = Builder.CreateIntrinsic(
+ Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
+ {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
+ Ctz = Builder.CreateZExt(Ctz, I64Type);
+ Value *SVELoopRes64 = Builder.CreateAdd(SVEFoundIndex, Ctz, "",
+ /*HasNUW=*/true, /*HasNSW=*/true);
+ Value *SVELoopRes = Builder.CreateTrunc(SVELoopRes64, ResType);
+
+ Builder.Insert(BranchInst::Create(EndBlock));
+
+ DTU.applyUpdates({{DominatorTree::Insert, SVELoopMismatchBlock, EndBlock}});
+
+ // Generate code for scalar loop.
+ Builder.SetInsertPoint(LoopPreHeaderBlock);
+ Builder.Insert(BranchInst::Create(LoopStartBlock));
+
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}});
+
+ Builder.SetInsertPoint(LoopStartBlock);
+ PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index");
+ IndexPhi->addIncoming(Start, LoopPreHeaderBlock);
+
+ // Otherwise compare the values
+ // Load bytes from each array and compare them.
+ Value *GepOffset = Builder.CreateZExt(IndexPhi, I64Type);
+
+ Value *LhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset);
+ if (GEPA->isInBounds())
+ cast<GetElementPtrInst>(LhsGep)->setIsInBounds(true);
+ Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep);
+
+ Value *RhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset);
+ if (GEPB->isInBounds())
+ cast<GetElementPtrInst>(RhsGep)->setIsInBounds(true);
+ Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep);
+
+ Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
+ // If we have a mismatch then exit the loop ...
+ BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
+ Builder.Insert(MatchCmpBr);
+
+ DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock},
+ {DominatorTree::Insert, LoopStartBlock, EndBlock}});
+
+ // Have we reached the maximum permitted length for the loop?
+ Builder.SetInsertPoint(LoopIncBlock);
+ Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "",
+ /*HasNUW=*/Index->hasNoUnsignedWrap(),
+ /*HasNSW=*/Index->hasNoSignedWrap());
+ IndexPhi->addIncoming(PhiInc, LoopIncBlock);
+ Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen);
+ BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
+ Builder.Insert(IVCmpBr);
+
+ DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock},
+ {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}});
+
+ // In the end block we need to insert a PHI node to deal with three cases:
+ // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen.
+ // 2. We exitted the scalar loop early due to a mismatch and need to return
+ // the index that we found.
+ // 3. We didn't find a mismatch in the SVE loop, so we return MaxLen.
+ // 4. We exitted the SVE loop early due to a mismatch and need to return
+ // the index that we found.
+ Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt());
+ PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result");
+ ResPhi->addIncoming(MaxLen, LoopIncBlock);
+ ResPhi->addIncoming(IndexPhi, LoopStartBlock);
+ ResPhi->addIncoming(MaxLen, SVELoopIncBlock);
+ ResPhi->addIncoming(SVELoopRes, SVELoopMismatchBlock);
+
+ Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType);
+
+ if (VerifyLoops) {
+ ScalarLoop->verifyLoop();
+ SVELoop->verifyLoop();
+ if (!SVELoop->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ }
+
+ return FinalRes;
+}
+
+void AArch64LoopIdiomTransform::transformByteCompare(
+ GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi,
+ Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx,
+ BasicBlock *FoundBB, BasicBlock *EndBB) {
+
+ // Insert the byte compare code at the end of the preheader block
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ BasicBlock *Header = CurLoop->getHeader();
+ BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+ IRBuilder<> Builder(PHBranch);
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
+
+ // Increment the pointer if this was done before the loads in the loop.
+ if (IncIdx)
+ Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1));
+
+ Value *ByteCmpRes =
+ expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen);
+
+ // Replaces uses of index & induction Phi with intrinsic (we already
+ // checked that the the first instruction of Header is the Phi above).
+ assert(IndPhi->hasOneUse() && "Index phi node has more than one use!");
+ Index->replaceAllUsesWith(ByteCmpRes);
+
+ assert(PHBranch->isUnconditional() &&
+ "Expected preheader to terminate with an unconditional branch.");
+
+ // If no mismatch was found, we can jump to the end block. Create a
+ // new basic block for the compare instruction.
+ auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare",
+ Preheader->getParent());
+ CmpBB->moveBefore(EndBB);
+
+ // Replace the branch in the preheader with an always-true conditional branch.
+ // This ensures there is still a reference to the original loop.
+ Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header);
+ PHBranch->eraseFromParent();
+
+ BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent();
+ DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}});
+
+ // Create the branch to either the end or found block depending on the value
+ // returned by the intrinsic.
+ Builder.SetInsertPoint(CmpBB);
+ if (FoundBB != EndBB) {
+ Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen);
+ Builder.CreateCondBr(FoundCmp, EndBB, FoundBB);
+ DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB},
+ {DominatorTree::Insert, CmpBB, EndBB}});
+
+ } else {
+ Builder.CreateBr(FoundBB);
+ DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
+ }
+
+ auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {
+ for (PHINode &PN : SuccBB->phis()) {
+ // At this point we've already replaced all uses of the result from the
+ // loop with ByteCmp. Look through the incoming values to find ByteCmp,
+ // meaning this is a Phi collecting the results of the byte compare.
+ bool ResPhi = false;
+ for (Value *Op : PN.incoming_values())
+ if (Op == ByteCmpRes) {
+ ResPhi = true;
+ break;
+ }
+
+ // Any PHI that depended upon the result of the byte compare needs a new
+ // incoming value from CmpBB. This is because the original loop will get
+ // deleted.
+ if (ResPhi)
+ PN.addIncoming(ByteCmpRes, CmpBB);
+ else {
+ // There should be no other outside uses of other values in the
+ // original loop. Any incoming values should either:
+ // 1. Be for blocks outside the loop, which aren't interesting. Or ..
+ // 2. These are from blocks in the loop with values defined outside
+ // the loop. We should a similar incoming value from CmpBB.
+ for (BasicBlock *BB : PN.blocks())
+ if (CurLoop->contains(BB)) {
+ PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB);
+ break;
+ }
+ }
+ }
+ };
+
+ // Ensure all Phis in the successors of CmpBB have an incoming value from it.
+ fixSuccessorPhis(EndBB);
+ if (EndBB != FoundBB)
+ fixSuccessorPhis(FoundBB);
+
+ // The new CmpBB block isn't part of the loop, but will need to be added to
+ // the outer loop if there is one.
+ if (!CurLoop->isOutermost())
+ CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI);
+
+ if (VerifyLoops && CurLoop->getParentLoop()) {
+ CurLoop->getParentLoop()->verifyLoop();
+ if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
+ report_fatal_error("Loops must remain in LCSSA form!");
+ }
+}
diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h
new file mode 100644
index 000000000000..cc68425bb68b
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h
@@ -0,0 +1,25 @@
+//===- AArch64LoopIdiomTransform.h --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+struct AArch64LoopIdiomTransformPass
+ : PassInfoMixin<AArch64LoopIdiomTransformPass> {
+ PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 738a52eebad2..380f6e1fcfda 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -810,7 +810,7 @@ defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, 0b11, ZPR16>;
defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>;
}
-let Predicates = [HasSME2p1, HasB16B16] in {
+let Predicates = [HasSME2, HasB16B16] in {
defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 344a15389063..ee10a7d1c706 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -453,6 +453,9 @@ def AArch64msb_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3),
def AArch64eor3 : PatFrags<(ops node:$op1, node:$op2, node:$op3),
[(int_aarch64_sve_eor3 node:$op1, node:$op2, node:$op3),
(xor node:$op1, (xor node:$op2, node:$op3))]>;
+def AArch64bcax : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+ [(int_aarch64_sve_bcax node:$op1, node:$op2, node:$op3),
+ (xor node:$op1, (and node:$op2, (vnot node:$op3)))]>;
def AArch64fmla_m1 : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm),
[(int_aarch64_sve_fmla node:$pg, node:$za, node:$zn, node:$zm),
@@ -3714,7 +3717,7 @@ let Predicates = [HasSVE2orSME] in {
// SVE2 bitwise ternary operations
defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", AArch64eor3>;
- defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>;
+ defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", AArch64bcax>;
defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl, AArch64bsp>;
defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 1a76f354589e..9e43f206efcf 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -172,7 +172,7 @@ static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,
SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Addr,
SDValue Size, MachinePointerInfo DstPtrInfo, bool ZeroData) const {
- uint64_t ObjSize = cast<ConstantSDNode>(Size)->getZExtValue();
+ uint64_t ObjSize = Size->getAsZExtVal();
assert(ObjSize % 16 == 0);
MachineFunction &MF = DAG.getMachineFunction();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 036719be06d8..144610e021c5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -11,6 +11,7 @@
#include "AArch64TargetMachine.h"
#include "AArch64.h"
+#include "AArch64LoopIdiomTransform.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64MachineScheduler.h"
#include "AArch64MacroFusion.h"
@@ -43,6 +44,7 @@
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
@@ -222,6 +224,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
initializeAArch64DeadRegisterDefinitionsPass(*PR);
initializeAArch64ExpandPseudoPass(*PR);
initializeAArch64LoadStoreOptPass(*PR);
+ initializeAArch64LoopIdiomTransformLegacyPassPass(*PR);
initializeAArch64MIPeepholeOptPass(*PR);
initializeAArch64SIMDInstrOptPass(*PR);
initializeAArch64O0PreLegalizerCombinerPass(*PR);
@@ -537,6 +540,14 @@ public:
} // end anonymous namespace
+void AArch64TargetMachine::registerPassBuilderCallbacks(
+ PassBuilder &PB, bool PopulateClassToPassNames) {
+ PB.registerLateLoopOptimizationsEPCallback(
+ [=](LoopPassManager &LPM, OptimizationLevel Level) {
+ LPM.addPass(AArch64LoopIdiomTransformPass());
+ });
+}
+
TargetTransformInfo
AArch64TargetMachine::getTargetTransformInfo(const Function &F) const {
return TargetTransformInfo(AArch64TTIImpl(this, F));
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 12b971853f84..8fb68b06f137 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -14,6 +14,7 @@
#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
#include "AArch64InstrInfo.h"
+#include "AArch64LoopIdiomTransform.h"
#include "AArch64Subtarget.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Target/TargetMachine.h"
@@ -43,6 +44,9 @@ public:
// Pass Pipeline Configuration
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool PopulateClassToPassNames) override;
+
TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
TargetLoweringObjectFile* getObjFileLowering() const override {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 0b220069a388..f471294ffc25 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -420,6 +420,8 @@ public:
return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
}
+
+ std::optional<unsigned> getMinPageSize() const { return 4096; }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 1d0e8be80d07..b657a0954d78 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -282,6 +282,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
// Regardless of FP16 support, widen 16-bit elements to 32-bits.
.minScalar(0, s32)
.libcallFor({s32, s64});
+ getActionDefinitionsBuilder(G_FPOWI)
+ .scalarize(0)
+ .minScalar(0, s32)
+ .libcallFor({{s32, s32}, {s64, s32}});
getActionDefinitionsBuilder(G_INSERT)
.legalIf(all(typeInSet(0, {s32, s64, p0}),
@@ -362,7 +366,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{v4s32, p0, s128, 8},
{v2s64, p0, s128, 8}})
// These extends are also legal
- .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
+ .legalForTypesWithMemDesc(
+ {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
.widenScalarToNextPow2(0, /* MinSize = */ 8)
.lowerIfMemSizeNotByteSizePow2()
.clampScalar(0, s8, s64)
@@ -761,17 +766,35 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.lowerIf(
all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
+ LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) {
+ return ST.outlineAtomics() && !ST.hasLSE();
+ };
+
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
- .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
- .customIf([](const LegalityQuery &Query) {
- return Query.Types[0].getSizeInBits() == 128;
+ .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
+ predNot(UseOutlineAtomics)))
+ .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
+ .customIf([UseOutlineAtomics](const LegalityQuery &Query) {
+ return Query.Types[0].getSizeInBits() == 128 &&
+ !UseOutlineAtomics(Query);
})
+ .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
+ UseOutlineAtomics))
+ .clampScalar(0, s32, s64);
+
+ getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
+ G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
+ G_ATOMICRMW_XOR})
+ .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
+ predNot(UseOutlineAtomics)))
+ .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
+ UseOutlineAtomics))
.clampScalar(0, s32, s64);
+ // Do not outline these atomics operations, as per comment in
+ // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
getActionDefinitionsBuilder(
- {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
- G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
- G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
+ {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
.legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
.clampScalar(0, s32, s64);
@@ -989,6 +1012,23 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampMaxNumElements(1, s16, 8)
.lower();
+ // For fmul reductions we need to split up into individual operations. We
+ // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
+ // smaller types, followed by scalarizing what remains.
+ getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
+ .minScalarOrElt(0, MinFPScalar)
+ .clampMaxNumElements(1, s64, 2)
+ .clampMaxNumElements(1, s32, 4)
+ .clampMaxNumElements(1, s16, 8)
+ .clampMaxNumElements(1, s32, 2)
+ .clampMaxNumElements(1, s16, 4)
+ .scalarize(1)
+ .lower();
+
+ getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
+ .scalarize(2)
+ .lower();
+
getActionDefinitionsBuilder(G_VECREDUCE_ADD)
.legalFor({{s8, v16s8},
{s8, v8s8},
@@ -1137,8 +1177,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
verify(*ST.getInstrInfo());
}
-bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
- MachineInstr &MI) const {
+bool AArch64LegalizerInfo::legalizeCustom(
+ LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const {
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
GISelChangeObserver &Observer = Helper.Observer;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 19f77baa77f8..c62a9d847c52 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -23,12 +23,12 @@ namespace llvm {
class AArch64Subtarget;
-/// This class provides the information for the target register banks.
class AArch64LegalizerInfo : public LegalizerInfo {
public:
AArch64LegalizerInfo(const AArch64Subtarget &ST);
- bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+ bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const override;
bool legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index b7552541e950..789ec817d3d8 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10082,6 +10082,12 @@ multiclass sve2p1_vector_to_pred<string mnemonic, SDPatternOperator Op_lane, SDP
def : InstAlias<mnemonic # "\t$Pd, $Zn",
(!cast<Instruction>(NAME # _B) PPR8:$Pd, ZPRAny:$Zn, 0), 1>;
+ def : InstAlias<mnemonic # "\t$Pd, $Zn",
+ (!cast<Instruction>(NAME # _H) PPR16:$Pd, ZPRAny:$Zn, 0), 0>;
+ def : InstAlias<mnemonic # "\t$Pd, $Zn",
+ (!cast<Instruction>(NAME # _S) PPR32:$Pd, ZPRAny:$Zn, 0), 0>;
+ def : InstAlias<mnemonic # "\t$Pd, $Zn",
+ (!cast<Instruction>(NAME # _D) PPR64:$Pd, ZPRAny:$Zn, 0), 0>;
// any_lane
def : Pat<(nxv16i1 (Op_lane (nxv16i8 ZPRAny:$Zn), (i32 timm32_0_0:$Idx))),
@@ -10143,6 +10149,12 @@ multiclass sve2p1_pred_to_vector<string mnemonic, SDPatternOperator MergeOp,
def : InstAlias<mnemonic # "\t$Zd, $Pn",
(!cast<Instruction>(NAME # _B) ZPRAny:$Zd, 0, PPR8:$Pn), 1>;
+ def : InstAlias<mnemonic # "\t$Zd, $Pn",
+ (!cast<Instruction>(NAME # _H) ZPRAny:$Zd, 0, PPR16:$Pn), 0>;
+ def : InstAlias<mnemonic # "\t$Zd, $Pn",
+ (!cast<Instruction>(NAME # _S) ZPRAny:$Zd, 0, PPR32:$Pn), 0>;
+ def : InstAlias<mnemonic # "\t$Zd, $Pn",
+ (!cast<Instruction>(NAME # _D) ZPRAny:$Zd, 0, PPR64:$Pn), 0>;
// Merge
def : Pat<(nxv8i16 (MergeOp (nxv8i16 ZPRAny:$Zd), (nxv8i1 PPR16:$Pn), (i32 timm32_1_1:$Idx))),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d2a325d5ad89..df8c35ffd457 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -761,6 +761,12 @@ def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register",
"Has SHADER_CYCLES hardware register"
>;
+def FeatureShaderCyclesHiLoRegisters : SubtargetFeature<"shader-cycles-hi-lo-registers",
+ "HasShaderCyclesHiLoRegisters",
+ "true",
+ "Has SHADER_CYCLES_HI/LO hardware registers"
+>;
+
def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
"HasMadMacF32Insts",
"true",
@@ -1469,7 +1475,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureNSAEncoding,
FeaturePartialNSAEncoding,
FeatureWavefrontSize32,
- FeatureShaderCyclesRegister,
+ FeatureShaderCyclesHiLoRegisters,
FeatureArchitectedFlatScratch,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
@@ -1970,6 +1976,8 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
+def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegisters()">;
+
def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
AssemblerPredicate<(all_of FeatureFP8Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 9036b26a6f6b..c5207228dc91 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -22,28 +22,28 @@ def CC_SI_Gfx : CallingConv<[
// 32 is reserved for the stack pointer
// 33 is reserved for the frame pointer
// 34 is reserved for the base pointer
- CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29
]>>>,
- CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
]>>>,
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>
]>;
def RetCC_SI_Gfx : CallingConv<[
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -66,7 +66,7 @@ def RetCC_SI_Gfx : CallingConv<[
def CC_SI_SHADER : CallingConv<[
- CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -76,7 +76,7 @@ def CC_SI_SHADER : CallingConv<[
]>>>,
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
- CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -109,7 +109,7 @@ def RetCC_SI_Shader : CallingConv<[
]>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
- CCIfType<[f32, f16, v2f16] , CCAssignToReg<[
+ CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -188,23 +188,23 @@ def CC_AMDGPU_Func : CallingConv<[
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
>>>,
- CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
+ CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1, bf16, v2bf16], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>
]>;
// Calling convention for leaf functions
def RetCC_AMDGPU_Func : CallingConv<[
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
+ CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -223,11 +223,11 @@ def CC_AMDGPU : CallingConv<[
]>;
def CC_AMDGPU_CS_CHAIN : CallingConv<[
- CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
!foreach(i, !range(105), !cast<Register>("SGPR"#i))
>>>,
- CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
!foreach(i, !range(8, 255), !cast<Register>("VGPR"#i))
>>>
]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 8d4cad4c07bc..0c77fe725958 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -104,6 +104,13 @@ def foldable_fneg : GICombineRule<
[{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]),
(apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>;
+// Detects s_mul_u64 instructions whose higher bits are zero/sign extended.
+def smulu64 : GICombineRule<
+ (defs root:$smul, unsigned_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_MUL):$smul,
+ [{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]),
+ (apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>;
+
def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">;
def sign_extension_in_reg : GICombineRule<
@@ -149,7 +156,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
- rcp_sqrt_to_rsq, sign_extension_in_reg]> {
+ rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b51a876750b5..74e9cd7d0965 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -646,7 +646,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
Offset += 8; // Skipped.
}
- Offset += 72; // Reserved.
+ // Emit argument for hidden dynamic lds size
+ if (MFI.isDynamicLDSUsed()) {
+ emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
+ Args);
+ } else {
+ Offset += 4; // skipped
+ }
+
+ Offset += 68; // Reserved.
// hidden_private_base and hidden_shared_base are only when the subtarget has
// ApertureRegs.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index bffea82ab8f4..719ae2e8750c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -303,6 +303,7 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
switch (N->getOpcode()) {
case ISD::BUILD_VECTOR:
+ // TODO: Match load d16 from shl (extload:i16), 16
MadeChange |= matchLoadD16FromBuildVector(N);
break;
default:
@@ -317,26 +318,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
}
}
-bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
- bool Negated) const {
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
if (N->isUndef())
return true;
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (Negated) {
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
- return TII->isInlineConstant(-C->getAPIntValue());
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(C->getAPIntValue());
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
- return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
-
- } else {
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
- return TII->isInlineConstant(C->getAPIntValue());
-
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
- return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
- }
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
return false;
}
@@ -382,7 +373,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
Subtarget->getRegisterInfo()->getRegClass(RCID);
SDValue SubRegOp = N->getOperand(OpNo + 1);
- unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
+ unsigned SubRegIdx = SubRegOp->getAsZExtVal();
return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
SubRegIdx);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 374108af08cd..df4a211d42a0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -50,15 +50,13 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {
}
// TODO: Handle undef as zero
-static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
- bool Negate = false) {
+static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
uint32_t LHSVal, RHSVal;
if (getConstantValue(N->getOperand(0), LHSVal) &&
getConstantValue(N->getOperand(1), RHSVal)) {
SDLoc SL(N);
- uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16)
- : (LHSVal & 0xffff) | (RHSVal << 16);
+ uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
DAG.getTargetConstant(K, SL, MVT::i32));
}
@@ -66,9 +64,6 @@ static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
return nullptr;
}
-static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
- return packConstantV2I16(N, DAG, true);
-}
} // namespace
/// AMDGPU specific code to select AMDGPU machine instructions for
@@ -110,10 +105,7 @@ protected:
private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
- bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
- bool isNegInlineImmediate(const SDNode *N) const {
- return isInlineImmediate(N, true);
- }
+ bool isInlineImmediate(const SDNode *N) const;
bool isInlineImmediate16(int64_t Imm) const {
return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8fbc90a6db9f..0dbcaf5a1b13 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -387,17 +387,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
Custom);
+
+ // FIXME: Why is v8f16/v8bf16 missing?
setOperationAction(
ISD::EXTRACT_SUBVECTOR,
- {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32,
- MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32,
- MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32,
- MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, MVT::v9i32,
- MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
- MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
- MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64,
- MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64,
- MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16},
+ {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
+ MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
+ MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
+ MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
+ MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
+ MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
+ MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
+ MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
+ MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
+ MVT::v32i16, MVT::v32f16, MVT::v32bf16},
Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
@@ -3281,7 +3284,15 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
}
- assert(SrcVT == MVT::i64 && "operation should be legal");
+ if (DestVT == MVT::bf16) {
+ SDLoc SL(Op);
+ SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
+ SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
+ return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
+ }
+
+ if (SrcVT != MVT::i64)
+ return Op;
if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
SDLoc DL(Op);
@@ -3319,7 +3330,15 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
}
- assert(SrcVT == MVT::i64 && "operation should be legal");
+ if (DestVT == MVT::bf16) {
+ SDLoc SL(Op);
+ SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
+ SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
+ return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
+ }
+
+ if (SrcVT != MVT::i64)
+ return Op;
// TODO: Factor out code common with LowerUINT_TO_FP.
@@ -3517,7 +3536,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con
return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
}
-SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
unsigned OpOpcode = Op.getOpcode();
@@ -3528,6 +3547,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
if (SrcVT == MVT::f16 && DestVT == MVT::i16)
return Op;
+ if (SrcVT == MVT::bf16) {
+ SDLoc DL(Op);
+ SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+ return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
+ }
+
// Promote i16 to i32
if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
SDLoc DL(Op);
@@ -3536,6 +3561,9 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
}
+ if (DestVT != MVT::i64)
+ return Op;
+
if (SrcVT == MVT::f16 ||
(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
SDLoc DL(Op);
@@ -3546,7 +3574,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
}
- if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
+ if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
return SDValue();
@@ -4947,7 +4975,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
// vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
if (DestVT.isVector()) {
SDValue Src = N->getOperand(0);
- if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+ if (Src.getOpcode() == ISD::BUILD_VECTOR &&
+ (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
+ isOperationLegal(ISD::BUILD_VECTOR, DestVT))) {
EVT SrcVT = Src.getValueType();
unsigned NElts = DestVT.getVectorNumElements();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 88ef4b577424..ad8dcda93c36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2764,7 +2764,9 @@ static bool isConstant(const MachineInstr &MI) {
void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
- const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
+ unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
+ const MachineInstr *PtrMI =
+ MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
assert(PtrMI);
@@ -2817,6 +2819,10 @@ bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
+ if (MI.getOpcode() == AMDGPU::G_PREFETCH)
+ return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID;
+
const Instruction *I = dyn_cast<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.uniform");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fbee28889451..aa235c07e995 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -701,13 +701,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.maxScalar(0, S32);
}
- getActionDefinitionsBuilder(G_MUL)
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
- .scalarize(0)
- .minScalar(0, S16)
- .widenScalarToNextMultipleOf(0, 32)
- .custom();
+ if (ST.hasScalarSMulU64()) {
+ getActionDefinitionsBuilder(G_MUL)
+ .legalFor({S64, S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .custom();
+ } else {
+ getActionDefinitionsBuilder(G_MUL)
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .custom();
+ }
assert(ST.hasMad64_32());
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
@@ -1996,8 +2006,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
verify(*ST.getInstrInfo());
}
-bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
- MachineInstr &MI) const {
+bool AMDGPULegalizerInfo::legalizeCustom(
+ LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const {
MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 855fa0ddc214..56aabd4f6ab7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -27,7 +27,6 @@ class MachineIRBuilder;
namespace AMDGPU {
struct ImageDimIntrinsicInfo;
}
-/// This class provides the information for the target register banks.
class AMDGPULegalizerInfo final : public LegalizerInfo {
const GCNSubtarget &ST;
@@ -35,7 +34,8 @@ public:
AMDGPULegalizerInfo(const GCNSubtarget &ST,
const GCNTargetMachine &TM);
- bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+ bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const override;
Register getSegmentAperture(unsigned AddrSpace,
MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index f03e6b8915b1..1b2f74cf153b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -87,9 +87,6 @@ private:
Constant *copr0, Constant *copr1);
bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
- // sqrt
- bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
-
/// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
/// of cos, sincos call).
std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
@@ -672,8 +669,6 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
// Specialized optimizations for each function call.
//
- // TODO: Handle other simple intrinsic wrappers. Sqrt.
- //
// TODO: Handle native functions
switch (FInfo.getId()) {
case AMDGPULibFunc::EI_EXP:
@@ -794,7 +789,9 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
case AMDGPULibFunc::EI_ROOTN:
return fold_rootn(FPOp, B, FInfo);
case AMDGPULibFunc::EI_SQRT:
- return fold_sqrt(FPOp, B, FInfo);
+ // TODO: Allow with strictfp + constrained intrinsic
+ return tryReplaceLibcallWithSimpleIntrinsic(
+ B, CI, Intrinsic::sqrt, true, true, /*AllowStrictFP=*/false);
case AMDGPULibFunc::EI_COS:
case AMDGPULibFunc::EI_SIN:
return fold_sincos(FPOp, B, FInfo);
@@ -1273,29 +1270,6 @@ bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
return true;
}
-// fold sqrt -> native_sqrt (x)
-bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B,
- const FuncInfo &FInfo) {
- if (!isUnsafeMath(FPOp))
- return false;
-
- if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) &&
- (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) {
- Module *M = B.GetInsertBlock()->getModule();
-
- if (FunctionCallee FPExpr = getNativeFunction(
- M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
- Value *opr0 = FPOp->getOperand(0);
- LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
- << "sqrt(" << *opr0 << ")\n");
- Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt");
- replaceCall(FPOp, nval);
- return true;
- }
- }
- return false;
-}
-
std::tuple<Value *, Value *, Value *>
AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
FunctionCallee Fsincos) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 323462e60a29..31777295b4f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -19,6 +19,26 @@
using namespace llvm;
+static const GlobalVariable *
+getKernelDynLDSGlobalFromFunction(const Function &F) {
+ const Module *M = F.getParent();
+ SmallString<64> KernelDynLDSName("llvm.amdgcn.");
+ KernelDynLDSName += F.getName();
+ KernelDynLDSName += ".dynlds";
+ return M->getNamedGlobal(KernelDynLDSName);
+}
+
+static bool hasLDSKernelArgument(const Function &F) {
+ for (const Argument &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+ if (auto PtrTy = dyn_cast<PointerType>(ArgTy)) {
+ if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+ return true;
+ }
+ }
+ return false;
+}
+
AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
const AMDGPUSubtarget &ST)
: IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())),
@@ -65,6 +85,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");
NoSignedZerosFPMath =
NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";
+
+ const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
+ if (DynLdsGlobal || hasLDSKernelArgument(F))
+ UsesDynamicLDS = true;
}
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
@@ -139,15 +163,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
return Offset;
}
-static const GlobalVariable *
-getKernelDynLDSGlobalFromFunction(const Function &F) {
- const Module *M = F.getParent();
- std::string KernelDynLDSName = "llvm.amdgcn.";
- KernelDynLDSName += F.getName();
- KernelDynLDSName += ".dynlds";
- return M->getNamedGlobal(KernelDynLDSName);
-}
-
std::optional<uint32_t>
AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
// TODO: Would be more consistent with the abs symbols to use a range
@@ -210,3 +225,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
}
}
}
+
+void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) {
+ UsesDynamicLDS = DynLDS;
+}
+
+bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 248ee26a47eb..7efb7f825348 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -46,6 +46,9 @@ protected:
/// stages.
Align DynLDSAlign;
+ // Flag to check dynamic LDS usage by kernel.
+ bool UsesDynamicLDS = false;
+
// Kernels + shaders. i.e. functions called by the hardware and not called
// by other functions.
bool IsEntryFunction = false;
@@ -119,6 +122,10 @@ public:
Align getDynLDSAlign() const { return DynLDSAlign; }
void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
+
+ void setUsesDynamicLDS(bool DynLDS);
+
+ bool isDynamicLDSUsed() const;
};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 7b18e1f805d8..21bfab52c6c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -104,6 +104,14 @@ public:
void applyCombineSignExtendInReg(MachineInstr &MI,
MachineInstr *&MatchInfo) const;
+ // Find the s_mul_u64 instructions where the higher bits are either
+ // zero-extended or sign-extended.
+ bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
+ // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher
+ // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32
+ // bits are zero extended.
+ void applyCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
+
private:
#define GET_GICOMBINER_CLASS_MEMBERS
#define AMDGPUSubtarget GCNSubtarget
@@ -419,6 +427,32 @@ void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
MI.eraseFromParent();
}
+bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
+ MachineInstr &MI, unsigned &NewOpcode) const {
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+ if (MRI.getType(Src0) != LLT::scalar(64))
+ return false;
+
+ if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&
+ KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) {
+ NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32;
+ return true;
+ }
+
+ if (KB->computeNumSignBits(Src1) >= 33 &&
+ KB->computeNumSignBits(Src0) >= 33) {
+ NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32;
+ return true;
+ }
+ return false;
+}
+
+void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64(
+ MachineInstr &MI, unsigned &NewOpcode) const {
+ Helper.replaceOpcodeWith(MI, NewOpcode);
+}
+
// Pass boilerplate
// ================
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index fba060464a6e..391c2b9ec256 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -441,7 +441,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
// FIXME: Returns uniform if there's no source value information. This is
// probably wrong.
-static bool isScalarLoadLegal(const MachineInstr &MI) {
+bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
if (!MI.hasOneMemOperand())
return false;
@@ -2094,6 +2094,74 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
return true;
}
+// Break s_mul_u64 into 32-bit vector operations.
+void AMDGPURegisterBankInfo::applyMappingSMULU64(
+ MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
+ SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
+
+ // All inputs are SGPRs, nothing special to do.
+ if (DefRegs.empty()) {
+ assert(Src0Regs.empty() && Src1Regs.empty());
+ applyDefaultMapping(OpdMapper);
+ return;
+ }
+
+ assert(DefRegs.size() == 2);
+ assert(Src0Regs.size() == Src1Regs.size() &&
+ (Src0Regs.empty() || Src0Regs.size() == 2));
+
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+ MachineInstr &MI = OpdMapper.getMI();
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT HalfTy = LLT::scalar(32);
+
+ // Depending on where the source registers came from, the generic code may
+ // have decided to split the inputs already or not. If not, we still need to
+ // extract the values.
+
+ if (Src0Regs.empty())
+ split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
+ else
+ setRegsToType(MRI, Src0Regs, HalfTy);
+
+ if (Src1Regs.empty())
+ split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
+ else
+ setRegsToType(MRI, Src1Regs, HalfTy);
+
+ setRegsToType(MRI, DefRegs, HalfTy);
+
+ // The multiplication is done as follows:
+ //
+ // Op1H Op1L
+ // * Op0H Op0L
+ // --------------------
+ // Op1H*Op0L Op1L*Op0L
+ // + Op1H*Op0H Op1L*Op0H
+ // -----------------------------------------
+ // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
+ //
+ // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
+ // value and that would overflow.
+ // The low 32-bit value is Op1L*Op0L.
+ // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
+ // Op1L*Op0L).
+
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
+
+ Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
+ Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);
+ Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);
+ Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);
+ B.buildAdd(DefRegs[1], Add, MulHiLo);
+ B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);
+
+ MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
+ MI.eraseFromParent();
+}
+
void AMDGPURegisterBankInfo::applyMappingImpl(
MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
@@ -2394,13 +2462,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
+ // Special case for s_mul_u64. There is not a vector equivalent of
+ // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
+ // multiplications.
+ if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
+ applyMappingSMULU64(B, OpdMapper);
+ return;
+ }
+
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
// Packed 16-bit operations need to be scalarized and promoted.
if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
break;
const RegisterBank *DstBank =
- OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank)
break;
@@ -2451,6 +2527,72 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
+ case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
+ case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
+ // This is a special case for s_mul_u64. We use
+ // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
+ // where the 33 higher bits are sign-extended and
+ // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
+ // where the 32 higher bits are zero-extended. In case scalar registers are
+ // selected, both opcodes are lowered as s_mul_u64. If the vector registers
+ // are selected, then G_AMDGPU_S_MUL_I64_I32 and
+ // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
+
+ // Insert basic copies.
+ applyDefaultMapping(OpdMapper);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg0 = MI.getOperand(1).getReg();
+ Register SrcReg1 = MI.getOperand(2).getReg();
+ const LLT S32 = LLT::scalar(32);
+ const LLT S64 = LLT::scalar(64);
+ assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
+ "that handles only 64-bit operands.");
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+
+ // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
+ // with s_mul_u64 operation.
+ if (DstBank == &AMDGPU::SGPRRegBank) {
+ MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
+ MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
+ MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
+ MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
+ return;
+ }
+
+ // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
+ // with a vector mad.
+ assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
+ "The destination operand should be in vector registers.");
+
+ DebugLoc DL = MI.getDebugLoc();
+
+ // Extract the lower subregister from the first operand.
+ Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
+ MRI.setType(Op0L, S32);
+ B.buildTrunc(Op0L, SrcReg0);
+
+ // Extract the lower subregister from the second operand.
+ Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
+ MRI.setType(Op1L, S32);
+ B.buildTrunc(Op1L, SrcReg1);
+
+ unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
+ ? AMDGPU::G_AMDGPU_MAD_U64_U32
+ : AMDGPU::G_AMDGPU_MAD_I64_I32;
+
+ MachineIRBuilder B(MI);
+ Register Zero64 = B.buildConstant(S64, 0).getReg(0);
+ MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
+ Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
+ B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
+ MI.eraseFromParent();
+ return;
+ }
case AMDGPU::G_SEXT_INREG: {
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
if (SrcRegs.empty())
@@ -3263,17 +3405,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
- unsigned PtrBank =
- getRegBankID(MI.getOperand(0).getReg(), MRI, AMDGPU::SGPRRegBankID);
+ Register PtrReg = MI.getOperand(0).getReg();
+ unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
if (PtrBank == AMDGPU::VGPRRegBankID) {
MI.eraseFromParent();
return;
}
- // FIXME: There is currently no support for prefetch in global isel.
- // There is no node equivalence and what's worse there is no MMO produced
- // for a prefetch on global isel path.
- // Prefetch does not affect execution so erase it for now.
- MI.eraseFromParent();
+ unsigned AS = MRI.getType(PtrReg).getAddressSpace();
+ if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
+ AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ MI.eraseFromParent();
+ return;
+ }
+ applyDefaultMapping(OpdMapper);
return;
}
default:
@@ -3667,7 +3811,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AND:
case AMDGPU::G_OR:
- case AMDGPU::G_XOR: {
+ case AMDGPU::G_XOR:
+ case AMDGPU::G_MUL: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Size == 1) {
const RegisterBank *DstBank
@@ -3735,7 +3880,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_PTRMASK:
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
- case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
case AMDGPU::G_LSHR:
case AMDGPU::G_ASHR:
@@ -3753,6 +3897,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SHUFFLE_VECTOR:
case AMDGPU::G_SBFX:
case AMDGPU::G_UBFX:
+ case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
+ case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index b5d16e70ab23..5f550b426ec0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -84,6 +84,9 @@ public:
bool applyMappingMAD_64_32(MachineIRBuilder &B,
const OperandsMapper &OpdMapper) const;
+ void applyMappingSMULU64(MachineIRBuilder &B,
+ const OperandsMapper &OpdMapper) const;
+
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;
@@ -173,6 +176,8 @@ public:
const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
LLT) const override;
+ bool isScalarLoadLegal(const MachineInstr &MI) const;
+
InstructionMappings
getInstrAlternativeMappings(const MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index fdc2077868cf..0f3bb3e7b0d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -620,7 +620,8 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
AAM.registerFunctionAnalysis<AMDGPUAA>();
}
-void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(
+ PassBuilder &PB, bool PopulateClassToPassNames) {
PB.registerPipelineParsingCallback(
[this](StringRef PassName, ModulePassManager &PM,
ArrayRef<PassBuilder::PipelineElement>) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 9051a61e6557..99c9db3e654a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -51,7 +51,8 @@ public:
return TLOF.get();
}
- void registerPassBuilderCallbacks(PassBuilder &PB) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool PopulateClassToPassNames) override;
void registerDefaultAliasAnalyses(AAManager &) override;
/// Get the integer value of a null pointer in the given address space.
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index abd7e911beef..b7f043860115 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -166,6 +166,8 @@ public:
ImmTyEndpgm,
ImmTyWaitVDST,
ImmTyWaitEXP,
+ ImmTyWaitVAVDst,
+ ImmTyWaitVMVSrc,
};
// Immediate operand kind.
@@ -909,6 +911,8 @@ public:
bool isEndpgm() const;
bool isWaitVDST() const;
bool isWaitEXP() const;
+ bool isWaitVAVDst() const;
+ bool isWaitVMVSrc() const;
auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const {
return std::bind(P, *this);
@@ -1029,6 +1033,7 @@ public:
}
static void printImmTy(raw_ostream& OS, ImmTy Type) {
+ // clang-format off
switch (Type) {
case ImmTyNone: OS << "None"; break;
case ImmTyGDS: OS << "GDS"; break;
@@ -1086,7 +1091,10 @@ public:
case ImmTyEndpgm: OS << "Endpgm"; break;
case ImmTyWaitVDST: OS << "WaitVDST"; break;
case ImmTyWaitEXP: OS << "WaitEXP"; break;
+ case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
+ case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
}
+ // clang-format on
}
void print(raw_ostream &OS) const override {
@@ -1857,6 +1865,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
return &APFloat::IEEEsingle();
@@ -1871,13 +1882,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
- case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_KIMM16:
return &APFloat::IEEEhalf();
@@ -2025,9 +2033,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
// We allow fp literals with f16x2 operands assuming that the specified
// literal goes into the lower half and the upper half is zero. We also
// require that the literal may be losslessly converted to f16.
- MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
- (type == MVT::v2i16)? MVT::i16 :
- (type == MVT::v2f32)? MVT::f32 : type;
+ //
+ // For i16x2 operands, we assume that the specified literal is encoded as a
+ // single-precision float. This is pretty odd, but it matches SP3 and what
+ // happens in hardware.
+ MVT ExpectedType = (type == MVT::v2f16) ? MVT::f16
+ : (type == MVT::v2i16) ? MVT::f32
+ : (type == MVT::v2f32) ? MVT::f32
+ : type;
APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
@@ -3393,12 +3406,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16)
- return AMDGPU::isInlinableIntLiteralV216(Val);
+ return AMDGPU::isInlinableLiteralV2I16(Val);
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
- return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
+ return AMDGPU::isInlinableLiteralV2F16(Val);
return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
}
@@ -9192,6 +9205,14 @@ bool AMDGPUOperand::isWaitVDST() const {
return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm());
}
+bool AMDGPUOperand::isWaitVAVDst() const {
+ return isImmTy(ImmTyWaitVAVDst) && isUInt<4>(getImm());
+}
+
+bool AMDGPUOperand::isWaitVMVSrc() const {
+ return isImmTy(ImmTyWaitVMVSrc) && isUInt<1>(getImm());
+}
+
//===----------------------------------------------------------------------===//
// VINTERP
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43d35fa5291c..9e99d382ed9b 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -503,7 +503,6 @@ class MUBUF_Load_Pseudo <string opName,
let has_vdata = !not(!or(isLds, isLdsOpc));
let mayLoad = 1;
let mayStore = isLds;
- let maybeAtomic = 1;
let Uses = !if(!or(isLds, isLdsOpc) , [EXEC, M0], [EXEC]);
let tfe = isTFE;
let lds = isLds;
@@ -610,7 +609,6 @@ class MUBUF_Store_Pseudo <string opName,
getAddrName<addrKindCopy>.ret;
let mayLoad = 0;
let mayStore = 1;
- let maybeAtomic = 1;
let elements = getMUBUFElements<store_vt>.ret;
let tfe = isTFE;
}
@@ -671,7 +669,6 @@ class MUBUF_Pseudo_Store_Lds<string opName>
let LGKM_CNT = 1;
let mayLoad = 1;
let mayStore = 1;
- let maybeAtomic = 1;
let has_vdata = 0;
let has_vaddr = 0;
@@ -735,7 +732,6 @@ class MUBUF_Atomic_Pseudo<string opName,
let has_glc = 0;
let has_dlc = 0;
let has_sccb = 1;
- let maybeAtomic = 1;
let AsmMatchConverter = "cvtMubufAtomic";
}
@@ -1222,8 +1218,10 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
} // End HasD16LoadStore
-def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
- int_amdgcn_buffer_wbinvl1>;
+let SubtargetPredicate = isNotGFX12Plus in
+def BUFFER_WBINVL1 : MUBUF_Invalidate <
+ "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1
+>;
let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<
diff --git a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
new file mode 100644
index 000000000000..4416da605981
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
@@ -0,0 +1,192 @@
+//===-- DSDIRInstructions.td - LDS/VDS Direct Instruction Definitions -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LDSDIR/VDSDIR encoding (LDSDIR is gfx11, VDSDIR is gfx12+)
+//===----------------------------------------------------------------------===//
+
+class LDSDIRe<bits<2> op, bit is_direct> : Enc32 {
+ // encoding fields
+ bits<2> attrchan;
+ bits<6> attr;
+ bits<4> waitvdst;
+ bits<8> vdst;
+
+ // encoding
+ let Inst{31-24} = 0xce; // encoding
+ let Inst{23-22} = 0x0; // reserved
+ let Inst{21-20} = op;
+ let Inst{19-16} = waitvdst;
+ let Inst{15-10} = !if(is_direct, ?, attr);
+ let Inst{9-8} = !if(is_direct, ?, attrchan);
+ let Inst{7-0} = vdst;
+}
+
+class VDSDIRe<bits<2> op, bit is_direct> : Enc32 {
+ // encoding fields
+ bits<2> attrchan;
+ bits<6> attr;
+ bits<4> waitvdst;
+ bits<8> vdst;
+ bits<1> waitvsrc;
+
+ // encoding
+ let Inst{31-24} = 0xce; // encoding
+ let Inst{23} = waitvsrc;
+ let Inst{22} = 0x0; // reserved
+ let Inst{21-20} = op;
+ let Inst{19-16} = waitvdst;
+ let Inst{15-10} = !if(is_direct, ?, attr);
+ let Inst{9-8} = !if(is_direct, ?, attrchan);
+ let Inst{7-0} = vdst;
+}
+
+//===----------------------------------------------------------------------===//
+// LDSDIR/VDSDIR Classes
+//===----------------------------------------------------------------------===//
+
+class LDSDIR_getIns<bit direct> {
+ dag ret = !if(direct,
+ (ins wait_vdst:$waitvdst),
+ (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst)
+ );
+}
+
+class VDSDIR_getIns<bit direct> {
+ dag ret = !if(direct,
+ (ins wait_va_vdst:$waitvdst, wait_va_vsrc:$waitvsrc),
+ (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_va_vdst:$waitvdst,
+ wait_va_vsrc:$waitvsrc)
+ );
+}
+
+class DSDIR_Common<string opName, string asm = "", dag ins, bit direct> :
+ InstSI<(outs VGPR_32:$vdst), ins, asm> {
+ let LDSDIR = 1;
+ let EXP_CNT = 1;
+
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+ let maybeAtomic = 0;
+
+ string Mnemonic = opName;
+ let UseNamedOperandTable = 1;
+
+ let Uses = [M0, EXEC];
+ let DisableWQM = 0;
+ let SchedRW = [WriteLDS];
+
+ bit is_direct;
+ let is_direct = direct;
+}
+
+class DSDIR_Pseudo<string opName, dag ins, bit direct> :
+ DSDIR_Common<opName, "", ins, direct>,
+ SIMCInstr<opName, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+class LDSDIR_getAsm<bit direct> {
+ string ret = !if(direct,
+ " $vdst$waitvdst",
+ " $vdst, $attr$attrchan$waitvdst"
+ );
+}
+
+class VDSDIR_getAsm<bit direct> {
+ string ret = !if(direct,
+ " $vdst$waitvdst$waitvsrc",
+ " $vdst, $attr$attrchan$waitvdst$waitvsrc"
+ );
+}
+
+class DSDIR_Real<DSDIR_Pseudo lds, dag ins, string asm, int subtarget> :
+ DSDIR_Common<lds.Mnemonic,
+ lds.Mnemonic # asm,
+ ins,
+ lds.is_direct>,
+ SIMCInstr <lds.Mnemonic, subtarget> {
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// LDS/VDS Direct Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGFX11Only in {
+
+def LDS_DIRECT_LOAD : DSDIR_Pseudo<"lds_direct_load", LDSDIR_getIns<1>.ret, 1>;
+def LDS_PARAM_LOAD : DSDIR_Pseudo<"lds_param_load", LDSDIR_getIns<0>.ret, 0>;
+
+def : GCNPat <
+ (f32 (int_amdgcn_lds_direct_load M0)),
+ (LDS_DIRECT_LOAD 0)
+>;
+
+def : GCNPat <
+ (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)),
+ (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0)
+>;
+
+} // End SubtargetPredicate = isGFX11Only
+
+let SubtargetPredicate = isGFX12Plus in {
+
+def DS_DIRECT_LOAD : DSDIR_Pseudo<"ds_direct_load", VDSDIR_getIns<1>.ret, 1>;
+def DS_PARAM_LOAD : DSDIR_Pseudo<"ds_param_load", VDSDIR_getIns<0>.ret, 0>;
+
+def : GCNPat <
+ (f32 (int_amdgcn_lds_direct_load M0)),
+ (DS_DIRECT_LOAD 0, 1)
+>;
+
+def : GCNPat <
+ (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)),
+ (DS_PARAM_LOAD timm:$attr, timm:$attrchan, 0, 1)
+>;
+
+} // End SubtargetPredicate = isGFX12Only
+
+//===----------------------------------------------------------------------===//
+// GFX11
+//===----------------------------------------------------------------------===//
+
+multiclass DSDIR_Real_gfx11<bits<2> op,
+ DSDIR_Pseudo lds = !cast<DSDIR_Pseudo>(NAME)> {
+ def _gfx11 : DSDIR_Real<lds, lds.InOperandList,
+ LDSDIR_getAsm<lds.is_direct>.ret,
+ SIEncodingFamily.GFX11>,
+ LDSDIRe<op, lds.is_direct> {
+ let AssemblerPredicate = isGFX11Only;
+ let DecoderNamespace = "GFX11";
+ }
+}
+
+defm LDS_PARAM_LOAD : DSDIR_Real_gfx11<0x0>;
+defm LDS_DIRECT_LOAD : DSDIR_Real_gfx11<0x1>;
+
+//===----------------------------------------------------------------------===//
+// GFX12+
+//===----------------------------------------------------------------------===//
+
+multiclass DSDIR_Real_gfx12<bits<2> op,
+ DSDIR_Pseudo lds = !cast<DSDIR_Pseudo>(NAME)> {
+ def _gfx12 : DSDIR_Real<lds, lds.InOperandList,
+ VDSDIR_getAsm<lds.is_direct>.ret,
+ SIEncodingFamily.GFX12>,
+ VDSDIRe<op, lds.is_direct> {
+ let AssemblerPredicate = isGFX12Plus;
+ let DecoderNamespace = "GFX12";
+ }
+}
+
+defm DS_PARAM_LOAD : DSDIR_Real_gfx12<0x0>;
+defm DS_DIRECT_LOAD : DSDIR_Real_gfx12<0x1>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index bc9049b4ef33..3cccd8c50e66 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -19,7 +19,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
// Most instruction load and store data, so set this as the default.
let mayLoad = 1;
let mayStore = 1;
- let maybeAtomic = 1;
let hasSideEffects = 0;
let SchedRW = [WriteLDS];
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 67be7b0fd642..9dff3f6c2efd 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm, \
false, ImmWidth)
+#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth) \
+ DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth)
+
// Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
// and decode using 'enum10' from decodeSrcOp.
#define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth) \
@@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16)
+
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
diff --git a/llvm/lib/Target/AMDGPU/EXPInstructions.td b/llvm/lib/Target/AMDGPU/EXPInstructions.td
index ff1d661ef6fe..4cfee7d013ef 100644
--- a/llvm/lib/Target/AMDGPU/EXPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EXPInstructions.td
@@ -20,6 +20,7 @@ class EXPCommon<bit row, bit done, string asm = ""> : InstSI<
let EXP_CNT = 1;
let mayLoad = done;
let mayStore = 1;
+ let maybeAtomic = 0;
let UseNamedOperandTable = 1;
let Uses = !if(row, [EXEC, M0], [EXEC]);
let SchedRW = [WriteExport];
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 615f8cd54d8f..16a8b770e057 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -60,6 +60,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
bits<1> has_sve = 0; // Scratch VGPR Enable
bits<1> lds = 0;
bits<1> sve = 0;
+ bits<1> has_offset = 1;
let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
!if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{51-50} = cpol{4-3}; // scope
let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?);
let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{95-72} = offset;
+ let Inst{95-72} = !if(ps.has_offset, offset, ?);
}
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
@@ -214,7 +215,6 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
let has_saddr = HasSaddr;
let enabled_saddr = EnableSaddr;
let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", "");
- let maybeAtomic = 1;
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
@@ -236,7 +236,6 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
let has_saddr = HasSaddr;
let enabled_saddr = EnableSaddr;
let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", "");
- let maybeAtomic = 1;
}
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
@@ -262,7 +261,6 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
let has_vaddr = 0;
let has_saddr = 1;
let enabled_saddr = EnableSaddr;
- let maybeAtomic = 1;
let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
@@ -329,7 +327,6 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
let has_vaddr = 0;
let has_saddr = 1;
let enabled_saddr = EnableSaddr;
- let maybeAtomic = 1;
let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
}
@@ -340,6 +337,34 @@ multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass
GlobalSaddrTable<1, opName>;
}
+class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = null_frag> :
+ FLAT_Pseudo<opName, (outs), (ins CPol:$cpol), "$cpol", [(node)]> {
+
+ let AsmMatchConverter = "";
+
+ let hasSideEffects = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let is_flat_global = 1;
+
+ let has_offset = 0;
+ let has_saddr = 0;
+ let enabled_saddr = 0;
+ let saddr_value = 0;
+ let has_vdst = 0;
+ let has_data = 0;
+ let has_vaddr = 0;
+ let has_glc = 0;
+ let has_dlc = 0;
+ let glcValue = 0;
+ let dlcValue = 0;
+ let has_sccb = 0;
+ let sccbValue = 0;
+ let has_sve = 0;
+ let lds = 0;
+ let sve = 0;
+}
+
class FlatScratchInst <string sv_op, string mode> {
string SVOp = sv_op;
string Mode = mode;
@@ -372,7 +397,6 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
let has_sve = EnableSVE;
let sve = EnableVaddr;
let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")));
- let maybeAtomic = 1;
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
@@ -401,7 +425,6 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
let has_sve = EnableSVE;
let sve = EnableVaddr;
let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")));
- let maybeAtomic = 1;
}
multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> {
@@ -491,7 +514,6 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
let has_vdst = 0;
let has_sccb = 1;
let sccbValue = 0;
- let maybeAtomic = 1;
let IsAtomicNoRet = 1;
}
@@ -928,6 +950,10 @@ defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwor
let SubtargetPredicate = isGFX12Plus in {
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
+
+ def GLOBAL_INV : FLAT_Global_Invalidate_Writeback<"global_inv">;
+ def GLOBAL_WB : FLAT_Global_Invalidate_Writeback<"global_wb">;
+ def GLOBAL_WBINV : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
} // End SubtargetPredicate = isGFX12Plus
} // End is_flat_global = 1
@@ -2662,6 +2688,10 @@ defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_A
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
+defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">;
+defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">;
+defm GLOBAL_WBINV : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">;
+
// ENC_VSCRATCH.
defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
defm SCRATCH_LOAD_I8 : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 91a709303269..f6f37f5170a4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -176,6 +176,7 @@ protected:
bool HasGetWaveIdInst = false;
bool HasSMemTimeInst = false;
bool HasShaderCyclesRegister = false;
+ bool HasShaderCyclesHiLoRegisters = false;
bool HasVOP3Literal = false;
bool HasNoDataDepHazard = false;
bool FlatAddressSpace = false;
@@ -682,6 +683,8 @@ public:
bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
+ bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
+
bool hasUnpackedD16VMem() const {
return HasUnpackedD16VMem;
}
@@ -819,6 +822,10 @@ public:
return HasShaderCyclesRegister;
}
+ bool hasShaderCyclesHiLoRegisters() const {
+ return HasShaderCyclesHiLoRegisters;
+ }
+
bool hasVOP3Literal() const {
return HasVOP3Literal;
}
@@ -1096,7 +1103,7 @@ public:
bool hasDstSelForwardingHazard() const { return GFX940Insts; }
// Cannot use op_sel with v_dot instructions.
- bool hasDOTOpSelHazard() const { return GFX940Insts; }
+ bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
// Does not have HW interlocs for VALU writing and then reading SGPRs.
bool hasVDecCoExecHazard() const {
diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
deleted file mode 100644
index 4956a1586774..000000000000
--- a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
+++ /dev/null
@@ -1,116 +0,0 @@
-//===-- LDSDIRInstructions.td - LDS Direct Instruction Definitions --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// LDSDIR encoding
-//===----------------------------------------------------------------------===//
-
-class LDSDIRe<bits<2> op, bit is_direct> : Enc32 {
- // encoding fields
- bits<2> attrchan;
- bits<6> attr;
- bits<4> waitvdst;
- bits<8> vdst;
-
- // encoding
- let Inst{31-24} = 0xce; // encoding
- let Inst{23-22} = 0x0; // reserved
- let Inst{21-20} = op;
- let Inst{19-16} = waitvdst;
- let Inst{15-10} = !if(is_direct, ?, attr);
- let Inst{9-8} = !if(is_direct, ?, attrchan);
- let Inst{7-0} = vdst;
-}
-
-//===----------------------------------------------------------------------===//
-// LDSDIR Classes
-//===----------------------------------------------------------------------===//
-
-class LDSDIR_getIns<bit direct> {
- dag ret = !if(direct,
- (ins wait_vdst:$waitvdst),
- (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst)
- );
-}
-
-class LDSDIR_Common<string opName, string asm = "", bit direct> : InstSI<
- (outs VGPR_32:$vdst),
- LDSDIR_getIns<direct>.ret,
- asm> {
- let LDSDIR = 1;
- let EXP_CNT = 1;
-
- let hasSideEffects = 0;
- let mayLoad = 1;
- let mayStore = 0;
-
- string Mnemonic = opName;
- let UseNamedOperandTable = 1;
-
- let Uses = [M0, EXEC];
- let DisableWQM = 0;
- let SchedRW = [WriteLDS];
-
- bit is_direct;
- let is_direct = direct;
-}
-
-class LDSDIR_Pseudo<string opName, bit direct> :
- LDSDIR_Common<opName, "", direct>,
- SIMCInstr<opName, SIEncodingFamily.NONE> {
- let isPseudo = 1;
- let isCodeGenOnly = 1;
-}
-
-class LDSDIR_getAsm<bit direct> {
- string ret = !if(direct,
- " $vdst$waitvdst",
- " $vdst, $attr$attrchan$waitvdst"
- );
-}
-
-class LDSDIR_Real<bits<2> op, LDSDIR_Pseudo lds, int subtarget> :
- LDSDIR_Common<lds.Mnemonic,
- lds.Mnemonic # LDSDIR_getAsm<lds.is_direct>.ret,
- lds.is_direct>,
- SIMCInstr <lds.Mnemonic, subtarget>,
- LDSDIRe<op, lds.is_direct> {
- let isPseudo = 0;
- let isCodeGenOnly = 0;
-}
-
-//===----------------------------------------------------------------------===//
-// LDS Direct Instructions
-//===----------------------------------------------------------------------===//
-
-def LDS_DIRECT_LOAD : LDSDIR_Pseudo<"lds_direct_load", 1>;
-def LDS_PARAM_LOAD : LDSDIR_Pseudo<"lds_param_load", 0>;
-
-def : GCNPat <
- (f32 (int_amdgcn_lds_direct_load M0)),
- (LDS_DIRECT_LOAD 0)
->;
-
-def : GCNPat <
- (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)),
- (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0)
->;
-
-//===----------------------------------------------------------------------===//
-// GFX11+
-//===----------------------------------------------------------------------===//
-
-multiclass LDSDIR_Real_gfx11<bits<2> op, LDSDIR_Pseudo lds = !cast<LDSDIR_Pseudo>(NAME)> {
- def _gfx11 : LDSDIR_Real<op, lds, SIEncodingFamily.GFX11> {
- let AssemblerPredicate = isGFX11Plus;
- let DecoderNamespace = "GFX11";
- }
-}
-
-defm LDS_PARAM_LOAD : LDSDIR_Real_gfx11<0x0>;
-defm LDS_DIRECT_LOAD : LDSDIR_Real_gfx11<0x1>;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index edc244db613d..6c7977e22599 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
}
}
-void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- int16_t SImm = static_cast<int16_t>(Imm);
- if (isInlinableIntLiteral(SImm)) {
- O << SImm;
- return;
- }
-
+// This must accept a 32-bit immediate value to correctly handle packed 16-bit
+// operations.
+static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
if (Imm == 0x3C00)
- O<< "1.0";
+ O << "1.0";
else if (Imm == 0xBC00)
- O<< "-1.0";
+ O << "-1.0";
else if (Imm == 0x3800)
- O<< "0.5";
+ O << "0.5";
else if (Imm == 0xB800)
- O<< "-0.5";
+ O << "-0.5";
else if (Imm == 0x4000)
- O<< "2.0";
+ O << "2.0";
else if (Imm == 0xC000)
- O<< "-2.0";
+ O << "-2.0";
else if (Imm == 0x4400)
- O<< "4.0";
+ O << "4.0";
else if (Imm == 0xC400)
- O<< "-4.0";
- else if (Imm == 0x3118 &&
- STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) {
+ O << "-4.0";
+ else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494";
- } else {
- uint64_t Imm16 = static_cast<uint16_t>(Imm);
- O << formatHex(Imm16);
- }
-}
+ else
+ return false;
-void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- uint16_t Lo16 = static_cast<uint16_t>(Imm);
- printImmediate16(Lo16, STI, O);
+ return true;
}
-void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ int16_t SImm = static_cast<int16_t>(Imm);
+ if (isInlinableIntLiteral(SImm)) {
+ O << SImm;
+ return;
+ }
+
+ uint16_t HImm = static_cast<uint16_t>(Imm);
+ if (printImmediateFloat16(HImm, STI, O))
+ return;
+
+ uint64_t Imm16 = static_cast<uint16_t>(Imm);
+ O << formatHex(Imm16);
+}
+
+void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
int32_t SImm = static_cast<int32_t>(Imm);
- if (SImm >= -16 && SImm <= 64) {
+ if (isInlinableIntLiteral(SImm)) {
O << SImm;
return;
}
+ switch (OpType) {
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ if (printImmediateFloat32(Imm, STI, O))
+ return;
+ break;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ if (isUInt<16>(Imm) &&
+ printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O))
+ return;
+ break;
+ default:
+ llvm_unreachable("bad operand type");
+ }
+
+ O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
if (Imm == llvm::bit_cast<uint32_t>(0.0f))
O << "0.0";
else if (Imm == llvm::bit_cast<uint32_t>(1.0f))
@@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494";
else
- O << formatHex(static_cast<uint64_t>(Imm));
+ return false;
+
+ return true;
+}
+
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int32_t SImm = static_cast<int32_t>(Imm);
+ if (isInlinableIntLiteral(SImm)) {
+ O << SImm;
+ return;
+ }
+
+ if (printImmediateFloat32(Imm, STI, O))
+ return;
+
+ O << formatHex(static_cast<uint64_t>(Imm));
}
void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
@@ -639,6 +684,20 @@ void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
printU4ImmDecOperand(MI, OpNo, O);
}
+void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << " wait_va_vdst:";
+ printU4ImmDecOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ O << " wait_vm_vsrc:";
+ printU4ImmDecOperand(MI, OpNo, O);
+}
+
void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -741,25 +800,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
break;
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
- if (!isUInt<16>(Op.getImm()) &&
- STI.hasFeature(AMDGPU::FeatureVOP3Literal)) {
- printImmediate32(Op.getImm(), STI, O);
- break;
- }
-
- // Deal with 16-bit FP inline immediates not working.
- if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) {
- printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O);
- break;
- }
- [[fallthrough]];
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O);
- break;
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
- printImmediateV216(Op.getImm(), STI, O);
+ printImmediateV216(Op.getImm(), OpTy, STI, O);
break;
case MCOI::OPERAND_UNKNOWN:
case MCOI::OPERAND_PCREL:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 95c26de6299e..e3958f88277d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -88,8 +88,10 @@ private:
raw_ostream &O);
void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
- raw_ostream &O);
+ void printImmediateV216(uint32_t Imm, uint8_t OpType,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
@@ -161,6 +163,10 @@ private:
raw_ostream &O);
void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printWaitVAVDst(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O, unsigned N);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index b403d69d9ff1..de1abaf29c56 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
// which does not have f16 support?
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_V2INT16:
- case AMDGPU::OPERAND_REG_IMM_V2FP16: {
- if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal))
- return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
- if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
- return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
- [[fallthrough]];
- }
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+ return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm))
+ .value_or(255);
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
- uint16_t Lo16 = static_cast<uint16_t>(Imm);
- uint32_t Encoding = getLit16Encoding(Lo16, STI);
- return Encoding;
- }
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm))
+ .value_or(255);
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16:
return MO.getImm();
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 9a2fb0bc37b2..674fd04f2fc1 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1651,7 +1651,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
for (unsigned i = 0; i < 4; i++) {
- unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
+ unsigned Idx = Swz[i]->getAsZExtVal();
if (SwizzleRemap.contains(Idx))
Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
}
@@ -1659,7 +1659,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
SwizzleRemap.clear();
BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
for (unsigned i = 0; i < 4; i++) {
- unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
+ unsigned Idx = Swz[i]->getAsZExtVal();
if (SwizzleRemap.contains(Idx))
Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
}
@@ -1780,7 +1780,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
// Check that we know which element is being inserted
if (!isa<ConstantSDNode>(EltNo))
return SDValue();
- unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+ unsigned Elt = EltNo->getAsZExtVal();
// Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
// be converted to a BUILD_VECTOR). Fill in the Ops vector with the
@@ -2021,7 +2021,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
}
case R600::MOV_IMM_GLOBAL_ADDR:
// Check if the Imm slot is used. Taken from below.
- if (cast<ConstantSDNode>(Imm)->getZExtValue())
+ if (Imm->getAsZExtVal())
return false;
Imm = Src.getOperand(0);
Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32);
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 709de612d81d..aa7639a0f186 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -208,9 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
assert(Old.isReg() && Fold.isImm());
if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
- (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
- isUInt<16>(Fold.ImmToFold) ||
- !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+ (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
return false;
unsigned Opcode = MI->getOpcode();
@@ -234,42 +232,123 @@ bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
unsigned Opcode = MI->getOpcode();
int OpNo = MI->getOperandNo(&Old);
+ uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
+
+ // If the literal can be inlined as-is, apply it and short-circuit the
+ // tests below. The main motivation for this is to avoid unintuitive
+ // uses of opsel.
+ if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
+ Old.ChangeToImmediate(Fold.ImmToFold);
+ return true;
+ }
- // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
- // already set.
+ // Refer to op_sel/op_sel_hi and check if we can change the immediate and
+ // op_sel in a way that allows an inline constant.
int ModIdx = -1;
- if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+ unsigned SrcIdx = ~0;
+ if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
ModIdx = AMDGPU::OpName::src0_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+ SrcIdx = 0;
+ } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
ModIdx = AMDGPU::OpName::src1_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+ SrcIdx = 1;
+ } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
ModIdx = AMDGPU::OpName::src2_modifiers;
+ SrcIdx = 2;
+ }
assert(ModIdx != -1);
ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
MachineOperand &Mod = MI->getOperand(ModIdx);
- unsigned Val = Mod.getImm();
- if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+ unsigned ModVal = Mod.getImm();
+
+ uint16_t ImmLo = static_cast<uint16_t>(
+ Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
+ uint16_t ImmHi = static_cast<uint16_t>(
+ Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
+ uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
+ unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+
+ // Helper function that attempts to inline the given value with a newly
+ // chosen opsel pattern.
+ auto tryFoldToInline = [&](uint32_t Imm) -> bool {
+ if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
+ Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate(Imm);
+ return true;
+ }
+
+ // Try to shuffle the halves around and leverage opsel to get an inline
+ // constant.
+ uint16_t Lo = static_cast<uint16_t>(Imm);
+ uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
+ if (Lo == Hi) {
+ if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
+ Mod.setImm(NewModVal);
+ Old.ChangeToImmediate(Lo);
+ return true;
+ }
+
+ if (static_cast<int16_t>(Lo) < 0) {
+ int32_t SExt = static_cast<int16_t>(Lo);
+ if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
+ Mod.setImm(NewModVal);
+ Old.ChangeToImmediate(SExt);
+ return true;
+ }
+ }
+
+ // This check is only useful for integer instructions
+ if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
+ OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) {
+ if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
+ Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
+ return true;
+ }
+ }
+ } else {
+ uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
+ if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
+ Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
+ Old.ChangeToImmediate(Swapped);
+ return true;
+ }
+ }
+
return false;
+ };
- // Only apply the following transformation if that operand requires
- // a packed immediate.
- // If upper part is all zero we do not need op_sel_hi.
- if (!(Fold.ImmToFold & 0xffff)) {
- MachineOperand New =
- MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
- if (!TII->isOperandLegal(*MI, OpNo, &New))
- return false;
- Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ if (tryFoldToInline(Imm))
return true;
+
+ // Replace integer addition by subtraction and vice versa if it allows
+ // folding the immediate to an inline constant.
+ //
+ // We should only ever get here for SrcIdx == 1 due to canonicalization
+ // earlier in the pipeline, but we double-check here to be safe / fully
+ // general.
+ bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
+ bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
+ if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
+ unsigned ClampIdx =
+ AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
+ bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
+
+ if (!Clamp) {
+ uint16_t NegLo = -static_cast<uint16_t>(Imm);
+ uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
+ uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
+
+ if (tryFoldToInline(NegImm)) {
+ unsigned NegOpcode =
+ IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
+ MI->setDesc(TII->get(NegOpcode));
+ return true;
+ }
+ }
}
- MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
- if (!TII->isOperandLegal(*MI, OpNo, &New))
- return false;
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
- return true;
+
+ return false;
}
bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
@@ -277,8 +356,19 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
assert(Old.isReg());
- if (Fold.isImm() && canUseImmWithOpSel(Fold))
- return tryFoldImmWithOpSel(Fold);
+ if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
+ if (tryFoldImmWithOpSel(Fold))
+ return true;
+
+ // We can't represent the candidate as an inline constant. Try as a literal
+ // with the original opsel, checking constant bus limitations.
+ MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold);
+ int OpNo = MI->getOperandNo(&Old);
+ if (!TII->isOperandLegal(*MI, OpNo, &New))
+ return false;
+ Old.ChangeToImmediate(Fold.ImmToFold);
+ return true;
+ }
if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
MachineBasicBlock *MBB = MI->getParent();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0e857e6ac71b..6ddc7e864fb2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -151,22 +151,29 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->useRealTrue16Insts()) {
addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
+ addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
} else {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
}
// Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
+ addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
+ addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
+ addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
+ addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -196,6 +203,41 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::i1, MVT::v32i32},
Custom);
+ if (isTypeLegal(MVT::bf16)) {
+ for (unsigned Opc :
+ {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
+ ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
+ ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
+ ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
+ ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
+ ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
+ ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
+ ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
+ ISD::SETCC}) {
+ // FIXME: The promoted to type shouldn't need to be explicit
+ setOperationAction(Opc, MVT::bf16, Promote);
+ AddPromotedToType(Opc, MVT::bf16, MVT::f32);
+ }
+
+ setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
+
+ setOperationAction(ISD::SELECT, MVT::bf16, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
+
+ // TODO: Could make these legal
+ setOperationAction(ISD::FABS, MVT::bf16, Expand);
+ setOperationAction(ISD::FNEG, MVT::bf16, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
+
+ // We only need to custom lower because we can't specify an action for bf16
+ // sources.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
+ }
+
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
@@ -271,13 +313,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT :
- {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
- MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
- MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
- MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
- MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
- MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
- MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
+ {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
+ MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
+ MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
+ MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
+ MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
+ MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
+ MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
+ MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -383,13 +426,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
Expand);
- setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
+ Custom);
// Avoid stack access for these.
// TODO: Generalize to more vector types.
setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
- {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v4i16, MVT::v4f16},
+ {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
+ MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
Custom);
// Deal with vec3 vector operations when widened to vec4.
@@ -498,6 +542,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
+ // Custom lower these because we can't specify a rule based on an illegal
+ // source bf16.
+ setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
+ setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
+
if (Subtarget->has16BitInsts()) {
setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
@@ -524,9 +573,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
+ setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
+ setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
+
+ setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);
// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
// F16 - Load/Store Actions.
setOperationAction(ISD::LOAD, MVT::f16, Promote);
@@ -534,16 +588,23 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::f16, Promote);
AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
+ // BF16 - Load/Store Actions.
+ setOperationAction(ISD::LOAD, MVT::bf16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
+ setOperationAction(ISD::STORE, MVT::bf16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
+
// F16 - VOP1 Actions.
setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
MVT::f16, Custom);
- setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
+ setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
// F16 - VOP2 Actions.
- setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
+ setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
+ Expand);
setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
setOperationAction(ISD::FFREXP, MVT::f16, Custom);
setOperationAction(ISD::FDIV, MVT::f16, Custom);
@@ -554,8 +615,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT :
- {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
- MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) {
+ {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
+ MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
+ MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -587,7 +649,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// XXX - Do these do anything? Vector constants turn into build_vector.
setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
- setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
+ setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
+ Legal);
setOperationAction(ISD::STORE, MVT::v2i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
@@ -610,16 +673,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
+ setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
setOperationAction(ISD::STORE, MVT::v4i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
setOperationAction(ISD::STORE, MVT::v4f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+ setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
setOperationAction(ISD::STORE, MVT::v4i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
@@ -630,26 +699,36 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
setOperationAction(ISD::STORE, MVT::v8f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
setOperationAction(ISD::STORE, MVT::v16i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
setOperationAction(ISD::STORE, MVT::v16f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
+ setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
+ setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
setOperationAction(ISD::STORE, MVT::v32i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
setOperationAction(ISD::STORE, MVT::v32f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
+ setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
MVT::v2i32, Expand);
@@ -662,7 +741,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::v8i32, Expand);
if (!Subtarget->hasVOP3PInsts())
- setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);
+ setOperationAction(ISD::BUILD_VECTOR,
+ {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
// This isn't really legal, but this avoids the legalizer unrolling it (and
@@ -680,8 +760,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
Expand);
- for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
- MVT::v32i16, MVT::v32f16}) {
+ for (MVT Vec16 :
+ {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
+ MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
setOperationAction(
{ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
Vec16, Custom);
@@ -699,7 +780,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
MVT::v2f16, Legal);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
setOperationAction(ISD::VECTOR_SHUFFLE,
@@ -724,7 +805,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
Custom);
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
- setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
+ setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
+ Custom);
if (Subtarget->hasPackedFP32Ops()) {
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
@@ -750,13 +832,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::SELECT,
- {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
- MVT::v32i16, MVT::v32f16},
+ {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
+ MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
+ MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
+ MVT::v32f16, MVT::v32bf16},
Custom);
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
+ if (Subtarget->hasScalarSMulU64())
+ setOperationAction(ISD::MUL, MVT::i64, Custom);
+
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
@@ -3902,6 +3988,26 @@ SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
+// Work around DAG legality rules only based on the result type.
+SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ EVT SrcVT = Src.getValueType();
+
+ if (SrcVT.getScalarType() != MVT::bf16)
+ return Op;
+
+ SDLoc SL(Op);
+ SDValue BitCast =
+ DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
+
+ EVT DstVT = Op.getValueType();
+ if (IsStrict)
+ llvm_unreachable("Need STRICT_BF16_TO_FP");
+
+ return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
+}
+
Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
@@ -4825,6 +4931,48 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::GET_SHADERCYCLESHILO: {
+ assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ // The algorithm is:
+ //
+ // hi1 = getreg(SHADER_CYCLES_HI)
+ // lo1 = getreg(SHADER_CYCLES_LO)
+ // hi2 = getreg(SHADER_CYCLES_HI)
+ //
+ // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
+ // Otherwise there was overflow and the result is hi2:0. In both cases the
+ // result should represent the actual time at some point during the sequence
+ // of three getregs.
+ Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
+ .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
+ 0, 32));
+ Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
+ .addImm(
+ AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32));
+ Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
+ .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
+ 0, 32));
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addReg(RegHi1)
+ .addReg(RegHi2);
+ Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
+ .addReg(RegLo1)
+ .addImm(0);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
+ .add(MI.getOperand(0))
+ .addReg(RegLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(RegHi2)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return BB;
+ }
case AMDGPU::SI_INDIRECT_SRC_V1:
case AMDGPU::SI_INDIRECT_SRC_V2:
case AMDGPU::SI_INDIRECT_SRC_V4:
@@ -5305,7 +5453,9 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16);
+ VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
+ VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
+ VT == MVT::v32bf16);
SDValue Lo0, Hi0;
SDValue Op0 = Op.getOperand(0);
@@ -5424,7 +5574,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL:
case ISD::ADD:
case ISD::SUB:
- case ISD::MUL:
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
@@ -5438,6 +5587,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SADDSAT:
case ISD::SSUBSAT:
return splitBinaryVectorOp(Op, DAG);
+ case ISD::MUL:
+ return lowerMUL(Op, DAG);
case ISD::SMULO:
case ISD::UMULO:
return lowerXMULO(Op, DAG);
@@ -5452,6 +5603,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerGET_ROUNDING(Op, DAG);
case ISD::PREFETCH:
return lowerPREFETCH(Op, DAG);
+ case ISD::FP_EXTEND:
+ case ISD::STRICT_FP_EXTEND:
+ return lowerFP_EXTEND(Op, DAG);
}
return SDValue();
}
@@ -6090,6 +6244,66 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
}
+// Custom lowering for vector multiplications and s_mul_u64.
+SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // Split vector operands.
+ if (VT.isVector())
+ return splitBinaryVectorOp(Op, DAG);
+
+ assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
+
+ // There are four ways to lower s_mul_u64:
+ //
+ // 1. If all the operands are uniform, then we lower it as it is.
+ //
+ // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
+ // multiplications because there is not a vector equivalent of s_mul_u64.
+ //
+ // 3. If the cost model decides that it is more efficient to use vector
+ // registers, then we have to split s_mul_u64 in 32-bit multiplications.
+ // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
+ //
+ // 4. If the cost model decides to use vector registers and both of the
+ // operands are zero-extended/sign-extended from 32-bits, then we split the
+ // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
+ // possible to check if the operands are zero-extended or sign-extended in
+ // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
+ // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
+ // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
+ // If the cost model decides that we have to use vector registers, then
+ // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
+ // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
+ // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
+ // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
+ // SIInstrInfo.cpp .
+
+ if (Op->isDivergent())
+ return SDValue();
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
+ // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
+ // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
+ KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
+ unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
+ KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
+ unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
+ SDLoc SL(Op);
+ if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
+ return SDValue(
+ DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
+ unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
+ unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
+ if (Op0SignBits >= 33 && Op1SignBits >= 33)
+ return SDValue(
+ DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
+ // If all the operands are uniform, then we lower s_mul_u64 as it is.
+ return Op;
+}
+
SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc SL(Op);
@@ -6424,7 +6638,7 @@ SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
EVT InsVT = Ins.getValueType();
EVT EltVT = VecVT.getVectorElementType();
unsigned InsNumElts = InsVT.getVectorNumElements();
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ unsigned IdxVal = Idx->getAsZExtVal();
SDLoc SL(Op);
if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
@@ -6639,7 +6853,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
- if (ResultVT == MVT::f16) {
+ if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
}
@@ -6725,8 +6939,8 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SDLoc SL(Op);
EVT VT = Op.getValueType();
- if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
- VT == MVT::v8i16 || VT == MVT::v8f16) {
+ if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
+ VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
VT.getVectorNumElements() / 2);
MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
@@ -6749,7 +6963,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
- if (VT == MVT::v16i16 || VT == MVT::v16f16) {
+ if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
VT.getVectorNumElements() / 4);
MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
@@ -6770,7 +6984,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
- if (VT == MVT::v32i16 || VT == MVT::v32f16) {
+ if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
VT.getVectorNumElements() / 8);
MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
@@ -6791,7 +7005,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
- assert(VT == MVT::v2f16 || VT == MVT::v2i16);
+ assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
SDValue Lo = Op.getOperand(0);
@@ -6890,6 +7104,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
// Adjust alignment for that dynamic shared memory array.
Function &F = DAG.getMachineFunction().getFunction();
MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
+ MFI->setUsesDynamicLDS(true);
return SDValue(
DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
}
@@ -7453,7 +7668,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Ops.push_back(IsA16 ? True : False);
if (!Subtarget->hasGFX90AInsts()) {
Ops.push_back(TFE); //tfe
- } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
+ } else if (TFE->getAsZExtVal()) {
report_fatal_error("TFE is not supported on this GPU");
}
if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
@@ -7590,7 +7805,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
setBufferOffsets(Offset, DAG, &Ops[3],
NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
- uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
+ uint64_t InstOffset = Ops[5]->getAsZExtVal();
for (unsigned i = 0; i < NumLoads; ++i) {
Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
@@ -14052,11 +14267,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
EVT VT = N->getValueType(0);
// v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
- if (VT == MVT::v2i16 || VT == MVT::v2f16) {
+ if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2f16) {
SDLoc SL(N);
SDValue Src = N->getOperand(0);
EVT EltVT = Src.getValueType();
- if (EltVT == MVT::f16)
+ if (EltVT != MVT::i16)
Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 5bc091d6e84d..92b38ebade62 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -146,6 +146,7 @@ private:
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
@@ -417,6 +418,7 @@ public:
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 55ddb540c51e..1cb1d32707f2 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1424,6 +1424,12 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
});
}
+static bool isCacheInvOrWBInst(MachineInstr &Inst) {
+ auto Opc = Inst.getOpcode();
+ return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+ Opc == AMDGPU::GLOBAL_WBINV;
+}
+
void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets) {
// Now look at the instruction opcode. If it is a memory access
@@ -1439,6 +1445,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
} else if (TII->isFLAT(Inst)) {
+ // TODO: Track this properly.
+ if (isCacheInvOrWBInst(Inst))
+ return;
+
assert(Inst.mayLoadOrStore());
int FlatASCount = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 585a3eb78618..1b66d163714f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -91,7 +91,7 @@ class InstSI <dag outs, dag ins, string asm = "",
field bit VOP3_OPSEL = 0;
// Is it possible for this instruction to be atomic?
- field bit maybeAtomic = 0;
+ field bit maybeAtomic = 1;
// This bit indicates that this is a VI instruction which is renamed
// in GFX9. Required for correct mapping from pseudo to MC.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 396d22c7ec18..fee900b3efb2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -338,8 +338,8 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
return false;
- Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
- Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
+ Offset0 = Off0->getAsZExtVal();
+ Offset1 = Off1->getAsZExtVal();
return true;
}
@@ -2475,6 +2475,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+
+ case AMDGPU::S_MUL_U64_U32_PSEUDO:
+ case AMDGPU::S_MUL_I64_I32_PSEUDO:
+ MI.setDesc(get(AMDGPU::S_MUL_U64));
+ break;
}
return true;
}
@@ -4153,15 +4158,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- return (isInt<16>(Imm) || isUInt<16>(Imm)) &&
- AMDGPU::isInlinableIntLiteral((int16_t)Imm);
+ return AMDGPU::isInlinableLiteralV2I16(Imm);
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ return AMDGPU::isInlinableLiteralV2F16(Imm);
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
// where 16-bit instructions are not legal.
@@ -6845,6 +6850,21 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Default handling
break;
}
+
+ case AMDGPU::S_MUL_U64:
+ // Split s_mul_u64 in 32-bit vector multiplications.
+ splitScalarSMulU64(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
+
+ case AMDGPU::S_MUL_U64_U32_PSEUDO:
+ case AMDGPU::S_MUL_I64_I32_PSEUDO:
+ // This is a special case of s_mul_u64 where all the operands are either
+ // zero extended or sign extended.
+ splitScalarSMulPseudo(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
+
case AMDGPU::S_AND_B64:
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
Inst.eraseFromParent();
@@ -7654,6 +7674,180 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
+// There is not a vector equivalent of s_mul_u64. For this reason, we need to
+// split the s_mul_u64 in 32-bit vector multiplications.
+void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
+ MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineBasicBlock::iterator MII = Inst;
+
+ const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
+ const TargetRegisterClass *Src0SubRC =
+ RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
+ if (RI.isSGPRClass(Src0SubRC))
+ Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
+ const TargetRegisterClass *Src1SubRC =
+ RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
+ if (RI.isSGPRClass(Src1SubRC))
+ Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
+
+ // First, we extract the low 32-bit and high 32-bit values from each of the
+ // operands.
+ MachineOperand Op0L =
+ buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+ MachineOperand Op1L =
+ buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+ MachineOperand Op0H =
+ buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+ MachineOperand Op1H =
+ buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+
+ // The multilication is done as follows:
+ //
+ // Op1H Op1L
+ // * Op0H Op0L
+ // --------------------
+ // Op1H*Op0L Op1L*Op0L
+ // + Op1H*Op0H Op1L*Op0H
+ // -----------------------------------------
+ // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
+ //
+ // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
+ // value and that would overflow.
+ // The low 32-bit value is Op1L*Op0L.
+ // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
+
+ Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *Op1L_Op0H =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
+ .add(Op1L)
+ .add(Op0H);
+
+ Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *Op1H_Op0L =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
+ .add(Op1H)
+ .add(Op0L);
+
+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *Carry =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
+ .add(Op1L)
+ .add(Op0L);
+
+ MachineInstr *LoHalf =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
+ .add(Op1L)
+ .add(Op0L);
+
+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
+ .addReg(Op1L_Op0H_Reg)
+ .addReg(Op1H_Op0L_Reg);
+
+ MachineInstr *HiHalf =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
+ .addReg(AddReg)
+ .addReg(CarryReg);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+ // Try to legalize the operands in case we need to swap the order to keep it
+ // valid.
+ legalizeOperands(*Op1L_Op0H, MDT);
+ legalizeOperands(*Op1H_Op0L, MDT);
+ legalizeOperands(*Carry, MDT);
+ legalizeOperands(*LoHalf, MDT);
+ legalizeOperands(*Add, MDT);
+ legalizeOperands(*HiHalf, MDT);
+
+ // Move all users of this moved value.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
+// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
+// multiplications.
+void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
+ MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineBasicBlock::iterator MII = Inst;
+
+ const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
+ const TargetRegisterClass *Src0SubRC =
+ RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
+ if (RI.isSGPRClass(Src0SubRC))
+ Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
+ const TargetRegisterClass *Src1SubRC =
+ RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
+ if (RI.isSGPRClass(Src1SubRC))
+ Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
+
+ // First, we extract the low 32-bit and high 32-bit values from each of the
+ // operands.
+ MachineOperand Op0L =
+ buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+ MachineOperand Op1L =
+ buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+
+ unsigned Opc = Inst.getOpcode();
+ unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
+ ? AMDGPU::V_MUL_HI_U32_e64
+ : AMDGPU::V_MUL_HI_I32_e64;
+ MachineInstr *HiHalf =
+ BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
+
+ MachineInstr *LoHalf =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
+ .add(Op1L)
+ .add(Op0L);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+ // Try to legalize the operands in case we need to swap the order to keep it
+ // valid.
+ legalizeOperands(*HiHalf, MDT);
+ legalizeOperands(*LoHalf, MDT);
+
+ // Move all users of this moved value.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
MachineInstr &Inst, unsigned Opcode,
MachineDominatorTree *MDT) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 46eee6fae0a5..37ee159362a2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -138,6 +138,12 @@ private:
unsigned Opcode,
MachineDominatorTree *MDT = nullptr) const;
+ void splitScalarSMulU64(SIInstrWorklist &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const;
+
+ void splitScalarSMulPseudo(SIInstrWorklist &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const;
+
void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 173c877b8d29..f07b8fa0ea4c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -860,23 +860,6 @@ def ShiftAmt32Imm : ImmLeaf <i32, [{
return Imm < 32;
}]>;
-def getNegV2I16Imm : SDNodeXForm<build_vector, [{
- return SDValue(packNegConstantV2I16(N, *CurDAG), 0);
-}]>;
-
-def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
- assert(N->getNumOperands() == 2);
- assert(N->getOperand(0).getValueType().getSizeInBits() == 16);
- SDValue Src0 = N->getOperand(0);
- SDValue Src1 = N->getOperand(1);
- if (Src0 == Src1)
- return isNegInlineImmediate(Src0.getNode());
-
- return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) ||
- (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
-}], getNegV2I16Imm>;
-
-
def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
return fp16SrcZerosHighBits(N->getOpcode());
}]>;
@@ -1144,6 +1127,8 @@ def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
def wait_vdst : NamedIntOperand<i8, "wait_vdst", "WaitVDST">;
def wait_exp : NamedIntOperand<i8, "wait_exp", "WaitEXP">;
+def wait_va_vdst : NamedIntOperand<i8, "wait_va_vdst", "WaitVAVDst">;
+def wait_va_vsrc : NamedIntOperand<i8, "wait_vm_vsrc", "WaitVMVSrc">;
class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
let OperandNamespace = "AMDGPU";
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8310c6b57dad..b4bd46d33c1f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -30,7 +30,7 @@ include "SMInstructions.td"
include "FLATInstructions.td"
include "BUFInstructions.td"
include "EXPInstructions.td"
-include "LDSDIRInstructions.td"
+include "DSDIRInstructions.td"
include "VINTERPInstructions.td"
//===----------------------------------------------------------------------===//
@@ -111,7 +111,6 @@ def ATOMIC_FENCE : SPseudoInstSI<
[(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
"ATOMIC_FENCE $ordering, $scope"> {
let hasSideEffects = 1;
- let maybeAtomic = 1;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
@@ -316,6 +315,12 @@ def S_USUBO_PSEUDO : SPseudoInstSI <
(outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
>;
+let OtherPredicates = [HasShaderCyclesHiLoRegisters] in
+def GET_SHADERCYCLESHILO : SPseudoInstSI<
+ (outs SReg_64:$sdst), (ins),
+ [(set SReg_64:$sdst, (i64 (readcyclecounter)))]
+>;
+
} // End usesCustomInserter = 1, Defs = [SCC]
let usesCustomInserter = 1 in {
@@ -557,6 +562,7 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
let hasNoSchedulingInfo = 1;
let FixedSize = 1;
let isMeta = 1;
+ let maybeAtomic = 0;
}
// Used as an isel pseudo to directly emit initialization with an
@@ -1097,7 +1103,7 @@ def : Pat <
multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
// f16_to_fp patterns
def : GCNPat <
- (f32 (f16_to_fp i32:$src0)),
+ (f32 (any_f16_to_fp i32:$src0)),
(cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0)
>;
@@ -1122,7 +1128,7 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
>;
def : GCNPat <
- (f64 (fpextend f16:$src)),
+ (f64 (any_fpextend f16:$src)),
(V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
>;
@@ -1151,6 +1157,13 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
(f16 (uint_to_fp i32:$src)),
(cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
>;
+
+ // This is only used on targets without half support
+ // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
+ def : GCNPat <
+ (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
+ >;
}
let SubtargetPredicate = NotHasTrue16BitInsts in
@@ -1515,6 +1528,23 @@ def : BitConvert <v2f16, f32, SReg_32>;
def : BitConvert <f32, v2f16, SReg_32>;
def : BitConvert <v2i16, f32, SReg_32>;
def : BitConvert <f32, v2i16, SReg_32>;
+def : BitConvert <v2bf16, i32, SReg_32>;
+def : BitConvert <i32, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, i32, VGPR_32>;
+def : BitConvert <i32, v2bf16, VGPR_32>;
+def : BitConvert <v2bf16, v2i16, SReg_32>;
+def : BitConvert <v2i16, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, v2i16, VGPR_32>;
+def : BitConvert <v2i16, v2bf16, VGPR_32>;
+def : BitConvert <v2bf16, v2f16, SReg_32>;
+def : BitConvert <v2f16, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, v2f16, VGPR_32>;
+def : BitConvert <v2f16, v2bf16, VGPR_32>;
+def : BitConvert <f32, v2bf16, VGPR_32>;
+def : BitConvert <v2bf16, f32, VGPR_32>;
+def : BitConvert <f32, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, f32, SReg_32>;
+
// 64-bit bitcast
def : BitConvert <i64, f64, VReg_64>;
@@ -1531,6 +1561,19 @@ def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v2i32, f64, VReg_64>;
def : BitConvert <v4i16, v4f16, VReg_64>;
def : BitConvert <v4f16, v4i16, VReg_64>;
+def : BitConvert <v4bf16, v2i32, VReg_64>;
+def : BitConvert <v2i32, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, i64, VReg_64>;
+def : BitConvert <i64, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, v4i16, VReg_64>;
+def : BitConvert <v4i16, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, v4f16, VReg_64>;
+def : BitConvert <v4f16, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, v2f32, VReg_64>;
+def : BitConvert <v2f32, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, f64, VReg_64>;
+def : BitConvert <f64, v4bf16, VReg_64>;
+
// FIXME: Make SGPR
def : BitConvert <v2i32, v4f16, VReg_64>;
@@ -1590,6 +1633,37 @@ def : BitConvert <v2f64, v8i16, SReg_128>;
def : BitConvert <v2i64, v8f16, SReg_128>;
def : BitConvert <v2f64, v8f16, SReg_128>;
+def : BitConvert <v4i32, v8bf16, SReg_128>;
+def : BitConvert <v8bf16, v4i32, SReg_128>;
+def : BitConvert <v4i32, v8bf16, VReg_128>;
+def : BitConvert <v8bf16, v4i32, VReg_128>;
+
+def : BitConvert <v4f32, v8bf16, SReg_128>;
+def : BitConvert <v8bf16, v4f32, SReg_128>;
+def : BitConvert <v4f32, v8bf16, VReg_128>;
+def : BitConvert <v8bf16, v4f32, VReg_128>;
+
+def : BitConvert <v8i16, v8bf16, SReg_128>;
+def : BitConvert <v8bf16, v8i16, SReg_128>;
+def : BitConvert <v8i16, v8bf16, VReg_128>;
+def : BitConvert <v8bf16, v8i16, VReg_128>;
+
+def : BitConvert <v8f16, v8bf16, SReg_128>;
+def : BitConvert <v8bf16, v8f16, SReg_128>;
+def : BitConvert <v8f16, v8bf16, VReg_128>;
+def : BitConvert <v8bf16, v8f16, VReg_128>;
+
+def : BitConvert <v2f64, v8bf16, SReg_128>;
+def : BitConvert <v8bf16, v2f64, SReg_128>;
+def : BitConvert <v2f64, v8bf16, VReg_128>;
+def : BitConvert <v8bf16, v2f64, VReg_128>;
+
+def : BitConvert <v2i64, v8bf16, SReg_128>;
+def : BitConvert <v8bf16, v2i64, SReg_128>;
+def : BitConvert <v2i64, v8bf16, VReg_128>;
+def : BitConvert <v8bf16, v2i64, VReg_128>;
+
+
// 160-bit bitcast
def : BitConvert <v5i32, v5f32, SReg_160>;
def : BitConvert <v5f32, v5i32, SReg_160>;
@@ -1654,6 +1728,31 @@ def : BitConvert <v4i64, v16i16, VReg_256>;
def : BitConvert <v4f64, v16f16, VReg_256>;
def : BitConvert <v4f64, v16i16, VReg_256>;
+
+def : BitConvert <v8i32, v16bf16, VReg_256>;
+def : BitConvert <v16bf16, v8i32, VReg_256>;
+def : BitConvert <v8f32, v16bf16, VReg_256>;
+def : BitConvert <v16bf16, v8f32, VReg_256>;
+def : BitConvert <v4i64, v16bf16, VReg_256>;
+def : BitConvert <v16bf16, v4i64, VReg_256>;
+def : BitConvert <v4f64, v16bf16, VReg_256>;
+def : BitConvert <v16bf16, v4f64, VReg_256>;
+
+
+
+def : BitConvert <v16i16, v16bf16, SReg_256>;
+def : BitConvert <v16bf16, v16i16, SReg_256>;
+def : BitConvert <v16i16, v16bf16, VReg_256>;
+def : BitConvert <v16bf16, v16i16, VReg_256>;
+
+def : BitConvert <v16f16, v16bf16, SReg_256>;
+def : BitConvert <v16bf16, v16f16, SReg_256>;
+def : BitConvert <v16f16, v16bf16, VReg_256>;
+def : BitConvert <v16bf16, v16f16, VReg_256>;
+
+
+
+
// 288-bit bitcast
def : BitConvert <v9i32, v9f32, SReg_288>;
def : BitConvert <v9f32, v9i32, SReg_288>;
@@ -1702,6 +1801,38 @@ def : BitConvert <v8f64, v16f32, VReg_512>;
def : BitConvert <v16f32, v8i64, VReg_512>;
def : BitConvert <v16f32, v8f64, VReg_512>;
+
+
+def : BitConvert <v32bf16, v32i16, VReg_512>;
+def : BitConvert <v32i16, v32bf16, VReg_512>;
+def : BitConvert <v32bf16, v32i16, SReg_512>;
+def : BitConvert <v32i16, v32bf16, SReg_512>;
+
+def : BitConvert <v32bf16, v32f16, VReg_512>;
+def : BitConvert <v32f16, v32bf16, VReg_512>;
+def : BitConvert <v32bf16, v32f16, SReg_512>;
+def : BitConvert <v32f16, v32bf16, SReg_512>;
+
+def : BitConvert <v32bf16, v16i32, VReg_512>;
+def : BitConvert <v16i32, v32bf16, VReg_512>;
+def : BitConvert <v32bf16, v16i32, SReg_512>;
+def : BitConvert <v16i32, v32bf16, SReg_512>;
+
+def : BitConvert <v32bf16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32bf16, VReg_512>;
+def : BitConvert <v32bf16, v16f32, SReg_512>;
+def : BitConvert <v16f32, v32bf16, SReg_512>;
+
+def : BitConvert <v32bf16, v8f64, VReg_512>;
+def : BitConvert <v8f64, v32bf16, VReg_512>;
+def : BitConvert <v32bf16, v8f64, SReg_512>;
+def : BitConvert <v8f64, v32bf16, SReg_512>;
+
+def : BitConvert <v32bf16, v8i64, VReg_512>;
+def : BitConvert <v8i64, v32bf16, VReg_512>;
+def : BitConvert <v32bf16, v8i64, SReg_512>;
+def : BitConvert <v8i64, v32bf16, SReg_512>;
+
// 1024-bit bitcast
def : BitConvert <v32i32, v32f32, VReg_1024>;
def : BitConvert <v32f32, v32i32, VReg_1024>;
@@ -1958,19 +2089,21 @@ def : GCNPat <
let SubtargetPredicate = HasPackedFP32Ops;
}
+foreach fp16vt = [f16, bf16] in {
+
def : GCNPat <
- (fcopysign f16:$src0, f16:$src1),
+ (fcopysign fp16vt:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
>;
def : GCNPat <
- (fcopysign f32:$src0, f16:$src1),
+ (fcopysign f32:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(V_LSHLREV_B32_e64 (i32 16), $src1))
>;
def : GCNPat <
- (fcopysign f64:$src0, f16:$src1),
+ (fcopysign f64:$src0, fp16vt:$src1),
(REG_SEQUENCE SReg_64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
@@ -1978,16 +2111,17 @@ def : GCNPat <
>;
def : GCNPat <
- (fcopysign f16:$src0, f32:$src1),
+ (fcopysign fp16vt:$src0, f32:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), $src1))
>;
def : GCNPat <
- (fcopysign f16:$src0, f64:$src1),
+ (fcopysign fp16vt:$src0, f64:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
+} // End foreach fp16vt = [f16, bf16]
/********** ================== **********/
/********** Immediate Patterns **********/
@@ -2026,6 +2160,11 @@ def : GCNPat <
(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
>;
+def : GCNPat <
+ (VGPRImm<(bf16 fpimm)>:$imm),
+ (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
+>;
+
// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
// immediate and wil be expanded as needed, but we will only use these patterns
// for values which can be encoded.
@@ -2060,6 +2199,11 @@ def : GCNPat <
>;
def : GCNPat <
+ (bf16 fpimm:$imm),
+ (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : GCNPat <
(p5 frameindex:$fi),
(V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
>;
@@ -3741,6 +3885,18 @@ def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
let mayStore = 0;
}
+def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
// This is equivalent to the G_INTRINSIC*, but the operands may have
// been legalized depending on the subtarget requirements.
def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 10ec54d3317f..6d749ad1ad24 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -578,6 +578,14 @@ public:
bool IsNonTemporal) const override;
};
+class SIGfx12CacheControl : public SIGfx11CacheControl {
+public:
+ SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+};
+
class SIMemoryLegalizer final : public MachineFunctionPass {
private:
@@ -857,7 +865,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
return std::make_unique<SIGfx7CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX11)
return std::make_unique<SIGfx10CacheControl>(ST);
- return std::make_unique<SIGfx11CacheControl>(ST);
+ if (Generation < AMDGPUSubtarget::GFX12)
+ return std::make_unique<SIGfx11CacheControl>(ST);
+ return std::make_unique<SIGfx12CacheControl>(ST);
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -1423,7 +1433,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
@@ -2132,6 +2142,62 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
+ return false;
+
+ AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ ScopeImm = AMDGPU::CPol::SCOPE_SYS;
+ break;
+ case SIAtomicScope::AGENT:
+ ScopeImm = AMDGPU::CPol::SCOPE_DEV;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore we need to invalidate the L0 which is per CU.
+ // Otherwise in CU mode all waves of a work-group are on the same CU, and so
+ // the L0 does not need to be invalidated.
+ if (ST.isCuModeEnabled())
+ return false;
+
+ ScopeImm = AMDGPU::CPol::SCOPE_SE;
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ return false;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return true;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c94b894c5841..f42af89cf5e6 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -916,7 +916,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>;
let GlobalPriority = true in {
-defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v32bf16], SGPR_512Regs, TTMP_512Regs>;
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
}
@@ -970,7 +970,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>;
let GlobalPriority = true in {
-defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>;
+defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v32bf16], (add VGPR_512)>;
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
}
@@ -1152,11 +1152,11 @@ class RegOrF32 <string RegisterClass, string OperandTypePrefix>
class RegOrV2B16 <string RegisterClass, string OperandTypePrefix>
: RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16",
- !subst("_v2b16", "V2B16", NAME), "_Imm16">;
+ !subst("_v2b16", "V2B16", NAME), "_ImmV2I16">;
class RegOrV2F16 <string RegisterClass, string OperandTypePrefix>
: RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16",
- !subst("_v2f16", "V2F16", NAME), "_Imm16">;
+ !subst("_v2f16", "V2F16", NAME), "_ImmV2F16">;
class RegOrF64 <string RegisterClass, string OperandTypePrefix>
: RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64",
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 59d6ccf513bb..5e6c34992930 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -553,7 +553,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
}
continue;
} else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
- Opcode == AMDGPU::LDS_DIRECT_LOAD) {
+ Opcode == AMDGPU::DS_PARAM_LOAD ||
+ Opcode == AMDGPU::LDS_DIRECT_LOAD ||
+ Opcode == AMDGPU::DS_DIRECT_LOAD) {
// Mark these STRICTWQM, but only for the instruction, not its operands.
// This avoid unnecessarily marking M0 as requiring WQM.
InstrInfo &II = Instructions[&MI];
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 3297847b0360..fc29ce8d71f2 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -29,6 +29,7 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let mayStore = 0;
let mayLoad = 1;
let hasSideEffects = 0;
+ let maybeAtomic = 0;
let UseNamedOperandTable = 1;
let SchedRW = [WriteSMEM];
@@ -305,6 +306,10 @@ let SubtargetPredicate = HasScalarDwordx3Loads in
defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>;
defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_64, SReg_256>;
defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>;
+defm S_LOAD_I8 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_LOAD_U8 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_LOAD_I16 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_LOAD_U16 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
let is_buffer = 1 in {
defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
@@ -316,6 +321,10 @@ let SubtargetPredicate = HasScalarDwordx3Loads in
defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>;
defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>;
defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>;
+defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
}
let SubtargetPredicate = HasScalarStores in {
@@ -977,20 +986,35 @@ def : GCNPat <
}
} // let OtherPredicates = [HasShaderCyclesRegister]
-multiclass SMPrefetchPat<string type, int cache_type> {
+def i32imm_zero : TImmLeaf <i32, [{
+ return Imm == 0;
+}]>;
+
+def i32imm_one : TImmLeaf <i32, [{
+ return Imm == 1;
+}]>;
+
+multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
def : GCNPat <
- (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)),
+ (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, cache_type),
(!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0))
>;
def : GCNPat <
- (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+ (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, cache_type),
(!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0))
>;
+
+ def : GCNPat <
+ (smrd_prefetch (i32 SReg_32:$sbase), timm, timm, cache_type),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type)
+ (i64 (REG_SEQUENCE SReg_64, $sbase, sub0, (i32 (S_MOV_B32 (i32 0))), sub1)),
+ 0, (i32 SGPR_NULL), (i8 0))
+ >;
}
-defm : SMPrefetchPat<"INST", 0>;
-defm : SMPrefetchPat<"DATA", 1>;
+defm : SMPrefetchPat<"INST", i32imm_zero>;
+defm : SMPrefetchPat<"DATA", i32imm_one>;
//===----------------------------------------------------------------------===//
// GFX10.
@@ -1321,6 +1345,11 @@ defm S_LOAD_B128 : SM_Real_Loads_gfx12<0x02, "S_LOAD_DWORDX4">;
defm S_LOAD_B256 : SM_Real_Loads_gfx12<0x03, "S_LOAD_DWORDX8">;
defm S_LOAD_B512 : SM_Real_Loads_gfx12<0x04, "S_LOAD_DWORDX16">;
+defm S_LOAD_I8 : SM_Real_Loads_gfx12<0x08>;
+defm S_LOAD_U8 : SM_Real_Loads_gfx12<0x09>;
+defm S_LOAD_I16 : SM_Real_Loads_gfx12<0x0a>;
+defm S_LOAD_U16 : SM_Real_Loads_gfx12<0x0b>;
+
defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx12<0x10, "S_BUFFER_LOAD_DWORD">;
defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx12<0x11, "S_BUFFER_LOAD_DWORDX2">;
defm S_BUFFER_LOAD_B96 : SM_Real_Loads_gfx12<0x15, "S_BUFFER_LOAD_DWORDX3">;
@@ -1328,6 +1357,11 @@ defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx12<0x12, "S_BUFFER_LOAD_DWORDX4">;
defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx12<0x13, "S_BUFFER_LOAD_DWORDX8">;
defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx12<0x14, "S_BUFFER_LOAD_DWORDX16">;
+defm S_BUFFER_LOAD_I8 : SM_Real_Loads_gfx12<0x18>;
+defm S_BUFFER_LOAD_U8 : SM_Real_Loads_gfx12<0x19>;
+defm S_BUFFER_LOAD_I16 : SM_Real_Loads_gfx12<0x1a>;
+defm S_BUFFER_LOAD_U16 : SM_Real_Loads_gfx12<0x1b>;
+
def S_DCACHE_INV_gfx12 : SMEM_Real_gfx12<0x021, S_DCACHE_INV>;
def S_PREFETCH_INST_gfx12 : SMEM_Real_Prefetch_gfx12<0x24, S_PREFETCH_INST>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index c9687ac368d3..46fa3d57a21c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -673,6 +673,16 @@ let SubtargetPredicate = isGFX12Plus in {
let isCommutable = 1;
}
+ // The higher 32-bits of the inputs contain the sign extension bits.
+ def S_MUL_I64_I32_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ >;
+
+ // The higher 32-bits of the inputs are zero.
+ def S_MUL_U64_U32_PSEUDO : SPseudoInstSI <
+ (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ >;
+
} // End SubtargetPredicate = isGFX12Plus
let Uses = [SCC] in {
@@ -1186,14 +1196,12 @@ let SubtargetPredicate = isGFX10Plus in {
let SubtargetPredicate = isGFX10GFX11 in {
def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">;
def S_SUBVECTOR_LOOP_END : SOPK_32_BR<"s_subvector_loop_end">;
-} // End SubtargetPredicate = isGFX10GFX11
-let SubtargetPredicate = isGFX10Plus in {
def S_WAITCNT_VSCNT : SOPK_WAITCNT<"s_waitcnt_vscnt">;
def S_WAITCNT_VMCNT : SOPK_WAITCNT<"s_waitcnt_vmcnt">;
def S_WAITCNT_EXPCNT : SOPK_WAITCNT<"s_waitcnt_expcnt">;
def S_WAITCNT_LGKMCNT : SOPK_WAITCNT<"s_waitcnt_lgkmcnt">;
-} // End SubtargetPredicate = isGFX10Plus
+} // End SubtargetPredicate = isGFX10GFX11
//===----------------------------------------------------------------------===//
// SOPC Instructions
@@ -1702,6 +1710,27 @@ let SubtargetPredicate = HasVGPRSingleUseHintInsts in {
SOPP_Pseudo<"s_singleuse_vdst", (ins s16imm:$simm16), "$simm16">;
} // End SubtargetPredicate = HasVGPRSingeUseHintInsts
+let SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 in {
+ def S_WAIT_LOADCNT :
+ SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_LOADCNT_DSCNT :
+ SOPP_Pseudo<"s_wait_loadcnt_dscnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_STORECNT :
+ SOPP_Pseudo<"s_wait_storecnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_STORECNT_DSCNT :
+ SOPP_Pseudo<"s_wait_storecnt_dscnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_SAMPLECNT :
+ SOPP_Pseudo<"s_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_BVHCNT :
+ SOPP_Pseudo<"s_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_EXPCNT :
+ SOPP_Pseudo<"s_wait_expcnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_DSCNT :
+ SOPP_Pseudo<"s_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_KMCNT :
+ SOPP_Pseudo<"s_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
+} // End SubtargetPredicate = isGFX12Plus, hasSideEffects = 1
+
//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//
@@ -2411,10 +2440,10 @@ defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11_gfx12<0x013>;
defm S_CALL_B64 : SOPK_Real32_gfx11_gfx12<0x014>;
defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;
defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>;
-defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11_gfx12<0x018>;
-defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11_gfx12<0x019>;
-defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11_gfx12<0x01a>;
-defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11_gfx12<0x01b>;
+defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>;
+defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>;
+defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>;
+defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>;
//===----------------------------------------------------------------------===//
// SOPK - GFX10.
@@ -2516,6 +2545,15 @@ multiclass SOPP_Real_32_Renamed_gfx12<bits<7> op, SOPP_Pseudo backing_pseudo, st
defm S_WAIT_ALU : SOPP_Real_32_Renamed_gfx12<0x008, S_WAITCNT_DEPCTR, "s_wait_alu">;
defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>;
defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>;
+defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12<0x040>;
+defm S_WAIT_STORECNT : SOPP_Real_32_gfx12<0x041>;
+defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12<0x042>;
+defm S_WAIT_BVHCNT : SOPP_Real_32_gfx12<0x043>;
+defm S_WAIT_EXPCNT : SOPP_Real_32_gfx12<0x044>;
+defm S_WAIT_DSCNT : SOPP_Real_32_gfx12<0x046>;
+defm S_WAIT_KMCNT : SOPP_Real_32_gfx12<0x047>;
+defm S_WAIT_LOADCNT_DSCNT : SOPP_Real_32_gfx12<0x048>;
+defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12<0x049>;
//===----------------------------------------------------------------------===//
// SOPP - GFX11, GFX12.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a91d77175234..26ba2575ff34 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2506,53 +2506,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
Val == 0x3118; // 1/2pi
}
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
- assert(HasInv2Pi);
-
- if (isInt<16>(Literal) || isUInt<16>(Literal)) {
- int16_t Trunc = static_cast<int16_t>(Literal);
- return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi);
+std::optional<unsigned> getInlineEncodingV216(bool IsFloat, uint32_t Literal) {
+ // Unfortunately, the Instruction Set Architecture Reference Guide is
+ // misleading about how the inline operands work for (packed) 16-bit
+ // instructions. In a nutshell, the actual HW behavior is:
+ //
+ // - integer encodings (-16 .. 64) are always produced as sign-extended
+ // 32-bit values
+ // - float encodings are produced as:
+ // - for F16 instructions: corresponding half-precision float values in
+ // the LSBs, 0 in the MSBs
+ // - for UI16 instructions: corresponding single-precision float value
+ int32_t Signed = static_cast<int32_t>(Literal);
+ if (Signed >= 0 && Signed <= 64)
+ return 128 + Signed;
+
+ if (Signed >= -16 && Signed <= -1)
+ return 192 + std::abs(Signed);
+
+ if (IsFloat) {
+ // clang-format off
+ switch (Literal) {
+ case 0x3800: return 240; // 0.5
+ case 0xB800: return 241; // -0.5
+ case 0x3C00: return 242; // 1.0
+ case 0xBC00: return 243; // -1.0
+ case 0x4000: return 244; // 2.0
+ case 0xC000: return 245; // -2.0
+ case 0x4400: return 246; // 4.0
+ case 0xC400: return 247; // -4.0
+ case 0x3118: return 248; // 1.0 / (2.0 * pi)
+ default: break;
+ }
+ // clang-format on
+ } else {
+ // clang-format off
+ switch (Literal) {
+ case 0x3F000000: return 240; // 0.5
+ case 0xBF000000: return 241; // -0.5
+ case 0x3F800000: return 242; // 1.0
+ case 0xBF800000: return 243; // -1.0
+ case 0x40000000: return 244; // 2.0
+ case 0xC0000000: return 245; // -2.0
+ case 0x40800000: return 246; // 4.0
+ case 0xC0800000: return 247; // -4.0
+ case 0x3E22F983: return 248; // 1.0 / (2.0 * pi)
+ default: break;
+ }
+ // clang-format on
}
- if (!(Literal & 0xffff))
- return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi);
- int16_t Lo16 = static_cast<int16_t>(Literal);
- int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
- return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
+ return {};
}
-bool isInlinableIntLiteralV216(int32_t Literal) {
- int16_t Lo16 = static_cast<int16_t>(Literal);
- if (isInt<16>(Literal) || isUInt<16>(Literal))
- return isInlinableIntLiteral(Lo16);
+// Encoding of the literal as an inline constant for a V_PK_*_IU16 instruction
+// or nullopt.
+std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal) {
+ return getInlineEncodingV216(false, Literal);
+}
- int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
- if (!(Literal & 0xffff))
- return isInlinableIntLiteral(Hi16);
- return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
+// Encoding of the literal as an inline constant for a V_PK_*_F16 instruction
+// or nullopt.
+std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) {
+ return getInlineEncodingV216(true, Literal);
}
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) {
+// Whether the given literal can be inlined for a V_PK_* instruction.
+bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
switch (OpType) {
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ return getInlineEncodingV216(false, Literal).has_value();
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- return isInlinableLiteralV216(Literal, HasInv2Pi);
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ return getInlineEncodingV216(true, Literal).has_value();
default:
- return isInlinableIntLiteralV216(Literal);
+ llvm_unreachable("bad packed operand type");
}
}
-bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
- assert(HasInv2Pi);
-
- int16_t Lo16 = static_cast<int16_t>(Literal);
- if (isInt<16>(Literal) || isUInt<16>(Literal))
- return true;
+// Whether the given literal can be inlined for a V_PK_*_IU16 instruction.
+bool isInlinableLiteralV2I16(uint32_t Literal) {
+ return getInlineEncodingV2I16(Literal).has_value();
+}
- int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
- if (!(Literal & 0xffff))
- return true;
- return Lo16 == Hi16;
+// Whether the given literal can be inlined for a V_PK_*_F16 instruction.
+bool isInlinableLiteralV2F16(uint32_t Literal) {
+ return getInlineEncodingV2F16(Literal).has_value();
}
bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3c9f330cbcde..50c741760d71 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1291,16 +1291,19 @@ LLVM_READNONE
bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
LLVM_READNONE
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
+std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal);
LLVM_READNONE
-bool isInlinableIntLiteralV216(int32_t Literal);
+std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal);
LLVM_READNONE
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType);
+bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType);
LLVM_READNONE
-bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
+bool isInlinableLiteralV2I16(uint32_t Literal);
+
+LLVM_READNONE
+bool isInlinableLiteralV2F16(uint32_t Literal);
LLVM_READNONE
bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0aa62ea77b11..ecee61daa1c8 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1300,7 +1300,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
let OtherPredicates = ps.OtherPredicates;
}
-
+
class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen,
VOPProfile p = ps.Pfl> :
VOP2_DPP8<op, ps, p> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7f52501b5d90..e9d6f67aee16 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -125,15 +125,6 @@ defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2
let SubtargetPredicate = HasVOP3PInsts in {
-// Undo sub x, c -> add x, -c canonicalization since c is more likely
-// an inline immediate than -c.
-// The constant will be emitted as a mov, and folded later.
-// TODO: We could directly encode the immediate now
-def : GCNPat<
- (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1),
- (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1)
->;
-
// Integer operations with clamp bit set.
class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat<
(pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)),
@@ -632,12 +623,12 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node,
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
- !if(NoDstOverlap, null_frag, AgprMAIFrag<node>)>,
+ !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node>)>,
MFMATable<0, NAME # "_e64">;
let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
- !if(NoDstOverlap, null_frag, VgprMAIFrag<node>)>,
+ !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node>)>,
MFMATable<0, NAME # "_vgprcd_e64">;
}
@@ -645,12 +636,13 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node,
let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
isConvertibleToThreeAddress = NoDstOverlap,
Mnemonic = OpName in {
- def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), AgprMAIFrag<node>>,
+ def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P),
+ !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node>)>,
MFMATable<1, NAME # "_e64">;
let SubtargetPredicate = isGFX90APlus in
def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
- VgprMAIFrag<node>>,
+ !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node>)>,
MFMATable<1, NAME # "_vgprcd_e64">;
}
}
diff --git a/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp b/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp
index 28e35f8f2a54..17c2d7bb13b4 100644
--- a/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp
@@ -170,7 +170,7 @@ bool ARCDAGToDAGISel::SelectFrameADDR_ri(SDValue Addr, SDValue &Base,
void ARCDAGToDAGISel::Select(SDNode *N) {
switch (N->getOpcode()) {
case ISD::Constant: {
- uint64_t CVal = cast<ConstantSDNode>(N)->getZExtValue();
+ uint64_t CVal = N->getAsZExtVal();
ReplaceNode(N, CurDAG->getMachineNode(
isInt<12>(CVal) ? ARC::MOV_rs12 : ARC::MOV_rlimm,
SDLoc(N), MVT::i32,
diff --git a/llvm/lib/Target/ARC/ARCISelLowering.cpp b/llvm/lib/Target/ARC/ARCISelLowering.cpp
index 2265f5db6737..5dd343d97b80 100644
--- a/llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -174,6 +174,8 @@ ARCTargetLowering::ARCTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::READCYCLECOUNTER, MVT::i32, Legal);
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
isTypeLegal(MVT::i64) ? Legal : Custom);
+
+ setMaxAtomicSizeInBitsSupported(0);
}
const char *ARCTargetLowering::getTargetNodeName(unsigned Opcode) const {
diff --git a/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
index d4ae3255b32a..4f612ae623b9 100644
--- a/llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -57,6 +57,7 @@ public:
return getTM<ARCTargetMachine>();
}
+ void addIRPasses() override;
bool addInstSelector() override;
void addPreEmitPass() override;
void addPreRegAlloc() override;
@@ -68,6 +69,12 @@ TargetPassConfig *ARCTargetMachine::createPassConfig(PassManagerBase &PM) {
return new ARCPassConfig(*this, PM);
}
+void ARCPassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass());
+
+ TargetPassConfig::addIRPasses();
+}
+
bool ARCPassConfig::addInstSelector() {
addPass(createARCISelDag(getARCTargetMachine(), getOptLevel()));
return false;
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 1d6aaeb7433b..cb3a709f7003 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -747,7 +747,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
TmpOffset += SL->getElementOffset(Idx);
} else {
- uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ uint64_t S = GTI.getSequentialElementStride(DL);
while (true) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index adc429b61bbc..e99ee299412a 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -372,7 +372,7 @@ INITIALIZE_PASS(ARMDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
/// operand. If so Imm will receive the 32-bit value.
static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
- Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ Imm = N->getAsZExtVal();
return true;
}
return false;
@@ -1101,8 +1101,7 @@ bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,
if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
Offset = N.getOperand(0);
SDValue N1 = N.getOperand(1);
- Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
- SDLoc(N), MVT::i32);
+ Label = CurDAG->getTargetConstant(N1->getAsZExtVal(), SDLoc(N), MVT::i32);
return true;
}
@@ -1942,7 +1941,7 @@ SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl,
if (!is64BitVector && NumVecs < 3)
NumRegs *= 2;
- unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+ unsigned Alignment = Align->getAsZExtVal();
if (Alignment >= 32 && NumRegs == 4)
Alignment = 32;
else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4))
@@ -2428,7 +2427,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
unsigned Alignment = 0;
if (NumVecs != 3) {
- Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+ Alignment = Align->getAsZExtVal();
unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8;
if (Alignment > NumBytes)
Alignment = NumBytes;
@@ -2871,7 +2870,7 @@ void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,
Ops.push_back(N->getOperand(OpIdx++)); // limit
SDValue ImmOp = N->getOperand(OpIdx++); // step
- int ImmValue = cast<ConstantSDNode>(ImmOp)->getZExtValue();
+ int ImmValue = ImmOp->getAsZExtVal();
Ops.push_back(getI32Imm(ImmValue, Loc));
if (Predicated)
@@ -2892,7 +2891,7 @@ void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode,
// Convert and append the immediate operand designating the coprocessor.
SDValue ImmCorpoc = N->getOperand(OpIdx++);
- uint32_t ImmCoprocVal = cast<ConstantSDNode>(ImmCorpoc)->getZExtValue();
+ uint32_t ImmCoprocVal = ImmCorpoc->getAsZExtVal();
Ops.push_back(getI32Imm(ImmCoprocVal, Loc));
// For accumulating variants copy the low and high order parts of the
@@ -2911,7 +2910,7 @@ void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode,
// Convert and append the immediate operand
SDValue Imm = N->getOperand(OpIdx);
- uint32_t ImmVal = cast<ConstantSDNode>(Imm)->getZExtValue();
+ uint32_t ImmVal = Imm->getAsZExtVal();
Ops.push_back(getI32Imm(ImmVal, Loc));
// Accumulating variants are IT-predicable, add predicate operands.
@@ -2965,7 +2964,7 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
unsigned Alignment = 0;
if (NumVecs != 3) {
- Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+ Alignment = Align->getAsZExtVal();
unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8;
if (Alignment > NumBytes)
Alignment = NumBytes;
@@ -3697,7 +3696,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
// Other cases are autogenerated.
break;
case ISD::Constant: {
- unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+ unsigned Val = N->getAsZExtVal();
// If we can't materialize the constant we need to use a literal pool
if (ConstantMaterializationCost(Val, Subtarget) > 2 &&
!Subtarget->genExecuteOnly()) {
@@ -4132,7 +4131,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
assert(N2.getOpcode() == ISD::Constant);
assert(N3.getOpcode() == ISD::Register);
- unsigned CC = (unsigned) cast<ConstantSDNode>(N2)->getZExtValue();
+ unsigned CC = (unsigned)N2->getAsZExtVal();
if (InGlue.getOpcode() == ARMISD::CMPZ) {
if (InGlue.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) {
@@ -4243,8 +4242,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
if (SwitchEQNEToPLMI) {
SDValue ARMcc = N->getOperand(2);
- ARMCC::CondCodes CC =
- (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+ ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();
switch (CC) {
default: llvm_unreachable("CMPZ must be either NE or EQ!");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9f3bcffc7a99..568085bd0ab3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -4820,8 +4820,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// some tweaks to the heuristics for the previous and->shift transform.
// FIXME: Optimize cases where the LHS isn't a shift.
if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
- isa<ConstantSDNode>(RHS) &&
- cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
+ isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
LHS.getConstantOperandVal(1) < 31) {
unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
@@ -5533,7 +5532,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
// Choose GE over PL, which vsel does now support
- if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
+ if (ARMcc->getAsZExtVal() == ARMCC::PL)
ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
}
@@ -7749,7 +7748,7 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
uint64_t Val;
if (!isa<ConstantSDNode>(N))
return SDValue();
- Val = cast<ConstantSDNode>(N)->getZExtValue();
+ Val = N->getAsZExtVal();
if (ST->isThumb1Only()) {
if (Val <= 255 || ~Val <= 255)
@@ -7804,7 +7803,7 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
SDValue V = Op.getOperand(i);
if (!isa<ConstantSDNode>(V) && !V.isUndef())
continue;
- bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
+ bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
if (BitSet)
Bits32 |= BoolMask << (i * BitsPerBool);
}
@@ -9240,7 +9239,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
EVT VT = Op.getValueType();
EVT Op1VT = V1.getValueType();
unsigned NumElts = VT.getVectorNumElements();
- unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
+ unsigned Index = V2->getAsZExtVal();
assert(VT.getScalarSizeInBits() == 1 &&
"Unexpected custom EXTRACT_SUBVECTOR lowering");
@@ -14618,7 +14617,7 @@ static SDValue PerformORCombineToBFI(SDNode *N,
// Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
// where lsb(mask) == #shamt and masked bits of B are known zero.
SDValue ShAmt = N00.getOperand(1);
- unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+ unsigned ShAmtC = ShAmt->getAsZExtVal();
unsigned LSB = llvm::countr_zero(Mask);
if (ShAmtC != LSB)
return SDValue();
@@ -18339,8 +18338,7 @@ ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
SDValue Chain = N->getOperand(0);
SDValue BB = N->getOperand(1);
SDValue ARMcc = N->getOperand(2);
- ARMCC::CondCodes CC =
- (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+ ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();
// (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
// -> (brcond Chain BB CC CPSR Cmp)
@@ -18373,8 +18371,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
SDValue FalseVal = N->getOperand(0);
SDValue TrueVal = N->getOperand(1);
SDValue ARMcc = N->getOperand(2);
- ARMCC::CondCodes CC =
- (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+ ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();
// BFI is only available on V6T2+.
if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 3ffde86ce1bb..abea0fef5cdc 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -362,8 +362,8 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate,
llvm_unreachable("Unsupported size for FCmp predicate");
}
-bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
- MachineInstr &MI) const {
+bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const {
using namespace TargetOpcode;
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
@@ -392,7 +392,8 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
OriginalResult};
auto Status = createLibcall(MIRBuilder, Libcall, {RetRegs, RetTy, 0},
{{MI.getOperand(1).getReg(), ArgTy, 0},
- {MI.getOperand(2).getReg(), ArgTy, 0}});
+ {MI.getOperand(2).getReg(), ArgTy, 0}},
+ LocObserver, &MI);
if (Status != LegalizerHelper::Legalized)
return false;
break;
@@ -428,7 +429,8 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
auto Status = createLibcall(MIRBuilder, Libcall.LibcallID,
{LibcallResult, RetTy, 0},
{{MI.getOperand(2).getReg(), ArgTy, 0},
- {MI.getOperand(3).getReg(), ArgTy, 0}});
+ {MI.getOperand(3).getReg(), ArgTy, 0}},
+ LocObserver, &MI);
if (Status != LegalizerHelper::Legalized)
return false;
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/llvm/lib/Target/ARM/ARMLegalizerInfo.h
index f1c2e9c94336..d6ce4eb1055b 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.h
@@ -23,12 +23,12 @@ namespace llvm {
class ARMSubtarget;
-/// This class provides the information for the target register banks.
class ARMLegalizerInfo : public LegalizerInfo {
public:
ARMLegalizerInfo(const ARMSubtarget &ST);
- bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+ bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const override;
private:
void setFCmpLibcallsGNU();
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index d36bfb188ed3..f91e77adb8f8 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -660,7 +660,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS,
SDValue Cmp;
if (LHS.getSimpleValueType() == MVT::i16 && isa<ConstantSDNode>(RHS)) {
- uint64_t Imm = cast<ConstantSDNode>(RHS)->getZExtValue();
+ uint64_t Imm = RHS->getAsZExtVal();
// Generate a CPI/CPC pair if RHS is a 16-bit constant. Use the zero
// register for the constant RHS if its lower or higher byte is zero.
SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS,
@@ -680,7 +680,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS,
} else if (RHS.getSimpleValueType() == MVT::i16 && isa<ConstantSDNode>(LHS)) {
// Generate a CPI/CPC pair if LHS is a 16-bit constant. Use the zero
// register for the constant LHS if its lower or higher byte is zero.
- uint64_t Imm = cast<ConstantSDNode>(LHS)->getZExtValue();
+ uint64_t Imm = LHS->getAsZExtVal();
SDValue LHSlo = (Imm & 0xff) == 0
? DAG.getRegister(Subtarget.getZeroRegister(), MVT::i8)
: DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS,
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 2fe86e75ddae..4d8ace7c1ece 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -151,6 +151,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
}
setBooleanContents(ZeroOrOneBooleanContent);
+ setMaxAtomicSizeInBitsSupported(64);
// Function alignments
setMinFunctionAlignment(Align(8));
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index ab0db576f7f7..8a6e7ae3663e 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -108,7 +108,8 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
return new BPFPassConfig(*this, PM);
}
-void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void BPFTargetMachine::registerPassBuilderCallbacks(
+ PassBuilder &PB, bool PopulateClassToPassNames) {
PB.registerPipelineParsingCallback(
[](StringRef PassName, FunctionPassManager &FPM,
ArrayRef<PassBuilder::PipelineElement>) {
@@ -148,7 +149,9 @@ void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
}
void BPFPassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass());
addPass(createBPFCheckAndAdjustIR());
+
TargetPassConfig::addIRPasses();
}
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.h b/llvm/lib/Target/BPF/BPFTargetMachine.h
index 4e6adc722e76..0a28394463b2 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -42,7 +42,8 @@ public:
return TLOF.get();
}
- void registerPassBuilderCallbacks(PassBuilder &PB) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool PopulateClassToPassNames) override;
};
}
diff --git a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
index 8ffa1d7cd9b3..bce41160b95e 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
+++ b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
@@ -36,6 +36,7 @@ class DXILResourcePrinterPass : public PassInfoMixin<DXILResourcePrinterPass> {
public:
explicit DXILResourcePrinterPass(raw_ostream &OS) : OS(OS) {}
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+ static bool isRequired() { return true; }
};
/// The legacy pass manager's analysis pass to compute DXIL resource
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index d5cb488f2fde..06938f8c74f1 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -100,7 +100,8 @@ DirectXTargetMachine::DirectXTargetMachine(const Target &T, const Triple &TT,
DirectXTargetMachine::~DirectXTargetMachine() {}
-void DirectXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void DirectXTargetMachine::registerPassBuilderCallbacks(
+ PassBuilder &PB, bool PopulateClassToPassNames) {
PB.registerPipelineParsingCallback(
[](StringRef PassName, ModulePassManager &PM,
ArrayRef<PassBuilder::PipelineElement>) {
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.h b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
index d04c375b2736..428beaf61cd0 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.h
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
@@ -47,7 +47,8 @@ public:
}
TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
- void registerPassBuilderCallbacks(PassBuilder &PB) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool PopulateClassToPassNames) override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index eb5c59672224..defb1f7324f4 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -743,7 +743,7 @@ void HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
//
void HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
if (N->getValueType(0) == MVT::i1) {
- assert(!(cast<ConstantSDNode>(N)->getZExtValue() >> 1));
+ assert(!(N->getAsZExtVal() >> 1));
unsigned Opc = (cast<ConstantSDNode>(N)->getSExtValue() != 0)
? Hexagon::PS_true
: Hexagon::PS_false;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 665e2d79c83d..81035849491b 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1256,7 +1256,7 @@ HexagonTargetLowering::extractHvxSubvectorReg(SDValue OrigOp, SDValue VecV,
SDValue IdxV, const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
MVT VecTy = ty(VecV);
unsigned HwLen = Subtarget.getVectorLength();
- unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
+ unsigned Idx = IdxV.getNode()->getAsZExtVal();
MVT ElemTy = VecTy.getVectorElementType();
unsigned ElemWidth = ElemTy.getSizeInBits();
@@ -1299,7 +1299,7 @@ HexagonTargetLowering::extractHvxSubvectorPred(SDValue VecV, SDValue IdxV,
MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);
// IdxV is required to be a constant.
- unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
+ unsigned Idx = IdxV.getNode()->getAsZExtVal();
unsigned ResLen = ResTy.getVectorNumElements();
unsigned BitBytes = HwLen / VecTy.getVectorNumElements();
@@ -1801,7 +1801,7 @@ HexagonTargetLowering::LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG)
MVT SrcTy = ty(SrcV);
MVT DstTy = ty(Op);
SDValue IdxV = Op.getOperand(1);
- unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
+ unsigned Idx = IdxV.getNode()->getAsZExtVal();
assert(Idx % DstTy.getVectorNumElements() == 0);
(void)Idx;
const SDLoc &dl(Op);
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 590e464e1653..e7a692d67ba0 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -274,7 +274,8 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
return I.get();
}
-void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void HexagonTargetMachine::registerPassBuilderCallbacks(
+ PassBuilder &PB, bool PopulateClassToPassNames) {
PB.registerLateLoopOptimizationsEPCallback(
[=](LoopPassManager &LPM, OptimizationLevel Level) {
LPM.addPass(HexagonLoopIdiomRecognitionPass());
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index dddd79ad1fcf..c5fed0cd65a8 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -34,7 +34,8 @@ public:
~HexagonTargetMachine() override;
const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
- void registerPassBuilderCallbacks(PassBuilder &PB) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool PopulateClassToPassNames) override;
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 17d7ffb586f4..06de2ff1ae3e 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -166,6 +166,8 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,
// Booleans always contain 0 or 1.
setBooleanContents(ZeroOrOneBooleanContent);
+
+ setMaxAtomicSizeInBitsSupported(0);
}
SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 039182b3ffe6..33479720183b 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -93,6 +93,7 @@ public:
return getTM<LanaiTargetMachine>();
}
+ void addIRPasses() override;
bool addInstSelector() override;
void addPreSched2() override;
void addPreEmitPass() override;
@@ -104,6 +105,12 @@ LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) {
return new LanaiPassConfig(*this, &PassManager);
}
+void LanaiPassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass());
+
+ TargetPassConfig::addIRPasses();
+}
+
// Install an instruction selector pass.
bool LanaiPassConfig::addInstSelector() {
addPass(createLanaiISelDag(getLanaiTargetMachine()));
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 66a37fce5dda..46f63a4103f9 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -121,6 +121,10 @@ class LoongArchAsmParser : public MCTargetAsmParser {
// Helper to emit pseudo instruction "li.w/d $rd, $imm".
void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+ // Helper to emit pseudo instruction "call36 sym" or "tail36 $rj, sym".
+ void emitFuncCall36(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ bool IsTailCall);
+
public:
enum LoongArchMatchResultTy {
Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
@@ -400,6 +404,22 @@ public:
IsValidKind;
}
+ bool isSImm20pcaddu18i() const {
+ if (!isImm())
+ return false;
+
+ int64_t Imm;
+ LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+ bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+ bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
+ VK == LoongArchMCExpr::VK_LoongArch_CALL36;
+
+ return IsConstantImm
+ ? isInt<20>(Imm) && IsValidKind
+ : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
+ IsValidKind;
+ }
+
bool isSImm21lsl2() const {
if (!isImm())
return false;
@@ -1110,6 +1130,35 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,
}
}
+void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out, bool IsTailCall) {
+ // call36 sym
+ // expands to:
+ // pcaddu18i $ra, %call36(sym)
+ // jirl $ra, $ra, 0
+ //
+ // tail36 $rj, sym
+ // expands to:
+ // pcaddu18i $rj, %call36(sym)
+ // jirl $r0, $rj, 0
+ unsigned ScratchReg =
+ IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1;
+ const MCExpr *Sym =
+ IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
+ const LoongArchMCExpr *LE = LoongArchMCExpr::create(
+ Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext());
+
+ Out.emitInstruction(
+ MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE),
+ getSTI());
+ Out.emitInstruction(
+ MCInstBuilder(LoongArch::JIRL)
+ .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(0),
+ getSTI());
+}
+
bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
OperandVector &Operands,
MCStreamer &Out) {
@@ -1158,6 +1207,12 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
case LoongArch::PseudoLI_D:
emitLoadImm(Inst, IDLoc, Out);
return false;
+ case LoongArch::PseudoCALL36:
+ emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/false);
+ return false;
+ case LoongArch::PseudoTAIL36:
+ emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/true);
+ return false;
}
Out.emitInstruction(Inst, getSTI());
return false;
@@ -1439,6 +1494,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
/*Upper=*/(1 << 19) - 1,
"operand must be a symbol with modifier (e.g. %pc_hi20) or an integer "
"in the range");
+ case Match_InvalidSImm20pcaddu18i:
+ return generateImmOutOfRangeError(
+ Operands, ErrorInfo, /*Lower=*/-(1 << 19),
+ /*Upper=*/(1 << 19) - 1,
+ "operand must be a symbol with modifier (e.g. %call36) or an integer "
+ "in the range");
case Match_InvalidSImm21lsl2:
return generateImmOutOfRangeError(
Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 72c1f1cec198..ad39658f698e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -62,43 +62,24 @@ private:
MachineBasicBlock::iterator &NextMBBI,
unsigned FlagsHi, unsigned SecondOpcode,
unsigned FlagsLo);
- bool expandLargeAddressLoad(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI,
- unsigned LastOpcode, unsigned IdentifyingMO);
- bool expandLargeAddressLoad(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI,
- unsigned LastOpcode, unsigned IdentifyingMO,
- const MachineOperand &Symbol, Register DestReg,
- bool EraseFromParent);
bool expandLoadAddressPcrel(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI,
- bool Large = false);
+ MachineBasicBlock::iterator &NextMBBI);
bool expandLoadAddressGot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI,
- bool Large = false);
+ MachineBasicBlock::iterator &NextMBBI);
bool expandLoadAddressTLSLE(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
bool expandLoadAddressTLSIE(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI,
- bool Large = false);
+ MachineBasicBlock::iterator &NextMBBI);
bool expandLoadAddressTLSLD(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI,
- bool Large = false);
+ MachineBasicBlock::iterator &NextMBBI);
bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI,
- bool Large = false);
- bool expandFunctionCALL(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI,
- bool IsTailCall);
+ MachineBasicBlock::iterator &NextMBBI);
};
char LoongArchPreRAExpandPseudo::ID = 0;
@@ -131,30 +112,16 @@ bool LoongArchPreRAExpandPseudo::expandMI(
switch (MBBI->getOpcode()) {
case LoongArch::PseudoLA_PCREL:
return expandLoadAddressPcrel(MBB, MBBI, NextMBBI);
- case LoongArch::PseudoLA_PCREL_LARGE:
- return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true);
case LoongArch::PseudoLA_GOT:
return expandLoadAddressGot(MBB, MBBI, NextMBBI);
- case LoongArch::PseudoLA_GOT_LARGE:
- return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true);
case LoongArch::PseudoLA_TLS_LE:
return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI);
case LoongArch::PseudoLA_TLS_IE:
return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI);
- case LoongArch::PseudoLA_TLS_IE_LARGE:
- return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true);
case LoongArch::PseudoLA_TLS_LD:
return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI);
- case LoongArch::PseudoLA_TLS_LD_LARGE:
- return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true);
case LoongArch::PseudoLA_TLS_GD:
return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI);
- case LoongArch::PseudoLA_TLS_GD_LARGE:
- return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true);
- case LoongArch::PseudoCALL:
- return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
- case LoongArch::PseudoTAIL:
- return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
}
return false;
}
@@ -187,118 +154,9 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(
return true;
}
-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
- unsigned IdentifyingMO) {
- MachineInstr &MI = *MBBI;
- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
- MI.getOperand(2), MI.getOperand(0).getReg(),
- true);
-}
-
-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
- unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
- bool EraseFromParent) {
- // Code Sequence:
- //
- // Part1: pcalau12i $scratch, %MO1(sym)
- // Part0: addi.d $dest, $zero, %MO0(sym)
- // Part2: lu32i.d $dest, %MO2(sym)
- // Part3: lu52i.d $dest, $dest, %MO3(sym)
- // Fin: LastOpcode $dest, $dest, $scratch
-
- unsigned MO0, MO1, MO2, MO3;
- switch (IdentifyingMO) {
- default:
- llvm_unreachable("unsupported identifying MO");
- case LoongArchII::MO_PCREL_LO:
- MO0 = IdentifyingMO;
- MO1 = LoongArchII::MO_PCREL_HI;
- MO2 = LoongArchII::MO_PCREL64_LO;
- MO3 = LoongArchII::MO_PCREL64_HI;
- break;
- case LoongArchII::MO_GOT_PC_HI:
- case LoongArchII::MO_LD_PC_HI:
- case LoongArchII::MO_GD_PC_HI:
- // These cases relocate just like the GOT case, except for Part1.
- MO0 = LoongArchII::MO_GOT_PC_LO;
- MO1 = IdentifyingMO;
- MO2 = LoongArchII::MO_GOT_PC64_LO;
- MO3 = LoongArchII::MO_GOT_PC64_HI;
- break;
- case LoongArchII::MO_IE_PC_LO:
- MO0 = IdentifyingMO;
- MO1 = LoongArchII::MO_IE_PC_HI;
- MO2 = LoongArchII::MO_IE_PC64_LO;
- MO3 = LoongArchII::MO_IE_PC64_HI;
- break;
- }
-
- MachineFunction *MF = MBB.getParent();
- MachineInstr &MI = *MBBI;
- DebugLoc DL = MI.getDebugLoc();
-
- assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
- "Large code model requires LA64");
-
- Register TmpPart1 =
- MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
- Register TmpPart0 =
- DestReg.isVirtual()
- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
- : DestReg;
- Register TmpParts02 =
- DestReg.isVirtual()
- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
- : DestReg;
- Register TmpParts023 =
- DestReg.isVirtual()
- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
- : DestReg;
-
- auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1);
- auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0)
- .addReg(LoongArch::R0);
- auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02)
- // "rj" is needed due to InstrInfo pattern requirement.
- .addReg(TmpPart0, RegState::Kill);
- auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023)
- .addReg(TmpParts02, RegState::Kill);
- BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
- .addReg(TmpParts023)
- .addReg(TmpPart1, RegState::Kill);
-
- if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
- const char *SymName = Symbol.getSymbolName();
- Part0.addExternalSymbol(SymName, MO0);
- Part1.addExternalSymbol(SymName, MO1);
- Part2.addExternalSymbol(SymName, MO2);
- Part3.addExternalSymbol(SymName, MO3);
- } else {
- Part0.addDisp(Symbol, 0, MO0);
- Part1.addDisp(Symbol, 0, MO1);
- Part2.addDisp(Symbol, 0, MO2);
- Part3.addDisp(Symbol, 0, MO3);
- }
-
- if (EraseFromParent)
- MI.eraseFromParent();
-
- return true;
-}
-
bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI, bool Large) {
- if (Large)
- // Emit the 5-insn large address load sequence with the `%pc` family of
- // relocs.
- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
- LoongArchII::MO_PCREL_LO);
-
+ MachineBasicBlock::iterator &NextMBBI) {
// Code Sequence:
// pcalau12i $rd, %pc_hi20(sym)
// addi.w/d $rd, $rd, %pc_lo12(sym)
@@ -311,13 +169,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
bool LoongArchPreRAExpandPseudo::expandLoadAddressGot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI, bool Large) {
- if (Large)
- // Emit the 5-insn large address load sequence with the `%got_pc` family
- // of relocs, loading the result from GOT with `ldx.d` in the end.
- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
- LoongArchII::MO_GOT_PC_HI);
-
+ MachineBasicBlock::iterator &NextMBBI) {
// Code Sequence:
// pcalau12i $rd, %got_pc_hi20(sym)
// ld.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -378,13 +230,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI, bool Large) {
- if (Large)
- // Emit the 5-insn large address load sequence with the `%ie_pc` family
- // of relocs, loading the result with `ldx.d` in the end.
- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
- LoongArchII::MO_IE_PC_LO);
-
+ MachineBasicBlock::iterator &NextMBBI) {
// Code Sequence:
// pcalau12i $rd, %ie_pc_hi20(sym)
// ld.w/d $rd, $rd, %ie_pc_lo12(sym)
@@ -397,13 +243,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI, bool Large) {
- if (Large)
- // Emit the 5-insn large address load sequence with the `%got_pc` family
- // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
- LoongArchII::MO_LD_PC_HI);
-
+ MachineBasicBlock::iterator &NextMBBI) {
// Code Sequence:
// pcalau12i $rd, %ld_pc_hi20(sym)
// addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -416,13 +256,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI, bool Large) {
- if (Large)
- // Emit the 5-insn large address load sequence with the `%got_pc` family
- // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
- LoongArchII::MO_GD_PC_HI);
-
+ MachineBasicBlock::iterator &NextMBBI) {
// Code Sequence:
// pcalau12i $rd, %gd_pc_hi20(sym)
// addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -433,88 +267,6 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
SecondOpcode, LoongArchII::MO_GOT_PC_LO);
}
-bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
- MachineFunction *MF = MBB.getParent();
- MachineInstr &MI = *MBBI;
- DebugLoc DL = MI.getDebugLoc();
- const MachineOperand &Func = MI.getOperand(0);
- MachineInstrBuilder CALL;
- unsigned Opcode;
-
- switch (MF->getTarget().getCodeModel()) {
- default:
- report_fatal_error("Unsupported code model");
- break;
- case CodeModel::Small: {
- // CALL:
- // bl func
- // TAIL:
- // b func
- Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
- CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
- break;
- }
- case CodeModel::Medium: {
- // CALL:
- // pcalau12i $ra, %pc_hi20(func)
- // jirl $ra, $ra, %pc_lo12(func)
- // TAIL:
- // pcalau12i $scratch, %pc_hi20(func)
- // jirl $r0, $scratch, %pc_lo12(func)
- Opcode =
- IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
- Register ScratchReg =
- IsTailCall
- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
- : LoongArch::R1;
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg);
- CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg);
- if (Func.isSymbol()) {
- const char *FnName = Func.getSymbolName();
- MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI);
- CALL.addExternalSymbol(FnName, LoongArchII::MO_PCREL_LO);
- break;
- }
- assert(Func.isGlobal() && "Expected a GlobalValue at this time");
- const GlobalValue *GV = Func.getGlobal();
- MIB.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_HI);
- CALL.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_LO);
- break;
- }
- case CodeModel::Large: {
- // Emit the 5-insn large address load sequence, either directly or
- // indirectly in case of going through the GOT, then JIRL_TAIL or
- // JIRL_CALL to $addr.
- Opcode =
- IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
- Register AddrReg =
- IsTailCall
- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
- : LoongArch::R1;
-
- bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
- unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
- unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
- expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
- false);
- CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
- break;
- }
- }
-
- // Transfer implicit operands.
- CALL.copyImplicitOps(MI);
-
- // Transfer MI flags.
- CALL.setMIFlags(MI.getFlags());
-
- MI.eraseFromParent();
- return true;
-}
-
class LoongArchExpandPseudo : public MachineFunctionPass {
public:
const LoongArchInstrInfo *TII;
@@ -536,6 +288,35 @@ private:
MachineBasicBlock::iterator &NextMBBI);
bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
+ bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ unsigned LastOpcode, unsigned IdentifyingMO);
+ bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ unsigned LastOpcode, unsigned IdentifyingMO,
+ const MachineOperand &Symbol, Register DestReg,
+ bool EraseFromParent);
+ bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandLoadAddressGotLarge(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandFunctionCALL(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ bool IsTailCall);
};
char LoongArchExpandPseudo::ID = 0;
@@ -570,6 +351,24 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
switch (MBBI->getOpcode()) {
case LoongArch::PseudoCopyCFR:
return expandCopyCFR(MBB, MBBI, NextMBBI);
+ case LoongArch::PseudoLA_PCREL_LARGE:
+ return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI);
+ case LoongArch::PseudoLA_GOT_LARGE:
+ return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI);
+ case LoongArch::PseudoLA_TLS_IE_LARGE:
+ return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI);
+ case LoongArch::PseudoLA_TLS_LD_LARGE:
+ return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI);
+ case LoongArch::PseudoLA_TLS_GD_LARGE:
+ return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI);
+ case LoongArch::PseudoCALL:
+ case LoongArch::PseudoCALL_MEDIUM:
+ case LoongArch::PseudoCALL_LARGE:
+ return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
+ case LoongArch::PseudoTAIL:
+ case LoongArch::PseudoTAIL_MEDIUM:
+ case LoongArch::PseudoTAIL_LARGE:
+ return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
}
return false;
@@ -628,6 +427,212 @@ bool LoongArchExpandPseudo::expandCopyCFR(
return true;
}
+bool LoongArchExpandPseudo::expandLargeAddressLoad(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+ unsigned IdentifyingMO) {
+ MachineInstr &MI = *MBBI;
+ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
+ MI.getOperand(2), MI.getOperand(0).getReg(),
+ true);
+}
+
+bool LoongArchExpandPseudo::expandLargeAddressLoad(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+ unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
+ bool EraseFromParent) {
+ // Code Sequence:
+ //
+ // Part1: pcalau12i $dst, %MO1(sym)
+ // Part0: addi.d $t8, $zero, %MO0(sym)
+ // Part2: lu32i.d $t8, %MO2(sym)
+ // Part3: lu52i.d $t8, $t8, %MO3(sym)
+ // Fin: LastOpcode $dst, $t8, $dst
+
+ unsigned MO0, MO1, MO2, MO3;
+ switch (IdentifyingMO) {
+ default:
+ llvm_unreachable("unsupported identifying MO");
+ case LoongArchII::MO_PCREL_LO:
+ MO0 = IdentifyingMO;
+ MO1 = LoongArchII::MO_PCREL_HI;
+ MO2 = LoongArchII::MO_PCREL64_LO;
+ MO3 = LoongArchII::MO_PCREL64_HI;
+ break;
+ case LoongArchII::MO_GOT_PC_HI:
+ case LoongArchII::MO_LD_PC_HI:
+ case LoongArchII::MO_GD_PC_HI:
+ // These cases relocate just like the GOT case, except for Part1.
+ MO0 = LoongArchII::MO_GOT_PC_LO;
+ MO1 = IdentifyingMO;
+ MO2 = LoongArchII::MO_GOT_PC64_LO;
+ MO3 = LoongArchII::MO_GOT_PC64_HI;
+ break;
+ case LoongArchII::MO_IE_PC_LO:
+ MO0 = IdentifyingMO;
+ MO1 = LoongArchII::MO_IE_PC_HI;
+ MO2 = LoongArchII::MO_IE_PC64_LO;
+ MO3 = LoongArchII::MO_IE_PC64_HI;
+ break;
+ }
+
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ Register ScratchReg = LoongArch::R20; // $t8
+
+ assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+ "Large code model requires LA64");
+
+ auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
+ auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
+ .addReg(LoongArch::R0);
+ auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
+ // "rj" is needed due to InstrInfo pattern requirement.
+ .addReg(ScratchReg);
+ auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
+ .addReg(ScratchReg);
+ BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
+ .addReg(ScratchReg)
+ .addReg(DestReg);
+
+ if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
+ const char *SymName = Symbol.getSymbolName();
+ Part0.addExternalSymbol(SymName, MO0);
+ Part1.addExternalSymbol(SymName, MO1);
+ Part2.addExternalSymbol(SymName, MO2);
+ Part3.addExternalSymbol(SymName, MO3);
+ } else {
+ Part0.addDisp(Symbol, 0, MO0);
+ Part1.addDisp(Symbol, 0, MO1);
+ Part2.addDisp(Symbol, 0, MO2);
+ Part3.addDisp(Symbol, 0, MO3);
+ }
+
+ if (EraseFromParent)
+ MI.eraseFromParent();
+
+ return true;
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ // Emit the 5-insn large address load sequence with the `%pc` family of
+ // relocs.
+ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+ LoongArchII::MO_PCREL_LO);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressGotLarge(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ // Emit the 5-insn large address load sequence with the `%got_pc` family
+ // of relocs, loading the result from GOT with `ldx.d` in the end.
+ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+ LoongArchII::MO_GOT_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ // Emit the 5-insn large address load sequence with the `%ie_pc` family
+ // of relocs, loading the result with `ldx.d` in the end.
+ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+ LoongArchII::MO_IE_PC_LO);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ // Emit the 5-insn large address load sequence with the `%got_pc` family
+ // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
+ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+ LoongArchII::MO_LD_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ // Emit the 5-insn large address load sequence with the `%got_pc` family
+ // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
+ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+ LoongArchII::MO_GD_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandFunctionCALL(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
+ MachineFunction *MF = MBB.getParent();
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ const MachineOperand &Func = MI.getOperand(0);
+ MachineInstrBuilder CALL;
+ unsigned Opcode;
+
+ switch (MF->getTarget().getCodeModel()) {
+ default:
+ report_fatal_error("Unsupported code model");
+ break;
+ case CodeModel::Small: {
+ // CALL:
+ // bl func
+ // TAIL:
+ // b func
+ Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
+ CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
+ break;
+ }
+ case CodeModel::Medium: {
+ // CALL:
+ // pcaddu18i $ra, %call36(func)
+ // jirl $ra, $ra, 0
+ // TAIL:
+ // pcaddu18i $t8, %call36(func)
+ // jr $t8
+ Opcode =
+ IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+ Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1;
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+
+ CALL =
+ BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+
+ if (Func.isSymbol())
+ MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+ else
+ MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+ break;
+ }
+ case CodeModel::Large: {
+ // Emit the 5-insn large address load sequence, either directly or
+ // indirectly in case of going through the GOT, then JIRL_TAIL or
+ // JIRL_CALL to $addr.
+ Opcode =
+ IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+ Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1;
+
+ bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
+ unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
+ unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
+ expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
+ false);
+ CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
+ break;
+ }
+ }
+
+ // Transfer implicit operands.
+ CALL.copyImplicitOps(MI);
+
+ // Transfer MI flags.
+ CALL.setMIFlags(MI.getFlags());
+
+ MI.eraseFromParent();
+ return true;
+}
+
} // end namespace
INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo",
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index e14bbadf9ed2..70f782b81270 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -525,8 +525,7 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (isa<ConstantSDNode>(Idx) &&
(EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 ||
- EltTy == MVT::f64 ||
- cast<ConstantSDNode>(Idx)->getZExtValue() < NumElts / 2))
+ EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2))
return Op;
return SDValue();
@@ -762,12 +761,13 @@ static SDValue getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty,
template <class NodeTy>
SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
+ CodeModel::Model M,
bool IsLocal) const {
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
- switch (DAG.getTarget().getCodeModel()) {
+ switch (M) {
default:
report_fatal_error("Unsupported code model");
@@ -808,24 +808,35 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
SDValue LoongArchTargetLowering::lowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
- return getAddr(cast<BlockAddressSDNode>(Op), DAG);
+ return getAddr(cast<BlockAddressSDNode>(Op), DAG,
+ DAG.getTarget().getCodeModel());
}
SDValue LoongArchTargetLowering::lowerJumpTable(SDValue Op,
SelectionDAG &DAG) const {
- return getAddr(cast<JumpTableSDNode>(Op), DAG);
+ return getAddr(cast<JumpTableSDNode>(Op), DAG,
+ DAG.getTarget().getCodeModel());
}
SDValue LoongArchTargetLowering::lowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
- return getAddr(cast<ConstantPoolSDNode>(Op), DAG);
+ return getAddr(cast<ConstantPoolSDNode>(Op), DAG,
+ DAG.getTarget().getCodeModel());
}
SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
assert(N->getOffset() == 0 && "unexpected offset in global node");
- return getAddr(N, DAG, N->getGlobal()->isDSOLocal());
+ auto CM = DAG.getTarget().getCodeModel();
+ const GlobalValue *GV = N->getGlobal();
+
+ if (GV->isDSOLocal() && isa<GlobalVariable>(GV)) {
+ if (auto GCM = dyn_cast<GlobalVariable>(GV)->getCodeModel())
+ CM = *GCM;
+ }
+
+ return getAddr(N, DAG, CM, GV->isDSOLocal());
}
SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
@@ -1383,28 +1394,28 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
if (IntrinsicEnum == Intrinsic::loongarch_cacop_w && Subtarget.is64Bit())
return emitIntrinsicErrorMessage(Op, ErrorMsgReqLA32, DAG);
// call void @llvm.loongarch.cacop.[d/w](uimm5, rj, simm12)
- unsigned Imm1 = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Imm1 = Op2->getAsZExtVal();
int Imm2 = cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue();
if (!isUInt<5>(Imm1) || !isInt<12>(Imm2))
return emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG);
return Op;
}
case Intrinsic::loongarch_dbar: {
- unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Imm = Op2->getAsZExtVal();
return !isUInt<15>(Imm)
? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
: DAG.getNode(LoongArchISD::DBAR, DL, MVT::Other, Chain,
DAG.getConstant(Imm, DL, GRLenVT));
}
case Intrinsic::loongarch_ibar: {
- unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Imm = Op2->getAsZExtVal();
return !isUInt<15>(Imm)
? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
: DAG.getNode(LoongArchISD::IBAR, DL, MVT::Other, Chain,
DAG.getConstant(Imm, DL, GRLenVT));
}
case Intrinsic::loongarch_break: {
- unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Imm = Op2->getAsZExtVal();
return !isUInt<15>(Imm)
? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
: DAG.getNode(LoongArchISD::BREAK, DL, MVT::Other, Chain,
@@ -1413,7 +1424,7 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::loongarch_movgr2fcsr: {
if (!Subtarget.hasBasicF())
return emitIntrinsicErrorMessage(Op, ErrorMsgReqF, DAG);
- unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Imm = Op2->getAsZExtVal();
return !isUInt<2>(Imm)
? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
: DAG.getNode(LoongArchISD::MOVGR2FCSR, DL, MVT::Other, Chain,
@@ -1422,7 +1433,7 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(3)));
}
case Intrinsic::loongarch_syscall: {
- unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Imm = Op2->getAsZExtVal();
return !isUInt<15>(Imm)
? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
: DAG.getNode(LoongArchISD::SYSCALL, DL, MVT::Other, Chain,
@@ -1925,7 +1936,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgReqF);
return;
}
- unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Imm = Op2->getAsZExtVal();
if (!isUInt<2>(Imm)) {
emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
return;
@@ -1981,7 +1992,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
CSR_CASE(iocsrrd_d);
#undef CSR_CASE
case Intrinsic::loongarch_csrrd_w: {
- unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Imm = Op2->getAsZExtVal();
if (!isUInt<14>(Imm)) {
emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
return;
@@ -3381,8 +3392,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
// TODO: Add more target-dependent nodes later.
NODE_NAME_CASE(CALL)
+ NODE_NAME_CASE(CALL_MEDIUM)
+ NODE_NAME_CASE(CALL_LARGE)
NODE_NAME_CASE(RET)
NODE_NAME_CASE(TAIL)
+ NODE_NAME_CASE(TAIL_MEDIUM)
+ NODE_NAME_CASE(TAIL_LARGE)
NODE_NAME_CASE(SLL_W)
NODE_NAME_CASE(SRA_W)
NODE_NAME_CASE(SRL_W)
@@ -4240,15 +4255,31 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Emit the call.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ unsigned Op;
+ switch (DAG.getTarget().getCodeModel()) {
+ default:
+ report_fatal_error("Unsupported code model");
+ case CodeModel::Small:
+ Op = IsTailCall ? LoongArchISD::TAIL : LoongArchISD::CALL;
+ break;
+ case CodeModel::Medium:
+ assert(Subtarget.is64Bit() && "Medium code model requires LA64");
+ Op = IsTailCall ? LoongArchISD::TAIL_MEDIUM : LoongArchISD::CALL_MEDIUM;
+ break;
+ case CodeModel::Large:
+ assert(Subtarget.is64Bit() && "Large code model requires LA64");
+ Op = IsTailCall ? LoongArchISD::TAIL_LARGE : LoongArchISD::CALL_LARGE;
+ break;
+ }
if (IsTailCall) {
MF.getFrameInfo().setHasTailCall();
- SDValue Ret = DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops);
+ SDValue Ret = DAG.getNode(Op, DL, NodeTys, Ops);
DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
return Ret;
}
- Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops);
+ Chain = DAG.getNode(Op, DL, NodeTys, Ops);
DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
Glue = Chain.getValue(1);
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 6f8878f9ccd5..72182623b2c3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -27,8 +27,12 @@ enum NodeType : unsigned {
// TODO: add more LoongArchISDs
CALL,
+ CALL_MEDIUM,
+ CALL_LARGE,
RET,
TAIL,
+ TAIL_MEDIUM,
+ TAIL_LARGE,
// 32-bit shifts, directly matching the semantics of the named LoongArch
// instructions.
@@ -250,7 +254,8 @@ private:
LoongArchCCAssignFn Fn) const;
template <class NodeTy>
- SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;
+ SDValue getAddr(NodeTy *N, SelectionDAG &DAG, CodeModel::Model M,
+ bool IsLocal = true) const;
SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
unsigned Opc, bool Large = false) const;
SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 2fea0f33e9eb..78074c012876 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -69,6 +69,18 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,
def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
+def loongarch_call_medium : SDNode<"LoongArchISD::CALL_MEDIUM", SDT_LoongArchCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def loongarch_tail_medium : SDNode<"LoongArchISD::TAIL_MEDIUM", SDT_LoongArchCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def loongarch_call_large : SDNode<"LoongArchISD::CALL_LARGE", SDT_LoongArchCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
@@ -377,6 +389,10 @@ def simm20_lu32id : SImm20Operand {
let ParserMatchClass = SImmAsmOperand<20, "lu32id">;
}
+def simm20_pcaddu18i : SImm20Operand {
+ let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
+}
+
def simm21_lsl2 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
let EncoderMethod = "getImmOpValueAsr<2>";
@@ -832,7 +848,7 @@ def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst),
"$rd, $imm20">;
}
def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>;
-def PCADDU18I : ALU_1RI20<0x1e000000, simm20>;
+def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>;
def MUL_D : ALU_3R<0x001d8000>;
def MULH_D : ALU_3R<0x001e0000>;
def MULH_DU : ALU_3R<0x001e8000>;
@@ -1395,16 +1411,43 @@ def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>;
def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
(PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
+// Function call with 'Small' code model.
let isCall = 1, Defs = [R1] in
-def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>;
+def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
+// Function call with 'Medium' code model.
+let isCall = 1, Defs = [R1, R20], Size = 8 in
+def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_medium tglobaladdr:$func),
+ (PseudoCALL_MEDIUM tglobaladdr:$func)>;
+def : Pat<(loongarch_call_medium texternalsym:$func),
+ (PseudoCALL_MEDIUM texternalsym:$func)>;
+} // Predicates = [IsLA64]
+
+// Function call with 'Large' code model.
+let isCall = 1, Defs = [R1, R20], Size = 24 in
+def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_large tglobaladdr:$func),
+ (PseudoCALL_LARGE tglobaladdr:$func)>;
+def : Pat<(loongarch_call_large texternalsym:$func),
+ (PseudoCALL_LARGE texternalsym:$func)>;
+} // Predicates = [IsLA64]
+
let isCall = 1, Defs = [R1] in
def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj),
[(loongarch_call GPR:$rj)]>,
PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>;
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_medium GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
+def : Pat<(loongarch_call_large GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
+}
let isCall = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Defs = [R1] in
def PseudoJIRL_CALL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
@@ -1415,18 +1458,47 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in
def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
PseudoInstExpansion<(JIRL R0, R1, 0)>;
+// Tail call with 'Small' code model.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
-def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>;
+def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
(PseudoTAIL tglobaladdr:$dst)>;
def : Pat<(loongarch_tail (iPTR texternalsym:$dst)),
(PseudoTAIL texternalsym:$dst)>;
+// Tail call with 'Medium' code model.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ Uses = [R3], Defs = [R20], Size = 8 in
+def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_medium (iPTR tglobaladdr:$dst)),
+ (PseudoTAIL_MEDIUM tglobaladdr:$dst)>;
+def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)),
+ (PseudoTAIL_MEDIUM texternalsym:$dst)>;
+} // Predicates = [IsLA64]
+
+// Tail call with 'Large' code model.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ Uses = [R3], Defs = [R19, R20], Size = 24 in
+def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_large (iPTR tglobaladdr:$dst)),
+ (PseudoTAIL_LARGE tglobaladdr:$dst)>;
+def : Pat<(loongarch_tail_large (iPTR texternalsym:$dst)),
+ (PseudoTAIL_LARGE texternalsym:$dst)>;
+} // Predicates = [IsLA64]
+
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj),
[(loongarch_tail GPRT:$rj)]>,
PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>;
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_medium GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
+def : Pat<(loongarch_tail_large GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
+}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
hasSideEffects = 0, mayStore = 0, mayLoad = 0, Uses = [R3] in
@@ -1439,6 +1511,19 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
PseudoInstExpansion<(JIRL R0, GPR:$rj,
simm16_lsl2:$imm16)>;
+/// call36/taill36 macro instructions
+let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1,
+ Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
+def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [],
+ "call36", "$dst">,
+ Requires<[IsLA64]>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3],
+ isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0,
+ mayStore = 0, mayLoad = 0 in
+def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [],
+ "tail36", "$tmp, $dst">,
+ Requires<[IsLA64]>;
+
/// Load address (la*) macro instructions.
// Define isCodeGenOnly = 0 to expose them to tablegened assembly parser.
@@ -1451,6 +1536,7 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst),
"la.abs", "$dst, $src">;
def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la.pcrel", "$dst, $src">;
+let Defs = [R20], Size = 20 in
def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
(ins GPR:$tmp, bare_symbol:$src), [],
"la.pcrel", "$dst, $tmp, $src">,
@@ -1462,28 +1548,30 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
isAsmParserOnly = 1 in {
def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la.got", "$dst, $src">;
+def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+ "la.tls.ie", "$dst, $src">;
+def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+ "la.tls.ld", "$dst, $src">;
+def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+ "la.tls.gd", "$dst, $src">;
+let Defs = [R20], Size = 20 in {
def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
(ins GPR:$tmp, bare_symbol:$src), [],
"la.got", "$dst, $tmp, $src">,
Requires<[IsLA64]>;
-def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
- "la.tls.ie", "$dst, $src">;
def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
(ins GPR:$tmp, bare_symbol:$src), [],
"la.tls.ie", "$dst, $tmp, $src">,
Requires<[IsLA64]>;
-def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
- "la.tls.ld", "$dst, $src">;
def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
(ins GPR:$tmp, bare_symbol:$src), [],
"la.tls.ld", "$dst, $tmp, $src">,
Requires<[IsLA64]>;
-def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
- "la.tls.gd", "$dst, $src">;
def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
(ins GPR:$tmp, bare_symbol:$src), [],
"la.tls.gd", "$dst, $tmp, $src">,
Requires<[IsLA64]>;
+} // Defs = [R20], Size = 20
}
// Load address inst alias: "la", "la.global" and "la.local".
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index 5daa9481c907..98ad49f25e3f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -95,6 +95,9 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
case LoongArchII::MO_GD_PC_HI:
Kind = LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20;
break;
+ case LoongArchII::MO_CALL36:
+ Kind = LoongArchMCExpr::VK_LoongArch_CALL36;
+ break;
// TODO: Handle more target-flags.
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
index 257b947a3ce4..092b5f1fb442 100644
--- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
@@ -15,6 +15,7 @@
#include "LoongArch.h"
#include "LoongArchInstrInfo.h"
#include "LoongArchSubtarget.h"
+#include "MCTargetDesc/LoongArchBaseInfo.h"
#include "MCTargetDesc/LoongArchMCTargetDesc.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -194,3 +195,25 @@ bool LoongArchRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
return false;
}
+
+bool LoongArchRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const LoongArchFrameLowering *TFI = getFrameLowering(MF);
+
+ // Stack realignment requires a frame pointer. If we already started
+ // register allocation with frame pointer elimination, it is too late now.
+ if (!MRI->canReserveReg(LoongArch::R22))
+ return false;
+
+ // We may also need a base pointer if there are dynamic allocas or stack
+ // pointer adjustments around calls.
+ if (TFI->hasReservedCallFrame(MF))
+ return true;
+
+ // A base pointer is required and allowed. Check that it isn't too late to
+ // reserve it.
+ return MRI->canReserveReg(LoongArchABI::getBPReg());
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
index 7e8f26b14097..d1e40254c297 100644
--- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
@@ -51,6 +51,7 @@ struct LoongArchRegisterInfo : public LoongArchGenRegisterInfo {
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
return true;
}
+ bool canRealignStack(const MachineFunction &MF) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index a5a4d78aceee..62ae1dea00d6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -63,11 +63,11 @@ getEffectiveLoongArchCodeModel(const Triple &TT,
switch (*CM) {
case CodeModel::Small:
- case CodeModel::Medium:
return *CM;
+ case CodeModel::Medium:
case CodeModel::Large:
if (!TT.isArch64Bit())
- report_fatal_error("Large code model requires LA64");
+ report_fatal_error("Medium/Large code model requires LA64");
return *CM;
default:
report_fatal_error(
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 6d8ef1bf96cb..518f6b10edab 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -91,6 +91,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
case FK_Data_2:
case FK_Data_4:
case FK_Data_8:
+ case FK_Data_leb128:
return Value;
case LoongArch::fixup_loongarch_b16: {
if (!isInt<18>(Value))
@@ -128,6 +129,15 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
}
+static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup,
+ MutableArrayRef<char> Data, uint64_t Value) {
+ unsigned I;
+ for (I = 0; I != Data.size() && Value; ++I, Value >>= 7)
+ Data[I] |= uint8_t(Value & 0x7f);
+ if (Value)
+ Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!");
+}
+
void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm,
const MCFixup &Fixup,
const MCValue &Target,
@@ -143,6 +153,10 @@ void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm,
MCFixupKindInfo Info = getFixupKindInfo(Kind);
MCContext &Ctx = Asm.getContext();
+ // Fixup leb128 separately.
+ if (Fixup.getTargetKind() == FK_Data_leb128)
+ return fixupLeb128(Ctx, Fixup, Data, Value);
+
// Apply any target-specific value adjustments.
Value = adjustFixupValue(Fixup, Value, Ctx);
@@ -173,6 +187,7 @@ bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
case FK_Data_2:
case FK_Data_4:
case FK_Data_8:
+ case FK_Data_leb128:
return !Target.isAbsolute();
}
}
@@ -202,9 +217,24 @@ getRelocPairForSize(unsigned Size) {
return std::make_pair(
MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD64),
MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB64));
+ case 128:
+ return std::make_pair(
+ MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD_ULEB128),
+ MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB_ULEB128));
}
}
+std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCLEBFragment &LF,
+ MCAsmLayout &Layout,
+ int64_t &Value) const {
+ const MCExpr &Expr = LF.getValue();
+ if (LF.isSigned() || !Expr.evaluateKnownAbsolute(Value, Layout))
+ return std::make_pair(false, false);
+ LF.getFixups().push_back(
+ MCFixup::create(0, &Expr, FK_Data_leb128, Expr.getLoc()));
+ return std::make_pair(true, true);
+}
+
bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const {
// We mostly follow binutils' convention here: align to 4-byte boundary with a
@@ -226,21 +256,27 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout,
uint64_t &FixedValue) const {
std::pair<MCFixupKind, MCFixupKind> FK;
uint64_t FixedValueA, FixedValueB;
- const MCSection &SecA = Target.getSymA()->getSymbol().getSection();
- const MCSection &SecB = Target.getSymB()->getSymbol().getSection();
-
- // We need record relocation if SecA != SecB. Usually SecB is same as the
- // section of Fixup, which will be record the relocation as PCRel. If SecB
- // is not same as the section of Fixup, it will report error. Just return
- // false and then this work can be finished by handleFixup.
- if (&SecA != &SecB)
- return false;
-
- // In SecA == SecB case. If the linker relaxation is enabled, we need record
- // the ADD, SUB relocations. Otherwise the FixedValue has already been
- // calculated out in evaluateFixup, return true and avoid record relocations.
- if (!STI.hasFeature(LoongArch::FeatureRelax))
- return true;
+ const MCSymbol &SA = Target.getSymA()->getSymbol();
+ const MCSymbol &SB = Target.getSymB()->getSymbol();
+
+ bool force = !SA.isInSection() || !SB.isInSection();
+ if (!force) {
+ const MCSection &SecA = SA.getSection();
+ const MCSection &SecB = SB.getSection();
+
+ // We need record relocation if SecA != SecB. Usually SecB is same as the
+ // section of Fixup, which will be record the relocation as PCRel. If SecB
+ // is not same as the section of Fixup, it will report error. Just return
+ // false and then this work can be finished by handleFixup.
+ if (&SecA != &SecB)
+ return false;
+
+ // In SecA == SecB case. If the linker relaxation is enabled, we need record
+ // the ADD, SUB relocations. Otherwise the FixedValue has already been calc-
+ // ulated out in evaluateFixup, return true and avoid record relocations.
+ if (!STI.hasFeature(LoongArch::FeatureRelax))
+ return true;
+ }
switch (Fixup.getKind()) {
case llvm::FK_Data_1:
@@ -255,6 +291,9 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout,
case llvm::FK_Data_8:
FK = getRelocPairForSize(64);
break;
+ case llvm::FK_Data_leb128:
+ FK = getRelocPairForSize(128);
+ break;
default:
llvm_unreachable("unsupported fixup size");
}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index fef0e84600a7..71977217f59b 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -66,6 +66,9 @@ public:
void relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const override {}
+ std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout,
+ int64_t &Value) const override;
+
bool writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const override;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
index cee6dad1f095..0692cb92b694 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -47,6 +47,7 @@ enum {
MO_IE_PC64_HI,
MO_LD_PC_HI,
MO_GD_PC_HI,
+ MO_CALL36
// TODO: Add more flags.
};
} // end namespace LoongArchII
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index fe19a4f2d3c8..1dec816f3473 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -90,6 +90,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_LARCH_TLS_LE64_LO20;
case LoongArch::fixup_loongarch_tls_le64_hi12:
return ELF::R_LARCH_TLS_LE64_HI12;
+ case LoongArch::fixup_loongarch_call36:
+ return ELF::R_LARCH_CALL36;
// TODO: Handle more fixup-kinds.
}
}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index 178fa6e5262b..e827bae1f3e3 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -108,7 +108,10 @@ enum Fixups {
// 20-bit fixup corresponding to %gd_hi20(foo) for instruction lu12i.w.
fixup_loongarch_tls_gd_hi20,
// Generate an R_LARCH_RELAX which indicates the linker may relax here.
- fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX
+ fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
+ // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
+ // pcaddu18i+jirl.
+ fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
};
} // end namespace LoongArch
} // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index d2ea062dc09a..9ac0128f2517 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -241,6 +241,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
case LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20:
FixupKind = LoongArch::fixup_loongarch_tls_gd_hi20;
break;
+ case LoongArchMCExpr::VK_LoongArch_CALL36:
+ FixupKind = LoongArch::fixup_loongarch_call36;
+ break;
}
} else if (Kind == MCExpr::SymbolRef &&
cast<MCSymbolRefExpr>(Expr)->getKind() ==
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 82c992b1cc8c..8ca8876a19b9 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -138,6 +138,8 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
return "gd_pc_hi20";
case VK_LoongArch_TLS_GD_HI20:
return "gd_hi20";
+ case VK_LoongArch_CALL36:
+ return "call36";
}
}
@@ -180,6 +182,7 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
.Case("ld_hi20", VK_LoongArch_TLS_LD_HI20)
.Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20)
.Case("gd_hi20", VK_LoongArch_TLS_GD_HI20)
+ .Case("call36", VK_LoongArch_CALL36)
.Default(VK_LoongArch_Invalid);
}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 93251f824103..bd828116d7fa 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -61,6 +61,7 @@ public:
VK_LoongArch_TLS_LD_HI20,
VK_LoongArch_TLS_GD_PC_HI20,
VK_LoongArch_TLS_GD_HI20,
+ VK_LoongArch_CALL36,
VK_LoongArch_Invalid // Must be the last item.
};
diff --git a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h
index a10401ed1a9a..cbe30ec494c9 100644
--- a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h
+++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h
@@ -20,7 +20,6 @@ namespace llvm {
class M68kSubtarget;
-/// This struct provides the information for the target register banks.
struct M68kLegalizerInfo : public LegalizerInfo {
public:
M68kLegalizerInfo(const M68kSubtarget &ST);
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index c4d7a0dec7f3..158393f02a24 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -2375,7 +2375,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// a >= b ? -1 : 0 -> RES = setcc_carry
// a >= b ? 0 : -1 -> RES = ~setcc_carry
if (Cond.getOpcode() == M68kISD::SUB) {
- unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+ unsigned CondCode = CC->getAsZExtVal();
if ((CondCode == M68k::COND_CC || CondCode == M68k::COND_CS) &&
(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
@@ -2491,7 +2491,7 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
Cond = Cmp;
AddTest = false;
} else {
- switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+ switch (CC->getAsZExtVal()) {
default:
break;
case M68k::COND_VS:
diff --git a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 660861a5d521..efb23b1a4e3f 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -308,12 +308,12 @@ static bool isValidIndexedLoad(const LoadSDNode *LD) {
switch (VT.getSimpleVT().SimpleTy) {
case MVT::i8:
- if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 1)
+ if (LD->getOffset()->getAsZExtVal() != 1)
return false;
break;
case MVT::i16:
- if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 2)
+ if (LD->getOffset()->getAsZExtVal() != 2)
return false;
break;
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index d3b59138a5a9..e68904863cfc 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -333,6 +333,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
setMinFunctionAlignment(Align(2));
setPrefFunctionAlignment(Align(2));
+ setMaxAtomicSizeInBitsSupported(0);
}
SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
@@ -1168,8 +1169,8 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
bool Invert = false;
bool Shift = false;
bool Convert = true;
- switch (cast<ConstantSDNode>(TargetCC)->getZExtValue()) {
- default:
+ switch (TargetCC->getAsZExtVal()) {
+ default:
Convert = false;
break;
case MSP430CC::COND_HS:
@@ -1193,7 +1194,7 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// C = ~Z for AND instruction, thus we can put Res = ~(SR & 1), however,
// Res = (SR >> 1) & 1 is 1 word shorter.
break;
- }
+ }
EVT VT = Op.getValueType();
SDValue One = DAG.getConstant(1, dl, VT);
if (Convert) {
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index 39e0658eb70d..283de46e57d5 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -65,6 +65,7 @@ public:
return getTM<MSP430TargetMachine>();
}
+ void addIRPasses() override;
bool addInstSelector() override;
void addPreEmitPass() override;
};
@@ -81,6 +82,12 @@ MachineFunctionInfo *MSP430TargetMachine::createMachineFunctionInfo(
F, STI);
}
+void MSP430PassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass());
+
+ TargetPassConfig::addIRPasses();
+}
+
bool MSP430PassConfig::addInstSelector() {
// Install an instruction selector.
addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel()));
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 7fcf375aa10b..192ed1cec79a 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -492,7 +492,7 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
TmpOffset += SL->getElementOffset(Idx);
} else {
- uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ uint64_t S = GTI.getSequentialElementStride(DL);
while (true) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 483eba4e4f47..d431d3d91494 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -2042,8 +2042,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
return Op;
SDValue CCNode = CondRes.getOperand(2);
- Mips::CondCode CC =
- (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
+ Mips::CondCode CC = (Mips::CondCode)CCNode->getAsZExtVal();
unsigned Opc = invertFPCondCodeUser(CC) ? Mips::BRANCH_F : Mips::BRANCH_T;
SDValue BrCode = DAG.getConstant(Opc, DL, MVT::i32);
SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
index 14f26201e6c0..f5e94235859a 100644
--- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -330,8 +330,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
verify(*ST.getInstrInfo());
}
-bool MipsLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
- MachineInstr &MI) const {
+bool MipsLegalizerInfo::legalizeCustom(
+ LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const {
using namespace TargetOpcode;
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.h b/llvm/lib/Target/Mips/MipsLegalizerInfo.h
index 05027b718a85..63daebf26470 100644
--- a/llvm/lib/Target/Mips/MipsLegalizerInfo.h
+++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.h
@@ -25,7 +25,8 @@ class MipsLegalizerInfo : public LegalizerInfo {
public:
MipsLegalizerInfo(const MipsSubtarget &ST);
- bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+ bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const override;
bool legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 0ed87ee0809a..c0e978018919 100644
--- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -76,7 +76,7 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
}
unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
- uint64_t RegNum = cast<ConstantSDNode>(RegIdx)->getZExtValue();
+ uint64_t RegNum = RegIdx->getAsZExtVal();
return Mips::MSACtrlRegClass.getRegister(RegNum);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 815c46edb6fa..7abe984b34e1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2076,7 +2076,7 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
VTs = CurDAG->getVTList(EVTs);
}
- unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+ unsigned OffsetVal = Offset->getAsZExtVal();
SmallVector<SDValue, 2> Ops;
Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
@@ -2091,7 +2091,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SDValue Offset = N->getOperand(1);
- unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+ unsigned OffsetVal = Offset->getAsZExtVal();
MemSDNode *Mem = cast<MemSDNode>(N);
// How many elements do we have?
@@ -2158,9 +2158,9 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SDValue Param = N->getOperand(1);
- unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
+ unsigned ParamVal = Param->getAsZExtVal();
SDValue Offset = N->getOperand(2);
- unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+ unsigned OffsetVal = Offset->getAsZExtVal();
MemSDNode *Mem = cast<MemSDNode>(N);
SDValue Glue = N->getOperand(N->getNumOperands() - 1);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index e8f36bf50a1b..c65090d915ef 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -854,6 +854,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
computeRegisterProperties(STI.getRegisterInfo());
setMinCmpXchgSizeInBits(32);
+ setMaxAtomicSizeInBitsSupported(64);
}
const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -5811,7 +5812,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
// Get the intrinsic ID
- unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
+ unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
switch (IntrinNo) {
default:
return;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 8d895762fbe1..fad69f5e80a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -225,7 +225,8 @@ void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
AAM.registerFunctionAnalysis<NVPTXAA>();
}
-void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+void NVPTXTargetMachine::registerPassBuilderCallbacks(
+ PassBuilder &PB, bool PopulateClassToPassNames) {
PB.registerPipelineParsingCallback(
[](StringRef PassName, FunctionPassManager &PM,
ArrayRef<PassBuilder::PipelineElement>) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index cfdd8da9b765..9e6bf929badb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -69,7 +69,8 @@ public:
void registerDefaultAliasAnalyses(AAManager &AAM) override;
- void registerPassBuilderCallbacks(PassBuilder &PB) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool PopulateClassToPassNames) override;
TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index c73721da46e3..7aa63f9fc0c9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -180,10 +180,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
return {Intrinsic::ceil, FTZ_MustBeOn};
case Intrinsic::nvvm_fabs_d:
return {Intrinsic::fabs, FTZ_Any};
- case Intrinsic::nvvm_fabs_f:
- return {Intrinsic::fabs, FTZ_MustBeOff};
- case Intrinsic::nvvm_fabs_ftz_f:
- return {Intrinsic::fabs, FTZ_MustBeOn};
case Intrinsic::nvvm_floor_d:
return {Intrinsic::floor, FTZ_Any};
case Intrinsic::nvvm_floor_f:
@@ -264,12 +260,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
return {Intrinsic::minimum, FTZ_MustBeOff, true};
case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
return {Intrinsic::minimum, FTZ_MustBeOn, true};
- case Intrinsic::nvvm_round_d:
- return {Intrinsic::round, FTZ_Any};
- case Intrinsic::nvvm_round_f:
- return {Intrinsic::round, FTZ_MustBeOff};
- case Intrinsic::nvvm_round_ftz_f:
- return {Intrinsic::round, FTZ_MustBeOn};
case Intrinsic::nvvm_sqrt_rn_d:
return {Intrinsic::sqrt, FTZ_Any};
case Intrinsic::nvvm_sqrt_f:
@@ -278,10 +268,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
// the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
// the versions with explicit ftz-ness.
return {Intrinsic::sqrt, FTZ_Any};
- case Intrinsic::nvvm_sqrt_rn_f:
- return {Intrinsic::sqrt, FTZ_MustBeOff};
- case Intrinsic::nvvm_sqrt_rn_ftz_f:
- return {Intrinsic::sqrt, FTZ_MustBeOn};
case Intrinsic::nvvm_trunc_d:
return {Intrinsic::trunc, FTZ_Any};
case Intrinsic::nvvm_trunc_f:
@@ -316,24 +302,8 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
return {Instruction::UIToFP};
// NVVM intrinsics that map to LLVM binary ops.
- case Intrinsic::nvvm_add_rn_d:
- return {Instruction::FAdd, FTZ_Any};
- case Intrinsic::nvvm_add_rn_f:
- return {Instruction::FAdd, FTZ_MustBeOff};
- case Intrinsic::nvvm_add_rn_ftz_f:
- return {Instruction::FAdd, FTZ_MustBeOn};
- case Intrinsic::nvvm_mul_rn_d:
- return {Instruction::FMul, FTZ_Any};
- case Intrinsic::nvvm_mul_rn_f:
- return {Instruction::FMul, FTZ_MustBeOff};
- case Intrinsic::nvvm_mul_rn_ftz_f:
- return {Instruction::FMul, FTZ_MustBeOn};
case Intrinsic::nvvm_div_rn_d:
return {Instruction::FDiv, FTZ_Any};
- case Intrinsic::nvvm_div_rn_f:
- return {Instruction::FDiv, FTZ_MustBeOff};
- case Intrinsic::nvvm_div_rn_ftz_f:
- return {Instruction::FDiv, FTZ_MustBeOn};
// The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
// need special handling.
@@ -342,10 +312,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
// as well.
case Intrinsic::nvvm_rcp_rn_d:
return {SPC_Reciprocal, FTZ_Any};
- case Intrinsic::nvvm_rcp_rn_f:
- return {SPC_Reciprocal, FTZ_MustBeOff};
- case Intrinsic::nvvm_rcp_rn_ftz_f:
- return {SPC_Reciprocal, FTZ_MustBeOn};
// We do not currently simplify intrinsics that give an approximate
// answer. These include:
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 42f5a4e624c4..56af80f9cede 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -350,7 +350,7 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
TmpOffset += SL->getElementOffset(Idx);
} else {
- uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ uint64_t S = GTI.getSequentialElementStride(DL);
for (;;) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index ed96339240d9..26ed74108ec3 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -565,7 +565,7 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
/// operand. If so Imm will receive the 32-bit value.
static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
- Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ Imm = N->getAsZExtVal();
return true;
}
return false;
@@ -575,7 +575,7 @@ static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
/// operand. If so Imm will receive the 64-bit value.
static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
- Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ Imm = N->getAsZExtVal();
return true;
}
return false;
@@ -1500,7 +1500,7 @@ static SDNode *selectI64Imm(SelectionDAG *CurDAG, SDNode *N) {
SDLoc dl(N);
// Get 64 bit value.
- int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ int64_t Imm = N->getAsZExtVal();
if (unsigned MinSize = allUsesTruncate(CurDAG, N)) {
uint64_t SextImm = SignExtend64(Imm, MinSize);
SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
@@ -4923,7 +4923,7 @@ bool PPCDAGToDAGISel::trySelectLoopCountIntrinsic(SDNode *N) {
SDNode *NewDecrement = CurDAG->getMachineNode(DecrementOpcode, DecrementLoc,
MVT::i1, DecrementOps);
- unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+ unsigned Val = RHS->getAsZExtVal();
bool IsBranchOnTrue = (CC == ISD::SETEQ && Val) || (CC == ISD::SETNE && !Val);
unsigned Opcode = IsBranchOnTrue ? PPC::BC : PPC::BCn;
@@ -5765,7 +5765,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
break;
// If the multiplier fits int16, we can handle it with mulli.
- int64_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue();
+ int64_t Imm = Op1->getAsZExtVal();
unsigned Shift = llvm::countr_zero<uint64_t>(Imm);
if (isInt<16>(Imm) || !Shift)
break;
@@ -6612,8 +6612,7 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
// For us to materialize these using one instruction, we must be able to
// represent them as signed 16-bit integers.
- uint64_t True = cast<ConstantSDNode>(TrueRes)->getZExtValue(),
- False = cast<ConstantSDNode>(FalseRes)->getZExtValue();
+ uint64_t True = TrueRes->getAsZExtVal(), False = FalseRes->getAsZExtVal();
if (!isInt<16>(True) || !isInt<16>(False))
break;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8f27e6677afa..235df1880b37 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2566,7 +2566,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
if (LeadingZero) {
if (!UniquedVals[Multiple-1].getNode())
return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef
- int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
+ int Val = UniquedVals[Multiple - 1]->getAsZExtVal();
if (Val < 16) // 0,0,0,4 -> vspltisw(4)
return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
}
@@ -2635,11 +2635,11 @@ bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
if (!isa<ConstantSDNode>(N))
return false;
- Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
+ Imm = (int16_t)N->getAsZExtVal();
if (N->getValueType(0) == MVT::i32)
- return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+ return Imm == (int32_t)N->getAsZExtVal();
else
- return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+ return Imm == (int64_t)N->getAsZExtVal();
}
bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
return isIntS16Immediate(Op.getNode(), Imm);
@@ -2684,7 +2684,7 @@ bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
if (!isa<ConstantSDNode>(N))
return false;
- Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+ Imm = (int64_t)N->getAsZExtVal();
return isInt<34>(Imm);
}
bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
@@ -15580,7 +15580,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR)
break;
- uint64_t Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ uint64_t Imm = Op2->getAsZExtVal();
// Make sure that the constant is narrow enough to fit in the narrow type.
if (!isUInt<32>(Imm))
break;
@@ -16795,7 +16795,7 @@ void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I,
return;
if (!isa<ConstantSDNode>(Ops[1].getNode()))
return;
- auto IntrinsicID = cast<ConstantSDNode>(Ops[1].getNode())->getZExtValue();
+ auto IntrinsicID = Ops[1].getNode()->getAsZExtVal();
if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw &&
IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap)
return;
@@ -18430,7 +18430,7 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
if (Flags & PPC::MOF_RPlusSImm16) {
SDValue Op0 = N.getOperand(0);
SDValue Op1 = N.getOperand(1);
- int16_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue();
+ int16_t Imm = Op1->getAsZExtVal();
if (!Align || isAligned(*Align, Imm)) {
Disp = DAG.getTargetConstant(Imm, DL, N.getValueType());
Base = Op0;
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 375e63654db1..8a37e40414ee 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -270,12 +270,15 @@ def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
// Link register
def LR : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
-//let Aliases = [LR] in
-def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>;
+def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]> {
+ let Aliases = [LR];
+}
// Count register
def CTR : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>;
-def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]> {
+ let Aliases = [CTR];
+}
// VRsave register
def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>;
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 4759aa951664..d616aaeddf41 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -466,10 +466,6 @@ public:
bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
- bool isGPRF64AsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
-
- bool isGPRPF64AsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
-
static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
RISCVMCExpr::VariantKind &VK) {
if (auto *RE = dyn_cast<RISCVMCExpr>(Expr)) {
@@ -2039,9 +2035,8 @@ ParseStatus RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size());
- RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL;
- if (Identifier.consume_back("@plt"))
- Kind = RISCVMCExpr::VK_RISCV_CALL_PLT;
+ RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL_PLT;
+ (void)Identifier.consume_back("@plt");
MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index 50ed85acdec0..697ad476ff8c 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -579,7 +579,7 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Select the recommended relocation type R_RISCV_CALL_PLT.
if (!Info.Callee.isReg())
- Info.Callee.setTargetFlags(RISCVII::MO_PLT);
+ Info.Callee.setTargetFlags(RISCVII::MO_CALL);
MachineInstrBuilder Call =
MIRBuilder
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 079906d1958c..ab8070772fe5 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -113,7 +113,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
getActionDefinitionsBuilder(G_BITREVERSE).maxScalar(0, sXLen).lower();
auto &BSWAPActions = getActionDefinitionsBuilder(G_BSWAP);
- if (ST.hasStdExtZbb())
+ if (ST.hasStdExtZbb() || ST.hasStdExtZbkb())
BSWAPActions.legalFor({sXLen}).clampScalar(0, sXLen, sXLen);
else
BSWAPActions.maxScalar(0, sXLen).lower();
@@ -411,8 +411,9 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI,
return true;
}
-bool RISCVLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
- MachineInstr &MI) const {
+bool RISCVLegalizerInfo::legalizeCustom(
+ LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const {
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
GISelChangeObserver &Observer = Helper.Observer;
switch (MI.getOpcode()) {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index 48c36976501f..f3ec6be16734 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -21,7 +21,6 @@ class GISelChangeObserver;
class MachineIRBuilder;
class RISCVSubtarget;
-/// This class provides the information for the target register banks.
class RISCVLegalizerInfo : public LegalizerInfo {
const RISCVSubtarget &STI;
const unsigned XLen;
@@ -30,7 +29,8 @@ class RISCVLegalizerInfo : public LegalizerInfo {
public:
RISCVLegalizerInfo(const RISCVSubtarget &ST);
- bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+ bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const override;
bool legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index aba2511959af..8d97c5ffd20a 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -186,30 +186,37 @@ RISCVInstrumentManager::createInstruments(const MCInst &Inst) {
}
static std::pair<uint8_t, uint8_t>
-getEEWAndEMULForUnitStrideLoadStore(unsigned Opcode, RISCVII::VLMUL LMUL,
- uint8_t SEW) {
+getEEWAndEMUL(unsigned Opcode, RISCVII::VLMUL LMUL, uint8_t SEW) {
uint8_t EEW;
switch (Opcode) {
case RISCV::VLM_V:
case RISCV::VSM_V:
case RISCV::VLE8_V:
case RISCV::VSE8_V:
+ case RISCV::VLSE8_V:
+ case RISCV::VSSE8_V:
EEW = 8;
break;
case RISCV::VLE16_V:
case RISCV::VSE16_V:
+ case RISCV::VLSE16_V:
+ case RISCV::VSSE16_V:
EEW = 16;
break;
case RISCV::VLE32_V:
case RISCV::VSE32_V:
+ case RISCV::VLSE32_V:
+ case RISCV::VSSE32_V:
EEW = 32;
break;
case RISCV::VLE64_V:
case RISCV::VSE64_V:
+ case RISCV::VLSE64_V:
+ case RISCV::VSSE64_V:
EEW = 64;
break;
default:
- llvm_unreachable("Opcode is not a vector unit stride load nor store");
+ llvm_unreachable("Could not determine EEW from Opcode");
}
auto EMUL = RISCVVType::getSameRatioLMUL(SEW, LMUL, EEW);
@@ -218,6 +225,18 @@ getEEWAndEMULForUnitStrideLoadStore(unsigned Opcode, RISCVII::VLMUL LMUL,
return std::make_pair(EEW, *EMUL);
}
+bool opcodeHasEEWAndEMULInfo(unsigned short Opcode) {
+ return Opcode == RISCV::VLM_V || Opcode == RISCV::VSM_V ||
+ Opcode == RISCV::VLE8_V || Opcode == RISCV::VSE8_V ||
+ Opcode == RISCV::VLE16_V || Opcode == RISCV::VSE16_V ||
+ Opcode == RISCV::VLE32_V || Opcode == RISCV::VSE32_V ||
+ Opcode == RISCV::VLE64_V || Opcode == RISCV::VSE64_V ||
+ Opcode == RISCV::VLSE8_V || Opcode == RISCV::VSSE8_V ||
+ Opcode == RISCV::VLSE16_V || Opcode == RISCV::VSSE16_V ||
+ Opcode == RISCV::VLSE32_V || Opcode == RISCV::VSSE32_V ||
+ Opcode == RISCV::VLSE64_V || Opcode == RISCV::VSSE64_V;
+}
+
unsigned RISCVInstrumentManager::getSchedClassID(
const MCInstrInfo &MCII, const MCInst &MCI,
const llvm::SmallVector<Instrument *> &IVec) const {
@@ -249,13 +268,9 @@ unsigned RISCVInstrumentManager::getSchedClassID(
uint8_t SEW = SI ? SI->getSEW() : 0;
const RISCVVInversePseudosTable::PseudoInfo *RVV = nullptr;
- if (Opcode == RISCV::VLM_V || Opcode == RISCV::VSM_V ||
- Opcode == RISCV::VLE8_V || Opcode == RISCV::VSE8_V ||
- Opcode == RISCV::VLE16_V || Opcode == RISCV::VSE16_V ||
- Opcode == RISCV::VLE32_V || Opcode == RISCV::VSE32_V ||
- Opcode == RISCV::VLE64_V || Opcode == RISCV::VSE64_V) {
+ if (opcodeHasEEWAndEMULInfo(Opcode)) {
RISCVII::VLMUL VLMUL = static_cast<RISCVII::VLMUL>(LMUL);
- auto [EEW, EMUL] = getEEWAndEMULForUnitStrideLoadStore(Opcode, VLMUL, SEW);
+ auto [EEW, EMUL] = getEEWAndEMUL(Opcode, VLMUL, SEW);
RVV = RISCVVInversePseudosTable::getBaseInfo(Opcode, EMUL, EEW);
} else {
// Check if it depends on LMUL and SEW
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 716fb67c5824..7ce08eabdeb6 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -329,16 +329,17 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
return true;
}
-bool RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout,
- int64_t &Value) const {
+std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF,
+ MCAsmLayout &Layout,
+ int64_t &Value) const {
if (LF.isSigned())
- return false;
+ return std::make_pair(false, false);
const MCExpr &Expr = LF.getValue();
if (ULEB128Reloc) {
LF.getFixups().push_back(
MCFixup::create(0, &Expr, FK_Data_leb128, Expr.getLoc()));
}
- return Expr.evaluateKnownAbsolute(Value, Layout);
+ return std::make_pair(Expr.evaluateKnownAbsolute(Value, Layout), false);
}
// Given a compressed control flow instruction this function returns
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 2ad6534ac8bc..902b44bba70f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -100,8 +100,8 @@ public:
bool &WasRelaxed) const override;
bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF, MCAsmLayout &Layout,
bool &WasRelaxed) const override;
- bool relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout,
- int64_t &Value) const override;
+ std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout,
+ int64_t &Value) const override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
const MCSubtargetInfo *STI) const override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index c32210fc1419..433e2e6f80bd 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -254,7 +254,6 @@ static inline bool isFirstDefTiedToFirstUse(const MCInstrDesc &Desc) {
enum {
MO_None = 0,
MO_CALL = 1,
- MO_PLT = 2,
MO_LO = 3,
MO_HI = 4,
MO_PCREL_LO = 5,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index d67351102bc1..64ddae61b1bc 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -41,8 +41,6 @@ void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
if (HasVariant)
OS << '%' << getVariantKindName(getKind()) << '(';
Expr->print(OS, MAI);
- if (Kind == VK_RISCV_CALL_PLT)
- OS << "@plt";
if (HasVariant)
OS << ')';
}
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 0fd514fa87cd..f2bd5118fc07 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -747,9 +747,6 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
Kind = RISCVMCExpr::VK_RISCV_None;
break;
case RISCVII::MO_CALL:
- Kind = RISCVMCExpr::VK_RISCV_CALL;
- break;
- case RISCVII::MO_PLT:
Kind = RISCVMCExpr::VK_RISCV_CALL_PLT;
break;
case RISCVII::MO_LO:
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 24a13f93af88..103a2e2da7b9 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -109,6 +109,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandRV32ZdinxStore(MBB, MBBI);
case RISCV::PseudoRV32ZdinxLD:
return expandRV32ZdinxLoad(MBB, MBBI);
+ case RISCV::PseudoCCMOVGPRNoX0:
case RISCV::PseudoCCMOVGPR:
case RISCV::PseudoCCADD:
case RISCV::PseudoCCSUB:
@@ -134,6 +135,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
case RISCV::PseudoCCSLLIW:
case RISCV::PseudoCCSRLIW:
case RISCV::PseudoCCSRAIW:
+ case RISCV::PseudoCCANDN:
+ case RISCV::PseudoCCORN:
+ case RISCV::PseudoCCXNOR:
return expandCCOp(MBB, MBBI, NextMBBI);
case RISCV::PseudoVSETVLI:
case RISCV::PseudoVSETVLIX0:
@@ -191,7 +195,8 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
Register DestReg = MI.getOperand(0).getReg();
assert(MI.getOperand(4).getReg() == DestReg);
- if (MI.getOpcode() == RISCV::PseudoCCMOVGPR) {
+ if (MI.getOpcode() == RISCV::PseudoCCMOVGPR ||
+ MI.getOpcode() == RISCV::PseudoCCMOVGPRNoX0) {
// Add MV.
BuildMI(TrueBB, DL, TII->get(RISCV::ADDI), DestReg)
.add(MI.getOperand(5))
@@ -225,6 +230,9 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
case RISCV::PseudoCCSLLIW: NewOpc = RISCV::SLLIW; break;
case RISCV::PseudoCCSRLIW: NewOpc = RISCV::SRLIW; break;
case RISCV::PseudoCCSRAIW: NewOpc = RISCV::SRAIW; break;
+ case RISCV::PseudoCCANDN: NewOpc = RISCV::ANDN; break;
+ case RISCV::PseudoCCORN: NewOpc = RISCV::ORN; break;
+ case RISCV::PseudoCCXNOR: NewOpc = RISCV::XNOR; break;
}
BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg)
.add(MI.getOperand(5))
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 59b202606dad..bb7a3291085d 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1021,6 +1021,12 @@ def TuneShortForwardBranchOpt
def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">;
def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">;
+def TuneConditionalCompressedMoveFusion
+ : SubtargetFeature<"conditional-cmv-fusion", "HasConditionalCompressedMoveFusion",
+ "true", "Enable branch+c.mv fusion">;
+def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">;
+def NoConditionalMoveFusion : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
+
def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
"SiFive 7-Series processors",
[TuneNoDefaultUnroll,
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index 5ad1e082344e..1129206800ad 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -362,7 +362,7 @@ RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr,
VecOperand = i;
- TypeSize TS = DL->getTypeAllocSize(GTI.getIndexedType());
+ TypeSize TS = GTI.getSequentialElementStride(*DL);
if (TS.isScalable())
return std::make_pair(nullptr, nullptr);
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index bfa3bf3cc74e..0d8688ba2eae 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -763,14 +763,12 @@ bool RISCVDAGToDAGISel::tryIndexedLoad(SDNode *Node) {
return false;
EVT LoadVT = Ld->getMemoryVT();
- bool IsPre = (AM == ISD::PRE_INC || AM == ISD::PRE_DEC);
- bool IsPost = (AM == ISD::POST_INC || AM == ISD::POST_DEC);
+ assert((AM == ISD::PRE_INC || AM == ISD::POST_INC) &&
+ "Unexpected addressing mode");
+ bool IsPre = AM == ISD::PRE_INC;
+ bool IsPost = AM == ISD::POST_INC;
int64_t Offset = C->getSExtValue();
- // Convert decrements to increments by a negative quantity.
- if (AM == ISD::PRE_DEC || AM == ISD::POST_DEC)
- Offset = -Offset;
-
// The constants that can be encoded in the THeadMemIdx instructions
// are of the form (sign_extend(imm5) << imm2).
int64_t Shift;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 03a59f8a8b57..0a1a466af591 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -814,8 +814,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT,
Custom);
setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
- setOperationAction(
- {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT, Legal);
+ setOperationAction({ISD::AVGFLOORU, ISD::SADDSAT, ISD::UADDSAT,
+ ISD::SSUBSAT, ISD::USUBSAT},
+ VT, Legal);
// Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR_VL"
// nodes which truncate by one power of two at a time.
@@ -1184,9 +1185,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV())
setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Custom);
- setOperationAction(
- {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT,
- Custom);
+ setOperationAction({ISD::AVGFLOORU, ISD::SADDSAT, ISD::UADDSAT,
+ ISD::SSUBSAT, ISD::USUBSAT},
+ VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -1350,8 +1351,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
if (Subtarget.hasVendorXTHeadMemIdx()) {
- for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::POST_DEC;
- ++im) {
+ for (unsigned im : {ISD::PRE_INC, ISD::POST_INC}) {
setIndexedLoadAction(im, MVT::i8, Legal);
setIndexedStoreAction(im, MVT::i8, Legal);
setIndexedLoadAction(im, MVT::i16, Legal);
@@ -1374,8 +1374,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setPrefLoopAlignment(Subtarget.getPrefLoopAlignment());
setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
- ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::MUL,
- ISD::AND, ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});
+ ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND,
+ ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});
if (Subtarget.is64Bit())
setTargetDAGCombine(ISD::SRA);
@@ -2711,11 +2711,19 @@ InstructionCost RISCVTargetLowering::getVRGatherVICost(MVT VT) const {
return getLMULCost(VT);
}
-/// Return the cost of a vslidedown.vi/vx or vslideup.vi/vx instruction
+/// Return the cost of a vslidedown.vx or vslideup.vx instruction
+/// for the type VT. (This does not cover the vslide1up or vslide1down
+/// variants.) Slides may be linear in the number of vregs implied by LMUL,
+/// or may track the vrgather.vv cost. It is implementation-dependent.
+InstructionCost RISCVTargetLowering::getVSlideVXCost(MVT VT) const {
+ return getLMULCost(VT);
+}
+
+/// Return the cost of a vslidedown.vi or vslideup.vi instruction
/// for the type VT. (This does not cover the vslide1up or vslide1down
/// variants.) Slides may be linear in the number of vregs implied by LMUL,
/// or may track the vrgather.vv cost. It is implementation-dependent.
-InstructionCost RISCVTargetLowering::getVSlideCost(MVT VT) const {
+InstructionCost RISCVTargetLowering::getVSlideVICost(MVT VT) const {
return getLMULCost(VT);
}
@@ -2811,8 +2819,8 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
SDValue SplatZero = DAG.getNode(
RISCVISD::VMV_V_X_VL, DL, DstContainerVT, DAG.getUNDEF(DstContainerVT),
DAG.getConstant(0, DL, Subtarget.getXLenVT()), VL);
- Res = DAG.getNode(RISCVISD::VSELECT_VL, DL, DstContainerVT, IsNan, SplatZero,
- Res, VL);
+ Res = DAG.getNode(RISCVISD::VMERGE_VL, DL, DstContainerVT, IsNan, SplatZero,
+ Res, DAG.getUNDEF(DstContainerVT), VL);
if (DstVT.isFixedLengthVector())
Res = convertFromScalableVector(DstVT, Res, DAG, Subtarget);
@@ -3489,7 +3497,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
for (unsigned I = 0; I < NumElts;) {
SDValue V = Op.getOperand(I);
- bool BitValue = !V.isUndef() && cast<ConstantSDNode>(V)->getZExtValue();
+ bool BitValue = !V.isUndef() && V->getAsZExtVal();
Bits |= ((uint64_t)BitValue << BitPos);
++BitPos;
++I;
@@ -3620,8 +3628,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
for (const auto &OpIdx : enumerate(Op->op_values())) {
const auto &SeqV = OpIdx.value();
if (!SeqV.isUndef())
- SplatValue |= ((cast<ConstantSDNode>(SeqV)->getZExtValue() & EltMask)
- << (OpIdx.index() * EltBitSize));
+ SplatValue |=
+ ((SeqV->getAsZExtVal() & EltMask) << (OpIdx.index() * EltBitSize));
}
// On RV64, sign-extend from 32 to 64 bits where possible in order to
@@ -3650,10 +3658,10 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
// would require bit-manipulation instructions to construct the splat value.
SmallVector<SDValue> Sequence;
const auto *BV = cast<BuildVectorSDNode>(Op);
- if (VT.isInteger() && EltBitSize < 64 &&
+ if (VT.isInteger() && EltBitSize < Subtarget.getELen() &&
ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
BV->getRepeatedSequence(Sequence) &&
- (Sequence.size() * EltBitSize) <= 64) {
+ (Sequence.size() * EltBitSize) <= Subtarget.getELen()) {
unsigned SeqLen = Sequence.size();
MVT ViaIntVT = MVT::getIntegerVT(EltBitSize * SeqLen);
assert((ViaIntVT == MVT::i16 || ViaIntVT == MVT::i32 ||
@@ -3676,8 +3684,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
// vector type.
for (const auto &SeqV : Sequence) {
if (!SeqV.isUndef())
- SplatValue |= ((cast<ConstantSDNode>(SeqV)->getZExtValue() & EltMask)
- << (EltIdx * EltBitSize));
+ SplatValue |=
+ ((SeqV->getAsZExtVal() & EltMask) << (EltIdx * EltBitSize));
EltIdx++;
}
@@ -3938,8 +3946,7 @@ static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,
(isa<RegisterSDNode>(VL) &&
cast<RegisterSDNode>(VL)->getReg() == RISCV::X0))
NewVL = DAG.getRegister(RISCV::X0, MVT::i32);
- else if (isa<ConstantSDNode>(VL) &&
- isUInt<4>(cast<ConstantSDNode>(VL)->getZExtValue()))
+ else if (isa<ConstantSDNode>(VL) && isUInt<4>(VL->getAsZExtVal()))
NewVL = DAG.getNode(ISD::ADD, DL, VL.getValueType(), VL, VL);
if (NewVL) {
@@ -5401,8 +5408,8 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG,
SDValue XIsNonNan = DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(),
{X, X, DAG.getCondCode(ISD::SETOEQ),
DAG.getUNDEF(ContainerVT), Mask, VL});
- NewY =
- DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, XIsNonNan, Y, X, VL);
+ NewY = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, XIsNonNan, Y, X,
+ DAG.getUNDEF(ContainerVT), VL);
}
SDValue NewX = X;
@@ -5410,8 +5417,8 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG,
SDValue YIsNonNan = DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(),
{Y, Y, DAG.getCondCode(ISD::SETOEQ),
DAG.getUNDEF(ContainerVT), Mask, VL});
- NewX =
- DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, YIsNonNan, X, Y, VL);
+ NewX = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, YIsNonNan, X, Y,
+ DAG.getUNDEF(ContainerVT), VL);
}
unsigned Opc =
@@ -5458,6 +5465,7 @@ static unsigned getRISCVVLOp(SDValue Op) {
OP_CASE(UADDSAT)
OP_CASE(SSUBSAT)
OP_CASE(USUBSAT)
+ OP_CASE(AVGFLOORU)
OP_CASE(FADD)
OP_CASE(FSUB)
OP_CASE(FMUL)
@@ -5528,7 +5536,6 @@ static unsigned getRISCVVLOp(SDValue Op) {
return RISCVISD::VMXOR_VL;
return RISCVISD::XOR_VL;
case ISD::VP_SELECT:
- return RISCVISD::VSELECT_VL;
case ISD::VP_MERGE:
return RISCVISD::VMERGE_VL;
case ISD::VP_ASHR:
@@ -6453,6 +6460,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
!Subtarget.hasVInstructionsF16()))
return SplitVectorOp(Op, DAG);
[[fallthrough]];
+ case ISD::AVGFLOORU:
case ISD::SADDSAT:
case ISD::UADDSAT:
case ISD::SSUBSAT:
@@ -6914,7 +6922,7 @@ static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG,
MVT VT = N->getSimpleValueType(0);
SDLoc DL(N);
- if (!Subtarget.hasShortForwardBranchOpt()) {
+ if (!Subtarget.hasConditionalMoveFusion()) {
// (select c, -1, y) -> -c | y
if (isAllOnesConstant(TrueV)) {
SDValue Neg = DAG.getNegative(CondV, DL, VT);
@@ -7078,7 +7086,7 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c))
// Unless we have the short forward branch optimization.
- if (!Subtarget.hasShortForwardBranchOpt())
+ if (!Subtarget.hasConditionalMoveFusion())
return DAG.getNode(
ISD::OR, DL, VT,
DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV),
@@ -7456,8 +7464,9 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
DAG.getUNDEF(ContainerVT), SplatZero, VL);
SplatTrueVal = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
DAG.getUNDEF(ContainerVT), SplatTrueVal, VL);
- SDValue Select = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC,
- SplatTrueVal, SplatZero, VL);
+ SDValue Select =
+ DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, SplatTrueVal,
+ SplatZero, DAG.getUNDEF(ContainerVT), VL);
return convertFromScalableVector(VecVT, Select, DAG, Subtarget);
}
@@ -7906,8 +7915,7 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
// Use tail agnostic policy if Idx is the last index of Vec.
unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
if (VecVT.isFixedLengthVector() && isa<ConstantSDNode>(Idx) &&
- cast<ConstantSDNode>(Idx)->getZExtValue() + 1 ==
- VecVT.getVectorNumElements())
+ Idx->getAsZExtVal() + 1 == VecVT.getVectorNumElements())
Policy = RISCVII::TAIL_AGNOSTIC;
SDValue Slideup = getVSlideup(DAG, Subtarget, DL, ContainerVT, Vec, ValInVec,
Idx, Mask, InsertVL, Policy);
@@ -8167,7 +8175,7 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
const auto [MinVLMAX, MaxVLMAX] =
RISCVTargetLowering::computeVLMAXBounds(VT, Subtarget);
- uint64_t AVLInt = cast<ConstantSDNode>(AVL)->getZExtValue();
+ uint64_t AVLInt = AVL->getAsZExtVal();
if (AVLInt <= MinVLMAX) {
I32VL = DAG.getConstant(2 * AVLInt, DL, XLenVT);
} else if (AVLInt >= 2 * MaxVLMAX) {
@@ -8233,15 +8241,14 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
SDValue Mask = Operands[NumOps - 3];
SDValue MaskedOff = Operands[1];
// Assume Policy operand is the last operand.
- uint64_t Policy =
- cast<ConstantSDNode>(Operands[NumOps - 1])->getZExtValue();
+ uint64_t Policy = Operands[NumOps - 1]->getAsZExtVal();
// We don't need to select maskedoff if it's undef.
if (MaskedOff.isUndef())
return Vec;
// TAMU
if (Policy == RISCVII::TAIL_AGNOSTIC)
- return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, Mask, Vec, MaskedOff,
- AVL);
+ return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff,
+ DAG.getUNDEF(VT), AVL);
// TUMA or TUMU: Currently we always emit tumu policy regardless of tuma.
// It's fine because vmerge does not care mask policy.
return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff,
@@ -8489,8 +8496,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT,
{VID, SplattedIdx, DAG.getCondCode(ISD::SETEQ),
DAG.getUNDEF(MaskVT), Mask, VL});
- return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, SelectCond, SplattedVal,
- Vec, VL);
+ return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, SelectCond, SplattedVal,
+ Vec, DAG.getUNDEF(VT), VL);
}
// EGS * EEW >= 128 bits
case Intrinsic::riscv_vaesdf_vv:
@@ -10243,8 +10250,8 @@ SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV(
SDLoc DL(Op);
SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
- SDValue Select =
- DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC, Op1, Op2, VL);
+ SDValue Select = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, Op1,
+ Op2, DAG.getUNDEF(ContainerVT), VL);
return convertFromScalableVector(VT, Select, DAG, Subtarget);
}
@@ -10327,9 +10334,14 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const {
Ops.push_back(DAG.getUNDEF(ContainerVT));
} else if (ISD::getVPExplicitVectorLengthIdx(Op.getOpcode()) ==
OpIdx.index()) {
- // For VP_MERGE, copy the false operand instead of an undef value.
- assert(Op.getOpcode() == ISD::VP_MERGE);
- Ops.push_back(Ops.back());
+ if (Op.getOpcode() == ISD::VP_MERGE) {
+ // For VP_MERGE, copy the false operand instead of an undef value.
+ Ops.push_back(Ops.back());
+ } else {
+ assert(Op.getOpcode() == ISD::VP_SELECT);
+ // For VP_SELECT, add an undef value.
+ Ops.push_back(DAG.getUNDEF(ContainerVT));
+ }
}
}
// Pass through operands which aren't fixed-length vectors.
@@ -10379,8 +10391,8 @@ SDValue RISCVTargetLowering::lowerVPExtMaskOp(SDValue Op,
SDValue Splat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
DAG.getUNDEF(ContainerVT), SplatValue, VL);
- SDValue Result = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Src,
- Splat, ZeroSplat, VL);
+ SDValue Result = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Src, Splat,
+ ZeroSplat, DAG.getUNDEF(ContainerVT), VL);
if (!VT.isFixedLengthVector())
return Result;
return convertFromScalableVector(VT, Result, DAG, Subtarget);
@@ -10508,8 +10520,8 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op,
RISCVISDExtOpc == RISCVISD::VZEXT_VL ? 1 : -1, DL, XLenVT);
SDValue OneSplat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntVT,
DAG.getUNDEF(IntVT), One, VL);
- Src = DAG.getNode(RISCVISD::VSELECT_VL, DL, IntVT, Src, OneSplat,
- ZeroSplat, VL);
+ Src = DAG.getNode(RISCVISD::VMERGE_VL, DL, IntVT, Src, OneSplat,
+ ZeroSplat, DAG.getUNDEF(IntVT), VL);
} else if (DstEltSize > (2 * SrcEltSize)) {
// Widen before converting.
MVT IntVT = MVT::getVectorVT(MVT::getIntegerVT(DstEltSize / 2),
@@ -10633,8 +10645,8 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
SDValue SplatZeroOp1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
DAG.getUNDEF(ContainerVT),
DAG.getConstant(0, DL, XLenVT), EVL1);
- Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op1, SplatOneOp1,
- SplatZeroOp1, EVL1);
+ Op1 = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Op1, SplatOneOp1,
+ SplatZeroOp1, DAG.getUNDEF(ContainerVT), EVL1);
SDValue SplatOneOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
DAG.getUNDEF(ContainerVT),
@@ -10642,8 +10654,8 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
SDValue SplatZeroOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
DAG.getUNDEF(ContainerVT),
DAG.getConstant(0, DL, XLenVT), EVL2);
- Op2 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op2, SplatOneOp2,
- SplatZeroOp2, EVL2);
+ Op2 = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Op2, SplatOneOp2,
+ SplatZeroOp2, DAG.getUNDEF(ContainerVT), EVL2);
}
int64_t ImmValue = cast<ConstantSDNode>(Offset)->getSExtValue();
@@ -10713,8 +10725,8 @@ RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op,
SDValue SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IndicesVT,
DAG.getUNDEF(IndicesVT),
DAG.getConstant(0, DL, XLenVT), EVL);
- Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, IndicesVT, Op1, SplatOne,
- SplatZero, EVL);
+ Op1 = DAG.getNode(RISCVISD::VMERGE_VL, DL, IndicesVT, Op1, SplatOne,
+ SplatZero, DAG.getUNDEF(IndicesVT), EVL);
}
unsigned EltSize = GatherVT.getScalarSizeInBits();
@@ -12197,7 +12209,7 @@ static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
if (VT.isVector())
return SDValue();
- if (!Subtarget.hasShortForwardBranchOpt()) {
+ if (!Subtarget.hasConditionalMoveFusion()) {
// (select cond, x, (and x, c)) has custom lowering with Zicond.
if ((!Subtarget.hasStdExtZicond() &&
!Subtarget.hasVendorXVentanaCondOps()) ||
@@ -12850,9 +12862,9 @@ struct CombineResult;
/// Helper class for folding sign/zero extensions.
/// In particular, this class is used for the following combines:
-/// add | add_vl -> vwadd(u) | vwadd(u)_w
-/// sub | sub_vl -> vwsub(u) | vwsub(u)_w
-/// mul | mul_vl -> vwmul(u) | vwmul_su
+/// add_vl -> vwadd(u) | vwadd(u)_w
+/// sub_vl -> vwsub(u) | vwsub(u)_w
+/// mul_vl -> vwmul(u) | vwmul_su
///
/// An object of this class represents an operand of the operation we want to
/// combine.
@@ -12897,8 +12909,6 @@ struct NodeExtensionHelper {
/// E.g., for zext(a), this would return a.
SDValue getSource() const {
switch (OrigOperand.getOpcode()) {
- case ISD::ZERO_EXTEND:
- case ISD::SIGN_EXTEND:
case RISCVISD::VSEXT_VL:
case RISCVISD::VZEXT_VL:
return OrigOperand.getOperand(0);
@@ -12915,8 +12925,7 @@ struct NodeExtensionHelper {
/// Get or create a value that can feed \p Root with the given extension \p
/// SExt. If \p SExt is std::nullopt, this returns the source of this operand.
/// \see ::getSource().
- SDValue getOrCreateExtendedOp(SDNode *Root, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget,
+ SDValue getOrCreateExtendedOp(const SDNode *Root, SelectionDAG &DAG,
std::optional<bool> SExt) const {
if (!SExt.has_value())
return OrigOperand;
@@ -12931,10 +12940,8 @@ struct NodeExtensionHelper {
// If we need an extension, we should be changing the type.
SDLoc DL(Root);
- auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget);
+ auto [Mask, VL] = getMaskAndVL(Root);
switch (OrigOperand.getOpcode()) {
- case ISD::ZERO_EXTEND:
- case ISD::SIGN_EXTEND:
case RISCVISD::VSEXT_VL:
case RISCVISD::VZEXT_VL:
return DAG.getNode(ExtOpc, DL, NarrowVT, Source, Mask, VL);
@@ -12974,15 +12981,12 @@ struct NodeExtensionHelper {
/// \pre \p Opcode represents a supported root (\see ::isSupportedRoot()).
static unsigned getSameExtensionOpcode(unsigned Opcode, bool IsSExt) {
switch (Opcode) {
- case ISD::ADD:
case RISCVISD::ADD_VL:
case RISCVISD::VWADD_W_VL:
case RISCVISD::VWADDU_W_VL:
return IsSExt ? RISCVISD::VWADD_VL : RISCVISD::VWADDU_VL;
- case ISD::MUL:
case RISCVISD::MUL_VL:
return IsSExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
- case ISD::SUB:
case RISCVISD::SUB_VL:
case RISCVISD::VWSUB_W_VL:
case RISCVISD::VWSUBU_W_VL:
@@ -12995,8 +12999,7 @@ struct NodeExtensionHelper {
/// Get the opcode to materialize \p Opcode(sext(a), zext(b)) ->
/// newOpcode(a, b).
static unsigned getSUOpcode(unsigned Opcode) {
- assert((Opcode == RISCVISD::MUL_VL || Opcode == ISD::MUL) &&
- "SU is only supported for MUL");
+ assert(Opcode == RISCVISD::MUL_VL && "SU is only supported for MUL");
return RISCVISD::VWMULSU_VL;
}
@@ -13004,10 +13007,8 @@ struct NodeExtensionHelper {
/// newOpcode(a, b).
static unsigned getWOpcode(unsigned Opcode, bool IsSExt) {
switch (Opcode) {
- case ISD::ADD:
case RISCVISD::ADD_VL:
return IsSExt ? RISCVISD::VWADD_W_VL : RISCVISD::VWADDU_W_VL;
- case ISD::SUB:
case RISCVISD::SUB_VL:
return IsSExt ? RISCVISD::VWSUB_W_VL : RISCVISD::VWSUBU_W_VL;
default:
@@ -13017,33 +13018,19 @@ struct NodeExtensionHelper {
using CombineToTry = std::function<std::optional<CombineResult>(
SDNode * /*Root*/, const NodeExtensionHelper & /*LHS*/,
- const NodeExtensionHelper & /*RHS*/, SelectionDAG &,
- const RISCVSubtarget &)>;
+ const NodeExtensionHelper & /*RHS*/)>;
/// Check if this node needs to be fully folded or extended for all users.
bool needToPromoteOtherUsers() const { return EnforceOneUse; }
/// Helper method to set the various fields of this struct based on the
/// type of \p Root.
- void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
+ void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG) {
SupportsZExt = false;
SupportsSExt = false;
EnforceOneUse = true;
CheckMask = true;
- unsigned Opc = OrigOperand.getOpcode();
- switch (Opc) {
- case ISD::ZERO_EXTEND:
- case ISD::SIGN_EXTEND: {
- if (OrigOperand.getValueType().isVector()) {
- SupportsZExt = Opc == ISD::ZERO_EXTEND;
- SupportsSExt = Opc == ISD::SIGN_EXTEND;
- SDLoc DL(Root);
- MVT VT = Root->getSimpleValueType(0);
- std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
- }
- break;
- }
+ switch (OrigOperand.getOpcode()) {
case RISCVISD::VZEXT_VL:
SupportsZExt = true;
Mask = OrigOperand.getOperand(1);
@@ -13099,16 +13086,8 @@ struct NodeExtensionHelper {
}
/// Check if \p Root supports any extension folding combines.
- static bool isSupportedRoot(const SDNode *Root, const SelectionDAG &DAG) {
+ static bool isSupportedRoot(const SDNode *Root) {
switch (Root->getOpcode()) {
- case ISD::ADD:
- case ISD::SUB:
- case ISD::MUL: {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (!TLI.isTypeLegal(Root->getValueType(0)))
- return false;
- return Root->getValueType(0).isScalableVector();
- }
case RISCVISD::ADD_VL:
case RISCVISD::MUL_VL:
case RISCVISD::VWADD_W_VL:
@@ -13123,10 +13102,9 @@ struct NodeExtensionHelper {
}
/// Build a NodeExtensionHelper for \p Root.getOperand(\p OperandIdx).
- NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
- assert(isSupportedRoot(Root, DAG) && "Trying to build an helper with an "
- "unsupported root");
+ NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG) {
+ assert(isSupportedRoot(Root) && "Trying to build an helper with an "
+ "unsupported root");
assert(OperandIdx < 2 && "Requesting something else than LHS or RHS");
OrigOperand = Root->getOperand(OperandIdx);
@@ -13142,7 +13120,7 @@ struct NodeExtensionHelper {
SupportsZExt =
Opc == RISCVISD::VWADDU_W_VL || Opc == RISCVISD::VWSUBU_W_VL;
SupportsSExt = !SupportsZExt;
- std::tie(Mask, VL) = getMaskAndVL(Root, DAG, Subtarget);
+ std::tie(Mask, VL) = getMaskAndVL(Root);
CheckMask = true;
// There's no existing extension here, so we don't have to worry about
// making sure it gets removed.
@@ -13151,7 +13129,7 @@ struct NodeExtensionHelper {
}
[[fallthrough]];
default:
- fillUpExtensionSupport(Root, DAG, Subtarget);
+ fillUpExtensionSupport(Root, DAG);
break;
}
}
@@ -13167,27 +13145,14 @@ struct NodeExtensionHelper {
}
/// Helper function to get the Mask and VL from \p Root.
- static std::pair<SDValue, SDValue>
- getMaskAndVL(const SDNode *Root, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
- assert(isSupportedRoot(Root, DAG) && "Unexpected root");
- switch (Root->getOpcode()) {
- case ISD::ADD:
- case ISD::SUB:
- case ISD::MUL: {
- SDLoc DL(Root);
- MVT VT = Root->getSimpleValueType(0);
- return getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
- }
- default:
- return std::make_pair(Root->getOperand(3), Root->getOperand(4));
- }
+ static std::pair<SDValue, SDValue> getMaskAndVL(const SDNode *Root) {
+ assert(isSupportedRoot(Root) && "Unexpected root");
+ return std::make_pair(Root->getOperand(3), Root->getOperand(4));
}
/// Check if the Mask and VL of this operand are compatible with \p Root.
- bool areVLAndMaskCompatible(SDNode *Root, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) const {
- auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget);
+ bool areVLAndMaskCompatible(const SDNode *Root) const {
+ auto [Mask, VL] = getMaskAndVL(Root);
return isMaskCompatible(Mask) && isVLCompatible(VL);
}
@@ -13195,14 +13160,11 @@ struct NodeExtensionHelper {
/// foldings that are supported by this class.
static bool isCommutative(const SDNode *N) {
switch (N->getOpcode()) {
- case ISD::ADD:
- case ISD::MUL:
case RISCVISD::ADD_VL:
case RISCVISD::MUL_VL:
case RISCVISD::VWADD_W_VL:
case RISCVISD::VWADDU_W_VL:
return true;
- case ISD::SUB:
case RISCVISD::SUB_VL:
case RISCVISD::VWSUB_W_VL:
case RISCVISD::VWSUBU_W_VL:
@@ -13247,25 +13209,14 @@ struct CombineResult {
/// Return a value that uses TargetOpcode and that can be used to replace
/// Root.
/// The actual replacement is *not* done in that method.
- SDValue materialize(SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) const {
+ SDValue materialize(SelectionDAG &DAG) const {
SDValue Mask, VL, Merge;
- std::tie(Mask, VL) =
- NodeExtensionHelper::getMaskAndVL(Root, DAG, Subtarget);
- switch (Root->getOpcode()) {
- default:
- Merge = Root->getOperand(2);
- break;
- case ISD::ADD:
- case ISD::SUB:
- case ISD::MUL:
- Merge = DAG.getUNDEF(Root->getValueType(0));
- break;
- }
+ std::tie(Mask, VL) = NodeExtensionHelper::getMaskAndVL(Root);
+ Merge = Root->getOperand(2);
return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0),
- LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtLHS),
- RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtRHS),
- Merge, Mask, VL);
+ LHS.getOrCreateExtendedOp(Root, DAG, SExtLHS),
+ RHS.getOrCreateExtendedOp(Root, DAG, SExtRHS), Merge,
+ Mask, VL);
}
};
@@ -13282,16 +13233,15 @@ struct CombineResult {
static std::optional<CombineResult>
canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,
const NodeExtensionHelper &RHS, bool AllowSExt,
- bool AllowZExt, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
+ bool AllowZExt) {
assert((AllowSExt || AllowZExt) && "Forgot to set what you want?");
- if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) ||
- !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
+ if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))
return std::nullopt;
if (AllowZExt && LHS.SupportsZExt && RHS.SupportsZExt)
return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(
Root->getOpcode(), /*IsSExt=*/false),
- Root, LHS, /*SExtLHS=*/false, RHS, /*SExtRHS=*/false);
+ Root, LHS, /*SExtLHS=*/false, RHS,
+ /*SExtRHS=*/false);
if (AllowSExt && LHS.SupportsSExt && RHS.SupportsSExt)
return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(
Root->getOpcode(), /*IsSExt=*/true),
@@ -13308,10 +13258,9 @@ canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
+ const NodeExtensionHelper &RHS) {
return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true,
- /*AllowZExt=*/true, DAG, Subtarget);
+ /*AllowZExt=*/true);
}
/// Check if \p Root follows a pattern Root(LHS, ext(RHS))
@@ -13320,9 +13269,8 @@ canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
- if (!RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
+ const NodeExtensionHelper &RHS) {
+ if (!RHS.areVLAndMaskCompatible(Root))
return std::nullopt;
// FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar
@@ -13346,10 +13294,9 @@ canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
+ const NodeExtensionHelper &RHS) {
return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true,
- /*AllowZExt=*/false, DAG, Subtarget);
+ /*AllowZExt=*/false);
}
/// Check if \p Root follows a pattern Root(zext(LHS), zext(RHS))
@@ -13358,10 +13305,9 @@ canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
+ const NodeExtensionHelper &RHS) {
return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/false,
- /*AllowZExt=*/true, DAG, Subtarget);
+ /*AllowZExt=*/true);
}
/// Check if \p Root follows a pattern Root(sext(LHS), zext(RHS))
@@ -13370,13 +13316,10 @@ canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVW_SU(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
-
+ const NodeExtensionHelper &RHS) {
if (!LHS.SupportsSExt || !RHS.SupportsZExt)
return std::nullopt;
- if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) ||
- !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
+ if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))
return std::nullopt;
return CombineResult(NodeExtensionHelper::getSUOpcode(Root->getOpcode()),
Root, LHS, /*SExtLHS=*/true, RHS, /*SExtRHS=*/false);
@@ -13386,8 +13329,6 @@ SmallVector<NodeExtensionHelper::CombineToTry>
NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
SmallVector<CombineToTry> Strategies;
switch (Root->getOpcode()) {
- case ISD::ADD:
- case ISD::SUB:
case RISCVISD::ADD_VL:
case RISCVISD::SUB_VL:
// add|sub -> vwadd(u)|vwsub(u)
@@ -13395,7 +13336,6 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
// add|sub -> vwadd(u)_w|vwsub(u)_w
Strategies.push_back(canFoldToVW_W);
break;
- case ISD::MUL:
case RISCVISD::MUL_VL:
// mul -> vwmul(u)
Strategies.push_back(canFoldToVWWithSameExtension);
@@ -13426,14 +13366,12 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
/// mul_vl -> vwmul(u) | vwmul_su
/// vwadd_w(u) -> vwadd(u)
/// vwub_w(u) -> vwadd(u)
-static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const RISCVSubtarget &Subtarget) {
+static SDValue
+combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- if (!NodeExtensionHelper::isSupportedRoot(N, DAG))
- return SDValue();
-
+ assert(NodeExtensionHelper::isSupportedRoot(N) &&
+ "Shouldn't have called this method");
SmallVector<SDNode *> Worklist;
SmallSet<SDNode *, 8> Inserted;
Worklist.push_back(N);
@@ -13442,11 +13380,11 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
while (!Worklist.empty()) {
SDNode *Root = Worklist.pop_back_val();
- if (!NodeExtensionHelper::isSupportedRoot(Root, DAG))
+ if (!NodeExtensionHelper::isSupportedRoot(Root))
return SDValue();
- NodeExtensionHelper LHS(N, 0, DAG, Subtarget);
- NodeExtensionHelper RHS(N, 1, DAG, Subtarget);
+ NodeExtensionHelper LHS(N, 0, DAG);
+ NodeExtensionHelper RHS(N, 1, DAG);
auto AppendUsersIfNeeded = [&Worklist,
&Inserted](const NodeExtensionHelper &Op) {
if (Op.needToPromoteOtherUsers()) {
@@ -13473,8 +13411,7 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
for (NodeExtensionHelper::CombineToTry FoldingStrategy :
FoldingStrategies) {
- std::optional<CombineResult> Res =
- FoldingStrategy(N, LHS, RHS, DAG, Subtarget);
+ std::optional<CombineResult> Res = FoldingStrategy(N, LHS, RHS);
if (Res) {
Matched = true;
CombinesToApply.push_back(*Res);
@@ -13503,7 +13440,7 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
SmallVector<std::pair<SDValue, SDValue>> ValuesToReplace;
ValuesToReplace.reserve(CombinesToApply.size());
for (CombineResult Res : CombinesToApply) {
- SDValue NewValue = Res.materialize(DAG, Subtarget);
+ SDValue NewValue = Res.materialize(DAG);
if (!InputRootReplacement) {
assert(Res.Root == N &&
"First element is expected to be the current node");
@@ -14503,7 +14440,7 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
if (SDValue V = useInversedSetcc(N, DAG, Subtarget))
return V;
- if (Subtarget.hasShortForwardBranchOpt())
+ if (Subtarget.hasConditionalMoveFusion())
return SDValue();
SDValue TrueVal = N->getOperand(1);
@@ -14775,20 +14712,13 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
-
- assert(N->getOpcode() == RISCVISD::ADD_VL || N->getOpcode() == ISD::ADD);
-
- if (N->getValueType(0).isFixedLengthVector())
- return SDValue();
-
+ assert(N->getOpcode() == RISCVISD::ADD_VL);
SDValue Addend = N->getOperand(0);
SDValue MulOp = N->getOperand(1);
+ SDValue AddMergeOp = N->getOperand(2);
- if (N->getOpcode() == RISCVISD::ADD_VL) {
- SDValue AddMergeOp = N->getOperand(2);
- if (!AddMergeOp.isUndef())
- return SDValue();
- }
+ if (!AddMergeOp.isUndef())
+ return SDValue();
auto IsVWMulOpc = [](unsigned Opc) {
switch (Opc) {
@@ -14812,16 +14742,8 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
if (!MulMergeOp.isUndef())
return SDValue();
- auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
- if (N->getOpcode() == ISD::ADD) {
- SDLoc DL(N);
- return getDefaultScalableVLOps(N->getSimpleValueType(0), DL, DAG,
- Subtarget);
- }
- return std::make_pair(N->getOperand(3), N->getOperand(4));
- }(N, DAG, Subtarget);
-
+ SDValue AddMask = N->getOperand(3);
+ SDValue AddVL = N->getOperand(4);
SDValue MulMask = MulOp.getOperand(3);
SDValue MulVL = MulOp.getOperand(4);
@@ -15087,18 +15009,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return DAG.getNode(ISD::AND, DL, VT, NewFMV,
DAG.getConstant(~SignBit, DL, VT));
}
- case ISD::ADD: {
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
- return V;
- if (SDValue V = combineToVWMACC(N, DAG, Subtarget))
- return V;
+ case ISD::ADD:
return performADDCombine(N, DAG, Subtarget);
- }
- case ISD::SUB: {
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
- return V;
+ case ISD::SUB:
return performSUBCombine(N, DAG, Subtarget);
- }
case ISD::AND:
return performANDCombine(N, DCI, Subtarget);
case ISD::OR:
@@ -15106,8 +15020,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::XOR:
return performXORCombine(N, DAG, Subtarget);
case ISD::MUL:
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
- return V;
return performMULCombine(N, DAG);
case ISD::FADD:
case ISD::UMAX:
@@ -15266,7 +15178,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
{LHS, RHS, CC, TrueV, FalseV});
- if (!Subtarget.hasShortForwardBranchOpt()) {
+ if (!Subtarget.hasConditionalMoveFusion()) {
// (select c, -1, y) -> -c | y
if (isAllOnesConstant(TrueV)) {
SDValue C = DAG.getSetCC(DL, VT, LHS, RHS, CCVal);
@@ -15584,7 +15496,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case RISCVISD::ADD_VL:
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI))
return V;
return combineToVWMACC(N, DAG, Subtarget);
case RISCVISD::SUB_VL:
@@ -15593,7 +15505,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case RISCVISD::VWSUB_W_VL:
case RISCVISD::VWSUBU_W_VL:
case RISCVISD::MUL_VL:
- return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+ return combineBinOp_VLToVWBinOp_VL(N, DCI);
case RISCVISD::VFMADD_VL:
case RISCVISD::VFNMADD_VL:
case RISCVISD::VFMSUB_VL:
@@ -18303,20 +18215,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
// split it and then direct call can be matched by PseudoCALL.
if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = S->getGlobal();
-
- unsigned OpFlags = RISCVII::MO_CALL;
- if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))
- OpFlags = RISCVII::MO_PLT;
-
- Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+ Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, RISCVII::MO_CALL);
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
- unsigned OpFlags = RISCVII::MO_CALL;
-
- if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(),
- nullptr))
- OpFlags = RISCVII::MO_PLT;
-
- Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags);
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, RISCVII::MO_CALL);
}
// The first call operand is the chain and the second is the target address.
@@ -18694,6 +18595,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(UDIV_VL)
NODE_NAME_CASE(UREM_VL)
NODE_NAME_CASE(XOR_VL)
+ NODE_NAME_CASE(AVGFLOORU_VL)
NODE_NAME_CASE(SADDSAT_VL)
NODE_NAME_CASE(UADDSAT_VL)
NODE_NAME_CASE(SSUBSAT_VL)
@@ -18783,7 +18685,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VWMACCSU_VL)
NODE_NAME_CASE(VNSRL_VL)
NODE_NAME_CASE(SETCC_VL)
- NODE_NAME_CASE(VSELECT_VL)
NODE_NAME_CASE(VMERGE_VL)
NODE_NAME_CASE(VMAND_VL)
NODE_NAME_CASE(VMOR_VL)
@@ -19357,7 +19258,6 @@ bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const {
bool RISCVTargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
- bool &IsInc,
SelectionDAG &DAG) const {
// Target does not support indexed loads.
if (!Subtarget.hasVendorXTHeadMemIdx())
@@ -19384,7 +19284,6 @@ bool RISCVTargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
if (!isLegalIndexedOffset)
return false;
- IsInc = (Op->getOpcode() == ISD::ADD);
Offset = Op->getOperand(1);
return true;
}
@@ -19407,11 +19306,10 @@ bool RISCVTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
} else
return false;
- bool IsInc;
- if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+ if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, DAG))
return false;
- AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+ AM = ISD::PRE_INC;
return true;
}
@@ -19431,15 +19329,14 @@ bool RISCVTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
} else
return false;
- bool IsInc;
- if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+ if (!getIndexedAddressParts(Op, Base, Offset, AM, DAG))
return false;
// Post-indexing updates the base, so it's not a valid transform
// if that's not the same as the load's pointer.
if (Ptr != Base)
return false;
- AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+ AM = ISD::POST_INC;
return true;
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 58ed611efc83..5d51fe168b04 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -253,6 +253,9 @@ enum NodeType : unsigned {
SSUBSAT_VL,
USUBSAT_VL,
+ // Averaging adds of unsigned integers.
+ AVGFLOORU_VL,
+
MULHS_VL,
MULHU_VL,
FADD_VL,
@@ -330,9 +333,8 @@ enum NodeType : unsigned {
// operand is VL.
SETCC_VL,
- // Vector select with an additional VL operand. This operation is unmasked.
- VSELECT_VL,
// General vmerge node with mask, true, false, passthru, and vl operands.
+ // Tail agnostic vselect can be implemented by setting passthru to undef.
VMERGE_VL,
// Mask binary operators.
@@ -526,7 +528,8 @@ public:
InstructionCost getVRGatherVVCost(MVT VT) const;
InstructionCost getVRGatherVICost(MVT VT) const;
- InstructionCost getVSlideCost(MVT VT) const;
+ InstructionCost getVSlideVXCost(MVT VT) const;
+ InstructionCost getVSlideVICost(MVT VT) const;
// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -774,8 +777,7 @@ public:
bool isVScaleKnownToBeAPowerOfTwo() const override;
bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
- ISD::MemIndexedMode &AM, bool &IsInc,
- SelectionDAG &DAG) const;
+ ISD::MemIndexedMode &AM, SelectionDAG &DAG) const;
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const override;
@@ -903,6 +905,7 @@ private:
SDValue lowerFixedLengthVectorSelectToRVV(SDValue Op,
SelectionDAG &DAG) const;
SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerUnsignedAvgFloor(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 3400b24e0abb..e591aa935c0b 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1381,6 +1381,11 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
if (!UnavailablePred || !AvailableInfo.isValid())
return;
+ // If we don't know the exact VTYPE, we can't copy the vsetvli to the exit of
+ // the unavailable pred.
+ if (AvailableInfo.hasSEWLMULRatioOnly())
+ return;
+
// Critical edge - TODO: consider splitting?
if (UnavailablePred->succ_size() != 1)
return;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index cd98438eed88..351f48c1708e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1346,6 +1346,10 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; break;
case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; break;
case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; break;
+
+ case RISCV::ANDN: return RISCV::PseudoCCANDN; break;
+ case RISCV::ORN: return RISCV::PseudoCCORN; break;
+ case RISCV::XNOR: return RISCV::PseudoCCXNOR; break;
}
return RISCV::INSTRUCTION_LIST_END;
@@ -2365,7 +2369,6 @@ RISCVInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
using namespace RISCVII;
static const std::pair<unsigned, const char *> TargetFlags[] = {
{MO_CALL, "riscv-call"},
- {MO_PLT, "riscv-plt"},
{MO_LO, "riscv-lo"},
{MO_HI, "riscv-hi"},
{MO_PCREL_LO, "riscv-pcrel-lo"},
@@ -2651,6 +2654,7 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case RISCV::TH_MULSH:
// Operands 2 and 3 are commutable.
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
+ case RISCV::PseudoCCMOVGPRNoX0:
case RISCV::PseudoCCMOVGPR:
// Operands 4 and 5 are commutable.
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 4, 5);
@@ -2807,6 +2811,7 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1,
OpIdx2);
}
+ case RISCV::PseudoCCMOVGPRNoX0:
case RISCV::PseudoCCMOVGPR: {
// CCMOV can be commuted by inverting the condition.
auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 35e8edf5d2fa..792e0bbdf581 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -729,22 +729,6 @@ def UNIMP : RVInstI<0b001, OPC_SYSTEM, (outs), (ins), "unimp", "">,
let imm12 = 0b110000000000;
}
-let Predicates = [HasStdExtZawrs] in {
-def WRS_NTO : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "wrs.nto", "">,
- Sched<[]> {
- let rs1 = 0;
- let rd = 0;
- let imm12 = 0b000000001101;
-}
-
-def WRS_STO : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "wrs.sto", "">,
- Sched<[]> {
- let rs1 = 0;
- let rd = 0;
- let imm12 = 0b000000011101;
-}
-} // Predicates = [HasStdExtZawrs]
-
} // hasSideEffects = 1, mayLoad = 0, mayStore = 0
def CSRRW : CSR_ir<0b001, "csrrw">;
@@ -1387,6 +1371,24 @@ def PseudoCCMOVGPR : Pseudo<(outs GPR:$dst),
ReadSFBALU, ReadSFBALU]>;
}
+// This should always expand to a branch+c.mv so the size is 6 or 4 if the
+// branch is compressible.
+let Predicates = [HasConditionalMoveFusion, NoShortForwardBranchOpt],
+ Constraints = "$dst = $falsev", isCommutable = 1, Size = 6 in {
+// This instruction moves $truev to $dst when the condition is true. It will
+// be expanded to control flow in RISCVExpandPseudoInsts.
+// We use GPRNoX0 because c.mv cannot encode X0.
+def PseudoCCMOVGPRNoX0 : Pseudo<(outs GPRNoX0:$dst),
+ (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc,
+ GPRNoX0:$falsev, GPRNoX0:$truev),
+ [(set GPRNoX0:$dst,
+ (riscv_selectcc_frag:$cc (XLenVT GPR:$lhs),
+ (XLenVT GPR:$rhs),
+ cond, (XLenVT GPRNoX0:$truev),
+ (XLenVT GPRNoX0:$falsev)))]>,
+ Sched<[]>;
+}
+
// Conditional binops, that updates update $dst to (op rs1, rs2) when condition
// is true. Returns $falsev otherwise. Selected by optimizeSelect.
// TODO: Can we use DefaultOperands on the regular binop to accomplish this more
@@ -1517,6 +1519,23 @@ def PseudoCCSRAIW : Pseudo<(outs GPR:$dst),
GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
ReadSFBALU]>;
+
+// Zbb/Zbkb instructions
+def PseudoCCANDN : Pseudo<(outs GPR:$dst),
+ (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc,
+ GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
+ Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
+ ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
+def PseudoCCORN : Pseudo<(outs GPR:$dst),
+ (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc,
+ GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
+ Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
+ ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
+def PseudoCCXNOR : Pseudo<(outs GPR:$dst),
+ (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc,
+ GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
+ Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
+ ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
}
multiclass SelectCC_GPR_rrirr<DAGOperand valty, ValueType vt> {
@@ -1535,7 +1554,7 @@ multiclass SelectCC_GPR_rrirr<DAGOperand valty, ValueType vt> {
(IntCCtoRISCVCC $cc), valty:$truev, valty:$falsev)>;
}
-let Predicates = [NoShortForwardBranchOpt] in
+let Predicates = [NoConditionalMoveFusion] in
defm Select_GPR : SelectCC_GPR_rrirr<GPR, XLenVT>;
class SelectCompressOpt<CondCode Cond>
@@ -2095,6 +2114,7 @@ include "RISCVInstrInfoM.td"
// Atomic
include "RISCVInstrInfoA.td"
+include "RISCVInstrInfoZa.td"
// Scalar FP
include "RISCVInstrInfoF.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index c8301fcc6b93..4d0567e41abc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -7,8 +7,7 @@
//===----------------------------------------------------------------------===//
//
// This file describes the RISC-V instructions from the standard 'A', Atomic
-// Instructions extension as well as the experimental 'Zacas' (Atomic
-// Compare-and-Swap) extension.
+// Instructions extension.
//
//===----------------------------------------------------------------------===//
@@ -96,15 +95,6 @@ defm AMOMAXU_D : AMO_rr_aq_rl<0b11100, 0b011, "amomaxu.d">,
Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
} // Predicates = [HasStdExtA, IsRV64]
-let Predicates = [HasStdExtZacas] in {
-defm AMOCAS_W : AMO_rr_aq_rl<0b00101, 0b010, "amocas.w">;
-defm AMOCAS_D : AMO_rr_aq_rl<0b00101, 0b011, "amocas.d">;
-} // Predicates = [HasStdExtZacas]
-
-let Predicates = [HasStdExtZacas, IsRV64] in {
-defm AMOCAS_Q : AMO_rr_aq_rl<0b00101, 0b100, "amocas.q">;
-} // Predicates = [HasStdExtZacas, IsRV64]
-
//===----------------------------------------------------------------------===//
// Pseudo-instructions and codegen patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 6af710049a9d..418421b2a556 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -36,11 +36,13 @@ def AddrRegImmINX : ComplexPattern<iPTR, 2, "SelectAddrRegImmINX">;
def GPRPF64AsFPR : AsmOperandClass {
let Name = "GPRPF64AsFPR";
let ParserMethod = "parseGPRAsFPR";
+ let PredicateMethod = "isGPRAsFPR";
let RenderMethod = "addRegOperands";
}
def GPRF64AsFPR : AsmOperandClass {
let Name = "GPRF64AsFPR";
+ let PredicateMethod = "isGPRAsFPR";
let ParserMethod = "parseGPRAsFPR";
let RenderMethod = "addRegOperands";
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 30deeaa06448..fcb18b67623e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6719,12 +6719,14 @@ defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">;
// 15.2. Vector mask population count vcpop
//===----------------------------------------------------------------------===//
+let IsSignExtendingOpW = 1 in
defm PseudoVCPOP: VPseudoVPOP_M;
//===----------------------------------------------------------------------===//
// 15.3. vfirst find-first-set mask bit
//===----------------------------------------------------------------------===//
+let IsSignExtendingOpW = 1 in
defm PseudoVFIRST: VPseudoV1ST_M;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index b7c845703794..4f87c36506e5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -1131,6 +1131,22 @@ defm : VPatBinarySDNode_VV_VX_VI<uaddsat, "PseudoVSADDU">;
defm : VPatBinarySDNode_VV_VX<ssubsat, "PseudoVSSUB">;
defm : VPatBinarySDNode_VV_VX<usubsat, "PseudoVSSUBU">;
+// 12.2. Vector Single-Width Averaging Add and Subtract
+foreach vti = AllIntegerVectors in {
+ let Predicates = GetVTypePredicates<vti>.Predicates in {
+ def : Pat<(avgflooru (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector vti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVAADDU_VV_"#vti.LMul.MX)
+ (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs2,
+ 0b10, vti.AVL, vti.Log2SEW, TA_MA)>;
+ def : Pat<(avgflooru (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector (SplatPat (XLenVT GPR:$rs2)))),
+ (!cast<Instruction>("PseudoVAADDU_VX_"#vti.LMul.MX)
+ (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, GPR:$rs2,
+ 0b10, vti.AVL, vti.Log2SEW, TA_MA)>;
+ }
+}
+
// 15. Vector Mask Instructions
// 15.1. Vector Mask-Register Logical Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 5b50a4a78c01..d60ff4b5fab0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -111,6 +111,7 @@ def riscv_ctlz_vl : SDNode<"RISCVISD::CTLZ_VL", SDT_RISCVIntUnOp_VL>
def riscv_cttz_vl : SDNode<"RISCVISD::CTTZ_VL", SDT_RISCVIntUnOp_VL>;
def riscv_ctpop_vl : SDNode<"RISCVISD::CTPOP_VL", SDT_RISCVIntUnOp_VL>;
+def riscv_avgflooru_vl : SDNode<"RISCVISD::AVGFLOORU_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
def riscv_saddsat_vl : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
def riscv_uaddsat_vl : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
def riscv_ssubsat_vl : SDNode<"RISCVISD::SSUBSAT_VL", SDT_RISCVIntBinOp_VL>;
@@ -338,13 +339,6 @@ def riscv_vrgatherei16_vv_vl : SDNode<"RISCVISD::VRGATHEREI16_VV_VL",
SDTCisSameNumEltsAs<0, 4>,
SDTCisVT<5, XLenVT>]>>;
-def SDT_RISCVSelect_VL : SDTypeProfile<1, 4, [
- SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>,
- SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisVT<4, XLenVT>
-]>;
-
-def riscv_vselect_vl : SDNode<"RISCVISD::VSELECT_VL", SDT_RISCVSelect_VL>;
-
def SDT_RISCVVMERGE_VL : SDTypeProfile<1, 5, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>,
SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameAs<0, 4>,
@@ -1722,21 +1716,21 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
(!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
- def : Pat<(riscv_vselect_vl (vti.Mask V0),
+ def : Pat<(riscv_vmerge_vl (vti.Mask V0),
(vti.Vector (op vti.RegClass:$rd,
(riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
srcvalue, (vti.Mask true_mask), VLOpFrag),
srcvalue, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, VLOpFrag),
+ vti.RegClass:$rd, undef, VLOpFrag),
(!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
- def : Pat<(riscv_vselect_vl (vti.Mask V0),
+ def : Pat<(riscv_vmerge_vl (vti.Mask V0),
(vti.Vector (op vti.RegClass:$rd,
(riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
srcvalue, (vti.Mask true_mask), VLOpFrag),
srcvalue, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, VLOpFrag),
+ vti.RegClass:$rd, undef, VLOpFrag),
(!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -1861,17 +1855,17 @@ multiclass VPatFPMulAccVL_VV_VF<PatFrag vop, string instruction_name> {
(!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
- def : Pat<(riscv_vselect_vl (vti.Mask V0),
+ def : Pat<(riscv_vmerge_vl (vti.Mask V0),
(vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, VLOpFrag),
+ vti.RegClass:$rd, undef, VLOpFrag),
(!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
- def : Pat<(riscv_vselect_vl (vti.Mask V0),
+ def : Pat<(riscv_vmerge_vl (vti.Mask V0),
(vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, VLOpFrag),
+ vti.RegClass:$rd, undef, VLOpFrag),
(!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -1905,10 +1899,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
// RISCVInsertReadWriteCSR
FRM_DYN,
GPR:$vl, vti.Log2SEW, TU_MU)>;
- def : Pat<(riscv_vselect_vl (vti.Mask V0),
+ def : Pat<(riscv_vmerge_vl (vti.Mask V0),
(vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, VLOpFrag),
+ vti.RegClass:$rd, undef, VLOpFrag),
(!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
(vti.Mask V0),
@@ -1916,10 +1910,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
// RISCVInsertReadWriteCSR
FRM_DYN,
GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
- def : Pat<(riscv_vselect_vl (vti.Mask V0),
+ def : Pat<(riscv_vmerge_vl (vti.Mask V0),
(vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, VLOpFrag),
+ vti.RegClass:$rd, undef, VLOpFrag),
(!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
(vti.Mask V0),
@@ -2255,31 +2249,6 @@ foreach vtiTowti = AllWidenableIntVectors in {
// 11.15. Vector Integer Merge Instructions
foreach vti = AllIntegerVectors in {
let Predicates = GetVTypePredicates<vti>.Predicates in {
- def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
- vti.RegClass:$rs1,
- vti.RegClass:$rs2,
- VLOpFrag)),
- (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
- (vti.Vector (IMPLICIT_DEF)),
- vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask V0),
- GPR:$vl, vti.Log2SEW)>;
-
- def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
- (SplatPat XLenVT:$rs1),
- vti.RegClass:$rs2,
- VLOpFrag)),
- (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
- (vti.Vector (IMPLICIT_DEF)),
- vti.RegClass:$rs2, GPR:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
-
- def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
- (SplatPat_simm5 simm5:$rs1),
- vti.RegClass:$rs2,
- VLOpFrag)),
- (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
- (vti.Vector (IMPLICIT_DEF)),
- vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
-
def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
vti.RegClass:$rs1,
vti.RegClass:$rs2,
@@ -2338,6 +2307,24 @@ defm : VPatBinaryVL_VV_VX_VI<riscv_uaddsat_vl, "PseudoVSADDU">;
defm : VPatBinaryVL_VV_VX<riscv_ssubsat_vl, "PseudoVSSUB">;
defm : VPatBinaryVL_VV_VX<riscv_usubsat_vl, "PseudoVSSUBU">;
+// 12.2. Vector Single-Width Averaging Add and Subtract
+foreach vti = AllIntegerVectors in {
+ let Predicates = GetVTypePredicates<vti>.Predicates in {
+ def : Pat<(riscv_avgflooru_vl (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector vti.RegClass:$rs2),
+ vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+ (!cast<Instruction>("PseudoVAADDU_VV_"#vti.LMul.MX#"_MASK")
+ vti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs2,
+ (vti.Mask V0), 0b10, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+ def : Pat<(riscv_avgflooru_vl (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
+ vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+ (!cast<Instruction>("PseudoVAADDU_VX_"#vti.LMul.MX#"_MASK")
+ vti.RegClass:$merge, vti.RegClass:$rs1, GPR:$rs2,
+ (vti.Mask V0), 0b10, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+ }
+}
+
// 12.5. Vector Narrowing Fixed-Point Clip Instructions
class VPatTruncSatClipMaxMinBase<string inst,
VTypeInfo vti,
@@ -2534,33 +2521,6 @@ foreach fvti = AllFloatVectors in {
// 13.15. Vector Floating-Point Merge Instruction
defvar ivti = GetIntVTypeInfo<fvti>.Vti;
let Predicates = GetVTypePredicates<ivti>.Predicates in {
- def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
- fvti.RegClass:$rs1,
- fvti.RegClass:$rs2,
- VLOpFrag)),
- (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
- (fvti.Vector (IMPLICIT_DEF)),
- fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
- GPR:$vl, fvti.Log2SEW)>;
-
- def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
- (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))),
- fvti.RegClass:$rs2,
- VLOpFrag)),
- (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
- (fvti.Vector (IMPLICIT_DEF)),
- fvti.RegClass:$rs2,
- GPR:$imm,
- (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
-
- def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
- (SplatFPOp (fvti.Scalar fpimm0)),
- fvti.RegClass:$rs2,
- VLOpFrag)),
- (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
- (fvti.Vector (IMPLICIT_DEF)),
- fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
-
def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
fvti.RegClass:$rs1,
fvti.RegClass:$rs2,
@@ -2571,6 +2531,16 @@ foreach fvti = AllFloatVectors in {
GPR:$vl, fvti.Log2SEW)>;
def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+ (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))),
+ fvti.RegClass:$rs2,
+ fvti.RegClass:$merge,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
+ fvti.RegClass:$merge, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0),
+ GPR:$vl, fvti.Log2SEW)>;
+
+
+ def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
(SplatFPOp (fvti.Scalar fpimm0)),
fvti.RegClass:$rs2,
fvti.RegClass:$merge,
@@ -2581,16 +2551,6 @@ foreach fvti = AllFloatVectors in {
}
let Predicates = GetVTypePredicates<fvti>.Predicates in {
- def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
- (SplatFPOp fvti.ScalarRegClass:$rs1),
- fvti.RegClass:$rs2,
- VLOpFrag)),
- (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
- (fvti.Vector (IMPLICIT_DEF)),
- fvti.RegClass:$rs2,
- (fvti.Scalar fvti.ScalarRegClass:$rs1),
- (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
-
def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
(SplatFPOp fvti.ScalarRegClass:$rs1),
fvti.RegClass:$rs2,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
new file mode 100644
index 000000000000..a09f5715b24f
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
@@ -0,0 +1,44 @@
+//===-- RISCVInstrInfoZa.td - RISC-V Atomic instructions ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions from the standard atomic 'Za*'
+// extensions:
+// - Zawrs (v1.0) : Wait-on-Reservation-Set.
+// - Zacas (v1.0-rc1) : Atomic Compare-and-Swap.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Zacas (Atomic Compare-and-Swap)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZacas] in {
+defm AMOCAS_W : AMO_rr_aq_rl<0b00101, 0b010, "amocas.w">;
+defm AMOCAS_D : AMO_rr_aq_rl<0b00101, 0b011, "amocas.d">;
+} // Predicates = [HasStdExtZacas]
+
+let Predicates = [HasStdExtZacas, IsRV64] in {
+defm AMOCAS_Q : AMO_rr_aq_rl<0b00101, 0b100, "amocas.q">;
+} // Predicates = [HasStdExtZacas, IsRV64]
+
+//===----------------------------------------------------------------------===//
+// Zawrs (Wait-on-Reservation-Set)
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+class WRSInst<bits<12> funct12, string opcodestr>
+ : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), opcodestr, ""> {
+ let rs1 = 0;
+ let rd = 0;
+ let imm12 = funct12;
+}
+
+let Predicates = [HasStdExtZawrs] in {
+def WRS_NTO : WRSInst<0b000000001101, "wrs.nto">, Sched<[]>;
+def WRS_STO : WRSInst<0b000000011101, "wrs.sto">, Sched<[]>;
+} // Predicates = [HasStdExtZawrs]
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 2c2b34bb5b77..c16eee67f3c5 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -126,7 +126,11 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
if (MI->getNumExplicitDefs() != 1)
return false;
- for (auto &UserOp : MRI.use_nodbg_operands(MI->getOperand(0).getReg())) {
+ Register DestReg = MI->getOperand(0).getReg();
+ if (!DestReg.isVirtual())
+ return false;
+
+ for (auto &UserOp : MRI.use_nodbg_operands(DestReg)) {
const MachineInstr *UserMI = UserOp.getParent();
unsigned OpIdx = UserOp.getOperandNo();
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index ba8996e710ed..52800f086129 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -232,7 +232,8 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", NoSchedModel,
FeatureStdExtZba,
FeatureStdExtZbb,
FeatureStdExtZbs,
- FeatureStdExtZfhmin]>;
+ FeatureStdExtZfhmin],
+ [TuneConditionalCompressedMoveFusion]>;
def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
SyntacoreSCR1Model,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 840fd149d681..a59d058382fe 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -487,7 +487,7 @@ defvar VMaskVTs = [vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t,
defvar VM1VTs = [vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
vbfloat16m1_t, vfloat16m1_t, vfloat32m1_t,
vfloat64m1_t, vint8mf2_t, vint8mf4_t, vint8mf8_t,
- vint16mf2_t, vint16mf4_t, vint32mf2_t,
+ vint16mf2_t, vint16mf4_t, vint32mf2_t,
vfloat16mf4_t, vfloat16mf2_t, vbfloat16mf4_t,
vbfloat16mf2_t, vfloat32mf2_t];
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 26320b05d9be..2ba93764facd 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -150,6 +150,13 @@ public:
bool hasHalfFPLoadStoreMove() const {
return HasStdExtZfhmin || HasStdExtZfbfmin;
}
+
+ bool hasConditionalMoveFusion() const {
+ // Do we support fusing a branch+mv or branch+c.mv as a conditional move.
+ return (hasConditionalCompressedMoveFusion() && hasStdExtCOrZca()) ||
+ hasShortForwardBranchOpt();
+ }
+
bool is64Bit() const { return IsRV64; }
MVT getXLenVT() const {
return is64Bit() ? MVT::i64 : MVT::i32;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 4614446b2150..b3916c987005 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -34,6 +34,65 @@ static cl::opt<unsigned> SLPMaxVF(
"exclusively by SLP vectorizer."),
cl::Hidden);
+InstructionCost
+RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
+ TTI::TargetCostKind CostKind) {
+ size_t NumInstr = OpCodes.size();
+ if (CostKind == TTI::TCK_CodeSize)
+ return NumInstr;
+ InstructionCost LMULCost = TLI->getLMULCost(VT);
+ if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
+ return LMULCost * NumInstr;
+ InstructionCost Cost = 0;
+ for (auto Op : OpCodes) {
+ switch (Op) {
+ case RISCV::VRGATHER_VI:
+ Cost += TLI->getVRGatherVICost(VT);
+ break;
+ case RISCV::VRGATHER_VV:
+ Cost += TLI->getVRGatherVVCost(VT);
+ break;
+ case RISCV::VSLIDEUP_VI:
+ case RISCV::VSLIDEDOWN_VI:
+ Cost += TLI->getVSlideVICost(VT);
+ break;
+ case RISCV::VSLIDEUP_VX:
+ case RISCV::VSLIDEDOWN_VX:
+ Cost += TLI->getVSlideVXCost(VT);
+ break;
+ case RISCV::VREDMAX_VS:
+ case RISCV::VREDMIN_VS:
+ case RISCV::VREDMAXU_VS:
+ case RISCV::VREDMINU_VS:
+ case RISCV::VREDSUM_VS:
+ case RISCV::VREDAND_VS:
+ case RISCV::VREDOR_VS:
+ case RISCV::VREDXOR_VS:
+ case RISCV::VFREDMAX_VS:
+ case RISCV::VFREDMIN_VS:
+ case RISCV::VFREDUSUM_VS: {
+ unsigned VL = VT.getVectorMinNumElements();
+ if (!VT.isFixedLengthVector())
+ VL *= *getVScaleForTuning();
+ Cost += Log2_32_Ceil(VL);
+ break;
+ }
+ case RISCV::VFREDOSUM_VS: {
+ unsigned VL = VT.getVectorMinNumElements();
+ if (!VT.isFixedLengthVector())
+ VL *= *getVScaleForTuning();
+ Cost += VL;
+ break;
+ }
+ case RISCV::VMV_S_X:
+ // FIXME: VMV_S_X doesn't use LMUL, the cost should be 1
+ default:
+ Cost += LMULCost;
+ }
+ }
+ return Cost;
+}
+
InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy() &&
@@ -281,7 +340,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// Example sequence:
// vnsrl.wi v10, v8, 0
if (equal(DeinterleaveMask, Mask))
- return LT.first * TLI->getLMULCost(LT.second);
+ return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
+ LT.second, CostKind);
}
}
}
@@ -292,7 +352,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
LT.second.getVectorNumElements() <= 256)) {
VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
- return IndexCost + TLI->getVRGatherVVCost(LT.second);
+ return IndexCost +
+ getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
}
[[fallthrough]];
}
@@ -310,7 +371,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
- return 2 * IndexCost + 2 * TLI->getVRGatherVVCost(LT.second) + MaskCost;
+ return 2 * IndexCost +
+ getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
+ LT.second, CostKind) +
+ MaskCost;
}
[[fallthrough]];
}
@@ -365,19 +429,24 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// Example sequence:
// vsetivli zero, 4, e8, mf2, tu, ma (ignored)
// vslidedown.vi v8, v9, 2
- return LT.first * TLI->getVSlideCost(LT.second);
+ return LT.first *
+ getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
case TTI::SK_InsertSubvector:
// Example sequence:
// vsetivli zero, 4, e8, mf2, tu, ma (ignored)
// vslideup.vi v8, v9, 2
- return LT.first * TLI->getVSlideCost(LT.second);
+ return LT.first *
+ getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
case TTI::SK_Select: {
// Example sequence:
// li a0, 90
// vsetivli zero, 8, e8, mf2, ta, ma (ignored)
// vmv.s.x v0, a0
// vmerge.vvm v8, v9, v8, v0
- return LT.first * 3 * TLI->getLMULCost(LT.second);
+ return LT.first *
+ (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for li
+ getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
+ LT.second, CostKind));
}
case TTI::SK_Broadcast: {
bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
@@ -389,7 +458,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// vsetivli zero, 2, e8, mf8, ta, ma (ignored)
// vmv.v.x v8, a0
// vmsne.vi v0, v8, 0
- return LT.first * TLI->getLMULCost(LT.second) * 3;
+ return LT.first *
+ (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi
+ getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
+ LT.second, CostKind));
}
// Example sequence:
// vsetivli zero, 2, e8, mf8, ta, mu (ignored)
@@ -400,24 +472,38 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// vmv.v.x v8, a0
// vmsne.vi v0, v8, 0
- return LT.first * TLI->getLMULCost(LT.second) * 6;
+ return LT.first *
+ (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi
+ TLI->getLMULCost(
+ LT.second) + // FIXME: vmv.x.s is the same as extractelement
+ getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
+ RISCV::VMV_V_X, RISCV::VMSNE_VI},
+ LT.second, CostKind));
}
if (HasScalar) {
// Example sequence:
// vmv.v.x v8, a0
- return LT.first * TLI->getLMULCost(LT.second);
+ return LT.first *
+ getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
}
// Example sequence:
// vrgather.vi v9, v8, 0
- return LT.first * TLI->getVRGatherVICost(LT.second);
+ return LT.first *
+ getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
}
- case TTI::SK_Splice:
+ case TTI::SK_Splice: {
// vslidedown+vslideup.
// TODO: Multiplying by LT.first implies this legalizes into multiple copies
// of similar code, but I think we expand through memory.
- return 2 * LT.first * TLI->getVSlideCost(LT.second);
+ unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
+ if (Index >= 0 && Index < 32)
+ Opcodes[0] = RISCV::VSLIDEDOWN_VI;
+ else if (Index < 0 && Index > -32)
+ Opcodes[1] = RISCV::VSLIDEUP_VI;
+ return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+ }
case TTI::SK_Reverse: {
// TODO: Cases to improve here:
// * Illegal vector types
@@ -437,7 +523,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
if (LT.second.isFixedLengthVector())
// vrsub.vi has a 5 bit immediate field, otherwise an li suffices
LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
- InstructionCost GatherCost = 2 + TLI->getVRGatherVVCost(LT.second);
+ // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
+ InstructionCost GatherCost =
+ 2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
// Mask operation additionally required extend and truncate
InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
return LT.first * (LenCost + GatherCost + ExtendCost);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 4c955744b37d..7e5dbddb5b51 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -48,6 +48,9 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
/// actual target hardware.
unsigned getEstimatedVLFor(VectorType *Ty);
+ InstructionCost getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
+ TTI::TargetCostKind CostKind);
+
/// Return the cost of accessing a constant pool entry of the specified
/// type.
InstructionCost getConstantPoolLoadCost(Type *Ty,
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 629db8e2eb4d..0a8b5499a1fc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -211,8 +211,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
MDString *MDKernelArgType =
getKernelArgAttribute(F, ArgIdx, "kernel_arg_type");
- if (!MDKernelArgType || (MDKernelArgType->getString().ends_with("*") &&
- MDKernelArgType->getString().ends_with("_t")))
+ if (!MDKernelArgType || (!MDKernelArgType->getString().ends_with("*") &&
+ !MDKernelArgType->getString().ends_with("_t")))
return GR->getOrCreateSPIRVType(OriginalArgType, MIRBuilder, ArgAccessQual);
if (MDKernelArgType->getString().ends_with("*"))
@@ -438,7 +438,8 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
assert(Arg.Regs.size() == 1 && "Call arg has multiple VRegs");
ArgVRegs.push_back(Arg.Regs[0]);
SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(Arg.Ty, MIRBuilder);
- GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF());
+ if (!GR->getSPIRVTypeForVReg(Arg.Regs[0]))
+ GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF());
}
if (auto Res = SPIRV::lowerBuiltin(
DemangledName, SPIRV::InstructionSet::OpenCL_std, MIRBuilder,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 660c574daf38..fb4e9932dd2d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -74,6 +74,7 @@ class SPIRVEmitIntrinsics
void processInstrAfterVisit(Instruction *I);
void insertAssignPtrTypeIntrs(Instruction *I);
void insertAssignTypeIntrs(Instruction *I);
+ void insertPtrCastInstr(Instruction *I);
void processGlobalValue(GlobalVariable &GV);
public:
@@ -255,7 +256,19 @@ Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) {
}
Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
- SmallVector<Type *, 2> Types = {I.getType(), I.getOperand(0)->getType()};
+ Value *Source = I.getOperand(0);
+
+ // SPIR-V, contrary to LLVM 17+ IR, supports bitcasts between pointers of
+ // varying element types. In case of IR coming from older versions of LLVM
+ // such bitcasts do not provide sufficient information, should be just skipped
+ // here, and handled in insertPtrCastInstr.
+ if (I.getType()->isPointerTy()) {
+ I.replaceAllUsesWith(Source);
+ I.eraseFromParent();
+ return nullptr;
+ }
+
+ SmallVector<Type *, 2> Types = {I.getType(), Source->getType()};
SmallVector<Value *> Args(I.op_begin(), I.op_end());
auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_bitcast, {Types}, {Args});
std::string InstName = I.hasName() ? I.getName().str() : "";
@@ -265,6 +278,111 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
return NewI;
}
+void SPIRVEmitIntrinsics::insertPtrCastInstr(Instruction *I) {
+ Value *Pointer;
+ Type *ExpectedElementType;
+ unsigned OperandToReplace;
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ Pointer = SI->getPointerOperand();
+ ExpectedElementType = SI->getValueOperand()->getType();
+ OperandToReplace = 1;
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ Pointer = LI->getPointerOperand();
+ ExpectedElementType = LI->getType();
+ OperandToReplace = 0;
+ } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+ Pointer = GEPI->getPointerOperand();
+ ExpectedElementType = GEPI->getSourceElementType();
+ OperandToReplace = 0;
+ } else {
+ return;
+ }
+
+ // If Pointer is the result of nop BitCastInst (ptr -> ptr), use the source
+ // pointer instead. The BitCastInst should be later removed when visited.
+ while (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer))
+ Pointer = BC->getOperand(0);
+
+ // Do not emit spv_ptrcast if Pointer is a GlobalValue of expected type.
+ GlobalValue *GV = dyn_cast<GlobalValue>(Pointer);
+ if (GV && GV->getValueType() == ExpectedElementType)
+ return;
+
+ // Do not emit spv_ptrcast if Pointer is a result of alloca with expected
+ // type.
+ AllocaInst *A = dyn_cast<AllocaInst>(Pointer);
+ if (A && A->getAllocatedType() == ExpectedElementType)
+ return;
+
+ if (dyn_cast<GetElementPtrInst>(Pointer))
+ return;
+
+ setInsertPointSkippingPhis(*IRB, I);
+ Constant *ExpectedElementTypeConst =
+ Constant::getNullValue(ExpectedElementType);
+ ConstantAsMetadata *CM =
+ ValueAsMetadata::getConstant(ExpectedElementTypeConst);
+ MDTuple *TyMD = MDNode::get(F->getContext(), CM);
+ MetadataAsValue *VMD = MetadataAsValue::get(F->getContext(), TyMD);
+ unsigned AddressSpace = Pointer->getType()->getPointerAddressSpace();
+ bool FirstPtrCastOrAssignPtrType = true;
+
+ // Do not emit new spv_ptrcast if equivalent one already exists or when
+ // spv_assign_ptr_type already targets this pointer with the same element
+ // type.
+ for (auto User : Pointer->users()) {
+ auto *II = dyn_cast<IntrinsicInst>(User);
+ if (!II ||
+ (II->getIntrinsicID() != Intrinsic::spv_assign_ptr_type &&
+ II->getIntrinsicID() != Intrinsic::spv_ptrcast) ||
+ II->getOperand(0) != Pointer)
+ continue;
+
+ // There is some spv_ptrcast/spv_assign_ptr_type already targeting this
+ // pointer.
+ FirstPtrCastOrAssignPtrType = false;
+ if (II->getOperand(1) != VMD ||
+ dyn_cast<ConstantInt>(II->getOperand(2))->getSExtValue() !=
+ AddressSpace)
+ continue;
+
+ // The spv_ptrcast/spv_assign_ptr_type targeting this pointer is of the same
+ // element type and address space.
+ if (II->getIntrinsicID() != Intrinsic::spv_ptrcast)
+ return;
+
+ // This must be a spv_ptrcast, do not emit new if this one has the same BB
+ // as I. Otherwise, search for other spv_ptrcast/spv_assign_ptr_type.
+ if (II->getParent() != I->getParent())
+ continue;
+
+ I->setOperand(OperandToReplace, II);
+ return;
+ }
+
+ // Do not emit spv_ptrcast if it would cast to the default pointer element
+ // type (i8) of the same address space.
+ if (ExpectedElementType->isIntegerTy(8))
+ return;
+
+ // If this would be the first spv_ptrcast and there is no spv_assign_ptr_type
+ // for this pointer before, do not emit spv_ptrcast but emit
+ // spv_assign_ptr_type instead.
+ if (FirstPtrCastOrAssignPtrType && isa<Instruction>(Pointer)) {
+ buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Pointer->getType()},
+ ExpectedElementTypeConst, Pointer,
+ {IRB->getInt32(AddressSpace)});
+ return;
+ } else {
+ SmallVector<Type *, 2> Types = {Pointer->getType(), Pointer->getType()};
+ SmallVector<Value *, 2> Args = {Pointer, VMD, IRB->getInt32(AddressSpace)};
+ auto *PtrCastI =
+ IRB->CreateIntrinsic(Intrinsic::spv_ptrcast, {Types}, Args);
+ I->setOperand(OperandToReplace, PtrCastI);
+ return;
+ }
+}
+
Instruction *SPIRVEmitIntrinsics::visitInsertElementInst(InsertElementInst &I) {
SmallVector<Type *, 4> Types = {I.getType(), I.getOperand(0)->getType(),
I.getOperand(1)->getType(),
@@ -522,13 +640,18 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
for (auto &I : Worklist) {
insertAssignPtrTypeIntrs(I);
insertAssignTypeIntrs(I);
+ insertPtrCastInstr(I);
}
for (auto *I : Worklist) {
TrackConstants = true;
if (!I->getType()->isVoidTy() || isa<StoreInst>(I))
IRB->SetInsertPoint(I->getNextNode());
+ // Visitors return either the original/newly created instruction for further
+ // processing, nullptr otherwise.
I = visit(*I);
+ if (!I)
+ continue;
processInstrAfterVisit(I);
}
return true;
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index faaf7f0e2548..061bc9674237 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -289,8 +289,9 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpirvType,
return ConvReg;
}
-bool SPIRVLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
- MachineInstr &MI) const {
+bool SPIRVLegalizerInfo::legalizeCustom(
+ LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const {
auto Opc = MI.getOpcode();
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
if (!isTypeFoldingSupported(Opc)) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
index 2541ff29edb0..f18b15b7f169 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h
@@ -29,7 +29,8 @@ class SPIRVLegalizerInfo : public LegalizerInfo {
SPIRVGlobalRegistry *GR;
public:
- bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
+ bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+ LostDebugLocObserver &LocObserver) const override;
SPIRVLegalizerInfo(const SPIRVSubtarget &ST);
};
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 1bfce70fedc0..cbc16fa98661 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -125,12 +125,32 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
SmallVector<MachineInstr *, 10> ToErase;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast))
+ if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) &&
+ !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast))
continue;
assert(MI.getOperand(2).isReg());
MIB.setInsertPt(*MI.getParent(), MI);
- MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
ToErase.push_back(&MI);
+ if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) {
+ MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
+ continue;
+ }
+ Register Def = MI.getOperand(0).getReg();
+ Register Source = MI.getOperand(2).getReg();
+ SPIRVType *BaseTy = GR->getOrCreateSPIRVType(
+ getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB);
+ SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
+ BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
+ addressSpaceToStorageClass(MI.getOperand(4).getImm()));
+
+ // If the bitcast would be redundant, replace all uses with the source
+ // register.
+ if (GR->getSPIRVTypeForVReg(Source) == AssignedPtrType) {
+ MIB.getMRI()->replaceRegWith(Def, Source);
+ } else {
+ GR->assignSPIRVTypeToVReg(AssignedPtrType, Def, MF);
+ MIB.buildBitcast(Def, Source);
+ }
}
}
for (MachineInstr *MI : ToErase)
@@ -587,6 +607,40 @@ static void processSwitches(MachineFunction &MF, SPIRVGlobalRegistry *GR,
}
}
+static bool isImplicitFallthrough(MachineBasicBlock &MBB) {
+ if (MBB.empty())
+ return true;
+
+ // Branching SPIR-V intrinsics are not detected by this generic method.
+ // Thus, we can only trust negative result.
+ if (!MBB.canFallThrough())
+ return false;
+
+ // Otherwise, we must manually check if we have a SPIR-V intrinsic which
+ // prevent an implicit fallthrough.
+ for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
+ It != E; ++It) {
+ if (isSpvIntrinsic(*It, Intrinsic::spv_switch))
+ return false;
+ }
+ return true;
+}
+
+static void removeImplicitFallthroughs(MachineFunction &MF,
+ MachineIRBuilder MIB) {
+ // It is valid for MachineBasicBlocks to not finish with a branch instruction.
+ // In such cases, they will simply fallthrough their immediate successor.
+ for (MachineBasicBlock &MBB : MF) {
+ if (!isImplicitFallthrough(MBB))
+ continue;
+
+ assert(std::distance(MBB.successors().begin(), MBB.successors().end()) ==
+ 1);
+ MIB.setInsertPt(MBB, MBB.end());
+ MIB.buildBr(**MBB.successors().begin());
+ }
+}
+
bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
// Initialize the type registry.
const SPIRVSubtarget &ST = MF.getSubtarget<SPIRVSubtarget>();
@@ -599,6 +653,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
generateAssignInstrs(MF, GR, MIB);
processSwitches(MF, GR, MIB);
processInstrsWithTypeFolding(MF, GR, MIB);
+ removeImplicitFallthroughs(MF, MIB);
return true;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 1503f263e42c..62d9090d289f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -29,6 +29,7 @@
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Utils.h"
#include <optional>
using namespace llvm;
@@ -151,6 +152,19 @@ TargetPassConfig *SPIRVTargetMachine::createPassConfig(PassManagerBase &PM) {
}
void SPIRVPassConfig::addIRPasses() {
+ if (TM.getSubtargetImpl()->isVulkanEnv()) {
+ // Once legalized, we need to structurize the CFG to follow the spec.
+ // This is done through the following 8 steps.
+ // TODO(#75801): add the remaining steps.
+
+ // 1. Simplify loop for subsequent transformations. After this steps, loops
+ // have the following properties:
+ // - loops have a single entry edge (pre-header to loop header).
+ // - all loop exits are dominated by the loop pre-header.
+ // - loops have a single back-edge.
+ addPass(createLoopSimplifyPass());
+ }
+
TargetPassConfig::addIRPasses();
addPass(createSPIRVRegularizerPass());
addPass(createSPIRVPrepareFunctionsPass(TM));
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 1c0e8d84e2fd..d4f7d8e89af5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -228,8 +228,8 @@ uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI) {
return MI->getOperand(1).getCImm()->getValue().getZExtValue();
}
-bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) {
- if (auto *GI = dyn_cast<GIntrinsic>(&MI))
+bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID) {
+ if (const auto *GI = dyn_cast<GIntrinsic>(&MI))
return GI->is(IntrinsicID);
return false;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index 30fae6c7de47..60742e2f2728 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -79,7 +79,7 @@ MachineInstr *getDefInstrMaybeConstant(Register &ConstReg,
uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI);
// Check if MI is a SPIR-V specific intrinsic call.
-bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID);
+bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID);
// Get type of i-th operand of the metadata node.
Type *getMDOperandAsType(const MDNode *N, unsigned I);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index c7d8591c5bdf..320f91c76057 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1641,7 +1641,7 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
// If this is a 64-bit constant that is out of the range of LLILF,
// LLIHF and LGFI, split it into two 32-bit pieces.
if (Node->getValueType(0) == MVT::i64) {
- uint64_t Val = cast<ConstantSDNode>(Node)->getZExtValue();
+ uint64_t Val = Node->getAsZExtVal();
if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val) && !isInt<32>(Val)) {
splitLargeImmediate(ISD::OR, Node, SDValue(), Val - uint32_t(Val),
uint32_t(Val));
@@ -1677,10 +1677,8 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
isInt<16>(cast<ConstantSDNode>(Op0)->getSExtValue())))) {
SDValue CCValid = Node->getOperand(2);
SDValue CCMask = Node->getOperand(3);
- uint64_t ConstCCValid =
- cast<ConstantSDNode>(CCValid.getNode())->getZExtValue();
- uint64_t ConstCCMask =
- cast<ConstantSDNode>(CCMask.getNode())->getZExtValue();
+ uint64_t ConstCCValid = CCValid.getNode()->getAsZExtVal();
+ uint64_t ConstCCMask = CCMask.getNode()->getAsZExtVal();
// Invert the condition.
CCMask = CurDAG->getTargetConstant(ConstCCValid ^ ConstCCMask,
SDLoc(Node), CCMask.getValueType());
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 045c4c0aac07..2450c6801a66 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -2662,10 +2662,8 @@ static void adjustForFNeg(Comparison &C) {
// with (sext (trunc X)) into a comparison with (shl X, 32).
static void adjustForLTGFR(Comparison &C) {
// Check for a comparison between (shl X, 32) and 0.
- if (C.Op0.getOpcode() == ISD::SHL &&
- C.Op0.getValueType() == MVT::i64 &&
- C.Op1.getOpcode() == ISD::Constant &&
- cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+ if (C.Op0.getOpcode() == ISD::SHL && C.Op0.getValueType() == MVT::i64 &&
+ C.Op1.getOpcode() == ISD::Constant && C.Op1->getAsZExtVal() == 0) {
auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
if (C1 && C1->getZExtValue() == 32) {
SDValue ShlOp0 = C.Op0.getOperand(0);
@@ -2690,7 +2688,7 @@ static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
C.Op1.getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(C.Op1)->getValueSizeInBits(0) <= 64 &&
- cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+ C.Op1->getAsZExtVal() == 0) {
auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
if (L->getMemoryVT().getStoreSizeInBits().getFixedValue() <=
C.Op0.getValueSizeInBits().getFixedValue()) {
@@ -3035,12 +3033,12 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid,
- cast<ConstantSDNode>(CmpOp1)->getZExtValue(), Cond);
+ CmpOp1->getAsZExtVal(), Cond);
if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid,
- cast<ConstantSDNode>(CmpOp1)->getZExtValue(), Cond);
+ CmpOp1->getAsZExtVal(), Cond);
}
Comparison C(CmpOp0, CmpOp1, Chain);
C.CCMask = CCMaskForCondCode(Cond);
@@ -3457,12 +3455,11 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
// Check for absolute and negative-absolute selections, including those
// where the comparison value is sign-extended (for LPGFR and LNGFR).
// This check supplements the one in DAGCombiner.
- if (C.Opcode == SystemZISD::ICMP &&
- C.CCMask != SystemZ::CCMASK_CMP_EQ &&
+ if (C.Opcode == SystemZISD::ICMP && C.CCMask != SystemZ::CCMASK_CMP_EQ &&
C.CCMask != SystemZ::CCMASK_CMP_NE &&
C.Op1.getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(C.Op1)->getValueSizeInBits(0) <= 64 &&
- cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+ C.Op1->getAsZExtVal() == 0) {
if (isAbsolute(C.Op0, TrueOp, FalseOp))
return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
if (isAbsolute(C.Op0, FalseOp, TrueOp))
@@ -3947,8 +3944,7 @@ SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_XPLINK(SDValue Op,
// If user has set the no alignment function attribute, ignore
// alloca alignments.
- uint64_t AlignVal =
- (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+ uint64_t AlignVal = (RealignOpt ? Align->getAsZExtVal() : 0);
uint64_t StackAlign = TFI->getStackAlignment();
uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
@@ -4013,8 +4009,7 @@ SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_ELF(SDValue Op,
// If user has set the no alignment function attribute, ignore
// alloca alignments.
- uint64_t AlignVal =
- (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+ uint64_t AlignVal = (RealignOpt ? Align->getAsZExtVal() : 0);
uint64_t StackAlign = TFI->getStackAlignment();
uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
@@ -4213,7 +4208,7 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
// If the low part is a constant that is outside the range of LHI,
// then we're better off using IILF.
if (LowOp.getOpcode() == ISD::Constant) {
- int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
+ int64_t Value = int32_t(LowOp->getAsZExtVal());
if (!isInt<16>(Value))
return Op;
}
@@ -5897,7 +5892,7 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
Op1.getOpcode() != ISD::BITCAST &&
Op1.getOpcode() != ISD::ConstantFP &&
Op2.getOpcode() == ISD::Constant) {
- uint64_t Index = cast<ConstantSDNode>(Op2)->getZExtValue();
+ uint64_t Index = Op2->getAsZExtVal();
unsigned Mask = VT.getVectorNumElements() - 1;
if (Index <= Mask)
return Op;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 37abbb072cdd..15dc44a04395 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -278,7 +278,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
TmpOffset += SL->getElementOffset(Idx);
} else {
- uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ uint64_t S = GTI.getSequentialElementStride(DL);
for (;;) {
if (const auto *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4bcf89690505..7c47790d1e35 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1869,8 +1869,7 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
Ops[OpIdx++] = Op.getOperand(2);
while (OpIdx < 18) {
const SDValue &MaskIdx = Op.getOperand(OpIdx + 1);
- if (MaskIdx.isUndef() ||
- cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) {
+ if (MaskIdx.isUndef() || MaskIdx.getNode()->getAsZExtVal() >= 32) {
bool isTarget = MaskIdx.getNode()->getOpcode() == ISD::TargetConstant;
Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32, isTarget);
} else {
@@ -1912,7 +1911,7 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
const SDNode *Index = Extract.getOperand(1).getNode();
if (!isa<ConstantSDNode>(Index))
return SDValue();
- unsigned IndexVal = cast<ConstantSDNode>(Index)->getZExtValue();
+ unsigned IndexVal = Index->getAsZExtVal();
unsigned Scale =
ExtractedVecT.getVectorNumElements() / VecT.getVectorNumElements();
assert(Scale > 1);
@@ -2335,7 +2334,7 @@ WebAssemblyTargetLowering::LowerAccessVectorElement(SDValue Op,
SDNode *IdxNode = Op.getOperand(Op.getNumOperands() - 1).getNode();
if (isa<ConstantSDNode>(IdxNode)) {
// Ensure the index type is i32 to match the tablegen patterns
- uint64_t Idx = cast<ConstantSDNode>(IdxNode)->getZExtValue();
+ uint64_t Idx = IdxNode->getAsZExtVal();
SmallVector<SDValue, 3> Ops(Op.getNode()->ops());
Ops[Op.getNumOperands() - 1] =
DAG.getConstant(Idx, SDLoc(IdxNode), MVT::i32);
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
index 1f69feceae27..12134f7b00f1 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
@@ -21,7 +21,6 @@ namespace llvm {
class X86Subtarget;
class X86TargetMachine;
-/// This class provides the information for the target register banks.
class X86LegalizerInfo : public LegalizerInfo {
private:
/// Keep a reference to the X86Subtarget around so that we can
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index e006dd877360..304b998e1f26 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -148,25 +148,21 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) {
case X86::AND16ri8:
case X86::AND16rm:
case X86::AND16rr:
- case X86::AND16rr_REV:
case X86::AND32i32:
case X86::AND32ri:
case X86::AND32ri8:
case X86::AND32rm:
case X86::AND32rr:
- case X86::AND32rr_REV:
case X86::AND64i32:
case X86::AND64ri32:
case X86::AND64ri8:
case X86::AND64rm:
case X86::AND64rr:
- case X86::AND64rr_REV:
case X86::AND8i8:
case X86::AND8ri:
case X86::AND8ri8:
case X86::AND8rm:
case X86::AND8rr:
- case X86::AND8rr_REV:
return FirstMacroFusionInstKind::And;
// CMP
case X86::CMP16i16:
@@ -175,28 +171,24 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) {
case X86::CMP16ri8:
case X86::CMP16rm:
case X86::CMP16rr:
- case X86::CMP16rr_REV:
case X86::CMP32i32:
case X86::CMP32mr:
case X86::CMP32ri:
case X86::CMP32ri8:
case X86::CMP32rm:
case X86::CMP32rr:
- case X86::CMP32rr_REV:
case X86::CMP64i32:
case X86::CMP64mr:
case X86::CMP64ri32:
case X86::CMP64ri8:
case X86::CMP64rm:
case X86::CMP64rr:
- case X86::CMP64rr_REV:
case X86::CMP8i8:
case X86::CMP8mr:
case X86::CMP8ri:
case X86::CMP8ri8:
case X86::CMP8rm:
case X86::CMP8rr:
- case X86::CMP8rr_REV:
return FirstMacroFusionInstKind::Cmp;
// ADD
case X86::ADD16i16:
@@ -204,50 +196,42 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) {
case X86::ADD16ri8:
case X86::ADD16rm:
case X86::ADD16rr:
- case X86::ADD16rr_REV:
case X86::ADD32i32:
case X86::ADD32ri:
case X86::ADD32ri8:
case X86::ADD32rm:
case X86::ADD32rr:
- case X86::ADD32rr_REV:
case X86::ADD64i32:
case X86::ADD64ri32:
case X86::ADD64ri8:
case X86::ADD64rm:
case X86::ADD64rr:
- case X86::ADD64rr_REV:
case X86::ADD8i8:
case X86::ADD8ri:
case X86::ADD8ri8:
case X86::ADD8rm:
case X86::ADD8rr:
- case X86::ADD8rr_REV:
// SUB
case X86::SUB16i16:
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB16rm:
case X86::SUB16rr:
- case X86::SUB16rr_REV:
case X86::SUB32i32:
case X86::SUB32ri:
case X86::SUB32ri8:
case X86::SUB32rm:
case X86::SUB32rr:
- case X86::SUB32rr_REV:
case X86::SUB64i32:
case X86::SUB64ri32:
case X86::SUB64ri8:
case X86::SUB64rm:
case X86::SUB64rr:
- case X86::SUB64rr_REV:
case X86::SUB8i8:
case X86::SUB8ri:
case X86::SUB8ri8:
case X86::SUB8rm:
case X86::SUB8rr:
- case X86::SUB8rr_REV:
return FirstMacroFusionInstKind::AddSub;
// INC
case X86::INC16r:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 924956295e7c..f7c361393fea 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1650,6 +1650,9 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
+ if (IsND) // Skip new data destination
+ ++CurOp;
+
emitRegModRMByte(MI.getOperand(SrcRegNum),
getX86RegNum(MI.getOperand(CurOp)), CB);
CurOp = SrcRegNum + 1;
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 485afbc1dfbc..21623a805f55 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -131,9 +131,9 @@ FunctionPass *createX86FixupBWInsts();
/// to another, when profitable.
FunctionPass *createX86DomainReassignmentPass();
-/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX
-/// encoding when possible in order to reduce code size.
-FunctionPass *createX86EvexToVexInsts();
+/// This pass compress instructions from EVEX space to legacy/VEX/EVEX space when
+/// possible in order to reduce code size or facilitate HW decoding.
+FunctionPass *createX86CompressEVEXPass();
/// This pass creates the thunks for the retpoline feature.
FunctionPass *createX86IndirectThunksPass();
@@ -167,7 +167,7 @@ FunctionPass *createX86SpeculativeLoadHardeningPass();
FunctionPass *createX86SpeculativeExecutionSideEffectSuppression();
FunctionPass *createX86ArgumentStackSlotPass();
-void initializeEvexToVexInstPassPass(PassRegistry &);
+void initializeCompressEVEXPassPass(PassRegistry &);
void initializeFPSPass(PassRegistry &);
void initializeFixupBWInstPassPass(PassRegistry &);
void initializeFixupLEAPassPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index c425c37b4186..b95baddd9dea 100644
--- a/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -1,5 +1,4 @@
-//===- X86EvexToVex.cpp ---------------------------------------------------===//
-// Compress EVEX instructions to VEX encoding when possible to reduce code size
+//===- X86CompressEVEX.cpp ------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -7,17 +6,30 @@
//
//===----------------------------------------------------------------------===//
//
-/// \file
-/// This file defines the pass that goes over all AVX-512 instructions which
-/// are encoded using the EVEX prefix and if possible replaces them by their
-/// corresponding VEX encoding which is usually shorter by 2 bytes.
-/// EVEX instructions may be encoded via the VEX prefix when the AVX-512
-/// instruction has a corresponding AVX/AVX2 opcode, when vector length
-/// accessed by instruction is less than 512 bits and when it does not use
-// the xmm or the mask registers or xmm/ymm registers with indexes higher
-// than 15.
-/// The pass applies code reduction on the generated code for AVX-512 instrs.
+// This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
+// when possible in order to reduce code size or facilitate HW decoding.
//
+// Possible compression:
+// a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
+// b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
+// c. NDD (EVEX) -> non-NDD (legacy)
+// d. NF_ND (EVEX) -> NF (EVEX)
+//
+// Compression a, b and c can always reduce code size, with some exceptions
+// such as promoted 16-bit CRC32 which is as long as the legacy version.
+//
+// legacy:
+// crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
+// promoted:
+// crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
+//
+// From performance perspective, these should be same (same uops and same EXE
+// ports). From a FMV perspective, an older legacy encoding is preferred b/c it
+// can execute in more places (broader HW install base). So we will still do
+// the compression.
+//
+// Compression d can help hardware decode (HW may skip reading the NDD
+// register) although the instruction length remains unchanged.
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/X86BaseInfo.h"
@@ -38,37 +50,34 @@
using namespace llvm;
-// Including the generated EVEX2VEX tables.
-struct X86EvexToVexCompressTableEntry {
- uint16_t EvexOpc;
- uint16_t VexOpc;
+// Including the generated EVEX compression tables.
+struct X86CompressEVEXTableEntry {
+ uint16_t OldOpc;
+ uint16_t NewOpc;
- bool operator<(const X86EvexToVexCompressTableEntry &RHS) const {
- return EvexOpc < RHS.EvexOpc;
+ bool operator<(const X86CompressEVEXTableEntry &RHS) const {
+ return OldOpc < RHS.OldOpc;
}
- friend bool operator<(const X86EvexToVexCompressTableEntry &TE,
- unsigned Opc) {
- return TE.EvexOpc < Opc;
+ friend bool operator<(const X86CompressEVEXTableEntry &TE, unsigned Opc) {
+ return TE.OldOpc < Opc;
}
};
-#include "X86GenEVEX2VEXTables.inc"
+#include "X86GenCompressEVEXTables.inc"
-#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible"
-#define EVEX2VEX_NAME "x86-evex-to-vex-compress"
+#define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
+#define COMP_EVEX_NAME "x86-compress-evex"
-#define DEBUG_TYPE EVEX2VEX_NAME
+#define DEBUG_TYPE COMP_EVEX_NAME
namespace {
-class EvexToVexInstPass : public MachineFunctionPass {
+class CompressEVEXPass : public MachineFunctionPass {
public:
static char ID;
- EvexToVexInstPass() : MachineFunctionPass(ID) {}
- StringRef getPassName() const override { return EVEX2VEX_DESC; }
+ CompressEVEXPass() : MachineFunctionPass(ID) {}
+ StringRef getPassName() const override { return COMP_EVEX_DESC; }
- /// Loop over all of the basic blocks, replacing EVEX instructions
- /// by equivalent VEX instructions when possible for reducing code size.
bool runOnMachineFunction(MachineFunction &MF) override;
// This pass runs after regalloc and doesn't support VReg operands.
@@ -80,7 +89,7 @@ public:
} // end anonymous namespace
-char EvexToVexInstPass::ID = 0;
+char CompressEVEXPass::ID = 0;
static bool usesExtendedRegister(const MachineInstr &MI) {
auto isHiRegIdx = [](unsigned Reg) {
@@ -112,8 +121,8 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
return false;
}
-static bool checkVEXInstPredicate(unsigned EvexOpc, const X86Subtarget &ST) {
- switch (EvexOpc) {
+static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) {
+ switch (OldOpc) {
default:
return true;
case X86::VCVTNEPS2BF16Z128rm:
@@ -151,15 +160,15 @@ static bool checkVEXInstPredicate(unsigned EvexOpc, const X86Subtarget &ST) {
}
// Do any custom cleanup needed to finalize the conversion.
-static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {
- (void)VexOpc;
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+ (void)NewOpc;
unsigned Opc = MI.getOpcode();
switch (Opc) {
case X86::VALIGNDZ128rri:
case X86::VALIGNDZ128rmi:
case X86::VALIGNQZ128rri:
case X86::VALIGNQZ128rmi: {
- assert((VexOpc == X86::VPALIGNRrri || VexOpc == X86::VPALIGNRrmi) &&
+ assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
"Unexpected new opcode!");
unsigned Scale =
(Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
@@ -175,8 +184,8 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {
case X86::VSHUFI32X4Z256rri:
case X86::VSHUFI64X2Z256rmi:
case X86::VSHUFI64X2Z256rri: {
- assert((VexOpc == X86::VPERM2F128rr || VexOpc == X86::VPERM2I128rr ||
- VexOpc == X86::VPERM2F128rm || VexOpc == X86::VPERM2I128rm) &&
+ assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
+ NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
"Unexpected new opcode!");
MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
int64_t ImmVal = Imm.getImm();
@@ -200,7 +209,7 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {
case X86::VRNDSCALESDZm_Int:
case X86::VRNDSCALESSZr_Int:
case X86::VRNDSCALESSZm_Int:
- const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+ const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
int64_t ImmVal = Imm.getImm();
// Ensure that only bits 3:0 of the immediate are used.
if ((ImmVal & 0xf) != ImmVal)
@@ -211,86 +220,77 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {
return true;
}
-// For EVEX instructions that can be encoded using VEX encoding
-// replace them by the VEX encoding in order to reduce size.
-static bool CompressEvexToVexImpl(MachineInstr &MI, const X86Subtarget &ST) {
- // VEX format.
- // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1
- // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM]
- //
- // EVEX format.
- // # of bytes: 4 1 1 1 4 / 1 1
- // [Prefixes] EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate]
- const MCInstrDesc &Desc = MI.getDesc();
+static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
// Check for EVEX instructions only.
- if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX)
+ if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
return false;
- // Check for EVEX instructions with mask or broadcast as in these cases
- // the EVEX prefix is needed in order to carry this information
- // thus preventing the transformation to VEX encoding.
- if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B))
+ // Instructions with mask or 512-bit vector can't be converted to VEX.
+ if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
return false;
- // Check for EVEX instructions with L2 set. These instructions are 512-bits
- // and can't be converted to VEX.
- if (Desc.TSFlags & X86II::EVEX_L2)
+ // EVEX_B has several meanings.
+ // AVX512:
+ // register form: rounding control or SAE
+ // memory form: broadcast
+ //
+ // APX:
+ // MAP4: NDD
+ //
+ // For AVX512 cases, EVEX prefix is needed in order to carry this information
+ // thus preventing the transformation to VEX encoding.
+ if (TSFlags & X86II::EVEX_B)
return false;
- // Use the VEX.L bit to select the 128 or 256-bit table.
- ArrayRef<X86EvexToVexCompressTableEntry> Table =
- (Desc.TSFlags & X86II::VEX_L) ? ArrayRef(X86EvexToVex256CompressTable)
- : ArrayRef(X86EvexToVex128CompressTable);
+ ArrayRef<X86CompressEVEXTableEntry> Table = ArrayRef(X86CompressEVEXTable);
- unsigned EvexOpc = MI.getOpcode();
- const auto *I = llvm::lower_bound(Table, EvexOpc);
- if (I == Table.end() || I->EvexOpc != EvexOpc)
+ unsigned Opc = MI.getOpcode();
+ const auto *I = llvm::lower_bound(Table, Opc);
+ if (I == Table.end() || I->OldOpc != Opc)
return false;
- if (usesExtendedRegister(MI))
- return false;
- if (!checkVEXInstPredicate(EvexOpc, ST))
- return false;
- if (!performCustomAdjustments(MI, I->VexOpc))
+ if (usesExtendedRegister(MI) || !checkVEXInstPredicate(Opc, ST) ||
+ !performCustomAdjustments(MI, I->NewOpc))
return false;
- MI.setDesc(ST.getInstrInfo()->get(I->VexOpc));
- MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+ const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc);
+ MI.setDesc(NewDesc);
+ uint64_t Encoding = NewDesc.TSFlags & X86II::EncodingMask;
+ auto AsmComment =
+ (Encoding == X86II::VEX) ? X86::AC_EVEX_2_VEX : X86::AC_EVEX_2_LEGACY;
+ MI.setAsmPrinterFlag(AsmComment);
return true;
}
-bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
+bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
#ifndef NDEBUG
// Make sure the tables are sorted.
static std::atomic<bool> TableChecked(false);
if (!TableChecked.load(std::memory_order_relaxed)) {
- assert(llvm::is_sorted(X86EvexToVex128CompressTable) &&
- "X86EvexToVex128CompressTable is not sorted!");
- assert(llvm::is_sorted(X86EvexToVex256CompressTable) &&
- "X86EvexToVex256CompressTable is not sorted!");
+ assert(llvm::is_sorted(X86CompressEVEXTable) &&
+ "X86CompressEVEXTable is not sorted!");
TableChecked.store(true, std::memory_order_relaxed);
}
#endif
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.hasAVX512())
+ if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD())
return false;
bool Changed = false;
- /// Go over all basic blocks in function and replace
- /// EVEX encoded instrs by VEX encoding when possible.
for (MachineBasicBlock &MBB : MF) {
// Traverse the basic block.
for (MachineInstr &MI : MBB)
- Changed |= CompressEvexToVexImpl(MI, ST);
+ Changed |= CompressEVEXImpl(MI, ST);
}
return Changed;
}
-INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
+INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false)
-FunctionPass *llvm::createX86EvexToVexInsts() {
- return new EvexToVexInstPass();
+FunctionPass *llvm::createX86CompressEVEXPass() {
+ return new CompressEVEXPass();
}
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index bdd86e48fa54..20dbaf797e32 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -619,40 +619,30 @@ void X86DomainReassignment::initConverters() {
std::make_unique<InstrReplacerDstCOPY>(From, To);
};
- bool HasEGPR = STI->hasEGPR();
- createReplacerDstCOPY(X86::MOVZX32rm16,
- HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm);
- createReplacerDstCOPY(X86::MOVZX64rm16,
- HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm);
+#define GET_EGPR_IF_ENABLED(OPC) STI->hasEGPR() ? OPC##_EVEX : OPC
+ createReplacerDstCOPY(X86::MOVZX32rm16, GET_EGPR_IF_ENABLED(X86::KMOVWkm));
+ createReplacerDstCOPY(X86::MOVZX64rm16, GET_EGPR_IF_ENABLED(X86::KMOVWkm));
- createReplacerDstCOPY(X86::MOVZX32rr16,
- HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk);
- createReplacerDstCOPY(X86::MOVZX64rr16,
- HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk);
+ createReplacerDstCOPY(X86::MOVZX32rr16, GET_EGPR_IF_ENABLED(X86::KMOVWkk));
+ createReplacerDstCOPY(X86::MOVZX64rr16, GET_EGPR_IF_ENABLED(X86::KMOVWkk));
if (STI->hasDQI()) {
- createReplacerDstCOPY(X86::MOVZX16rm8,
- HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm);
- createReplacerDstCOPY(X86::MOVZX32rm8,
- HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm);
- createReplacerDstCOPY(X86::MOVZX64rm8,
- HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm);
-
- createReplacerDstCOPY(X86::MOVZX16rr8,
- HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk);
- createReplacerDstCOPY(X86::MOVZX32rr8,
- HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk);
- createReplacerDstCOPY(X86::MOVZX64rr8,
- HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk);
+ createReplacerDstCOPY(X86::MOVZX16rm8, GET_EGPR_IF_ENABLED(X86::KMOVBkm));
+ createReplacerDstCOPY(X86::MOVZX32rm8, GET_EGPR_IF_ENABLED(X86::KMOVBkm));
+ createReplacerDstCOPY(X86::MOVZX64rm8, GET_EGPR_IF_ENABLED(X86::KMOVBkm));
+
+ createReplacerDstCOPY(X86::MOVZX16rr8, GET_EGPR_IF_ENABLED(X86::KMOVBkk));
+ createReplacerDstCOPY(X86::MOVZX32rr8, GET_EGPR_IF_ENABLED(X86::KMOVBkk));
+ createReplacerDstCOPY(X86::MOVZX64rr8, GET_EGPR_IF_ENABLED(X86::KMOVBkk));
}
auto createReplacer = [&](unsigned From, unsigned To) {
Converters[{MaskDomain, From}] = std::make_unique<InstrReplacer>(From, To);
};
- createReplacer(X86::MOV16rm, HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm);
- createReplacer(X86::MOV16mr, HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
- createReplacer(X86::MOV16rr, HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk);
+ createReplacer(X86::MOV16rm, GET_EGPR_IF_ENABLED(X86::KMOVWkm));
+ createReplacer(X86::MOV16mr, GET_EGPR_IF_ENABLED(X86::KMOVWmk));
+ createReplacer(X86::MOV16rr, GET_EGPR_IF_ENABLED(X86::KMOVWkk));
createReplacer(X86::SHR16ri, X86::KSHIFTRWri);
createReplacer(X86::SHL16ri, X86::KSHIFTLWri);
createReplacer(X86::NOT16r, X86::KNOTWrr);
@@ -661,14 +651,14 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::XOR16rr, X86::KXORWrr);
if (STI->hasBWI()) {
- createReplacer(X86::MOV32rm, HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm);
- createReplacer(X86::MOV64rm, HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm);
+ createReplacer(X86::MOV32rm, GET_EGPR_IF_ENABLED(X86::KMOVDkm));
+ createReplacer(X86::MOV64rm, GET_EGPR_IF_ENABLED(X86::KMOVQkm));
- createReplacer(X86::MOV32mr, HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
- createReplacer(X86::MOV64mr, HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
+ createReplacer(X86::MOV32mr, GET_EGPR_IF_ENABLED(X86::KMOVDmk));
+ createReplacer(X86::MOV64mr, GET_EGPR_IF_ENABLED(X86::KMOVQmk));
- createReplacer(X86::MOV32rr, HasEGPR ? X86::KMOVDkk_EVEX : X86::KMOVDkk);
- createReplacer(X86::MOV64rr, HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk);
+ createReplacer(X86::MOV32rr, GET_EGPR_IF_ENABLED(X86::KMOVDkk));
+ createReplacer(X86::MOV64rr, GET_EGPR_IF_ENABLED(X86::KMOVQkk));
createReplacer(X86::SHR32ri, X86::KSHIFTRDri);
createReplacer(X86::SHR64ri, X86::KSHIFTRQri);
@@ -696,8 +686,8 @@ void X86DomainReassignment::initConverters() {
// TODO: KTEST is not a replacement for TEST due to flag differences. Need
// to prove only Z flag is used.
- //createReplacer(X86::TEST32rr, X86::KTESTDrr);
- //createReplacer(X86::TEST64rr, X86::KTESTQrr);
+ // createReplacer(X86::TEST32rr, X86::KTESTDrr);
+ // createReplacer(X86::TEST64rr, X86::KTESTQrr);
}
if (STI->hasDQI()) {
@@ -706,9 +696,9 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::AND8rr, X86::KANDBrr);
- createReplacer(X86::MOV8rm, HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm);
- createReplacer(X86::MOV8mr, HasEGPR ? X86::KMOVBmk_EVEX : X86::KMOVBmk);
- createReplacer(X86::MOV8rr, HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk);
+ createReplacer(X86::MOV8rm, GET_EGPR_IF_ENABLED(X86::KMOVBkm));
+ createReplacer(X86::MOV8mr, GET_EGPR_IF_ENABLED(X86::KMOVBmk));
+ createReplacer(X86::MOV8rr, GET_EGPR_IF_ENABLED(X86::KMOVBkk));
createReplacer(X86::NOT8r, X86::KNOTBrr);
@@ -719,11 +709,12 @@ void X86DomainReassignment::initConverters() {
// TODO: KTEST is not a replacement for TEST due to flag differences. Need
// to prove only Z flag is used.
- //createReplacer(X86::TEST8rr, X86::KTESTBrr);
- //createReplacer(X86::TEST16rr, X86::KTESTWrr);
+ // createReplacer(X86::TEST8rr, X86::KTESTBrr);
+ // createReplacer(X86::TEST16rr, X86::KTESTWrr);
createReplacer(X86::XOR8rr, X86::KXORBrr);
}
+#undef GET_EGPR_IF_ENABLED
}
bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 0ba31e173a1a..1ce1e6f6a563 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -916,7 +916,7 @@ redo_gep:
// A array/variable index is always of the form i*S where S is the
// constant scale size. See if we can push the scale into immediates.
- uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ uint64_t S = GTI.getSequentialElementStride(DL);
for (;;) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
@@ -3046,22 +3046,24 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
switch (II->getIntrinsicID()) {
default:
llvm_unreachable("Unexpected intrinsic.");
+#define GET_EGPR_IF_ENABLED(OPC) Subtarget->hasEGPR() ? OPC##_EVEX : OPC
case Intrinsic::x86_sse42_crc32_32_8:
- Opc = X86::CRC32r32r8;
+ Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r8);
RC = &X86::GR32RegClass;
break;
case Intrinsic::x86_sse42_crc32_32_16:
- Opc = X86::CRC32r32r16;
+ Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r16);
RC = &X86::GR32RegClass;
break;
case Intrinsic::x86_sse42_crc32_32_32:
- Opc = X86::CRC32r32r32;
+ Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r32);
RC = &X86::GR32RegClass;
break;
case Intrinsic::x86_sse42_crc32_64_64:
- Opc = X86::CRC32r64r64;
+ Opc = GET_EGPR_IF_ENABLED(X86::CRC32r64r64);
RC = &X86::GR64RegClass;
break;
+#undef GET_EGPR_IF_ENABLED
}
const Value *LHS = II->getArgOperand(0);
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index b13bf361ab79..aad839b83ee1 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -173,7 +173,6 @@ static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
#define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \
LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \
- LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \
LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \
LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \
case X86::MNEMONIC##8ri: \
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 73b10cf3067e..53ce720be2da 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2852,7 +2852,7 @@ bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
X86ISelAddressMode AM;
- AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue();
+ AM.Scale = ScaleOp->getAsZExtVal();
// Attempt to match index patterns, as long as we're not relying on implicit
// sign-extension, which is performed BEFORE scale.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1e4b1361f98a..5a28240ea9e2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7371,7 +7371,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
/// index.
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
SDValue ExtIdx) {
- int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+ int Idx = ExtIdx->getAsZExtVal();
if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
return Idx;
@@ -7475,10 +7475,12 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
- MVT IVT = VT.changeVectorElementTypeToInteger();
+ MVT IVT =
+ VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
SmallVector<SDValue, 16> NewOps;
for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
- NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
+ NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
+ Op.getOperand(I)));
SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
return DAG.getBitcast(VT, Res);
}
@@ -8793,7 +8795,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
- unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
+ unsigned InsertC = InsIndex->getAsZExtVal();
unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
if (InsertC < NumEltsInLow128Bits)
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
@@ -14369,6 +14371,13 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ if (VT == MVT::v8bf16) {
+ V1 = DAG.getBitcast(MVT::v8i16, V1);
+ V2 = DAG.getBitcast(MVT::v8i16, V2);
+ return DAG.getBitcast(VT,
+ DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
+ }
+
switch (VT.SimpleTy) {
case MVT::v2i64:
return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
@@ -17096,14 +17105,14 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}
- if (VT == MVT::v32f16) {
+ if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
if (!Subtarget.hasBWI())
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
/*SimpleOnly*/ false);
V1 = DAG.getBitcast(MVT::v32i16, V1);
V2 = DAG.getBitcast(MVT::v32i16, V2);
- return DAG.getBitcast(MVT::v32f16,
+ return DAG.getBitcast(VT,
DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
}
@@ -17747,7 +17756,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ unsigned IdxVal = Idx->getAsZExtVal();
SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
@@ -21515,9 +21524,8 @@ SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
RTLIB::Libcall LC =
RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
SDValue Res =
- makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
- DAG.getBitcast(MVT::i32, Res));
+ makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
+ return DAG.getBitcast(MVT::i16, Res);
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -24061,7 +24069,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// a >= b ? -1 : 0 -> RES = setcc_carry
// a >= b ? 0 : -1 -> RES = ~setcc_carry
if (Cond.getOpcode() == X86ISD::SUB) {
- unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+ unsigned CondCode = CC->getAsZExtVal();
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
@@ -25359,8 +25367,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
Src3.getValueType() != MVT::i8) {
- Src3 = DAG.getTargetConstant(
- cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
+ Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
}
// We specify 2 possible opcodes for intrinsics with rounding modes.
@@ -25385,8 +25392,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
SDValue Src4 = Op.getOperand(4);
if (Src4.getValueType() != MVT::i8) {
- Src4 = DAG.getTargetConstant(
- cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
+ Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
@@ -26788,7 +26794,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
{Chain, Op1, Op2, Size}, VT, MMO);
Chain = Res.getValue(1);
Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
- unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned Imm = Op2->getAsZExtVal();
if (Imm)
Res = DAG.getNode(ISD::SHL, DL, VT, Res,
DAG.getShiftAmountConstant(Imm, VT, DL));
@@ -40221,6 +40227,34 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
return SDValue();
}
+ case X86ISD::SHUF128: {
+ // If we're permuting the upper 256-bits subvectors of a concatenation, then
+ // see if we can peek through and access the subvector directly.
+ if (VT.is512BitVector()) {
+ // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
+ // upper subvector is used.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ uint64_t Mask = N->getConstantOperandVal(2);
+ SmallVector<SDValue> LHSOps, RHSOps;
+ SDValue NewLHS, NewRHS;
+ if ((Mask & 0x0A) == 0x0A &&
+ collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
+ NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
+ Mask &= ~0x0A;
+ }
+ if ((Mask & 0xA0) == 0xA0 &&
+ collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
+ NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
+ Mask &= ~0xA0;
+ }
+ if (NewLHS || NewRHS)
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
+ NewRHS ? NewRHS : RHS,
+ DAG.getTargetConstant(Mask, DL, MVT::i8));
+ }
+ return SDValue();
+ }
case X86ISD::VPERM2X128: {
// Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
SDValue LHS = N->getOperand(0);
@@ -41320,6 +41354,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return TLO.CombineTo(Op, Src);
break;
}
+ case X86ISD::VZEXT_LOAD: {
+ // If upper demanded elements are not demanded then simplify to a
+ // scalar_to_vector(load()).
+ MVT SVT = VT.getSimpleVT().getVectorElementType();
+ if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
+ SDLoc DL(Op);
+ auto *Mem = cast<MemSDNode>(Op);
+ SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
+ Mem->getMemOperand());
+ SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
+ return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
+ }
+ break;
+ }
case X86ISD::VBROADCAST: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -41795,7 +41843,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
+ unsigned ShAmt = Op1->getAsZExtVal();
if (ShAmt >= BitWidth)
break;
@@ -42580,7 +42628,7 @@ static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
APInt Imm(SrcVT.getVectorNumElements(), 0);
for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
SDValue In = Op.getOperand(Idx);
- if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
+ if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
Imm.setBit(Idx);
}
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
@@ -49931,18 +49979,17 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
SDValue Ptr = Ld->getBasePtr();
SDValue Chain = Ld->getChain();
for (SDNode *User : Chain->uses()) {
- if (User != N &&
+ auto *UserLd = dyn_cast<MemSDNode>(User);
+ if (User != N && UserLd &&
(User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
ISD::isNormalLoad(User)) &&
- cast<MemSDNode>(User)->getChain() == Chain &&
- !User->hasAnyUseOfValue(1) &&
+ UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
User->getValueSizeInBits(0).getFixedValue() >
RegVT.getFixedSizeInBits()) {
if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
- cast<MemSDNode>(User)->getBasePtr() == Ptr &&
- cast<MemSDNode>(User)->getMemoryVT().getSizeInBits() ==
- MemVT.getSizeInBits()) {
+ UserLd->getBasePtr() == Ptr &&
+ UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) {
SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
RegVT.getSizeInBits());
Extract = DAG.getBitcast(RegVT, Extract);
@@ -49961,7 +50008,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
// See if we are loading a constant that matches in the lower
// bits of a longer constant (but from a different constant pool ptr).
EVT UserVT = User->getValueType(0);
- SDValue UserPtr = cast<MemSDNode>(User)->getBasePtr();
+ SDValue UserPtr = UserLd->getBasePtr();
const Constant *LdC = getTargetConstantFromBasePtr(Ptr);
const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
if (LdC && UserC && UserPtr != Ptr) {
@@ -53258,7 +53305,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
if (Index.getOpcode() == ISD::ADD &&
Index.getValueType().getVectorElementType() == PtrVT &&
isa<ConstantSDNode>(Scale)) {
- uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
+ uint64_t ScaleAmt = Scale->getAsZExtVal();
if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
BitVector UndefElts;
if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
@@ -54572,6 +54619,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
return Op0.getOperand(0);
}
+
+ // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
+ if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
+ !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ Op0.getOperand(0), Op0.getOperand(0)),
+ Op0.getOperand(1));
}
// concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
@@ -54979,6 +55034,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
}
break;
+ case X86ISD::BLENDI:
+ if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
+ uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
+ uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
+ uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
+ MVT MaskSVT = MVT::getIntegerVT(VT.getVectorNumElements());
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue Sel =
+ DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
+ return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
+ ConcatSubOperand(VT, Ops, 0));
+ }
+ break;
case ISD::VSELECT:
if (!IsSplat && Subtarget.hasAVX512() &&
(VT.is256BitVector() ||
@@ -57602,7 +57670,7 @@ X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
}
Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
- if (ML->isInnermost() &&
+ if (ML && ML->isInnermost() &&
ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
return TargetLowering::getPrefLoopAlignment();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 9bd1622cb0d3..32745400a38b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1714,16 +1714,6 @@ namespace llvm {
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
- bool splitValueIntoRegisterParts(
- SelectionDAG & DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
- unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC)
- const override;
-
- SDValue joinRegisterPartsIntoValue(
- SelectionDAG & DAG, const SDLoc &DL, const SDValue *Parts,
- unsigned NumParts, MVT PartVT, EVT ValueVT,
- std::optional<CallingConv::ID> CC) const override;
-
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b8b5421b9005..d75bd4171fde 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -127,6 +127,9 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
return getRegisterTypeForCallingConv(Context, CC,
VT.changeVectorElementType(MVT::f16));
+ if (VT == MVT::bf16)
+ return MVT::f16;
+
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
@@ -421,40 +424,6 @@ unsigned X86TargetLowering::getJumpTableEncoding() const {
return TargetLowering::getJumpTableEncoding();
}
-bool X86TargetLowering::splitValueIntoRegisterParts(
- SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
- unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
- bool IsABIRegCopy = CC.has_value();
- EVT ValueVT = Val.getValueType();
- if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
- unsigned ValueBits = ValueVT.getSizeInBits();
- unsigned PartBits = PartVT.getSizeInBits();
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
- Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
- Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
- Parts[0] = Val;
- return true;
- }
- return false;
-}
-
-SDValue X86TargetLowering::joinRegisterPartsIntoValue(
- SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
- MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
- bool IsABIRegCopy = CC.has_value();
- if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
- unsigned ValueBits = ValueVT.getSizeInBits();
- unsigned PartBits = PartVT.getSizeInBits();
- SDValue Val = Parts[0];
-
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
- Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
- Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
- return Val;
- }
- return SDValue();
-}
-
bool X86TargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index c3a673f97d34..fe7d90fbcdf7 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -448,7 +448,7 @@ multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
X86VectorVTInfo< 2, EltVT64, VR128X>,
X86VectorVTInfo< 4, EltVT64, VR256X>,
null_frag, vinsert128_insert, sched>,
- VEX_W1X, EVEX_V256;
+ EVEX_V256, REX_W;
// Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasDQI] in {
@@ -750,7 +750,7 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
X86VectorVTInfo< 4, EltVT64, VR256X>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
null_frag, vextract128_extract, SchedRR, SchedMR>,
- VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+ EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W;
// Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasDQI] in {
@@ -1161,7 +1161,7 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
defm VBROADCASTSS : avx512_fp_broadcast_ss<0x18, "vbroadcastss",
avx512vl_f32_info>;
defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
- avx512vl_f64_info>, VEX_W1X;
+ avx512vl_f64_info>, REX_W;
multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
X86VectorVTInfo _, SDPatternOperator OpNode,
@@ -1267,7 +1267,7 @@ defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
avx512vl_i32_info, HasAVX512, 1>;
defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
- avx512vl_i64_info, HasAVX512, 1>, VEX_W1X;
+ avx512vl_i64_info, HasAVX512, 1>, REX_W;
multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode,
@@ -1460,11 +1460,11 @@ let Predicates = [HasBF16, HasVLX] in
let Predicates = [HasVLX, HasDQI] in {
defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
- X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X,
- EVEX_V256, EVEX_CD8<64, CD8VT2>;
+ X86SubVBroadcastld128, v4i64x_info, v2i64x_info>,
+ EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W;
defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
- X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X,
- EVEX_V256, EVEX_CD8<64, CD8VT2>;
+ X86SubVBroadcastld128, v4f64x_info, v2f64x_info>,
+ EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W;
// Patterns for selects of bitcasted operations.
def : Pat<(vselect_mask VK4WM:$mask,
@@ -3185,15 +3185,13 @@ defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;
multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload,
- X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd,
- bit NoRMPattern = 0,
+ X86SchedWriteMoveLS Sched, bit NoRMPattern = 0,
SDPatternOperator SelectOprr = vselect> {
let hasSideEffects = 0 in {
let isMoveReg = 1 in
def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
- _.ExeDomain>, EVEX, Sched<[Sched.RR]>,
- EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
+ _.ExeDomain>, EVEX, Sched<[Sched.RR]>;
def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
@@ -3209,8 +3207,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
!if(NoRMPattern, [],
[(set _.RC:$dst,
(_.VT (ld_frag addr:$src)))]),
- _.ExeDomain>, EVEX, Sched<[Sched.RM]>,
- EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
+ _.ExeDomain>, EVEX, Sched<[Sched.RM]>;
let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
@@ -3253,53 +3250,48 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd,
X86SchedWriteMoveLSWidths Sched,
- string EVEX2VEXOvrd, bit NoRMPattern = 0> {
+ bit NoRMPattern = 0> {
let Predicates = [prd] in
defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512,
_.info512.AlignedLdFrag, masked_load_aligned,
- Sched.ZMM, "", NoRMPattern>, EVEX_V512;
+ Sched.ZMM, NoRMPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256,
_.info256.AlignedLdFrag, masked_load_aligned,
- Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256;
+ Sched.YMM, NoRMPattern>, EVEX_V256;
defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128,
_.info128.AlignedLdFrag, masked_load_aligned,
- Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128;
+ Sched.XMM, NoRMPattern>, EVEX_V128;
}
}
multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd,
X86SchedWriteMoveLSWidths Sched,
- string EVEX2VEXOvrd, bit NoRMPattern = 0,
+ bit NoRMPattern = 0,
SDPatternOperator SelectOprr = vselect> {
let Predicates = [prd] in
defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag,
- masked_load, Sched.ZMM, "",
- NoRMPattern, SelectOprr>, EVEX_V512;
+ masked_load, Sched.ZMM, NoRMPattern, SelectOprr>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag,
- masked_load, Sched.YMM, EVEX2VEXOvrd#"Y",
- NoRMPattern, SelectOprr>, EVEX_V256;
+ masked_load, Sched.YMM, NoRMPattern, SelectOprr>, EVEX_V256;
defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag,
- masked_load, Sched.XMM, EVEX2VEXOvrd,
- NoRMPattern, SelectOprr>, EVEX_V128;
+ masked_load, Sched.XMM, NoRMPattern, SelectOprr>, EVEX_V128;
}
}
multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName,
X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore,
- X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd,
- bit NoMRPattern = 0> {
+ X86SchedWriteMoveLS Sched, bit NoMRPattern = 0> {
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
let isMoveReg = 1 in
def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
OpcodeStr # "\t{$src, $dst|$dst, $src}",
[], _.ExeDomain>, EVEX,
- Sched<[Sched.RR]>,
- EVEX2VEXOverride<EVEX2VEXOvrd#"rr_REV">;
+ Sched<[Sched.RR]>;
def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, ${dst} {${mask}}|"#
@@ -3319,8 +3311,7 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName,
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
!if(NoMRPattern, [],
[(st_frag (_.VT _.RC:$src), addr:$dst)]),
- _.ExeDomain>, EVEX, Sched<[Sched.MR]>,
- EVEX2VEXOverride<EVEX2VEXOvrd#"mr">;
+ _.ExeDomain>, EVEX, Sched<[Sched.MR]>;
def mrk : AVX512PI<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
@@ -3344,102 +3335,92 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName,
multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd,
X86SchedWriteMoveLSWidths Sched,
- string EVEX2VEXOvrd, bit NoMRPattern = 0> {
+ bit NoMRPattern = 0> {
let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store,
- masked_store, Sched.ZMM, "",
- NoMRPattern>, EVEX_V512;
+ masked_store, Sched.ZMM, NoMRPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store,
- masked_store, Sched.YMM,
- EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
+ masked_store, Sched.YMM, NoMRPattern>, EVEX_V256;
defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store,
- masked_store, Sched.XMM, EVEX2VEXOvrd,
- NoMRPattern>, EVEX_V128;
+ masked_store, Sched.XMM, NoMRPattern>, EVEX_V128;
}
}
multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd,
X86SchedWriteMoveLSWidths Sched,
- string EVEX2VEXOvrd, bit NoMRPattern = 0> {
+ bit NoMRPattern = 0> {
let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore,
- masked_store_aligned, Sched.ZMM, "",
- NoMRPattern>, EVEX_V512;
+ masked_store_aligned, Sched.ZMM, NoMRPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore,
- masked_store_aligned, Sched.YMM,
- EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
+ masked_store_aligned, Sched.YMM, NoMRPattern>, EVEX_V256;
defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore,
- masked_store_aligned, Sched.XMM, EVEX2VEXOvrd,
- NoMRPattern>, EVEX_V128;
+ masked_store_aligned, Sched.XMM, NoMRPattern>, EVEX_V128;
}
}
defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
- HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
+ HasAVX512, SchedWriteFMoveLS>,
avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
- HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
+ HasAVX512, SchedWriteFMoveLS>,
TB, EVEX_CD8<32, CD8VF>;
defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
- HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
+ HasAVX512, SchedWriteFMoveLS>,
avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
- HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
+ HasAVX512, SchedWriteFMoveLS>,
TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
- SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>,
+ SchedWriteFMoveLS, 0, null_frag>,
avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
- SchedWriteFMoveLS, "VMOVUPS">,
+ SchedWriteFMoveLS>,
TB, EVEX_CD8<32, CD8VF>;
defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
- SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>,
+ SchedWriteFMoveLS, 0, null_frag>,
avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
- SchedWriteFMoveLS, "VMOVUPD">,
+ SchedWriteFMoveLS>,
TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
- HasAVX512, SchedWriteVecMoveLS,
- "VMOVDQA", 1>,
+ HasAVX512, SchedWriteVecMoveLS, 1>,
avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
- HasAVX512, SchedWriteVecMoveLS,
- "VMOVDQA", 1>,
+ HasAVX512, SchedWriteVecMoveLS, 1>,
TB, PD, EVEX_CD8<32, CD8VF>;
defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
- HasAVX512, SchedWriteVecMoveLS,
- "VMOVDQA">,
+ HasAVX512, SchedWriteVecMoveLS>,
avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
- HasAVX512, SchedWriteVecMoveLS,
- "VMOVDQA">,
+ HasAVX512, SchedWriteVecMoveLS>,
TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI,
- SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ SchedWriteVecMoveLS, 1>,
avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI,
- SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ SchedWriteVecMoveLS, 1>,
TB, XD, EVEX_CD8<8, CD8VF>;
defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI,
- SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ SchedWriteVecMoveLS, 1>,
avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI,
- SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ SchedWriteVecMoveLS, 1>,
TB, XD, REX_W, EVEX_CD8<16, CD8VF>;
defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
- SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>,
+ SchedWriteVecMoveLS, 1, null_frag>,
avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
- SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ SchedWriteVecMoveLS, 1>,
TB, XS, EVEX_CD8<32, CD8VF>;
defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
- SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>,
+ SchedWriteVecMoveLS, 0, null_frag>,
avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
- SchedWriteVecMoveLS, "VMOVDQU">,
+ SchedWriteVecMoveLS>,
TB, XS, REX_W, EVEX_CD8<64, CD8VF>;
// Special instructions to help with spilling when we don't have VLX. We need
@@ -4844,8 +4825,7 @@ defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
SchedWriteVecIMul, HasBWI, 1>;
defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
- SchedWriteVecIMul, HasDQI, 1>, T8,
- NotEVEX2VEXConvertible;
+ SchedWriteVecIMul, HasDQI, 1>, T8;
defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul,
HasBWI, 1>;
defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul,
@@ -4989,8 +4969,7 @@ defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax,
SchedWriteVecALU, HasAVX512, 1>, T8;
defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax,
- SchedWriteVecALU, HasAVX512, 1>, T8,
- NotEVEX2VEXConvertible;
+ SchedWriteVecALU, HasAVX512, 1>, T8;
defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
SchedWriteVecALU, HasBWI, 1>;
@@ -4999,8 +4978,7 @@ defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax,
SchedWriteVecALU, HasAVX512, 1>, T8;
defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax,
- SchedWriteVecALU, HasAVX512, 1>, T8,
- NotEVEX2VEXConvertible;
+ SchedWriteVecALU, HasAVX512, 1>, T8;
defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
SchedWriteVecALU, HasBWI, 1>, T8;
@@ -5009,8 +4987,7 @@ defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin,
SchedWriteVecALU, HasAVX512, 1>, T8;
defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin,
- SchedWriteVecALU, HasAVX512, 1>, T8,
- NotEVEX2VEXConvertible;
+ SchedWriteVecALU, HasAVX512, 1>, T8;
defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
SchedWriteVecALU, HasBWI, 1>;
@@ -5019,8 +4996,7 @@ defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
SchedWriteVecALU, HasAVX512, 1>, T8;
defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
- SchedWriteVecALU, HasAVX512, 1>, T8,
- NotEVEX2VEXConvertible;
+ SchedWriteVecALU, HasAVX512, 1>, T8;
// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
let Predicates = [HasDQI, NoVLX] in {
@@ -5405,8 +5381,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode, SDNode VecNode, SDNode SaeNode,
- X86FoldableSchedWrite sched, bit IsCommutable,
- string EVEX2VexOvrd> {
+ X86FoldableSchedWrite sched, bit IsCommutable> {
let ExeDomain = _.ExeDomain in {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -5427,8 +5402,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
- Sched<[sched]>,
- EVEX2VEXOverride<EVEX2VexOvrd#"rr"> {
+ Sched<[sched]> {
let isCommutable = IsCommutable;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
@@ -5436,8 +5410,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2)))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>,
- EVEX2VEXOverride<EVEX2VexOvrd#"rm">;
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
}
let Uses = [MXCSR] in
@@ -5474,19 +5447,15 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode VecNode, SDNode SaeNode,
X86SchedWriteSizes sched, bit IsCommutable> {
defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
- VecNode, SaeNode, sched.PS.Scl, IsCommutable,
- NAME#"SS">,
+ VecNode, SaeNode, sched.PS.Scl, IsCommutable>,
TB, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
- VecNode, SaeNode, sched.PD.Scl, IsCommutable,
- NAME#"SD">,
+ VecNode, SaeNode, sched.PD.Scl, IsCommutable>,
TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
let Predicates = [HasFP16] in {
defm SHZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sh", f16x_info, OpNode,
- VecNode, SaeNode, sched.PH.Scl, IsCommutable,
- NAME#"SH">,
- T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>,
- NotEVEX2VEXConvertible;
+ VecNode, SaeNode, sched.PH.Scl, IsCommutable>,
+ T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;
}
}
defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds,
@@ -5506,14 +5475,13 @@ defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
// X86fminc and X86fmaxc instead of X86fmin and X86fmax
multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, SDNode OpNode,
- X86FoldableSchedWrite sched,
- string EVEX2VEXOvrd> {
+ X86FoldableSchedWrite sched> {
let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
- Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr"> {
+ Sched<[sched]> {
let isCommutable = 1;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
@@ -5521,36 +5489,34 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2)))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>,
- EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
- SchedWriteFCmp.Scl, "VMINCSS">, TB, XS,
+ SchedWriteFCmp.Scl>, TB, XS,
EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
- SchedWriteFCmp.Scl, "VMINCSD">, TB, XD,
+ SchedWriteFCmp.Scl>, TB, XD,
REX_W, EVEX, VVVV, VEX_LIG,
EVEX_CD8<64, CD8VT1>, SIMD_EXC;
defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
- SchedWriteFCmp.Scl, "VMAXCSS">, TB, XS,
+ SchedWriteFCmp.Scl>, TB, XS,
EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
- SchedWriteFCmp.Scl, "VMAXCSD">, TB, XD,
+ SchedWriteFCmp.Scl>, TB, XD,
REX_W, EVEX, VVVV, VEX_LIG,
EVEX_CD8<64, CD8VT1>, SIMD_EXC;
defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc,
- SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5, XS,
- EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
- NotEVEX2VEXConvertible;
+ SchedWriteFCmp.Scl>, T_MAP5, XS,
+ EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC;
+
defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc,
- SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5, XS,
- EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
- NotEVEX2VEXConvertible;
+ SchedWriteFCmp.Scl>, T_MAP5, XS,
+ EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC;
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDPatternOperator MaskOpNode,
@@ -5820,8 +5786,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6, PD;
}
}
-defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef",
- SchedWriteFAdd>, NotEVEX2VEXConvertible;
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", SchedWriteFAdd>;
//===----------------------------------------------------------------------===//
// AVX-512 VPTESTM instructions
@@ -5985,11 +5950,9 @@ multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
string OpcodeStr, SDNode OpNode,
- X86SchedWriteWidths sched,
- bit NotEVEX2VEXConvertibleQ = 0> {
+ X86SchedWriteWidths sched> {
defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, sched, v4i32,
avx512vl_i32_info, HasAVX512>;
- let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, sched, v2i64,
avx512vl_i64_info, HasAVX512>, REX_W;
defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, sched, v8i16,
@@ -6034,11 +5997,9 @@ multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM,
multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
Format ImmFormR, Format ImmFormM,
string OpcodeStr, SDNode OpNode,
- X86SchedWriteWidths sched,
- bit NotEVEX2VEXConvertibleQ = 0> {
+ X86SchedWriteWidths sched> {
defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
- let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
sched, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, REX_W;
}
@@ -6054,7 +6015,7 @@ defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli,
SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai,
- SchedWriteVecShiftImm, 1>,
+ SchedWriteVecShiftImm>,
avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai,
SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
@@ -6066,7 +6027,7 @@ defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli,
defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl,
SchedWriteVecShift>;
defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra,
- SchedWriteVecShift, 1>;
+ SchedWriteVecShift>;
defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,
SchedWriteVecShift>;
@@ -6435,7 +6396,7 @@ defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
avx512vl_i32_info>;
let ExeDomain = SSEPackedDouble in
defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
- avx512vl_i64_info>, VEX_W1X;
+ avx512vl_i64_info>, REX_W;
//===----------------------------------------------------------------------===//
// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
@@ -8443,9 +8404,9 @@ multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo
}
let Predicates = [HasDQI, HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode,
- MaskOpNode, sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible;
+ MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode,
- MaskOpNode, sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible;
+ MaskOpNode, sched.YMM>, EVEX_V256;
}
}
@@ -8524,11 +8485,10 @@ multiclass avx512_cvtqq2ps_dq2ph<bits<8> opc, string OpcodeStr, SDPatternOperato
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info128, null_frag,
null_frag, sched.XMM, _src.info128.BroadcastStr,
"{x}", i128mem, _src.info128.KRCWM>,
- EVEX_V128, NotEVEX2VEXConvertible;
+ EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info256, OpNode,
MaskOpNode, sched.YMM, _src.info256.BroadcastStr,
- "{y}">, EVEX_V256,
- NotEVEX2VEXConvertible;
+ "{y}">, EVEX_V256;
// Special patterns to allow use of X86VM[SU]intToFP for masking. Instruction
// patterns have been disabled with null_frag.
@@ -10882,8 +10842,7 @@ defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info,
multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched,
X86VectorVTInfo _,
- X86VectorVTInfo CastInfo,
- string EVEX2VEXOvrd> {
+ X86VectorVTInfo CastInfo> {
let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
@@ -10891,7 +10850,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
(_.VT (bitconvert
(CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2,
(i8 timm:$src3)))))>,
- Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
+ Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10900,8 +10859,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
(CastInfo.VT (X86Shuf128 _.RC:$src1,
(CastInfo.LdFrag addr:$src2),
(i8 timm:$src3)))))>,
- Sched<[sched.Folded, sched.ReadAfterFold]>,
- EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
@@ -10918,45 +10876,40 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched,
AVX512VLVectorVTInfo _,
- AVX512VLVectorVTInfo CastInfo, bits<8> opc,
- string EVEX2VEXOvrd>{
+ AVX512VLVectorVTInfo CastInfo, bits<8> opc>{
let Predicates = [HasAVX512] in
defm Z : avx512_shuff_packed_128_common<opc, OpcodeStr, sched,
- _.info512, CastInfo.info512, "">, EVEX_V512;
+ _.info512, CastInfo.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in
defm Z256 : avx512_shuff_packed_128_common<opc, OpcodeStr, sched,
- _.info256, CastInfo.info256,
- EVEX2VEXOvrd>, EVEX_V256;
+ _.info256, CastInfo.info256>, EVEX_V256;
}
defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256,
- avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
+ avx512vl_f32_info, avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256,
- avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
+ avx512vl_f64_info, avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256,
- avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
+ avx512vl_i32_info, avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256,
- avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
+ avx512vl_i64_info, avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
multiclass avx512_valign<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _>{
- // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the
- // instantiation of this class.
let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>,
- Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">;
+ Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86VAlign _.RC:$src1,
(bitconvert (_.LdFrag addr:$src2)),
(i8 timm:$src3)))>,
- Sched<[sched.Folded, sched.ReadAfterFold]>,
- EVEX2VEXOverride<"VPALIGNRrmi">;
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
@@ -10979,7 +10932,6 @@ multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched,
defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>,
AVX512AIi8Base, EVEX, VVVV, EVEX_V128;
// We can't really override the 256-bit version so change it back to unset.
- let EVEX2VEXOverride = ? in
defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>,
AVX512AIi8Base, EVEX, VVVV, EVEX_V256;
}
@@ -11111,7 +11063,7 @@ let Predicates = [HasVLX, HasBWI] in {
defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw",
SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info>,
- EVEX_CD8<8, CD8VF>, NotEVEX2VEXConvertible;
+ EVEX_CD8<8, CD8VF>;
multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
@@ -13088,12 +13040,10 @@ multiclass avx512_cvtqq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo
let Predicates = [HasFP16, HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v2i64x_info,
null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
- i128mem, VK2WM>,
- EVEX_V128, NotEVEX2VEXConvertible;
+ i128mem, VK2WM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v4i64x_info,
null_frag, null_frag, sched.YMM, "{1to4}", "{y}",
- i256mem, VK4WM>,
- EVEX_V256, NotEVEX2VEXConvertible;
+ i256mem, VK4WM>, EVEX_V256;
}
def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 6b0c1b8c28c9..5cfa95e085e3 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -71,24 +71,60 @@ multiclass Mul<bits<8> o, string m, Format RegMRM, Format MemMRM, SDPatternOpera
// FIXME: Used for 8-bit mul, ignore result upper 8 bits.
// This probably ought to be moved to a def : Pat<> if the
// syntax can be accepted.
- let Defs = [AL,EFLAGS,AX], Uses = [AL] in
- def 8r : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8,
- [(set AL, (node AL, GR8:$src1)), (implicit EFLAGS)]>;
- let Defs = [AX,DX,EFLAGS], Uses = [AX] in
- def 16r : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16;
- let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
- def 32r : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32;
- let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
- def 64r : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>;
- let Defs = [AL,EFLAGS,AX], Uses = [AL] in
- def 8m : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8,
- [(set AL, (node AL, (loadi8 addr:$src1))), (implicit EFLAGS)]>;
- let Defs = [AX,DX,EFLAGS], Uses = [AX] in
- def 16m : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16;
- let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
- def 32m : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32;
- let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
- def 64m : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>;
+ let Defs = [AL, EFLAGS, AX], Uses = [AL] in
+ def 8r : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8,
+ [(set AL, (node AL, GR8:$src1)), (implicit EFLAGS)]>;
+ let Defs = [AX, DX, EFLAGS], Uses = [AX] in
+ def 16r : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16;
+ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in
+ def 32r : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32;
+ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in
+ def 64r : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>;
+ let Defs = [AL, EFLAGS, AX], Uses = [AL] in
+ def 8m : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8,
+ [(set AL, (node AL, (loadi8 addr:$src1))), (implicit EFLAGS)]>;
+ let Defs = [AX, DX, EFLAGS], Uses = [AX] in
+ def 16m : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16;
+ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in
+ def 32m : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32;
+ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in
+ def 64m : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>;
+
+ let Predicates = [In64BitMode] in {
+ let Defs = [AL, AX], Uses = [AL] in
+ def 8r_NF : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, []>, NF;
+ let Defs = [AX, DX], Uses = [AX] in
+ def 16r_NF : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, NF, PD;
+ let Defs = [EAX, EDX], Uses = [EAX] in
+ def 32r_NF : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, NF;
+ let Defs = [RAX, RDX], Uses = [RAX] in
+ def 64r_NF : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>, NF;
+ let Defs = [AL, AX], Uses = [AL] in
+ def 8m_NF : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, []>, NF;
+ let Defs = [AX, DX], Uses = [AX] in
+ def 16m_NF : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, NF, PD;
+ let Defs = [EAX, EDX], Uses = [EAX] in
+ def 32m_NF : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, NF;
+ let Defs = [RAX, RDX], Uses = [RAX] in
+ def 64m_NF : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, NF;
+
+ let Defs = [AL, EFLAGS, AX], Uses = [AL] in
+ def 8r_EVEX : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, []>, PL;
+ let Defs = [AX, DX, EFLAGS], Uses = [AX] in
+ def 16r_EVEX : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, PL, PD;
+ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in
+ def 32r_EVEX : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, PL;
+ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in
+ def 64r_EVEX : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>, PL;
+ let Defs = [AL, EFLAGS, AX], Uses = [AL] in
+ def 8m_EVEX : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, []>, PL;
+ let Defs = [AX, DX, EFLAGS], Uses = [AX] in
+ def 16m_EVEX : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, PL, PD;
+ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in
+ def 32m_EVEX : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, PL;
+ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in
+ def 64m_EVEX : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, PL;
+ }
}
defm MUL : Mul<0xF7, "mul", MRM4r, MRM4m, mul>;
@@ -99,137 +135,341 @@ multiclass Div<bits<8> o, string m, Format RegMRM, Format MemMRM> {
defvar sched16 = !if(!eq(m, "div"), WriteDiv16, WriteIDiv16);
defvar sched32 = !if(!eq(m, "div"), WriteDiv32, WriteIDiv32);
defvar sched64 = !if(!eq(m, "div"), WriteDiv64, WriteIDiv64);
- let Defs = [AL,AH,EFLAGS], Uses = [AX] in
- def 8r : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>;
- let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
- def 16r : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16;
- let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
- def 32r : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32;
- let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
- def 64r : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>;
- let Defs = [AL,AH,EFLAGS], Uses = [AX] in
- def 8m : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>;
- let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
- def 16m : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16;
- let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
- def 32m : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32;
- let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
- def 64m : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>;
+ let Defs = [AL, AH, EFLAGS], Uses = [AX] in
+ def 8r : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>;
+ let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in
+ def 16r : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16;
+ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in
+ def 32r : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32;
+ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in
+ def 64r : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>;
+ let Defs = [AL, AH, EFLAGS], Uses = [AX] in
+ def 8m : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>;
+ let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in
+ def 16m : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16;
+ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in
+ def 32m : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32;
+ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in
+ def 64m : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>;
+
+ let Predicates = [In64BitMode] in {
+ let Defs = [AL, AH], Uses = [AX] in
+ def 8r_NF : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>, NF;
+ let Defs = [AX, DX], Uses = [AX, DX] in
+ def 16r_NF : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, NF, PD;
+ let Defs = [EAX, EDX], Uses = [EAX, EDX] in
+ def 32r_NF : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, NF;
+ let Defs = [RAX, RDX], Uses = [RAX, RDX] in
+ def 64r_NF : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>, NF;
+ let Defs = [AL, AH], Uses = [AX] in
+ def 8m_NF : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>, NF;
+ let Defs = [AX, DX], Uses = [AX, DX] in
+ def 16m_NF : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, NF, PD;
+ let Defs = [EAX, EDX], Uses = [EAX, EDX] in
+ def 32m_NF : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, NF;
+ let Defs = [RAX, RDX], Uses = [RAX, RDX] in
+ def 64m_NF : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, NF;
+
+ let Defs = [AL, AH, EFLAGS], Uses = [AX] in
+ def 8r_EVEX : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>, PL;
+ let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in
+ def 16r_EVEX : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, PL, PD;
+ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in
+ def 32r_EVEX : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, PL;
+ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in
+ def 64r_EVEX : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>, PL;
+ let Defs = [AL, AH, EFLAGS], Uses = [AX] in
+ def 8m_EVEX : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>, PL;
+ let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in
+ def 16m_EVEX : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, PL, PD;
+ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in
+ def 32m_EVEX : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, PL;
+ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in
+ def 64m_EVEX : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, PL;
+ }
}
+
let hasSideEffects = 1 in { // so that we don't speculatively execute
-defm DIV: Div<0xF7, "div", MRM6r, MRM6m>;
-defm IDIV: Div<0xF7, "idiv", MRM7r, MRM7m>;
+ defm DIV: Div<0xF7, "div", MRM6r, MRM6m>;
+ defm IDIV: Div<0xF7, "idiv", MRM7r, MRM7m>;
}
-class IMulOpRR<X86TypeInfo t, X86FoldableSchedWrite sched>
- : BinOpRR_RF<0xAF, "imul", t, X86smul_flag>, TB {
+class IMulOpRR_R<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0>
+ : BinOpRR_R<0xAF, "imul", t, ndd> {
let Form = MRMSrcReg;
let SchedRW = [sched];
// X = IMUL Y, Z --> X = IMUL Z, Y
let isCommutable = 1;
}
-class IMulOpRM<X86TypeInfo t, X86FoldableSchedWrite sched>
- : BinOpRM_RF<0xAF, "imul", t, X86smul_flag>, TB {
-let Form = MRMSrcMem;
-let SchedRW = [sched.Folded, sched.ReadAfterFold];
+class IMulOpRR_RF<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0>
+ : BinOpRR_RF<0xAF, "imul", t, X86smul_flag, ndd> {
+ let Form = MRMSrcReg;
+ let SchedRW = [sched];
+ // X = IMUL Y, Z --> X = IMUL Z, Y
+ let isCommutable = 1;
+}
+class IMulOpRM_R<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0>
+ : BinOpRM_R<0xAF, "imul", t, ndd> {
+ let Form = MRMSrcMem;
+ let SchedRW = [sched.Folded, sched.ReadAfterFold];
+}
+class IMulOpRM_RF<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0>
+ : BinOpRM_RF<0xAF, "imul", t, X86smul_flag, ndd> {
+ let Form = MRMSrcMem;
+ let SchedRW = [sched.Folded, sched.ReadAfterFold];
+}
+
+let Predicates = [NoNDD] in {
+ def IMUL16rr : IMulOpRR_RF<Xi16, WriteIMul16Reg>, TB, OpSize16;
+ def IMUL32rr : IMulOpRR_RF<Xi32, WriteIMul32Reg>, TB, OpSize32;
+ def IMUL64rr : IMulOpRR_RF<Xi64, WriteIMul64Reg>, TB;
+ def IMUL16rm : IMulOpRM_RF<Xi16, WriteIMul16Reg>, TB, OpSize16;
+ def IMUL32rm : IMulOpRM_RF<Xi32, WriteIMul32Reg>, TB, OpSize32;
+ def IMUL64rm : IMulOpRM_RF<Xi64, WriteIMul64Reg>, TB;
+}
+let Predicates = [HasNDD, In64BitMode] in {
+ def IMUL16rr_ND : IMulOpRR_RF<Xi16, WriteIMul16Reg, 1>, PD;
+ def IMUL32rr_ND : IMulOpRR_RF<Xi32, WriteIMul32Reg, 1>;
+ def IMUL64rr_ND : IMulOpRR_RF<Xi64, WriteIMul64Reg, 1>;
+ def IMUL16rm_ND : IMulOpRM_RF<Xi16, WriteIMul16Reg, 1>, PD;
+ def IMUL32rm_ND : IMulOpRM_RF<Xi32, WriteIMul32Reg, 1>;
+ def IMUL64rm_ND : IMulOpRM_RF<Xi64, WriteIMul64Reg, 1>;
}
-def IMUL16rr : IMulOpRR<Xi16, WriteIMul16Reg>, OpSize16;
-def IMUL32rr : IMulOpRR<Xi32, WriteIMul32Reg>, OpSize32;
-def IMUL64rr : IMulOpRR<Xi64, WriteIMul64Reg>;
-def IMUL16rm : IMulOpRM<Xi16, WriteIMul16Reg>, OpSize16;
-def IMUL32rm : IMulOpRM<Xi32, WriteIMul32Reg>, OpSize32;
-def IMUL64rm : IMulOpRM<Xi64, WriteIMul64Reg>;
+let Predicates = [In64BitMode], Pattern = [(null_frag)] in {
+ def IMUL16rr_NF : IMulOpRR_R<Xi16, WriteIMul16Reg>, NF, PD;
+ def IMUL32rr_NF : IMulOpRR_R<Xi32, WriteIMul32Reg>, NF;
+ def IMUL64rr_NF : IMulOpRR_R<Xi64, WriteIMul64Reg>, NF;
+ def IMUL16rm_NF : IMulOpRM_R<Xi16, WriteIMul16Reg>, NF, PD;
+ def IMUL32rm_NF : IMulOpRM_R<Xi32, WriteIMul32Reg>, NF;
+ def IMUL64rm_NF : IMulOpRM_R<Xi64, WriteIMul64Reg>, NF;
+
+ def IMUL16rr_NF_ND : IMulOpRR_R<Xi16, WriteIMul16Reg, 1>, EVEX_NF, PD;
+ def IMUL32rr_NF_ND : IMulOpRR_R<Xi32, WriteIMul32Reg, 1>, EVEX_NF;
+ def IMUL64rr_NF_ND : IMulOpRR_R<Xi64, WriteIMul64Reg, 1>, EVEX_NF;
+ def IMUL16rm_NF_ND : IMulOpRM_R<Xi16, WriteIMul16Reg, 1>, EVEX_NF, PD;
+ def IMUL32rm_NF_ND : IMulOpRM_R<Xi32, WriteIMul32Reg, 1>, EVEX_NF;
+ def IMUL64rm_NF_ND : IMulOpRM_R<Xi64, WriteIMul64Reg, 1>, EVEX_NF;
+
+ def IMUL16rr_EVEX : IMulOpRR_RF<Xi16, WriteIMul16Reg>, PL, PD;
+ def IMUL32rr_EVEX : IMulOpRR_RF<Xi32, WriteIMul32Reg>, PL;
+ def IMUL64rr_EVEX : IMulOpRR_RF<Xi64, WriteIMul64Reg>, PL;
+ def IMUL16rm_EVEX : IMulOpRM_RF<Xi16, WriteIMul16Reg>, PL, PD;
+ def IMUL32rm_EVEX : IMulOpRM_RF<Xi32, WriteIMul32Reg>, PL;
+ def IMUL64rm_EVEX : IMulOpRM_RF<Xi64, WriteIMul64Reg>, PL;
+}
class IMulOpRI8_R<X86TypeInfo t, X86FoldableSchedWrite sched>
: BinOpRI8<0x6B, "imul", binop_ndd_args, t, MRMSrcReg,
- (outs t.RegClass:$dst)>, DefEFLAGS {
+ (outs t.RegClass:$dst)> {
let SchedRW = [sched];
}
class IMulOpRI_R<X86TypeInfo t, X86FoldableSchedWrite sched>
: BinOpRI<0x69, "imul", binop_ndd_args, t, MRMSrcReg,
+ (outs t.RegClass:$dst), []> {
+ let SchedRW = [sched];
+}
+class IMulOpRI_RF<X86TypeInfo t, X86FoldableSchedWrite sched>
+ : BinOpRI<0x69, "imul", binop_ndd_args, t, MRMSrcReg,
(outs t.RegClass:$dst),
[(set t.RegClass:$dst, EFLAGS, (X86smul_flag t.RegClass:$src1,
t.ImmNoSuOperator:$src2))]>, DefEFLAGS {
let SchedRW = [sched];
}
class IMulOpMI8_R<X86TypeInfo t, X86FoldableSchedWrite sched>
- : BinOpMI8<"imul", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst)>,
- DefEFLAGS {
+ : BinOpMI8<"imul", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst)> {
let Opcode = 0x6B;
let SchedRW = [sched.Folded];
}
class IMulOpMI_R<X86TypeInfo t, X86FoldableSchedWrite sched>
: BinOpMI<0x69, "imul", binop_ndd_args, t, MRMSrcMem,
+ (outs t.RegClass:$dst), []> {
+ let SchedRW = [sched.Folded];
+}
+class IMulOpMI_RF<X86TypeInfo t, X86FoldableSchedWrite sched>
+ : BinOpMI<0x69, "imul", binop_ndd_args, t, MRMSrcMem,
(outs t.RegClass:$dst),
[(set t.RegClass:$dst, EFLAGS, (X86smul_flag (t.LoadNode addr:$src1),
t.ImmNoSuOperator:$src2))]>,
DefEFLAGS {
let SchedRW = [sched.Folded];
}
-def IMUL16rri8 : IMulOpRI8_R<Xi16, WriteIMul16Imm>, OpSize16;
-def IMUL32rri8 : IMulOpRI8_R<Xi32, WriteIMul32Imm>, OpSize32;
-def IMUL64rri8 : IMulOpRI8_R<Xi64, WriteIMul64Imm>;
-def IMUL16rri : IMulOpRI_R<Xi16, WriteIMul16Imm>, OpSize16;
-def IMUL32rri : IMulOpRI_R<Xi32, WriteIMul32Imm>, OpSize32;
-def IMUL64rri32 : IMulOpRI_R<Xi64, WriteIMul64Imm>;
-
-def IMUL16rmi8 : IMulOpMI8_R<Xi16, WriteIMul16Imm>, OpSize16;
-def IMUL32rmi8 : IMulOpMI8_R<Xi32, WriteIMul32Imm>, OpSize32;
-def IMUL64rmi8 : IMulOpMI8_R<Xi64, WriteIMul64Imm>;
-def IMUL16rmi : IMulOpMI_R<Xi16, WriteIMul16Imm>, OpSize16;
-def IMUL32rmi : IMulOpMI_R<Xi32, WriteIMul32Imm>, OpSize32;
-def IMUL64rmi32 : IMulOpMI_R<Xi64, WriteIMul64Imm>;
-
+def IMUL16rri8 : IMulOpRI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, OpSize16;
+def IMUL32rri8 : IMulOpRI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, OpSize32;
+def IMUL64rri8 : IMulOpRI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS;
+def IMUL16rri : IMulOpRI_RF<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rri : IMulOpRI_RF<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rri32 : IMulOpRI_RF<Xi64, WriteIMul64Imm>;
+def IMUL16rmi8 : IMulOpMI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, OpSize16;
+def IMUL32rmi8 : IMulOpMI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, OpSize32;
+def IMUL64rmi8 : IMulOpMI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS;
+def IMUL16rmi : IMulOpMI_RF<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rmi : IMulOpMI_RF<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rmi32 : IMulOpMI_RF<Xi64, WriteIMul64Imm>;
+
+let Predicates = [In64BitMode] in {
+ def IMUL16rri8_NF : IMulOpRI8_R<Xi16, WriteIMul16Imm>, NF, PD;
+ def IMUL32rri8_NF : IMulOpRI8_R<Xi32, WriteIMul32Imm>, NF;
+ def IMUL64rri8_NF : IMulOpRI8_R<Xi64, WriteIMul64Imm>, NF;
+ def IMUL16rri_NF : IMulOpRI_R<Xi16, WriteIMul16Imm>, NF, PD;
+ def IMUL32rri_NF : IMulOpRI_R<Xi32, WriteIMul32Imm>, NF;
+ def IMUL64rri32_NF : IMulOpRI_R<Xi64, WriteIMul64Imm>, NF;
+ def IMUL16rmi8_NF : IMulOpMI8_R<Xi16, WriteIMul16Imm>, NF, PD;
+ def IMUL32rmi8_NF : IMulOpMI8_R<Xi32, WriteIMul32Imm>, NF;
+ def IMUL64rmi8_NF : IMulOpMI8_R<Xi64, WriteIMul64Imm>, NF;
+ def IMUL16rmi_NF : IMulOpMI_R<Xi16, WriteIMul16Imm>, NF, PD;
+ def IMUL32rmi_NF : IMulOpMI_R<Xi32, WriteIMul32Imm>, NF;
+ def IMUL64rmi32_NF : IMulOpMI_R<Xi64, WriteIMul64Imm>, NF;
+
+ def IMUL16rri8_EVEX : IMulOpRI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, PL, PD;
+ def IMUL32rri8_EVEX : IMulOpRI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, PL;
+ def IMUL64rri8_EVEX : IMulOpRI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS, PL;
+ def IMUL16rri_EVEX : IMulOpRI_RF<Xi16, WriteIMul16Imm>, PL, PD;
+ def IMUL32rri_EVEX : IMulOpRI_RF<Xi32, WriteIMul32Imm>, PL;
+ def IMUL64rri32_EVEX : IMulOpRI_RF<Xi64, WriteIMul64Imm>, PL;
+ def IMUL16rmi8_EVEX : IMulOpMI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, PL, PD;
+ def IMUL32rmi8_EVEX : IMulOpMI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, PL;
+ def IMUL64rmi8_EVEX : IMulOpMI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS, PL;
+ def IMUL16rmi_EVEX : IMulOpMI_RF<Xi16, WriteIMul16Imm>, PL, PD;
+ def IMUL32rmi_EVEX : IMulOpMI_RF<Xi32, WriteIMul32Imm>, PL;
+ def IMUL64rmi32_EVEX : IMulOpMI_RF<Xi64, WriteIMul64Imm>, PL;
+}
//===----------------------------------------------------------------------===//
// INC and DEC Instructions
//
-class IncOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag> {
+class IncOpR_RF<X86TypeInfo t, bit ndd = 0> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag, ndd> {
let Pattern = [(set t.RegClass:$dst, EFLAGS,
(X86add_flag_nocf t.RegClass:$src1, 1))];
}
-class DecOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag> {
+class DecOpR_RF<X86TypeInfo t, bit ndd = 0> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag, ndd> {
let Pattern = [(set t.RegClass:$dst, EFLAGS,
(X86sub_flag_nocf t.RegClass:$src1, 1))];
}
-class IncOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> {
+class IncOpR_R<X86TypeInfo t, bit ndd = 0> : UnaryOpR_R<0xFF, MRM0r, "inc", t, null_frag, ndd>;
+class DecOpR_R<X86TypeInfo t, bit ndd = 0> : UnaryOpR_R<0xFF, MRM1r, "dec", t, null_frag, ndd>;
+class IncOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> {
let Pattern = [(store (add (t.LoadNode addr:$src1), 1), addr:$src1),
(implicit EFLAGS)];
}
-class DecOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> {
+class DecOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> {
let Pattern = [(store (add (t.LoadNode addr:$src1), -1), addr:$src1),
(implicit EFLAGS)];
}
+class IncOpM_RF<X86TypeInfo t> : UnaryOpM_RF<0xFF, MRM0m, "inc", t, null_frag> {
+ let Pattern = [(set t.RegClass:$dst, EFLAGS, (add (t.LoadNode addr:$src1), 1))];
+}
+class DecOpM_RF<X86TypeInfo t> : UnaryOpM_RF<0xFF, MRM1m, "dec", t, null_frag> {
+ let Pattern = [(set t.RegClass:$dst, EFLAGS, (add (t.LoadNode addr:$src1), -1))];
+}
+class IncOpM_M<X86TypeInfo t> : UnaryOpM_M<0xFF, MRM0m, "inc", t, null_frag>;
+class DecOpM_M<X86TypeInfo t> : UnaryOpM_M<0xFF, MRM1m, "dec", t, null_frag>;
+class IncOpM_R<X86TypeInfo t> : UnaryOpM_R<0xFF, MRM0m, "inc", t, null_frag>;
+class DecOpM_R<X86TypeInfo t> : UnaryOpM_R<0xFF, MRM1m, "dec", t, null_frag>;
+
// IncDec_Alt - Instructions like "inc reg" short forms.
// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
class IncDec_Alt<bits<8> o, string m, X86TypeInfo t>
: UnaryOpR_RF<o, AddRegFrm, m, t, null_frag>, Requires<[Not64BitMode]>;
let isConvertibleToThreeAddress = 1 in {
-def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16;
-def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32;
-def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16;
-def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32;
-def INC8r : IncOpR_RF<Xi8>;
-def INC16r : IncOpR_RF<Xi16>, OpSize16;
-def INC32r : IncOpR_RF<Xi32>, OpSize32;
-def INC64r : IncOpR_RF<Xi64>;
-def DEC8r : DecOpR_RF<Xi8>;
-def DEC16r : DecOpR_RF<Xi16>, OpSize16;
-def DEC32r : DecOpR_RF<Xi32>, OpSize32;
-def DEC64r : DecOpR_RF<Xi64>;
+ def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16;
+ def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32;
+ def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16;
+ def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32;
+ let Predicates = [NoNDD] in {
+ def INC8r : IncOpR_RF<Xi8>;
+ def INC16r : IncOpR_RF<Xi16>, OpSize16;
+ def INC32r : IncOpR_RF<Xi32>, OpSize32;
+ def INC64r : IncOpR_RF<Xi64>;
+ def DEC8r : DecOpR_RF<Xi8>;
+ def DEC16r : DecOpR_RF<Xi16>, OpSize16;
+ def DEC32r : DecOpR_RF<Xi32>, OpSize32;
+ def DEC64r : DecOpR_RF<Xi64>;
+ }
+ let Predicates = [HasNDD, In64BitMode] in {
+ def INC8r_ND : IncOpR_RF<Xi8, 1>;
+ def INC16r_ND : IncOpR_RF<Xi16, 1>, PD;
+ def INC32r_ND : IncOpR_RF<Xi32, 1>;
+ def INC64r_ND : IncOpR_RF<Xi64, 1>;
+ def DEC8r_ND : DecOpR_RF<Xi8, 1>;
+ def DEC16r_ND : DecOpR_RF<Xi16, 1>, PD;
+ def DEC32r_ND : DecOpR_RF<Xi32, 1>;
+ def DEC64r_ND : DecOpR_RF<Xi64, 1>;
+ }
+ let Predicates = [In64BitMode], Pattern = [(null_frag)] in {
+ def INC8r_NF : IncOpR_R<Xi8>, NF;
+ def INC16r_NF : IncOpR_R<Xi16>, NF, PD;
+ def INC32r_NF : IncOpR_R<Xi32>, NF;
+ def INC64r_NF : IncOpR_R<Xi64>, NF;
+ def DEC8r_NF : DecOpR_R<Xi8>, NF;
+ def DEC16r_NF : DecOpR_R<Xi16>, NF, PD;
+ def DEC32r_NF : DecOpR_R<Xi32>, NF;
+ def DEC64r_NF : DecOpR_R<Xi64>, NF;
+ def INC8r_NF_ND : IncOpR_R<Xi8, 1>, NF;
+ def INC16r_NF_ND : IncOpR_R<Xi16, 1>, NF, PD;
+ def INC32r_NF_ND : IncOpR_R<Xi32, 1>, NF;
+ def INC64r_NF_ND : IncOpR_R<Xi64, 1>, NF;
+ def DEC8r_NF_ND : DecOpR_R<Xi8, 1>, NF;
+ def DEC16r_NF_ND : DecOpR_R<Xi16, 1>, NF, PD;
+ def DEC32r_NF_ND : DecOpR_R<Xi32, 1>, NF;
+ def DEC64r_NF_ND : DecOpR_R<Xi64, 1>, NF;
+ def INC8r_EVEX : IncOpR_RF<Xi8>, PL;
+ def INC16r_EVEX : IncOpR_RF<Xi16>, PL, PD;
+ def INC32r_EVEX : IncOpR_RF<Xi32>, PL;
+ def INC64r_EVEX : IncOpR_RF<Xi64>, PL;
+ def DEC8r_EVEX : DecOpR_RF<Xi8>, PL;
+ def DEC16r_EVEX : DecOpR_RF<Xi16>, PL, PD;
+ def DEC32r_EVEX : DecOpR_RF<Xi32>, PL;
+ def DEC64r_EVEX : DecOpR_RF<Xi64>, PL;
+ }
}
let Predicates = [UseIncDec] in {
-def INC8m : IncOpM_M<Xi8>;
-def INC16m : IncOpM_M<Xi16>, OpSize16;
-def INC32m : IncOpM_M<Xi32>, OpSize32;
-def DEC8m : DecOpM_M<Xi8>;
-def DEC16m : DecOpM_M<Xi16>, OpSize16;
-def DEC32m : DecOpM_M<Xi32>, OpSize32;
+ def INC8m : IncOpM_MF<Xi8>;
+ def INC16m : IncOpM_MF<Xi16>, OpSize16;
+ def INC32m : IncOpM_MF<Xi32>, OpSize32;
+ def DEC8m : DecOpM_MF<Xi8>;
+ def DEC16m : DecOpM_MF<Xi16>, OpSize16;
+ def DEC32m : DecOpM_MF<Xi32>, OpSize32;
}
let Predicates = [UseIncDec, In64BitMode] in {
-def INC64m : IncOpM_M<Xi64>;
-def DEC64m : DecOpM_M<Xi64>;
+ def INC64m : IncOpM_MF<Xi64>;
+ def DEC64m : DecOpM_MF<Xi64>;
+}
+let Predicates = [HasNDD, In64BitMode, UseIncDec] in {
+ def INC8m_ND : IncOpM_RF<Xi8>;
+ def INC16m_ND : IncOpM_RF<Xi16>, PD;
+ def INC32m_ND : IncOpM_RF<Xi32>;
+ def DEC8m_ND : DecOpM_RF<Xi8>;
+ def DEC16m_ND : DecOpM_RF<Xi16>, PD;
+ def DEC32m_ND : DecOpM_RF<Xi32>;
+ def INC64m_ND : IncOpM_RF<Xi64>;
+ def DEC64m_ND : DecOpM_RF<Xi64>;
+}
+let Predicates = [In64BitMode], Pattern = [(null_frag)] in {
+ def INC8m_NF : IncOpM_M<Xi8>, NF;
+ def INC16m_NF : IncOpM_M<Xi16>, NF, PD;
+ def INC32m_NF : IncOpM_M<Xi32>, NF;
+ def INC64m_NF : IncOpM_M<Xi64>, NF;
+ def DEC8m_NF : DecOpM_M<Xi8>, NF;
+ def DEC16m_NF : DecOpM_M<Xi16>, NF, PD;
+ def DEC32m_NF : DecOpM_M<Xi32>, NF;
+ def DEC64m_NF : DecOpM_M<Xi64>, NF;
+ def INC8m_NF_ND : IncOpM_R<Xi8>, NF;
+ def INC16m_NF_ND : IncOpM_R<Xi16>, NF, PD;
+ def INC32m_NF_ND : IncOpM_R<Xi32>, NF;
+ def INC64m_NF_ND : IncOpM_R<Xi64>, NF;
+ def DEC8m_NF_ND : DecOpM_R<Xi8>, NF;
+ def DEC16m_NF_ND : DecOpM_R<Xi16>, NF, PD;
+ def DEC32m_NF_ND : DecOpM_R<Xi32>, NF;
+ def DEC64m_NF_ND : DecOpM_R<Xi64>, NF;
+ def INC8m_EVEX : IncOpM_MF<Xi8>, PL;
+ def INC16m_EVEX : IncOpM_MF<Xi16>, PL, PD;
+ def INC32m_EVEX : IncOpM_MF<Xi32>, PL;
+ def INC64m_EVEX : IncOpM_MF<Xi64>, PL;
+ def DEC8m_EVEX : DecOpM_MF<Xi8>, PL;
+ def DEC16m_EVEX : DecOpM_MF<Xi16>, PL, PD;
+ def DEC32m_EVEX : DecOpM_MF<Xi32>, PL;
+ def DEC64m_EVEX : DecOpM_MF<Xi64>, PL;
}
//===----------------------------------------------------------------------===//
@@ -350,212 +590,212 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
let isCommutable = CommutableRR,
isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
let Predicates = [NoNDD] in {
- def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
- def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16;
- def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32;
- def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+ def 8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+ def 16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16;
+ def 32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32;
+ def 64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
}
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#8rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag, 1>;
- def NAME#16rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag, 1>, PD;
- def NAME#32rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag, 1>;
- def NAME#64rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag, 1>;
- def NAME#8rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi8, 1>, EVEX_NF;
- def NAME#16rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi16, 1>, EVEX_NF, PD;
- def NAME#32rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi32, 1>, EVEX_NF;
- def NAME#64rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi64, 1>, EVEX_NF;
+ def 8rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag, 1>;
+ def 16rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag, 1>, PD;
+ def 32rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag, 1>;
+ def 64rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag, 1>;
+ def 8rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi8, 1>, EVEX_NF;
+ def 16rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi16, 1>, EVEX_NF, PD;
+ def 32rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi32, 1>, EVEX_NF;
+ def 64rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi64, 1>, EVEX_NF;
}
let Predicates = [In64BitMode] in {
- def NAME#8rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi8>, NF;
- def NAME#16rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi16>, NF, PD;
- def NAME#32rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi32>, NF;
- def NAME#64rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi64>, NF;
- def NAME#8rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
- def NAME#16rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
- def NAME#32rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
- def NAME#64rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+ def 8rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi8>, NF;
+ def 16rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi16>, NF, PD;
+ def 32rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi32>, NF;
+ def 64rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi64>, NF;
+ def 8rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+ def 16rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+ def 32rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+ def 64rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
}
}
- def NAME#8rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>;
- def NAME#16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
- def NAME#32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
- def NAME#64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;
+ def 8rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>;
+ def 16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+ def 32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+ def 64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;
let Predicates = [In64BitMode] in {
- def NAME#8rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
- def NAME#16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
- def NAME#32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
- def NAME#64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
- def NAME#8rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
- def NAME#16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
- def NAME#32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
- def NAME#64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
- def NAME#8rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF;
- def NAME#16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD;
- def NAME#32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF;
- def NAME#64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF;
- def NAME#8rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
- def NAME#16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
- def NAME#32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
- def NAME#64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
+ def 8rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
+ def 16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
+ def 32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
+ def 64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
+ def 8rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
+ def 16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
+ def 32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
+ def 64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
+ def 8rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF;
+ def 16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD;
+ def 32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF;
+ def 64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF;
+ def 8rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
+ def 16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
+ def 32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
+ def 64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
}
let Predicates = [NoNDD] in {
- def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
- def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16;
- def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32;
- def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+ def 8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+ def 16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16;
+ def 32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32;
+ def 64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
}
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#8rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>;
- def NAME#16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD;
- def NAME#32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>;
- def NAME#64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>;
- def NAME#8rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
- def NAME#16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
- def NAME#32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
- def NAME#64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
+ def 8rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>;
+ def 16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD;
+ def 32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>;
+ def 64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>;
+ def 8rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
+ def 16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
+ def 32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
+ def 64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
}
let Predicates = [In64BitMode] in {
- def NAME#8rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF;
- def NAME#16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD;
- def NAME#32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF;
- def NAME#64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF;
- def NAME#8rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL;
- def NAME#16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD;
- def NAME#32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL;
- def NAME#64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL;
+ def 8rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF;
+ def 16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD;
+ def 32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF;
+ def 64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF;
+ def 8rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL;
+ def 16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD;
+ def 32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL;
+ def 64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL;
}
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
let Predicates = [NoNDD] in {
// NOTE: These are order specific, we want the ri8 forms to be listed
// first so that they are slightly preferred to the ri forms.
- def NAME#16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
- def NAME#32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
- def NAME#64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
- def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
- def NAME#16ri : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16;
- def NAME#32ri : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32;
- def NAME#64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
+ def 16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+ def 32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+ def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
+ def 8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+ def 16ri : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16;
+ def 32ri : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32;
+ def 64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
}
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
- def NAME#32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
- def NAME#64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
- def NAME#8ri_ND : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>;
- def NAME#16ri_ND : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD;
- def NAME#32ri_ND : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>;
- def NAME#64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>;
- def NAME#16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
- def NAME#32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
- def NAME#64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
- def NAME#8ri_NF_ND : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF;
- def NAME#16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
- def NAME#32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
- def NAME#64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
+ def 16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
+ def 32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
+ def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
+ def 8ri_ND : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>;
+ def 16ri_ND : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD;
+ def 32ri_ND : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>;
+ def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>;
+ def 16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
+ def 32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
+ def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
+ def 8ri_NF_ND : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF;
+ def 16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
+ def 32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
+ def 64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
}
let Predicates = [In64BitMode] in {
- def NAME#16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD;
- def NAME#32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF;
- def NAME#64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF;
- def NAME#8ri_NF : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF;
- def NAME#16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD;
- def NAME#32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF;
- def NAME#64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF;
- def NAME#16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
- def NAME#32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
- def NAME#64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
- def NAME#8ri_EVEX : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL;
- def NAME#16ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD;
- def NAME#32ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL;
- def NAME#64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL;
+ def 16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD;
+ def 32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF;
+ def 64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF;
+ def 8ri_NF : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF;
+ def 16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD;
+ def 32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF;
+ def 64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF;
+ def 16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
+ def 32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
+ def 64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
+ def 8ri_EVEX : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL;
+ def 16ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD;
+ def 32ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL;
+ def 64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL;
}
}
- def NAME#8mr : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>;
- def NAME#16mr : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
- def NAME#32mr : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
- def NAME#64mr : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;
+ def 8mr : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def 16mr : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+ def 32mr : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+ def 64mr : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#8mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>;
- def NAME#16mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
- def NAME#32mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>;
- def NAME#64mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>;
- def NAME#8mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF;
- def NAME#16mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD;
- def NAME#32mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF;
- def NAME#64mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi64>, EVEX_NF;
+ def 8mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def 16mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
+ def 32mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>;
+ def 64mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>;
+ def 8mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF;
+ def 16mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD;
+ def 32mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF;
+ def 64mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi64>, EVEX_NF;
}
let Predicates = [In64BitMode] in {
- def NAME#8mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi8>, NF;
- def NAME#16mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi16>, NF, PD;
- def NAME#32mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi32>, NF;
- def NAME#64mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi64>, NF;
- def NAME#8mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
- def NAME#16mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
- def NAME#32mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
- def NAME#64mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+ def 8mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi8>, NF;
+ def 16mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi16>, NF, PD;
+ def 32mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi32>, NF;
+ def 64mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi64>, NF;
+ def 8mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+ def 16mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+ def 32mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+ def 64mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
}
// NOTE: These are order specific, we want the mi8 forms to be listed
// first so that they are slightly preferred to the mi forms.
- def NAME#16mi8 : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16;
- def NAME#32mi8 : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32;
+ def 16mi8 : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16;
+ def 32mi8 : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32;
let Predicates = [In64BitMode] in
- def NAME#64mi8 : BinOpMI8_MF<mnemonic, Xi64, MemMRM>;
- def NAME#8mi : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
- def NAME#16mi : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
- def NAME#32mi : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
+ def 64mi8 : BinOpMI8_MF<mnemonic, Xi64, MemMRM>;
+ def 8mi : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def 16mi : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+ def 32mi : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
let Predicates = [In64BitMode] in
- def NAME#64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+ def 64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#16mi8_ND : BinOpMI8_RF<mnemonic, Xi16, MemMRM>, PD;
- def NAME#32mi8_ND : BinOpMI8_RF<mnemonic, Xi32, MemMRM>;
- def NAME#64mi8_ND : BinOpMI8_RF<mnemonic, Xi64, MemMRM>;
- def NAME#8mi_ND : BinOpMI_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
- def NAME#16mi_ND : BinOpMI_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
- def NAME#32mi_ND : BinOpMI_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
- def NAME#64mi32_ND : BinOpMI_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
- def NAME#16mi8_NF_ND : BinOpMI8_R<mnemonic, Xi16, MemMRM>, NF, PD;
- def NAME#32mi8_NF_ND : BinOpMI8_R<mnemonic, Xi32, MemMRM>, NF;
- def NAME#64mi8_NF_ND : BinOpMI8_R<mnemonic, Xi64, MemMRM>, NF;
- def NAME#8mi_NF_ND : BinOpMI_R<0x80, mnemonic, Xi8, MemMRM>, NF;
- def NAME#16mi_NF_ND : BinOpMI_R<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
- def NAME#32mi_NF_ND : BinOpMI_R<0x81, mnemonic, Xi32, MemMRM>, NF;
- def NAME#64mi32_NF_ND : BinOpMI_R<0x81, mnemonic, Xi64, MemMRM>, NF;
+ def 16mi8_ND : BinOpMI8_RF<mnemonic, Xi16, MemMRM>, PD;
+ def 32mi8_ND : BinOpMI8_RF<mnemonic, Xi32, MemMRM>;
+ def 64mi8_ND : BinOpMI8_RF<mnemonic, Xi64, MemMRM>;
+ def 8mi_ND : BinOpMI_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def 16mi_ND : BinOpMI_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
+ def 32mi_ND : BinOpMI_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
+ def 64mi32_ND : BinOpMI_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+ def 16mi8_NF_ND : BinOpMI8_R<mnemonic, Xi16, MemMRM>, NF, PD;
+ def 32mi8_NF_ND : BinOpMI8_R<mnemonic, Xi32, MemMRM>, NF;
+ def 64mi8_NF_ND : BinOpMI8_R<mnemonic, Xi64, MemMRM>, NF;
+ def 8mi_NF_ND : BinOpMI_R<0x80, mnemonic, Xi8, MemMRM>, NF;
+ def 16mi_NF_ND : BinOpMI_R<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
+ def 32mi_NF_ND : BinOpMI_R<0x81, mnemonic, Xi32, MemMRM>, NF;
+ def 64mi32_NF_ND : BinOpMI_R<0x81, mnemonic, Xi64, MemMRM>, NF;
}
let Predicates = [In64BitMode] in {
- def NAME#16mi8_NF : BinOpMI8_M<mnemonic, Xi16, MemMRM>, NF, PD;
- def NAME#32mi8_NF : BinOpMI8_M<mnemonic, Xi32, MemMRM>, NF;
- def NAME#64mi8_NF : BinOpMI8_M<mnemonic, Xi64, MemMRM>, NF;
- def NAME#8mi_NF : BinOpMI_M<0x80, mnemonic, Xi8, MemMRM>, NF;
- def NAME#16mi_NF : BinOpMI_M<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
- def NAME#32mi_NF : BinOpMI_M<0x81, mnemonic, Xi32, MemMRM>, NF;
- def NAME#64mi32_NF : BinOpMI_M<0x81, mnemonic, Xi64, MemMRM>, NF;
- def NAME#16mi8_EVEX : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, PL, PD;
- def NAME#32mi8_EVEX : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, PL;
- def NAME#64mi8_EVEX : BinOpMI8_MF<mnemonic, Xi64, MemMRM>, PL;
- def NAME#8mi_EVEX : BinOpMI_MF<0x80, mnemonic, Xi8 , null_frag, MemMRM>, PL;
- def NAME#16mi_EVEX : BinOpMI_MF<0x81, mnemonic, Xi16, null_frag, MemMRM>, PL, PD;
- def NAME#32mi_EVEX : BinOpMI_MF<0x81, mnemonic, Xi32, null_frag, MemMRM>, PL;
- def NAME#64mi32_EVEX : BinOpMI_MF<0x81, mnemonic, Xi64, null_frag, MemMRM>, PL;
+ def 16mi8_NF : BinOpMI8_M<mnemonic, Xi16, MemMRM>, NF, PD;
+ def 32mi8_NF : BinOpMI8_M<mnemonic, Xi32, MemMRM>, NF;
+ def 64mi8_NF : BinOpMI8_M<mnemonic, Xi64, MemMRM>, NF;
+ def 8mi_NF : BinOpMI_M<0x80, mnemonic, Xi8, MemMRM>, NF;
+ def 16mi_NF : BinOpMI_M<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
+ def 32mi_NF : BinOpMI_M<0x81, mnemonic, Xi32, MemMRM>, NF;
+ def 64mi32_NF : BinOpMI_M<0x81, mnemonic, Xi64, MemMRM>, NF;
+ def 16mi8_EVEX : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, PL, PD;
+ def 32mi8_EVEX : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, PL;
+ def 64mi8_EVEX : BinOpMI8_MF<mnemonic, Xi64, MemMRM>, PL;
+ def 8mi_EVEX : BinOpMI_MF<0x80, mnemonic, Xi8 , null_frag, MemMRM>, PL;
+ def 16mi_EVEX : BinOpMI_MF<0x81, mnemonic, Xi16, null_frag, MemMRM>, PL, PD;
+ def 32mi_EVEX : BinOpMI_MF<0x81, mnemonic, Xi32, null_frag, MemMRM>, PL;
+ def 64mi32_EVEX : BinOpMI_MF<0x81, mnemonic, Xi64, null_frag, MemMRM>, PL;
}
// These are for the disassembler since 0x82 opcode behaves like 0x80, but
// not in 64-bit mode.
let Predicates = [Not64BitMode] in {
- def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
- def NAME#8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+ def 8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+ def 8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
}
- def NAME#8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL,
+ def 8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL,
"{$src, %al|al, $src}">;
- def NAME#16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX,
+ def 16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX,
"{$src, %ax|ax, $src}">, OpSize16;
- def NAME#32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX,
+ def 32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX,
"{$src, %eax|eax, $src}">, OpSize32;
- def NAME#64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX,
+ def 64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX,
"{$src, %rax|rax, $src}">;
}
@@ -571,162 +811,162 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
bit ConvertibleToThreeAddress> {
let isCommutable = CommutableRR in {
let Predicates = [NoNDD] in {
- def NAME#8rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def 8rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- def NAME#16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
- def NAME#32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
- def NAME#64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
+ def 16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+ def 32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+ def 64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
}
}
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#8rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode, 1>;
+ def 8rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode, 1>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- def NAME#16rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode, 1>, PD;
- def NAME#32rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode, 1>;
- def NAME#64rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode, 1>;
+ def 16rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode, 1>, PD;
+ def 32rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode, 1>;
+ def 64rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode, 1>;
}
}
} // isCommutable
let Predicates = [In64BitMode] in {
- def NAME#8rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
- def NAME#16rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
- def NAME#32rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
- def NAME#64rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+ def 8rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+ def 16rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+ def 32rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+ def 64rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
}
- def NAME#8rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>;
- def NAME#16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
- def NAME#32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
- def NAME#64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>;
+ def 8rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>;
+ def 16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+ def 32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+ def 64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>;
let Predicates = [In64BitMode] in {
- def NAME#8rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
- def NAME#16rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
- def NAME#32rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
- def NAME#64rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
- def NAME#8rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
- def NAME#16rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
- def NAME#32rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
- def NAME#64rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
+ def 8rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
+ def 16rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
+ def 32rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
+ def 64rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
+ def 8rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
+ def 16rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
+ def 32rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
+ def 64rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
}
let Predicates = [NoNDD] in {
- def NAME#8rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>;
- def NAME#16rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
- def NAME#32rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
- def NAME#64rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>;
+ def 8rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def 16rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
+ def 32rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
+ def 64rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>;
}
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#8rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode, 1>;
- def NAME#16rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode, 1>, PD;
- def NAME#32rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode, 1>;
- def NAME#64rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode, 1>;
+ def 8rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode, 1>;
+ def 16rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode, 1>, PD;
+ def 32rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode, 1>;
+ def 64rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode, 1>;
}
let Predicates = [In64BitMode] in {
- def NAME#8rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>, PL;
- def NAME#16rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, PL, PD;
- def NAME#32rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, PL;
- def NAME#64rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>, PL;
+ def 8rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>, PL;
+ def 16rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, PL, PD;
+ def 32rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, PL;
+ def 64rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>, PL;
}
let Predicates = [NoNDD] in {
- def NAME#8ri : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+ def 8ri : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
// NOTE: These are order specific, we want the ri8 forms to be listed
// first so that they are slightly preferred to the ri forms.
- def NAME#16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
- def NAME#32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
- def NAME#64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>;
+ def 16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+ def 32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+ def 64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>;
- def NAME#16ri : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
- def NAME#32ri : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
- def NAME#64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>;
+ def 16ri : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
+ def 32ri : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
+ def 64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>;
}
}
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#8ri_ND : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM, 1>;
+ def 8ri_ND : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM, 1>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- def NAME#16ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
- def NAME#32ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
- def NAME#64ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
- def NAME#16ri_ND : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM, 1>, PD;
- def NAME#32ri_ND : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM, 1>;
- def NAME#64ri32_ND: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM, 1>;
+ def 16ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
+ def 32ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
+ def 64ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
+ def 16ri_ND : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM, 1>, PD;
+ def 32ri_ND : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM, 1>;
+ def 64ri32_ND: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM, 1>;
}
}
let Predicates = [In64BitMode] in {
- def NAME#8ri_EVEX : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>, PL;
- def NAME#16ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
- def NAME#32ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
- def NAME#64ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
- def NAME#16ri_EVEX : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, PL, PD;
- def NAME#32ri_EVEX : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, PL;
- def NAME#64ri32_EVEX: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>, PL;
+ def 8ri_EVEX : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>, PL;
+ def 16ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
+ def 32ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
+ def 64ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
+ def 16ri_EVEX : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, PL, PD;
+ def 32ri_EVEX : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, PL;
+ def 64ri32_EVEX: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>, PL;
}
- def NAME#8mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>;
- def NAME#16mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
- def NAME#32mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
- def NAME#64mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>;
+ def 8mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def 16mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+ def 32mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+ def 64mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>;
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#8mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
- def NAME#16mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
- def NAME#32mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>;
- def NAME#64mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
+ def 8mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def 16mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
+ def 32mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>;
+ def 64mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
}
let Predicates = [In64BitMode] in {
- def NAME#8mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
- def NAME#16mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
- def NAME#32mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
- def NAME#64mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+ def 8mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+ def 16mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+ def 32mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+ def 64mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
}
// NOTE: These are order specific, we want the mi8 forms to be listed
// first so that they are slightly preferred to the mi forms.
- def NAME#8mi : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
- def NAME#16mi8 : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16;
- def NAME#32mi8 : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32;
+ def 8mi : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def 16mi8 : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16;
+ def 32mi8 : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32;
let Predicates = [In64BitMode] in
- def NAME#64mi8 : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>;
- def NAME#16mi : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
- def NAME#32mi : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
+ def 64mi8 : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>;
+ def 16mi : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+ def 32mi : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
let Predicates = [In64BitMode] in
- def NAME#64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+ def 64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
let Predicates = [HasNDD, In64BitMode] in {
- def NAME#8mi_ND : BinOpMIF_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
- def NAME#16mi8_ND : BinOpMI8F_RF<mnemonic, Xi16, MemMRM>, PD;
- def NAME#32mi8_ND : BinOpMI8F_RF<mnemonic, Xi32, MemMRM>;
- def NAME#64mi8_ND : BinOpMI8F_RF<mnemonic, Xi64, MemMRM>;
- def NAME#16mi_ND : BinOpMIF_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
- def NAME#32mi_ND : BinOpMIF_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
- def NAME#64mi32_ND : BinOpMIF_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+ def 8mi_ND : BinOpMIF_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def 16mi8_ND : BinOpMI8F_RF<mnemonic, Xi16, MemMRM>, PD;
+ def 32mi8_ND : BinOpMI8F_RF<mnemonic, Xi32, MemMRM>;
+ def 64mi8_ND : BinOpMI8F_RF<mnemonic, Xi64, MemMRM>;
+ def 16mi_ND : BinOpMIF_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
+ def 32mi_ND : BinOpMIF_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
+ def 64mi32_ND : BinOpMIF_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
}
let Predicates = [In64BitMode] in {
- def NAME#8mi_EVEX : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>, PL;
- def NAME#16mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, PL, PD;
- def NAME#32mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, PL;
- def NAME#64mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>, PL;
- def NAME#16mi_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, PL, PD;
- def NAME#32mi_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, PL;
- def NAME#64mi32_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>, PL;
+ def 8mi_EVEX : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>, PL;
+ def 16mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, PL, PD;
+ def 32mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, PL;
+ def 64mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>, PL;
+ def 16mi_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, PL, PD;
+ def 32mi_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, PL;
+ def 64mi32_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>, PL;
}
// These are for the disassembler since 0x82 opcode behaves like 0x80, but
// not in 64-bit mode.
let Predicates = [Not64BitMode] in {
- def NAME#8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
- def NAME#8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+ def 8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+ def 8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
}
- def NAME#8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL,
+ def 8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL,
"{$src, %al|al, $src}">;
- def NAME#16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX,
+ def 16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX,
"{$src, %ax|ax, $src}">, OpSize16;
- def NAME#32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX,
+ def 32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX,
"{$src, %eax|eax, $src}">, OpSize32;
- def NAME#64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX,
+ def 64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX,
"{$src, %rax|rax, $src}">;
}
@@ -739,71 +979,71 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
SDNode opnode, bit CommutableRR,
bit ConvertibleToThreeAddress> {
let isCommutable = CommutableRR in {
- def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def 8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
- def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
- def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+ def 16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+ def 32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+ def 64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
} // isConvertibleToThreeAddress
} // isCommutable
- def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
- def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
- def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
- def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+ def 8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+ def 16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+ def 32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+ def 64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
- def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
- def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
- def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
- def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+ def 8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def 16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
+ def 32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
+ def 64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
- def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+ def 8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
// NOTE: These are order specific, we want the ri8 forms to be listed
// first so that they are slightly preferred to the ri forms.
- def NAME#16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
- def NAME#32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
- def NAME#64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>;
+ def 16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+ def 32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+ def 64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>;
- def NAME#16ri : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
- def NAME#32ri : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
- def NAME#64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;
+ def 16ri : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
+ def 32ri : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
+ def 64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;
}
- def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
- def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
- def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
- def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+ def 8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def 16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+ def 32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+ def 64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
// NOTE: These are order specific, we want the mi8 forms to be listed
// first so that they are slightly preferred to the mi forms.
- def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16;
- def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32;
+ def 16mi8 : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16;
+ def 32mi8 : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32;
let Predicates = [In64BitMode] in
- def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
+ def 64mi8 : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
- def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
- def NAME#16mi : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
- def NAME#32mi : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
+ def 8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def 16mi : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+ def 32mi : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
let Predicates = [In64BitMode] in
- def NAME#64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;
+ def 64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;
// These are for the disassembler since 0x82 opcode behaves like 0x80, but
// not in 64-bit mode.
let Predicates = [Not64BitMode] in {
- def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+ def 8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
let mayLoad = 1 in
- def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
+ def 8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
}
- def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+ def 8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
"{$src, %al|al, $src}">;
- def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
+ def 16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
"{$src, %ax|ax, $src}">, OpSize16;
- def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
+ def 32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
"{$src, %eax|eax, $src}">, OpSize32;
- def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
+ def 64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
"{$src, %rax|rax, $src}">;
}
@@ -1119,14 +1359,34 @@ defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W;
// We don't have patterns for these as there is no advantage over ADC for
// most code.
let Form = MRMSrcReg in {
-def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD;
-def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD;
-def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS;
-def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS;
+ def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32>, T8, PD;
+ def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64>, T8, PD;
+ def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32>, T8, XS;
+ def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64>, T8, XS;
+ let Predicates =[In64BitMode] in {
+ def ADCX32rr_EVEX : BinOpRRF_RF<0x66, "adcx", Xi32>, EVEX, T_MAP4, PD;
+ def ADCX64rr_EVEX : BinOpRRF_RF<0x66, "adcx", Xi64>, EVEX, T_MAP4, PD;
+ def ADOX32rr_EVEX : BinOpRRF_RF<0x66, "adox", Xi32>, EVEX, T_MAP4, XS;
+ def ADOX64rr_EVEX : BinOpRRF_RF<0x66, "adox", Xi64>, EVEX, T_MAP4, XS;
+ def ADCX32rr_ND : BinOpRRF_RF<0x66, "adcx", Xi32, null_frag, 1>, PD;
+ def ADCX64rr_ND : BinOpRRF_RF<0x66, "adcx", Xi64, null_frag, 1>, PD;
+ def ADOX32rr_ND : BinOpRRF_RF<0x66, "adox", Xi32, null_frag, 1>, XS;
+ def ADOX64rr_ND : BinOpRRF_RF<0x66, "adox", Xi64, null_frag, 1>, XS;
+ }
}
let Form = MRMSrcMem in {
-def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD;
-def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD;
-def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS;
-def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS;
+ def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32>, T8, PD;
+ def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64>, T8, PD;
+ def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32>, T8, XS;
+ def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64>, T8, XS;
+ let Predicates =[In64BitMode] in {
+ def ADCX32rm_EVEX : BinOpRMF_RF<0x66, "adcx", Xi32>, EVEX, T_MAP4, PD;
+ def ADCX64rm_EVEX : BinOpRMF_RF<0x66, "adcx", Xi64>, EVEX, T_MAP4, PD;
+ def ADOX32rm_EVEX : BinOpRMF_RF<0x66, "adox", Xi32>, EVEX, T_MAP4, XS;
+ def ADOX64rm_EVEX : BinOpRMF_RF<0x66, "adox", Xi64>, EVEX, T_MAP4, XS;
+ def ADCX32rm_ND : BinOpRMF_RF<0x66, "adcx", Xi32, null_frag, 1>, PD;
+ def ADCX64rm_ND : BinOpRMF_RF<0x66, "adcx", Xi64, null_frag, 1>, PD;
+ def ADOX32rm_ND : BinOpRMF_RF<0x66, "adox", Xi32, null_frag, 1>, XS;
+ def ADOX64rm_ND : BinOpRMF_RF<0x66, "adox", Xi64, null_frag, 1>, XS;
+ }
}
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 6e76b44b66a3..8798b13a1761 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -247,8 +247,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bit hasREPPrefix = 0; // Does this inst have a REP prefix?
bits<2> OpEncBits = OpEnc.Value;
bit IgnoresW = 0; // Does this inst ignore REX_W field?
- bit EVEX_W1_VEX_W0 = 0; // This EVEX inst with VEX.W==1 can become a VEX
- // instruction with VEX.W == 0.
bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field?
bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
@@ -279,10 +277,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
CD8_EltSize,
!srl(VectSize, CD8_Form{1-0}))), 0);
- // Used to prevent an explicit EVEX2VEX override for this instruction.
- string EVEX2VEXOverride = ?;
-
- bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
ExplicitOpPrefix explicitOpPrefix = NoExplicitOpPrefix;
bits<2> explicitOpPrefixBits = explicitOpPrefix.Value;
// TSFlags layout should be kept in sync with X86BaseInfo.h.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index eac8d79eb8a3..eb0734f9a618 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -29,8 +29,10 @@ class X86Subtarget;
namespace X86 {
enum AsmComments {
+ // For instr that was compressed from EVEX to LEGACY.
+ AC_EVEX_2_LEGACY = MachineInstr::TAsmComments,
// For instr that was compressed from EVEX to VEX.
- AC_EVEX_2_VEX = MachineInstr::TAsmComments
+ AC_EVEX_2_VEX = AC_EVEX_2_LEGACY << 1
};
/// Return a pair of condition code for the given predicate and whether
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 305bd74f7bd7..97c625a64cfc 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1212,36 +1212,33 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
(implicit EFLAGS)]>, TB, XS, Sched<[WriteTZCNTLd]>;
}
-multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
- RegisterClass RC, X86MemOperand x86memop,
- X86FoldableSchedWrite sched, string Suffix = ""> {
-let hasSideEffects = 0 in {
- def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
- !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
- T8, VEX, VVVV, Sched<[sched]>;
- let mayLoad = 1 in
- def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
- !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
- T8, VEX, VVVV, Sched<[sched.Folded]>;
-}
+multiclass Bls<string m, Format RegMRM, Format MemMRM, X86TypeInfo t, string Suffix = ""> {
+ let SchedRW = [WriteBLS] in {
+ def rr#Suffix : UnaryOpR<0xF3, RegMRM, m, unaryop_ndd_args, t,
+ (outs t.RegClass:$dst), []>, T8, VVVV;
+ }
+
+ let SchedRW = [WriteBLS.Folded] in
+ def rm#Suffix : UnaryOpM<0xF3, MemMRM, m, unaryop_ndd_args, t,
+ (outs t.RegClass:$dst), []>, T8, VVVV;
}
-let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in {
- defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>;
- defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, REX_W;
- defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>;
- defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, REX_W;
- defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>;
- defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, REX_W;
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm BLSR32 : Bls<"blsr", MRM1r, MRM1m, Xi32>, VEX;
+ defm BLSR64 : Bls<"blsr", MRM1r, MRM1m, Xi64>, VEX;
+ defm BLSMSK32 : Bls<"blsmsk", MRM2r, MRM2m, Xi32>, VEX;
+ defm BLSMSK64 : Bls<"blsmsk", MRM2r, MRM2m, Xi64>, VEX;
+ defm BLSI32 : Bls<"blsi", MRM3r, MRM3m, Xi32>, VEX;
+ defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64>, VEX;
}
-let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in {
- defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX;
- defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX;
- defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX;
- defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX;
- defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX;
- defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX;
+let Predicates = [HasBMI, In64BitMode], Defs = [EFLAGS] in {
+ defm BLSR32 : Bls<"blsr", MRM1r, MRM1m, Xi32, "_EVEX">, EVEX;
+ defm BLSR64 : Bls<"blsr", MRM1r, MRM1m, Xi64, "_EVEX">, EVEX;
+ defm BLSMSK32 : Bls<"blsmsk", MRM2r, MRM2m, Xi32, "_EVEX">, EVEX;
+ defm BLSMSK64 : Bls<"blsmsk", MRM2r, MRM2m, Xi64, "_EVEX">, EVEX;
+ defm BLSI32 : Bls<"blsi", MRM3r, MRM3m, Xi32, "_EVEX">, EVEX;
+ defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64, "_EVEX">, EVEX;
}
let Predicates = [HasBMI] in {
@@ -1281,50 +1278,35 @@ let Predicates = [HasBMI] in {
(BLSI64rr GR64:$src)>;
}
-multiclass bmi4VOp3_base<bits<8> opc, string mnemonic, RegisterClass RC,
- X86MemOperand x86memop, SDPatternOperator OpNode,
- PatFrag ld_frag, X86FoldableSchedWrite Sched,
- string Suffix = ""> {
- def rr#Suffix : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
- !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
- T8, VEX, Sched<[Sched]>;
-let mayLoad = 1 in
- def rm#Suffix : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
- !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
- (implicit EFLAGS)]>, T8, VEX,
- Sched<[Sched.Folded,
- // x86memop:$src1
- ReadDefault, ReadDefault, ReadDefault, ReadDefault,
- ReadDefault,
- // RC:$src2
- Sched.ReadAfterFold]>;
+multiclass Bmi4VOp3<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
+ X86FoldableSchedWrite sched, string Suffix = ""> {
+ let SchedRW = [sched], Form = MRMSrcReg4VOp3 in
+ def rr#Suffix : BinOpRR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst),
+ [(set t.RegClass:$dst, EFLAGS,
+ (node t.RegClass:$src1, t.RegClass:$src2))]>, T8;
+ let SchedRW = [sched.Folded,
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ sched.ReadAfterFold], Form = MRMSrcMem4VOp3 in
+ def rm#Suffix : BinOpMR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst),
+ [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1),
+ t.RegClass:$src2))]>, T8;
}
let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in {
- defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem,
- X86bextr, loadi32, WriteBEXTR>;
- defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem,
- X86bextr, loadi64, WriteBEXTR>, REX_W;
+ defm BEXTR32 : Bmi4VOp3<0xF7, "bextr", Xi32, X86bextr, WriteBEXTR>, VEX;
+ defm BEXTR64 : Bmi4VOp3<0xF7, "bextr", Xi64, X86bextr, WriteBEXTR>, VEX;
}
let Predicates = [HasBMI2, NoEGPR], Defs = [EFLAGS] in {
- defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem,
- X86bzhi, loadi32, WriteBZHI>;
- defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem,
- X86bzhi, loadi64, WriteBZHI>, REX_W;
+ defm BZHI32 : Bmi4VOp3<0xF5, "bzhi", Xi32, X86bzhi, WriteBZHI>, VEX;
+ defm BZHI64 : Bmi4VOp3<0xF5, "bzhi", Xi64, X86bzhi, WriteBZHI>, VEX;
}
-let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in {
- defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem,
- X86bextr, loadi32, WriteBEXTR, "_EVEX">, EVEX;
- defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem,
- X86bextr, loadi64, WriteBEXTR, "_EVEX">, EVEX, REX_W;
+let Predicates = [HasBMI, HasEGPR, In64BitMode], Defs = [EFLAGS] in {
+ defm BEXTR32 : Bmi4VOp3<0xF7, "bextr", Xi32, X86bextr, WriteBEXTR, "_EVEX">, EVEX;
+ defm BEXTR64 : Bmi4VOp3<0xF7, "bextr", Xi64, X86bextr, WriteBEXTR, "_EVEX">, EVEX;
}
-let Predicates = [HasBMI2, HasEGPR], Defs = [EFLAGS] in {
- defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem,
- X86bzhi, loadi32, WriteBZHI, "_EVEX">, EVEX;
- defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem,
- X86bzhi, loadi64, WriteBZHI, "_EVEX">, EVEX, REX_W;
+let Predicates = [HasBMI2, HasEGPR, In64BitMode], Defs = [EFLAGS] in {
+ defm BZHI32 : Bmi4VOp3<0xF5, "bzhi", Xi32, X86bzhi, WriteBZHI, "_EVEX">, EVEX;
+ defm BZHI64 : Bmi4VOp3<0xF5, "bzhi", Xi64, X86bzhi, WriteBZHI, "_EVEX">, EVEX;
}
def CountTrailingOnes : SDNodeXForm<imm, [{
@@ -1371,22 +1353,22 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
def rr#Suffix : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
- VEX, VVVV, Sched<[WriteALU]>;
+ NoCD8, VVVV, Sched<[WriteALU]>;
def rm#Suffix : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
- VEX, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+ NoCD8, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
}
let Predicates = [HasBMI2, NoEGPR] in {
defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
- X86pdep, loadi32>, T8, XD;
+ X86pdep, loadi32>, T8, XD, VEX;
defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
- X86pdep, loadi64>, T8, XD, REX_W;
+ X86pdep, loadi64>, T8, XD, REX_W, VEX;
defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
- X86pext, loadi32>, T8, XS;
+ X86pext, loadi32>, T8, XS, VEX;
defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
- X86pext, loadi64>, T8, XS, REX_W;
+ X86pext, loadi64>, T8, XS, REX_W, VEX;
}
let Predicates = [HasBMI2, HasEGPR] in {
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 94fa6e45ded9..cb751639a057 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -8,8 +8,41 @@
def TruePredicate : Predicate<"true">;
+// Intel x86 instructions have three separate encoding spaces: legacy, VEX, and
+// EVEX. Not all X86 instructions are extended for EGPR. The following is an
+// overview of which instructions are extended and how we implement them.
+//
+// * Legacy space
+// All instructions in legacy maps 0 and 1 that have explicit GPR or memory
+// operands can use the REX2 prefix to access the EGPR, except XSAVE*/XRSTOR.
+//
+// * EVEX space
+// All instructions in the EVEX space can access the EGPR in their
+// register/memory operands.
+//
+// For the above intructions, the only difference in encoding is reflected in
+// the REX2/EVEX prefix when EGPR is used, i.e. the opcode and opcode name are
+// unchanged. We don’t add new entries in TD, and instead we extend GPR with
+// R16-R31 and make them allocatable only when the feature EGPR is available.
+//
+// Besides, some instructions in legacy space with map 2/3 and VEX space are
+// promoted into EVEX space. Encoding space changes after the promotion, opcode
+// and opcode map may change too sometimes. For these instructions, we add new
+// entries in TD to avoid overcomplicating the assembler and disassembler.
+//
+// HasEGPR is for the new entries and NoEGPR is for the entries before
+// promotion, so that the promoted variant can be selected first to benefit RA.
def HasEGPR : Predicate<"Subtarget->hasEGPR()">;
def NoEGPR : Predicate<"!Subtarget->hasEGPR()">;
+
+// APX extends some instructions with a new form that has an extra register
+// operand called a new data destination (NDD). In such forms, NDD is the new
+// destination register receiving the result of the computation and all other
+// operands (including the original destination operand) become read-only source
+// operands.
+//
+// HasNDD is for the new NDD entries and NoNDD is for the legacy 2-address
+// entries, so that the NDD variant can be selected first to benefit RA.
def HasNDD : Predicate<"Subtarget->hasNDD()">;
def NoNDD : Predicate<"!Subtarget->hasNDD()">;
def HasCMOV : Predicate<"Subtarget->canUseCMOV()">;
diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index d13e3b7af69a..f951894db189 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -868,7 +868,7 @@ let Predicates = [HasBMI2, NoEGPR] in {
defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, PD, REX_W;
}
-let Predicates = [HasBMI2, HasEGPR] in {
+let Predicates = [HasBMI2, HasEGPR, In64BitMode] in {
defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem, "_EVEX">, EVEX;
defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem, "_EVEX">, REX_W, EVEX;
defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem, "_EVEX">, T8, XS, EVEX;
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index 699e5847e63f..b1be4739617d 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -695,14 +695,14 @@ def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
Requires<[Not64BitMode, HasINVPCID]>;
def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
"invpcid\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
- Requires<[In64BitMode, HasINVPCID]>;
+ Requires<[In64BitMode]>;
def INVPCID64_EVEX : I<0xF2, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
"invpcid\t{$src2, $src1|$src1, $src2}", []>,
- EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode, HasINVPCID]>;
+ EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode]>;
} // SchedRW
-let Predicates = [In64BitMode, HasINVPCID] in {
+let Predicates = [HasINVPCID, NoEGPR] in {
// The instruction can only use a 64 bit register as the register argument
// in 64 bit mode, while the intrinsic only accepts a 32 bit argument
// corresponding to it.
@@ -714,6 +714,13 @@ let Predicates = [In64BitMode, HasINVPCID] in {
addr:$src2)>;
}
+let Predicates = [HasINVPCID, HasEGPR] in {
+ def : Pat<(int_x86_invpcid GR32:$src1, addr:$src2),
+ (INVPCID64_EVEX
+ (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src1), sub_32bit),
+ addr:$src2)>;
+}
+
//===----------------------------------------------------------------------===//
// SMAP Instruction
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index da85922a018d..f4ae15837fbf 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -43,8 +43,6 @@ class XOP { Encoding OpEnc = EncXOP; }
class VEX { Encoding OpEnc = EncVEX; }
class EVEX { Encoding OpEnc = EncEVEX; }
class WIG { bit IgnoresW = 1; }
-// Special version of REX_W that can be changed to VEX.W==0 for EVEX2VEX.
-class VEX_W1X { bit hasREX_W = 1; bit EVEX_W1_VEX_W0 = 1; }
class VEX_L { bit hasVEX_L = 1; }
class VEX_LIG { bit ignoresVEX_L = 1; }
class VVVV { bit hasVEX_4V = 1; }
@@ -66,9 +64,6 @@ class EVEX_CD8<int esize, CD8VForm form> {
}
class NoCD8 { bits<7> CD8_Scale = 0; }
-class EVEX2VEXOverride<string VEXInstrName> {
- string EVEX2VEXOverride = VEXInstrName;
-}
class AVX512BIi8Base : TB, PD {
Domain ExeDomain = SSEPackedInt;
ImmType ImmT = Imm8;
@@ -89,7 +84,6 @@ class AVX512PDIi8Base : TB, PD {
Domain ExeDomain = SSEPackedDouble;
ImmType ImmT = Imm8;
}
-class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
class ExplicitREX2Prefix { ExplicitOpPrefix explicitOpPrefix = ExplicitREX2; }
class ExplicitVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitVEX; }
class ExplicitEVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitEVEX; }
@@ -1005,7 +999,7 @@ class BinOpRR_RF_Rev<bits<8> o, string m, X86TypeInfo t, bit ndd = 0>
}
// BinOpRRF_RF - Instructions that read "reg, reg", write "reg" and read/write
// EFLAGS.
-class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0>
+class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>
: BinOpRR<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
[(set t.RegClass:$dst, EFLAGS,
(node t.RegClass:$src1, t.RegClass:$src2,
@@ -1041,7 +1035,7 @@ class BinOpRM_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit
(t.LoadNode addr:$src2)))]>, DefEFLAGS, NDD<ndd>;
// BinOpRMF_RF - Instructions that read "reg, [mem]", write "reg" and read/write
// EFLAGS.
-class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0>
+class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>
: BinOpRM<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
[(set t.RegClass:$dst, EFLAGS,
(node t.RegClass:$src1, (t.LoadNode addr:$src2), EFLAGS))]>,
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index e1a67f61e766..133ee2041565 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -2055,10 +2055,11 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
}
- // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
- // are compressed from EVEX encoding to VEX encoding.
+ // Add a comment about EVEX compression
if (TM.Options.MCOptions.ShowMCEncoding) {
- if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
+ if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_LEGACY)
+ OutStreamer->AddComment("EVEX TO LEGACY Compression ", false);
+ else if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
OutStreamer->AddComment("EVEX TO VEX Compression ", false);
}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 5668b514d6de..b92bffbe6239 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -75,7 +75,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeGlobalISel(PR);
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
- initializeEvexToVexInstPassPass(PR);
+ initializeCompressEVEXPassPass(PR);
initializeFixupLEAPassPass(PR);
initializeFPSPass(PR);
initializeX86FixupSetCCPassPass(PR);
@@ -575,7 +575,7 @@ void X86PassConfig::addPreEmitPass() {
addPass(createX86FixupInstTuning());
addPass(createX86FixupVectorConstants());
}
- addPass(createX86EvexToVexInsts());
+ addPass(createX86CompressEVEXPass());
addPass(createX86DiscriminateMemOpsPass());
addPass(createX86InsertPrefetchPass());
addPass(createX86InsertX87waitPass());
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 49631f38017a..cd40b1d3b093 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2232,6 +2232,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
+ { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
diff --git a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 05003ec304ad..1535eb622da6 100644
--- a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -142,7 +142,7 @@ void XCoreDAGToDAGISel::Select(SDNode *N) {
switch (N->getOpcode()) {
default: break;
case ISD::Constant: {
- uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue();
+ uint64_t Val = N->getAsZExtVal();
if (immMskBitp(N)) {
// Transformation function: get the size of a mask
// Look for the first non-zero bit