aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-07-24 15:03:44 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-07-24 15:03:44 +0000
commit4b4fe385e49bd883fd183b5f21c1ea486c722e61 (patch)
treec3d8fdb355c9c73e57723718c22103aaf7d15aa6 /llvm/lib/Target
parent1f917f69ff07f09b6dbb670971f57f8efe718b84 (diff)
downloadsrc-4b4fe385e49bd883fd183b5f21c1ea486c722e61.tar.gz
src-4b4fe385e49bd883fd183b5f21c1ea486c722e61.zip
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp55
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp119
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h9
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td25
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td36
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp24
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp86
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h10
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp65
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp67
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributes.def1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp76
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp166
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp45
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp50
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp102
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp91
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/GCNProcessors.td4
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h7
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp323
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp40
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td110
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp309
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp82
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td119
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h2
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td78
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td100
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td38
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td26
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td42
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.h3
-rw-r--r--llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp15
-rw-r--r--llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp6
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp42
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h2
-rw-r--r--llvm/lib/Target/ARM/ARMInstrMVE.td6
-rw-r--r--llvm/lib/Target/ARM/ARMInstrThumb.td8
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp14
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h8
-rw-r--r--llvm/lib/Target/AVR/AVRSubtarget.h14
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp8
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp1
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp5
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp4
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.cpp6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp128
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h100
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp95
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.h1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXUtilities.h10
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp25
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.h3
-rw-r--r--llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp13
-rw-r--r--llvm/lib/Target/RISCV/RISCV.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCV.td26
-rw-r--r--llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp169
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp27
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp376
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp112
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.h13
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td7
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoM.td22
-rw-r--r--llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp7
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp5
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp10
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h13
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp15
-rw-r--r--llvm/lib/Target/SPIRV/SPIRV.h1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp164
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp201
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCallLowering.h6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp61
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp425
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h53
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp14
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.h1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.td24
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp252
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp7
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp168
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h8
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp75
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp288
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.cpp31
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.h6
-rw-r--r--llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp8
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.cpp4
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.h38
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.td34
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp101
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.h2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp8
-rw-r--r--llvm/lib/Target/VE/VEInstrInfo.cpp40
-rw-r--r--llvm/lib/Target/VE/VEInstrPatternsVec.td14
-rw-r--r--llvm/lib/Target/VE/VEInstrVec.td27
-rw-r--r--llvm/lib/Target/VE/VERegisterInfo.cpp201
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp23
-rw-r--r--llvm/lib/Target/X86/X86.td2
-rw-r--r--llvm/lib/Target/X86/X86FixupBWInsts.cpp12
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp561
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td48
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp12
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h3
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td70
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp8
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp65
-rw-r--r--llvm/lib/Target/XCore/XCoreFrameLowering.cpp2
160 files changed, 5435 insertions, 1674 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index b332e9dcb176..8fb5d49e2121 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -216,7 +216,7 @@ def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128",
"IsPaired128Slow", "true", "Paired 128 bit loads and stores are slow">;
def FeatureAscendStoreAddress : SubtargetFeature<"ascend-store-address",
- "IsStoreAddressAscend", "false",
+ "IsStoreAddressAscend", "true",
"Schedule vector stores by ascending address">;
def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow",
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 82fe5772c99d..00621b84d2f2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -69,6 +69,7 @@ public:
bool tryMLAV64LaneV128(SDNode *N);
bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+ bool SelectArithUXTXRegister(SDValue N, SDValue &Reg, SDValue &Shift);
bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
@@ -893,6 +894,30 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
return isWorthFolding(N);
}
+/// SelectArithUXTXRegister - Select a "UXTX register" operand. This
+/// operand is refered by the instructions have SP operand
+bool AArch64DAGToDAGISel::SelectArithUXTXRegister(SDValue N, SDValue &Reg,
+ SDValue &Shift) {
+ unsigned ShiftVal = 0;
+ AArch64_AM::ShiftExtendType Ext;
+
+ if (N.getOpcode() != ISD::SHL)
+ return false;
+
+ ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!CSD)
+ return false;
+ ShiftVal = CSD->getZExtValue();
+ if (ShiftVal > 4)
+ return false;
+
+ Ext = AArch64_AM::UXTX;
+ Reg = N.getOperand(0);
+ Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
+ MVT::i32);
+ return isWorthFolding(N);
+}
+
/// If there's a use of this ADDlow that's not itself a load/store then we'll
/// need to create a real ADD instruction from it anyway and there's no point in
/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
@@ -4049,6 +4074,24 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
+ case Intrinsic::swift_async_context_addr: {
+ SDLoc DL(Node);
+ SDValue Chain = Node->getOperand(0);
+ SDValue CopyFP = CurDAG->getCopyFromReg(Chain, DL, AArch64::FP, MVT::i64);
+ SDValue Res = SDValue(
+ CurDAG->getMachineNode(AArch64::SUBXri, DL, MVT::i64, CopyFP,
+ CurDAG->getTargetConstant(8, DL, MVT::i32),
+ CurDAG->getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ ReplaceUses(SDValue(Node, 0), Res);
+ ReplaceUses(SDValue(Node, 1), CopyFP.getValue(1));
+ CurDAG->RemoveDeadNode(Node);
+
+ auto &MF = CurDAG->getMachineFunction();
+ MF.getFrameInfo().setFrameAddressIsTaken(true);
+ MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
+ return;
+ }
}
} break;
case ISD::INTRINSIC_WO_CHAIN: {
@@ -4094,18 +4137,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
if (tryMULLV64LaneV128(IntNo, Node))
return;
break;
- case Intrinsic::swift_async_context_addr: {
- SDLoc DL(Node);
- CurDAG->SelectNodeTo(Node, AArch64::SUBXri, MVT::i64,
- CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
- AArch64::FP, MVT::i64),
- CurDAG->getTargetConstant(8, DL, MVT::i32),
- CurDAG->getTargetConstant(0, DL, MVT::i32));
- auto &MF = CurDAG->getMachineFunction();
- MF.getFrameInfo().setFrameAddressIsTaken(true);
- MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
- return;
- }
}
break;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 447ad10ddf22..e070ce2efa6b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -521,6 +521,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
setOperationAction(ISD::CTPOP, MVT::i128, Custom);
+ setOperationAction(ISD::PARITY, MVT::i64, Custom);
+ setOperationAction(ISD::PARITY, MVT::i128, Custom);
+
setOperationAction(ISD::ABS, MVT::i32, Custom);
setOperationAction(ISD::ABS, MVT::i64, Custom);
@@ -5463,7 +5466,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::SRA_PARTS:
return LowerShiftParts(Op, DAG);
case ISD::CTPOP:
- return LowerCTPOP(Op, DAG);
+ case ISD::PARITY:
+ return LowerCTPOP_PARITY(Op, DAG);
case ISD::FCOPYSIGN:
return LowerFCOPYSIGN(Op, DAG);
case ISD::OR:
@@ -7783,7 +7787,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
return BitCast(VT, BSP, DAG);
}
-SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
+ SelectionDAG &DAG) const {
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat))
return SDValue();
@@ -7791,6 +7796,8 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget->hasNEON())
return SDValue();
+ bool IsParity = Op.getOpcode() == ISD::PARITY;
+
// While there is no integer popcount instruction, it can
// be more efficiently lowered to the following sequence that uses
// AdvSIMD registers/instructions as long as the copies to/from
@@ -7813,6 +7820,10 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
+ if (IsParity)
+ UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
+ DAG.getConstant(1, DL, MVT::i32));
+
if (VT == MVT::i64)
UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
return UaddLV;
@@ -7824,9 +7835,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
+ if (IsParity)
+ UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
+ DAG.getConstant(1, DL, MVT::i32));
+
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
}
+ assert(!IsParity && "ISD::PARITY of vector types not supported");
+
if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
@@ -11811,6 +11828,12 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
isConcatMask(M, VT, VT.getSizeInBits() == 128));
}
+bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
+ EVT VT) const {
+ // Just delegate to the generic legality, clear masks aren't special.
+ return isShuffleMaskLegal(M, VT);
+}
+
/// getVShiftImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
@@ -11969,6 +11992,11 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
if (IsZero)
return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+ case AArch64CC::LE:
+ if (!NoNans)
+ return SDValue();
+ // If we ignore NaNs then we can use to the LS implementation.
+ LLVM_FALLTHROUGH;
case AArch64CC::LS:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
@@ -12073,7 +12101,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
bool ShouldInvert;
changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
- bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
SDValue Cmp =
EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
if (!Cmp.getNode())
@@ -13587,21 +13615,50 @@ AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
bool
AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const {
- N = N->getOperand(0).getNode();
+ assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
+ N->getOpcode() == ISD::SRL) &&
+ "Expected shift op");
+
+ SDValue ShiftLHS = N->getOperand(0);
EVT VT = N->getValueType(0);
- // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
- // it with shift to let it be lowered to UBFX.
- if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
- isa<ConstantSDNode>(N->getOperand(1))) {
- uint64_t TruncMask = N->getConstantOperandVal(1);
+
+ // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not combine
+ // it with shift 'N' to let it be lowered to UBFX.
+ if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+ isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
+ uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
if (isMask_64(TruncMask) &&
- N->getOperand(0).getOpcode() == ISD::SRL &&
- isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+ ShiftLHS.getOperand(0).getOpcode() == ISD::SRL &&
+ isa<ConstantSDNode>(ShiftLHS.getOperand(0).getOperand(1)))
return false;
}
return true;
}
+bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
+ const SDNode *N) const {
+ assert(N->getOpcode() == ISD::XOR &&
+ (N->getOperand(0).getOpcode() == ISD::SHL ||
+ N->getOperand(0).getOpcode() == ISD::SRL) &&
+ "Expected XOR(SHIFT) pattern");
+
+ // Only commute if the entire NOT mask is a hidden shifted mask.
+ auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
+ if (XorC && ShiftC) {
+ unsigned MaskIdx, MaskLen;
+ if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
+ unsigned ShiftAmt = ShiftC->getZExtValue();
+ unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+ if (N->getOperand(0).getOpcode() == ISD::SHL)
+ return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
+ return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
+ }
+ }
+
+ return false;
+}
+
bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
assert(((N->getOpcode() == ISD::SHL &&
@@ -19221,6 +19278,41 @@ static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
}
+static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ SDValue Insert = N->getOperand(0);
+ if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
+ return SDValue();
+
+ if (!Insert.getOperand(0).isUndef())
+ return SDValue();
+
+ uint64_t IdxInsert = Insert.getConstantOperandVal(2);
+ uint64_t IdxDupLane = N->getConstantOperandVal(1);
+ if (IdxInsert != IdxDupLane)
+ return SDValue();
+
+ SDValue Bitcast = Insert.getOperand(1);
+ if (Bitcast.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ SDValue Subvec = Bitcast.getOperand(0);
+ EVT SubvecVT = Subvec.getValueType();
+ if (!SubvecVT.is128BitVector())
+ return SDValue();
+ EVT NewSubvecVT =
+ getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
+
+ SDLoc DL(N);
+ SDValue NewInsert =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
+ DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
+ SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
+ NewInsert, N->getOperand(1));
+ return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -19307,6 +19399,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performCSELCombine(N, DCI, DAG);
case AArch64ISD::DUP:
return performDUPCombine(N, DCI);
+ case AArch64ISD::DUPLANE128:
+ return performDupLane128Combine(N, DAG);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
case AArch64ISD::SPLICE:
@@ -19981,7 +20075,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
case ISD::CTPOP:
- if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
+ case ISD::PARITY:
+ if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
Results.push_back(Result);
return;
case AArch64ISD::SADDV:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e02b5e56fd2e..1ba2e2f315ec 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -549,6 +549,10 @@ public:
/// should be stack expanded.
bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
+ /// Similar to isShuffleMaskLegal. Return true is the given 'select with zero'
+ /// shuffle mask can be codegen'd directly.
+ bool isVectorClearMaskLegal(ArrayRef<int> M, EVT VT) const override;
+
/// Return the ISD::SETCC ValueType.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
@@ -653,6 +657,9 @@ public:
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
+ /// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+ bool isDesirableToCommuteXorWithShift(const SDNode *N) const override;
+
/// Return true if it is profitable to fold a pair of shifts into a mask.
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
CombineLevel Level) const override;
@@ -995,7 +1002,7 @@ private:
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTPOP_PARITY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBitreverse(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 02fa36a1df4b..e70d304f37b9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1168,6 +1168,8 @@ def gi_arith_extended_reg32to64_i64 :
GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
GIComplexPatternEquiv<arith_extended_reg32to64_i64>;
+def arith_uxtx : ComplexPattern<i64, 2, "SelectArithUXTXRegister", []>;
+
// Floating-point immediate.
def fpimm16XForm : SDNodeXForm<fpimm, [{
@@ -1234,6 +1236,10 @@ def fpimm0 : FPImmLeaf<fAny, [{
return Imm.isExactlyValue(+0.0);
}]>;
+def fpimm_minus0 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(-0.0);
+}]>;
+
def fpimm_half : FPImmLeaf<fAny, [{
return Imm.isExactlyValue(+0.5);
}]>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d444223e4494..a7b7e5270888 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1691,6 +1691,11 @@ def : InstAlias<"mov $dst, $src",
defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">;
defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">;
+def copyFromSP: PatLeaf<(i64 GPR64:$src), [{
+ return N->getOpcode() == ISD::CopyFromReg &&
+ cast<RegisterSDNode>(N->getOperand(1))->getReg() == AArch64::SP;
+}]>;
+
// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
(SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
@@ -1709,6 +1714,8 @@ def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3),
(SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>;
def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3),
(SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>;
+def : Pat<(sub copyFromSP:$R2, (arith_uxtx GPR64:$R3, arith_extendlsl64:$imm)),
+ (SUBXrx64 GPR64sp:$R2, GPR64:$R3, arith_extendlsl64:$imm)>;
}
// Because of the immediate format for add/sub-imm instructions, the
@@ -5293,6 +5300,9 @@ def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
// CodeGen patterns for addhn and subhn instructions, which can actually be
// written in LLVM IR without too much difficulty.
+// Prioritize ADDHN and SUBHN over UZP2.
+let AddedComplexity = 10 in {
+
// ADDHN
def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
(ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
@@ -5343,6 +5353,8 @@ def : Pat<(concat_vectors (v2i32 V64:$Rd),
(SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
V128:$Rn, V128:$Rm)>;
+} // AddedComplexity = 10
+
//----------------------------------------------------------------------------
// AdvSIMD bitwise extract from vector instruction.
//----------------------------------------------------------------------------
@@ -5409,6 +5421,19 @@ def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
(v2i32 (trunc (v2i64 V128:$Vm))))),
(UZP1v4i32 V128:$Vn, V128:$Vm)>;
+def : Pat<(v16i8 (concat_vectors
+ (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
+ (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vm), (i32 8)))))),
+ (UZP2v16i8 V128:$Vn, V128:$Vm)>;
+def : Pat<(v8i16 (concat_vectors
+ (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vn), (i32 16)))),
+ (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vm), (i32 16)))))),
+ (UZP2v8i16 V128:$Vn, V128:$Vm)>;
+def : Pat<(v4i32 (concat_vectors
+ (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vn), (i32 32)))),
+ (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))),
+ (UZP2v4i32 V128:$Vn, V128:$Vm)>;
+
//----------------------------------------------------------------------------
// AdvSIMD TBL/TBX instructions
//----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
index 6c8845ee8598..79866c9b0a05 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
@@ -22,8 +22,8 @@ static bool needReorderStoreMI(const MachineInstr *MI) {
return false;
case AArch64::STURQi:
case AArch64::STRQui:
- if (MI->getMF()->getSubtarget<AArch64Subtarget>().isStoreAddressAscend())
- return false;
+ if (!MI->getMF()->getSubtarget<AArch64Subtarget>().isStoreAddressAscend())
+ return false;
LLVM_FALLTHROUGH;
case AArch64::STPQi:
return AArch64InstrInfo::getLdStOffsetOp(*MI).isImm();
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c66f9cfd9c22..4032c4667bc7 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -278,10 +278,18 @@ def AArch64scvtf_mt : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch
def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>;
def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>;
-def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
-def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>;
-def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>;
-def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;
+def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3,
+ [SDTCisVec<1>, SDTCVecEltisVT<1,i1>, SDTCisVec<3>, SDTCisSameNumEltsAs<1,3>]>;
+def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>;
+def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>;
+def AArch64fadda_p_node : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;
+
+def AArch64fadda_p : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+ [(AArch64fadda_p_node node:$op1, node:$op2, node:$op3),
+ (AArch64fadda_p_node (SVEAllActive), node:$op2,
+ (vselect node:$op1, node:$op3, (splat_vector (f32 fpimm_minus0)))),
+ (AArch64fadda_p_node (SVEAllActive), node:$op2,
+ (vselect node:$op1, node:$op3, (splat_vector (f64 fpimm_minus0))))]>;
def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
@@ -447,6 +455,16 @@ let Predicates = [HasSVEorSME] in {
defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>;
defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>;
+ // zext(cmpeq(x, splat(0))) -> cnot(x)
+ def : Pat<(nxv16i8 (zext (nxv16i1 (AArch64setcc_z (nxv16i1 (SVEAllActive):$Pg), nxv16i8:$Op2, (SVEDup0), SETEQ)))),
+ (CNOT_ZPmZ_B $Op2, $Pg, $Op2)>;
+ def : Pat<(nxv8i16 (zext (nxv8i1 (AArch64setcc_z (nxv8i1 (SVEAllActive):$Pg), nxv8i16:$Op2, (SVEDup0), SETEQ)))),
+ (CNOT_ZPmZ_H $Op2, $Pg, $Op2)>;
+ def : Pat<(nxv4i32 (zext (nxv4i1 (AArch64setcc_z (nxv4i1 (SVEAllActive):$Pg), nxv4i32:$Op2, (SVEDup0), SETEQ)))),
+ (CNOT_ZPmZ_S $Op2, $Pg, $Op2)>;
+ def : Pat<(nxv2i64 (zext (nxv2i1 (AArch64setcc_z (nxv2i1 (SVEAllActive):$Pg), nxv2i64:$Op2, (SVEDup0), SETEQ)))),
+ (CNOT_ZPmZ_D $Op2, $Pg, $Op2)>;
+
defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>;
defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>;
defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", "SMIN_ZPZZ", int_aarch64_sve_smin, DestructiveBinaryComm>;
@@ -857,6 +875,16 @@ let Predicates = [HasSVEorSME] in {
defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;
+ let AddedComplexity = 1 in {
+ class LD1RQPat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr, Instruction ptrue> :
+ Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))),
+ (load_instr (ptrue 31), GPR64sp:$Xn, 0)>;
+ }
+ def : LD1RQPat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B_IMM, PTRUE_B>;
+ def : LD1RQPat<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H_IMM, PTRUE_H>;
+ def : LD1RQPat<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W_IMM, PTRUE_S>;
+ def : LD1RQPat<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D_IMM, PTRUE_D>;
+
// continuous load with reg+reg addressing.
defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 3f9795f5198b..47e4c6589c26 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -128,7 +128,7 @@ static cl::opt<bool>
static cl::opt<bool>
EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
cl::desc("Enable optimizations on complex GEPs"),
- cl::init(false));
+ cl::init(true));
static cl::opt<bool>
BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
@@ -563,17 +563,6 @@ void AArch64PassConfig::addIRPasses() {
addPass(createFalkorMarkStridedAccessesPass());
}
- TargetPassConfig::addIRPasses();
-
- addPass(createAArch64StackTaggingPass(
- /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
-
- // Match interleaved memory accesses to ldN/stN intrinsics.
- if (TM->getOptLevel() != CodeGenOpt::None) {
- addPass(createInterleavedLoadCombinePass());
- addPass(createInterleavedAccessPass());
- }
-
if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
// Call SeparateConstOffsetFromGEP pass to extract constants within indices
// and lower a GEP with multiple indices to either arithmetic operations or
@@ -587,6 +576,17 @@ void AArch64PassConfig::addIRPasses() {
addPass(createLICMPass());
}
+ TargetPassConfig::addIRPasses();
+
+ addPass(createAArch64StackTaggingPass(
+ /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
+
+ // Match interleaved memory accesses to ldN/stN intrinsics.
+ if (TM->getOptLevel() != CodeGenOpt::None) {
+ addPass(createInterleavedLoadCombinePass());
+ addPass(createInterleavedAccessPass());
+ }
+
// Add Control Flow Guard checks.
if (TM->getTargetTriple().isOSWindows())
addPass(createCFGuardCheckPass());
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 274a025e82a0..66617393c9ae 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -37,6 +38,74 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
cl::init(10), cl::Hidden);
+class TailFoldingKind {
+private:
+ uint8_t Bits = 0; // Currently defaults to disabled.
+
+public:
+ enum TailFoldingOpts {
+ TFDisabled = 0x0,
+ TFReductions = 0x01,
+ TFRecurrences = 0x02,
+ TFSimple = 0x80,
+ TFAll = TFReductions | TFRecurrences | TFSimple
+ };
+
+ void operator=(const std::string &Val) {
+ if (Val.empty())
+ return;
+ SmallVector<StringRef, 6> TailFoldTypes;
+ StringRef(Val).split(TailFoldTypes, '+', -1, false);
+ for (auto TailFoldType : TailFoldTypes) {
+ if (TailFoldType == "disabled")
+ Bits = 0;
+ else if (TailFoldType == "all")
+ Bits = TFAll;
+ else if (TailFoldType == "default")
+ Bits = 0; // Currently defaults to never tail-folding.
+ else if (TailFoldType == "simple")
+ add(TFSimple);
+ else if (TailFoldType == "reductions")
+ add(TFReductions);
+ else if (TailFoldType == "recurrences")
+ add(TFRecurrences);
+ else if (TailFoldType == "noreductions")
+ remove(TFReductions);
+ else if (TailFoldType == "norecurrences")
+ remove(TFRecurrences);
+ else {
+ errs()
+ << "invalid argument " << TailFoldType.str()
+ << " to -sve-tail-folding=; each element must be one of: disabled, "
+ "all, default, simple, reductions, noreductions, recurrences, "
+ "norecurrences\n";
+ }
+ }
+ }
+
+ operator uint8_t() const { return Bits; }
+
+ void add(uint8_t Flag) { Bits |= Flag; }
+ void remove(uint8_t Flag) { Bits &= ~Flag; }
+};
+
+TailFoldingKind TailFoldingKindLoc;
+
+cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
+ "sve-tail-folding",
+ cl::desc(
+ "Control the use of vectorisation using tail-folding for SVE:"
+ "\ndisabled No loop types will vectorize using tail-folding"
+ "\ndefault Uses the default tail-folding settings for the target "
+ "CPU"
+ "\nall All legal loop types will vectorize using tail-folding"
+ "\nsimple Use tail-folding for simple loops (not reductions or "
+ "recurrences)"
+ "\nreductions Use tail-folding for loops containing reductions"
+ "\nrecurrences Use tail-folding for loops containing first order "
+ "recurrences"),
+ cl::location(TailFoldingKindLoc));
+
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -2955,3 +3024,20 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
}
+
+bool AArch64TTIImpl::preferPredicateOverEpilogue(
+ Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+ TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) {
+ if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
+ return false;
+
+ TailFoldingKind Required; // Defaults to 0.
+ if (LVL->getReductionVars().size())
+ Required.add(TailFoldingKind::TFReductions);
+ if (LVL->getFirstOrderRecurrences().size())
+ Required.add(TailFoldingKind::TFRecurrences);
+ if (!Required)
+ Required.add(TailFoldingKind::TFSimple);
+
+ return (TailFoldingKindLoc & Required) == Required;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 59ec91843266..2231f8705998 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -340,6 +340,11 @@ public:
return PredicationStyle::None;
}
+ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+ AssumptionCache &AC, TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ LoopVectorizationLegality *LVL);
+
bool supportsScalableVectors() const { return ST->hasSVE(); }
bool enableScalableVectorization() const { return ST->hasSVE(); }
@@ -347,6 +352,11 @@ public:
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
ElementCount VF) const;
+ bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ return ST->hasSVE();
+ }
+
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
Optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index f129bfe11e4d..3fe3b2a69855 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -231,7 +231,70 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
{codeview::RegisterId::ARM64_Q29, AArch64::Q29},
{codeview::RegisterId::ARM64_Q30, AArch64::Q30},
{codeview::RegisterId::ARM64_Q31, AArch64::Q31},
-
+ {codeview::RegisterId::ARM64_B0, AArch64::B0},
+ {codeview::RegisterId::ARM64_B1, AArch64::B1},
+ {codeview::RegisterId::ARM64_B2, AArch64::B2},
+ {codeview::RegisterId::ARM64_B3, AArch64::B3},
+ {codeview::RegisterId::ARM64_B4, AArch64::B4},
+ {codeview::RegisterId::ARM64_B5, AArch64::B5},
+ {codeview::RegisterId::ARM64_B6, AArch64::B6},
+ {codeview::RegisterId::ARM64_B7, AArch64::B7},
+ {codeview::RegisterId::ARM64_B8, AArch64::B8},
+ {codeview::RegisterId::ARM64_B9, AArch64::B9},
+ {codeview::RegisterId::ARM64_B10, AArch64::B10},
+ {codeview::RegisterId::ARM64_B11, AArch64::B11},
+ {codeview::RegisterId::ARM64_B12, AArch64::B12},
+ {codeview::RegisterId::ARM64_B13, AArch64::B13},
+ {codeview::RegisterId::ARM64_B14, AArch64::B14},
+ {codeview::RegisterId::ARM64_B15, AArch64::B15},
+ {codeview::RegisterId::ARM64_B16, AArch64::B16},
+ {codeview::RegisterId::ARM64_B17, AArch64::B17},
+ {codeview::RegisterId::ARM64_B18, AArch64::B18},
+ {codeview::RegisterId::ARM64_B19, AArch64::B19},
+ {codeview::RegisterId::ARM64_B20, AArch64::B20},
+ {codeview::RegisterId::ARM64_B21, AArch64::B21},
+ {codeview::RegisterId::ARM64_B22, AArch64::B22},
+ {codeview::RegisterId::ARM64_B23, AArch64::B23},
+ {codeview::RegisterId::ARM64_B24, AArch64::B24},
+ {codeview::RegisterId::ARM64_B25, AArch64::B25},
+ {codeview::RegisterId::ARM64_B26, AArch64::B26},
+ {codeview::RegisterId::ARM64_B27, AArch64::B27},
+ {codeview::RegisterId::ARM64_B28, AArch64::B28},
+ {codeview::RegisterId::ARM64_B29, AArch64::B29},
+ {codeview::RegisterId::ARM64_B30, AArch64::B30},
+ {codeview::RegisterId::ARM64_B31, AArch64::B31},
+ {codeview::RegisterId::ARM64_H0, AArch64::H0},
+ {codeview::RegisterId::ARM64_H1, AArch64::H1},
+ {codeview::RegisterId::ARM64_H2, AArch64::H2},
+ {codeview::RegisterId::ARM64_H3, AArch64::H3},
+ {codeview::RegisterId::ARM64_H4, AArch64::H4},
+ {codeview::RegisterId::ARM64_H5, AArch64::H5},
+ {codeview::RegisterId::ARM64_H6, AArch64::H6},
+ {codeview::RegisterId::ARM64_H7, AArch64::H7},
+ {codeview::RegisterId::ARM64_H8, AArch64::H8},
+ {codeview::RegisterId::ARM64_H9, AArch64::H9},
+ {codeview::RegisterId::ARM64_H10, AArch64::H10},
+ {codeview::RegisterId::ARM64_H11, AArch64::H11},
+ {codeview::RegisterId::ARM64_H12, AArch64::H12},
+ {codeview::RegisterId::ARM64_H13, AArch64::H13},
+ {codeview::RegisterId::ARM64_H14, AArch64::H14},
+ {codeview::RegisterId::ARM64_H15, AArch64::H15},
+ {codeview::RegisterId::ARM64_H16, AArch64::H16},
+ {codeview::RegisterId::ARM64_H17, AArch64::H17},
+ {codeview::RegisterId::ARM64_H18, AArch64::H18},
+ {codeview::RegisterId::ARM64_H19, AArch64::H19},
+ {codeview::RegisterId::ARM64_H20, AArch64::H20},
+ {codeview::RegisterId::ARM64_H21, AArch64::H21},
+ {codeview::RegisterId::ARM64_H22, AArch64::H22},
+ {codeview::RegisterId::ARM64_H23, AArch64::H23},
+ {codeview::RegisterId::ARM64_H24, AArch64::H24},
+ {codeview::RegisterId::ARM64_H25, AArch64::H25},
+ {codeview::RegisterId::ARM64_H26, AArch64::H26},
+ {codeview::RegisterId::ARM64_H27, AArch64::H27},
+ {codeview::RegisterId::ARM64_H28, AArch64::H28},
+ {codeview::RegisterId::ARM64_H29, AArch64::H29},
+ {codeview::RegisterId::ARM64_H30, AArch64::H30},
+ {codeview::RegisterId::ARM64_H31, AArch64::H31},
};
for (const auto &I : RegMap)
MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 48b5814cd482..2d6f1438e315 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -585,6 +585,12 @@ def FeatureMAIInsts : SubtargetFeature<"mai-insts",
"Has mAI instructions"
>;
+def FeatureFP8Insts : SubtargetFeature<"fp8-insts",
+ "HasFP8Insts",
+ "true",
+ "Has fp8 and bf8 instructions"
+>;
+
def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
"HasPkFmacF16Inst",
"true",
@@ -1124,6 +1130,7 @@ def FeatureISAVersion9_4_0 : FeatureSet<
Feature64BitDPP,
FeaturePackedFP32Ops,
FeatureMAIInsts,
+ FeatureFP8Insts,
FeaturePkFmacF16Inst,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
@@ -1265,11 +1272,14 @@ def FeatureISAVersion11_Common : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard]>;
-// Features for GFX 11.0.0 and 11.0.1
-def FeatureISAVersion11_0 : FeatureSet<
+def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureUserSGPRInit16Bug])>;
+def FeatureISAVersion11_0_1 : FeatureSet<
+ !listconcat(FeatureISAVersion11_Common.Features,
+ [])>;
+
def FeatureISAVersion11_0_2 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureUserSGPRInit16Bug])>;
@@ -1704,6 +1714,9 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
+def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
+ AssemblerPredicate<(all_of FeatureFP8Insts)>;
+
def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index d28f38e42430..d361e33995cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -74,6 +74,7 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
<< " WorkGroupIDY: " << FI.second.WorkGroupIDY
<< " WorkGroupIDZ: " << FI.second.WorkGroupIDZ
<< " WorkGroupInfo: " << FI.second.WorkGroupInfo
+ << " LDSKernelId: " << FI.second.LDSKernelId
<< " PrivateSegmentWaveByteOffset: "
<< FI.second.PrivateSegmentWaveByteOffset
<< " ImplicitBufferPtr: " << FI.second.ImplicitBufferPtr
@@ -107,6 +108,9 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
return std::make_tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
&AMDGPU::SGPR_32RegClass, LLT::scalar(32));
+ case AMDGPUFunctionArgInfo::LDS_KERNEL_ID:
+ return std::make_tuple(LDSKernelId ? &LDSKernelId : nullptr,
+ &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
return std::make_tuple(
PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
@@ -162,6 +166,7 @@ constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
AI.WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::SGPR12);
AI.WorkGroupIDY = ArgDescriptor::createRegister(AMDGPU::SGPR13);
AI.WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::SGPR14);
+ AI.LDSKernelId = ArgDescriptor::createRegister(AMDGPU::SGPR15);
const unsigned Mask = 0x3ff;
AI.WorkItemIDX = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index e9ed45d8cd14..f595e469f998 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -103,6 +103,7 @@ struct AMDGPUFunctionArgInfo {
KERNARG_SEGMENT_PTR = 3,
DISPATCH_ID = 4,
FLAT_SCRATCH_INIT = 5,
+ LDS_KERNEL_ID = 6, // LLVM internal, not part of the ABI
WORKGROUP_ID_X = 10,
WORKGROUP_ID_Y = 11,
WORKGROUP_ID_Z = 12,
@@ -128,6 +129,7 @@ struct AMDGPUFunctionArgInfo {
ArgDescriptor DispatchID;
ArgDescriptor FlatScratchInit;
ArgDescriptor PrivateSegmentSize;
+ ArgDescriptor LDSKernelId;
// System SGPRs in kernels.
ArgDescriptor WorkGroupIDX;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 57a4660bc1eb..13a65f1ad601 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -27,8 +27,10 @@
#include "SIMachineFunctionInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
@@ -415,6 +417,10 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
+ if (CurrentProgramInfo.DynamicCallStack) {
+ KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
+ }
+
return KernelCodeProperties;
}
@@ -506,6 +512,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
emitFunctionBody();
+ emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
+ STM.hasMAIInsts());
+
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
@@ -875,6 +884,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
LDSAlignShift = 9;
}
+ ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
+ ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
+
ProgInfo.LDSSize = MFI->getLDSSize();
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
@@ -1180,3 +1192,58 @@ void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<AMDGPUResourceUsageAnalysis>();
AsmPrinter::getAnalysisUsage(AU);
}
+
+void AMDGPUAsmPrinter::emitResourceUsageRemarks(
+ const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
+ bool isModuleEntryFunction, bool hasMAIInsts) {
+ if (!ORE)
+ return;
+
+ const char *Name = "kernel-resource-usage";
+ const char *Indent = " ";
+
+ // If the remark is not specifically enabled, do not output to yaml
+ LLVMContext &Ctx = MF.getFunction().getContext();
+ if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
+ return;
+
+ auto EmitResourceUsageRemark = [&](StringRef RemarkName,
+ StringRef RemarkLabel, auto Argument) {
+ // Add an indent for every line besides the line with the kernel name. This
+ // makes it easier to tell which resource usage go with which kernel since
+ // the kernel name will always be displayed first.
+ std::string LabelStr = RemarkLabel.str() + ": ";
+ if (!RemarkName.equals("FunctionName"))
+ LabelStr = Indent + LabelStr;
+
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(Name, RemarkName,
+ MF.getFunction().getSubprogram(),
+ &MF.front())
+ << LabelStr << ore::NV(RemarkName, Argument);
+ });
+ };
+
+ // FIXME: Formatting here is pretty nasty because clang does not accept
+ // newlines from diagnostics. This forces us to emit multiple diagnostic
+ // remarks to simulate newlines. If and when clang does accept newlines, this
+ // formatting should be aggregated into one remark with newlines to avoid
+ // printing multiple diagnostic location and diag opts.
+ EmitResourceUsageRemark("FunctionName", "Function Name",
+ MF.getFunction().getName());
+ EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
+ EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
+ if (hasMAIInsts)
+ EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
+ EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
+ CurrentProgramInfo.ScratchSize);
+ EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
+ CurrentProgramInfo.Occupancy);
+ EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
+ CurrentProgramInfo.SGPRSpill);
+ EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
+ CurrentProgramInfo.VGPRSpill);
+ if (isModuleEntryFunction)
+ EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
+ CurrentProgramInfo.LDSSize);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index ddda2cf107b1..2881b8d7bcca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -69,6 +69,9 @@ private:
uint64_t ScratchSize,
uint64_t CodeSize,
const AMDGPUMachineFunction* MFI);
+ void emitResourceUsageRemarks(const MachineFunction &MF,
+ const SIProgramInfo &CurrentProgramInfo,
+ bool isModuleEntryFunction, bool hasMAIInsts);
uint16_t getAmdhsaKernelCodeProperties(
const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
index 0a2cf3874245..c7a060c5db5b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
@@ -27,5 +27,6 @@ AMDGPU_ATTRIBUTE(WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z")
AMDGPU_ATTRIBUTE(WORKITEM_ID_X, "amdgpu-no-workitem-id-x")
AMDGPU_ATTRIBUTE(WORKITEM_ID_Y, "amdgpu-no-workitem-id-y")
AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z")
+AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id")
#undef AMDGPU_ATTRIBUTE
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 8de0d7e6bff1..a3634d2440c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -72,6 +72,8 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
return WORKGROUP_ID_Z;
+ case Intrinsic::amdgcn_lds_kernel_id:
+ return LDS_KERNEL_ID;
case Intrinsic::amdgcn_dispatch_ptr:
return DISPATCH_PTR;
case Intrinsic::amdgcn_dispatch_id:
@@ -457,6 +459,10 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
removeAssumedBits(QUEUE_PTR);
}
+ if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
+ removeAssumedBits(LDS_KERNEL_ID);
+ }
+
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
: ChangeStatus::UNCHANGED;
}
@@ -591,6 +597,16 @@ private:
return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
UsedAssumedInformation);
}
+
+ bool funcRetrievesLDSKernelId(Attributor &A) {
+ auto DoesNotRetrieve = [&](Instruction &I) {
+ auto &Call = cast<CallBase>(I);
+ return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
+ };
+ bool UsedAssumedInformation = false;
+ return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
+ UsedAssumedInformation);
+ }
};
AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
@@ -743,7 +759,8 @@ public:
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
DenseSet<const char *> Allowed(
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
- &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, &AAPointerInfo::ID});
+ &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID,
+ &AAPointerInfo::ID});
AttributorConfig AC(CGUpdater);
AC.Allowed = &Allowed;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index fd812eb676ef..4550cfdcf883 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -764,7 +764,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
AMDGPUFunctionArgInfo::DISPATCH_ID,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
- AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+ AMDGPUFunctionArgInfo::LDS_KERNEL_ID,
};
static constexpr StringLiteral ImplicitAttrNames[] = {
@@ -774,7 +775,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
"amdgpu-no-dispatch-id",
"amdgpu-no-workgroup-id-x",
"amdgpu-no-workgroup-id-y",
- "amdgpu-no-workgroup-id-z"
+ "amdgpu-no-workgroup-id-z",
+ "amdgpu-no-lds-kernel-id",
};
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -810,6 +812,14 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
} else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
+ } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
+ Optional<uint32_t> Id =
+ AMDGPUMachineFunction::getLDSKernelIdMetadata(MF.getFunction());
+ if (Id.has_value()) {
+ MIRBuilder.buildConstant(InputReg, Id.value());
+ } else {
+ MIRBuilder.buildUndef(InputReg);
+ }
} else {
// We may have proven the input wasn't needed, although the ABI is
// requiring it. We just need to allocate the register appropriately.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 5747fc0ca8e6..229dfb62ef6e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -88,6 +88,10 @@ def gi_smrd_sgpr :
GIComplexOperandMatcher<s64, "selectSmrdSgpr">,
GIComplexPatternEquiv<SMRDSgpr>;
+def gi_smrd_sgpr_imm :
+ GIComplexOperandMatcher<s64, "selectSmrdSgprImm">,
+ GIComplexPatternEquiv<SMRDSgprImm>;
+
def gi_flat_offset :
GIComplexOperandMatcher<s64, "selectFlatOffset">,
GIComplexPatternEquiv<FlatOffset>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 6fa44ffcbfaa..632a76b32009 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -875,6 +875,8 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
Kern.getDocument()->getNode(ProgramInfo.LDSSize);
Kern[".private_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
+ Kern[".uses_dynamic_stack"] =
+ Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack);
// FIXME: The metadata treats the minimum as 16?
Kern[".kernarg_segment_align"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 589992c7a7ec..147c8850587e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -33,7 +33,7 @@
#include "llvm/IR/Dominators.h"
#endif
-#define DEBUG_TYPE "isel"
+#define DEBUG_TYPE "amdgpu-isel"
using namespace llvm;
@@ -1886,21 +1886,21 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
// Match an immediate (if Imm is true) or an SGPR (if Imm is false)
// offset. If Imm32Only is true, match only 32-bit immediate offsets
// available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
- SDValue &Offset, bool Imm,
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue Addr, SDValue ByteOffsetNode,
+ SDValue *SOffset, SDValue *Offset,
bool Imm32Only) const {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
if (!C) {
- if (Imm)
+ if (!SOffset)
return false;
if (ByteOffsetNode.getValueType().isScalarInteger() &&
ByteOffsetNode.getValueType().getSizeInBits() == 32) {
- Offset = ByteOffsetNode;
+ *SOffset = ByteOffsetNode;
return true;
}
if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
- Offset = ByteOffsetNode.getOperand(0);
+ *SOffset = ByteOffsetNode.getOperand(0);
return true;
}
}
@@ -1912,8 +1912,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
int64_t ByteOffset = C->getSExtValue();
Optional<int64_t> EncodedOffset =
AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
- if (EncodedOffset && Imm && !Imm32Only) {
- Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
+ if (EncodedOffset && Offset && !Imm32Only) {
+ *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
return true;
}
@@ -1922,17 +1922,17 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
return false;
EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
- if (EncodedOffset && Imm32Only) {
- Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
+ if (EncodedOffset && Offset && Imm32Only) {
+ *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
return true;
}
if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
return false;
- if (!Imm) {
+ if (SOffset) {
SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
- Offset = SDValue(
+ *SOffset = SDValue(
CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
return true;
}
@@ -1968,11 +1968,18 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
// Match a base and an immediate (if Imm is true) or an SGPR
// (if Imm is false) offset. If Imm32Only is true, match only 32-bit
// immediate offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
- SDValue &Offset, bool Imm,
- bool Imm32Only) const {
+bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
+ SDValue *SOffset, SDValue *Offset,
+ bool Imm32Only) const {
SDLoc SL(Addr);
+ if (SOffset && Offset) {
+ assert(!Imm32Only);
+ SDValue B;
+ return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
+ SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
+ }
+
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
// wraparound, because s_load instructions perform the addition in 64 bits.
if ((Addr.getValueType() != MVT::i32 ||
@@ -1987,34 +1994,55 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
assert(N0 && N1 && isa<ConstantSDNode>(N1));
}
if (N0 && N1) {
- if (SelectSMRDOffset(N1, Offset, Imm, Imm32Only)) {
- SBase = Expand32BitAddress(N0);
+ if (SelectSMRDOffset(N0, N1, SOffset, Offset, Imm32Only)) {
+ SBase = N0;
+ return true;
+ }
+ if (SelectSMRDOffset(N1, N0, SOffset, Offset, Imm32Only)) {
+ SBase = N1;
return true;
}
}
return false;
}
- if (!Imm)
+ if (Offset && !SOffset) {
+ SBase = Addr;
+ *Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+ SDValue *SOffset, SDValue *Offset,
+ bool Imm32Only) const {
+ if (!SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only))
return false;
- SBase = Expand32BitAddress(Addr);
- Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+ SBase = Expand32BitAddress(SBase);
return true;
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, Offset, /* Imm */ true);
+ return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- return SelectSMRD(Addr, SBase, Offset, /* Imm */ true, /* Imm32Only */ true);
+ return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
+ /* Imm32Only */ true);
}
bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
- SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, Offset, /* Imm */ false);
+ SDValue &SOffset) const {
+ return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
+ SDValue &SOffset,
+ SDValue &Offset) const {
+ return SelectSMRD(Addr, SBase, &SOffset, &Offset);
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 7894b8eb5b67..fda2bfac71fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -193,14 +193,18 @@ private:
bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &SAddr, SDValue &Offset) const;
- bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool Imm,
- bool Imm32Only) const;
+ bool SelectSMRDOffset(SDValue Base, SDValue ByteOffsetNode, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only = false) const;
SDValue Expand32BitAddress(SDValue Addr) const;
- bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool Imm,
- bool Imm32Only = false) const;
+ bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only = false) const;
+ bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only = false) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
- bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
+ bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset,
+ SDValue &Offset) const;
bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 70fae9d784a2..f2e5c2fe00e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1006,6 +1006,14 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
return selectSMFMACIntrin(I);
default:
return selectImpl(I, *CoverageInfo);
@@ -2361,7 +2369,7 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
return;
- GEPInfo GEPInfo(*PtrMI);
+ GEPInfo GEPInfo;
for (unsigned i = 1; i != 3; ++i) {
const MachineOperand &GEPOp = PtrMI->getOperand(i);
@@ -3237,6 +3245,8 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
return Register();
+ assert(Def->getNumOperands() == 3 &&
+ MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
return Def->getOperand(1).getReg();
}
@@ -3354,6 +3364,30 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
break;
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+ Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+ Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+ Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+ Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
+ break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
+ break;
default:
llvm_unreachable("unhandled smfmac intrinsic");
}
@@ -3800,25 +3834,82 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
}};
}
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
+bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
+ Register &Base,
+ Register *SOffset,
+ int64_t *Offset) const {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+
+ // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
+ // then we can select all ptr + 32-bit offsets.
SmallVector<GEPInfo, 4> AddrInfo;
- getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
+ getAddrModeInfo(*MI, *MRI, AddrInfo);
- if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
- return None;
+ if (AddrInfo.empty())
+ return false;
- const GEPInfo &GEPInfo = AddrInfo[0];
+ const GEPInfo &GEPI = AddrInfo[0];
Optional<int64_t> EncodedImm =
- AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
- if (!EncodedImm)
+ AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
+
+ if (SOffset && Offset) {
+ if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
+ AddrInfo.size() > 1) {
+ const GEPInfo &GEPI2 = AddrInfo[1];
+ if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
+ if (Register OffsetReg =
+ matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
+ Base = GEPI2.SgprParts[0];
+ *SOffset = OffsetReg;
+ *Offset = *EncodedImm;
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
+ Base = GEPI.SgprParts[0];
+ *Offset = *EncodedImm;
+ return true;
+ }
+
+ // SGPR offset is unsigned.
+ if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
+ GEPI.Imm != 0) {
+ // If we make it this far we have a load with an 32-bit immediate offset.
+ // It is OK to select this using a sgpr offset, because we have already
+ // failed trying to select this load into one of the _IMM variants since
+ // the _IMM Patterns are considered before the _SGPR patterns.
+ Base = GEPI.SgprParts[0];
+ *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
+ .addImm(GEPI.Imm);
+ return true;
+ }
+
+ if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
+ if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
+ Base = GEPI.SgprParts[0];
+ *SOffset = OffsetReg;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
+ Register Base;
+ int64_t Offset;
+ if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
return None;
- unsigned PtrReg = GEPInfo.SgprParts[0];
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
- }};
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
}
InstructionSelector::ComplexRendererFns
@@ -3844,43 +3935,24 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
-
- SmallVector<GEPInfo, 4> AddrInfo;
- getAddrModeInfo(*MI, *MRI, AddrInfo);
-
- // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
- // then we can select all ptr + 32-bit offsets.
- if (AddrInfo.empty())
+ Register Base, SOffset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
return None;
- const GEPInfo &GEPInfo = AddrInfo[0];
- Register PtrReg = GEPInfo.SgprParts[0];
-
- // SGPR offset is unsigned.
- if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) &&
- GEPInfo.Imm != 0) {
- // If we make it this far we have a load with an 32-bit immediate offset.
- // It is OK to select this using a sgpr offset, because we have already
- // failed trying to select this load into one of the _IMM variants since
- // the _IMM Patterns are considered before the _SGPR patterns.
- Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
- .addImm(GEPInfo.Imm);
- return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
- [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
- }
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
+}
- if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) {
- if (Register OffsetReg =
- matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) {
- return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
- [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
- }
- }
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
+ Register Base, SOffset;
+ int64_t Offset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
+ return None;
- return None;
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
}
std::pair<Register, int>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 22672ba59e76..5baf55d23480 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -63,11 +63,9 @@ public:
private:
struct GEPInfo {
- const MachineInstr &GEP;
SmallVector<unsigned, 2> SgprParts;
SmallVector<unsigned, 2> VgprParts;
- int64_t Imm;
- GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
+ int64_t Imm = 0;
};
bool isSGPR(Register Reg) const;
@@ -200,12 +198,16 @@ private:
InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;
+ bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
+ int64_t *Offset) const;
InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectSmrdSgpr(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSmrdSgprImm(MachineOperand &Root) const;
std::pair<Register, int> selectFlatOffsetImpl(MachineOperand &Root,
uint64_t FlatVariant) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 01a3e78ea48c..0979debe9777 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4197,6 +4197,35 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Function &F = B.getMF().getFunction();
+ Optional<uint32_t> KnownSize =
+ AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
+ if (KnownSize.has_value())
+ B.buildConstant(DstReg, KnownSize.value());
+ return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ if (!MFI->isEntryFunction()) {
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ if (!getLDSKernelId(DstReg, MRI, B))
+ return false;
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
@@ -5636,6 +5665,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_workgroup_id_z:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_lds_kernel_id:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
case Intrinsic::amdgcn_dispatch_ptr:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::DISPATCH_PTR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index cee533aa34ec..5e8111e22aad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -155,6 +155,13 @@ public:
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+
+ bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
+ bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, unsigned AddrSpace) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 78e092b2e872..7e49a6117ebd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -376,15 +376,7 @@ static bool HasNative(AMDGPULibFunc::EFuncId id) {
return false;
}
-struct TableRef {
- size_t size;
- const TableEntry *table; // variable size: from 0 to (size - 1)
-
- TableRef() : size(0), table(nullptr) {}
-
- template <size_t N>
- TableRef(const TableEntry (&tbl)[N]) : size(N), table(&tbl[0]) {}
-};
+using TableRef = ArrayRef<TableEntry>;
static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
switch(id) {
@@ -698,11 +690,10 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
// Table-Driven optimization
const TableRef tr = getOptTable(FInfo.getId());
- if (tr.size==0)
+ if (tr.empty())
return false;
- int const sz = (int)tr.size;
- const TableEntry * const ftbl = tr.table;
+ int const sz = (int)tr.size();
Value *opr0 = CI->getArgOperand(0);
if (getVecSize(FInfo) > 1) {
@@ -714,8 +705,8 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
assert(eltval && "Non-FP arguments in math function!");
bool found = false;
for (int i=0; i < sz; ++i) {
- if (eltval->isExactlyValue(ftbl[i].input)) {
- DVal.push_back(ftbl[i].result);
+ if (eltval->isExactlyValue(tr[i].input)) {
+ DVal.push_back(tr[i].result);
found = true;
break;
}
@@ -746,8 +737,8 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
// Scalar version
if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
for (int i = 0; i < sz; ++i) {
- if (CF->isExactlyValue(ftbl[i].input)) {
- Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result);
+ if (CF->isExactlyValue(tr[i].input)) {
+ Value *nval = ConstantFP::get(CF->getType(), tr[i].result);
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
replaceCall(nval);
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 35922341de26..b4a8766d682e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -55,21 +55,6 @@ static cl::opt<bool> SuperAlignLDSGlobals(
cl::init(true), cl::Hidden);
namespace {
-
-SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
- SmallPtrSet<GlobalValue *, 32> UsedList;
-
- SmallVector<GlobalValue *, 32> TmpVec;
- collectUsedGlobalVariables(M, TmpVec, true);
- UsedList.insert(TmpVec.begin(), TmpVec.end());
-
- TmpVec.clear();
- collectUsedGlobalVariables(M, TmpVec, false);
- UsedList.insert(TmpVec.begin(), TmpVec.end());
-
- return UsedList;
-}
-
class AMDGPULowerModuleLDS : public ModulePass {
static void removeFromUsedList(Module &M, StringRef Name,
@@ -153,9 +138,6 @@ class AMDGPULowerModuleLDS : public ModulePass {
"");
}
-private:
- SmallPtrSet<GlobalValue *, 32> UsedList;
-
public:
static char ID;
@@ -165,9 +147,10 @@ public:
bool runOnModule(Module &M) override {
CallGraph CG = CallGraph(M);
- UsedList = getUsedList(M);
bool Changed = superAlignLDSGlobals(M);
- Changed |= processUsedLDS(CG, M);
+ std::vector<GlobalVariable *> ModuleScopeVariables =
+ AMDGPU::findVariablesToLower(M, nullptr);
+ Changed |= processUsedLDS(CG, M, ModuleScopeVariables);
for (Function &F : M.functions()) {
if (F.isDeclaration())
@@ -176,10 +159,11 @@ public:
// Only lower compute kernels' LDS.
if (!AMDGPU::isKernel(F.getCallingConv()))
continue;
- Changed |= processUsedLDS(CG, M, &F);
+ std::vector<GlobalVariable *> KernelUsedVariables =
+ AMDGPU::findVariablesToLower(M, &F);
+ Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F);
}
- UsedList.clear();
return Changed;
}
@@ -228,22 +212,20 @@ private:
return Changed;
}
- bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
+ bool processUsedLDS(CallGraph const &CG, Module &M,
+ std::vector<GlobalVariable *> const &LDSVarsToTransform,
+ Function *F = nullptr) {
LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
- // Find variables to move into new struct instance
- std::vector<GlobalVariable *> FoundLocalVars =
- AMDGPU::findVariablesToLower(M, F);
-
- if (FoundLocalVars.empty()) {
+ if (LDSVarsToTransform.empty()) {
// No variables to rewrite, no changes made.
return false;
}
SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
- LayoutFields.reserve(FoundLocalVars.size());
- for (GlobalVariable *GV : FoundLocalVars) {
+ LayoutFields.reserve(LDSVarsToTransform.size());
+ for (GlobalVariable *GV : LDSVarsToTransform) {
OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
AMDGPU::getAlign(DL, GV));
LayoutFields.emplace_back(F);
@@ -252,7 +234,7 @@ private:
performOptimizedStructLayout(LayoutFields);
std::vector<GlobalVariable *> LocalVars;
- LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
+ LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
{
// This usually won't need to insert any padding, perhaps avoid the alloc
uint64_t CurrentOffset = 0;
@@ -352,7 +334,6 @@ private:
GV->replaceAllUsesWith(GEP);
}
if (GV->use_empty()) {
- UsedList.erase(GV);
GV->eraseFromParent();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index b461c3c4bfdc..f5e12fd960d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -11,6 +11,7 @@
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -101,6 +102,21 @@ void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
}
}
+Optional<uint32_t>
+AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
+ auto MD = F.getMetadata("llvm.amdgcn.lds.kernel.id");
+ if (MD && MD->getNumOperands() == 1) {
+ ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(0));
+ if (KnownSize) {
+ uint64_t V = KnownSize->getZExtValue();
+ if (V <= UINT32_MAX) {
+ return V;
+ }
+ }
+ }
+ return {};
+}
+
void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
const GlobalVariable &GV) {
assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index df62c2314617..97db8b7eb8d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -11,11 +11,12 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Function.h"
namespace llvm {
@@ -104,6 +105,8 @@ public:
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
void allocateModuleLDSGlobal(const Function &F);
+ static Optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
+
Align getDynLDSAlign() const { return DynLDSAlign; }
void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 09dbd2150db6..a9f1e9bd0996 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -74,10 +74,10 @@ public:
private:
struct MemAccessInfo {
- const Value *V;
- const Value *Base;
- int64_t Offset;
- MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
+ const Value *V = nullptr;
+ const Value *Base = nullptr;
+ int64_t Offset = 0;
+ MemAccessInfo() = default;
bool isLargeStride(MemAccessInfo &Reference) const;
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Printable print() const {
@@ -116,6 +116,7 @@ private:
bool isGlobalAddr(const Value *V) const;
bool isLocalAddr(const Value *V) const;
+ bool isGlobalLoadUsedInBB(const Instruction &) const;
};
static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
@@ -196,6 +197,24 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
return false;
}
+// Returns true if the global load `I` is used in its own basic block.
+bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
+ const auto *Ld = dyn_cast<LoadInst>(&I);
+ if (!Ld)
+ return false;
+ if (!isGlobalAddr(Ld->getPointerOperand()))
+ return false;
+
+ for (const User *Usr : Ld->users()) {
+ if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
+ if (UsrInst->getParent() == I.getParent())
+ return true;
+ }
+ }
+
+ return false;
+}
+
AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
@@ -203,9 +222,14 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
for (auto &B : F) {
LastAccess = MemAccessInfo();
+ unsigned UsedGlobalLoadsInBB = 0;
for (auto &I : B) {
if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
+ // TODO: Check if the global load and its user are close to each other
+ // instead (Or do this analysis in GCNSchedStrategy?).
+ if (isGlobalLoadUsedInBB(I))
+ UsedGlobalLoadsInBB += Size;
if (isIndirectAccess(&I))
FI.IAMInstCost += Size;
if (isLargeStride(&I))
@@ -245,6 +269,16 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
++FI.InstCost;
}
}
+
+ if (!FI.HasDenseGlobalMemAcc) {
+ unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
+ if (GlobalMemAccPercentage > 50) {
+ LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
+ << B.getName() << " has " << GlobalMemAccPercentage
+ << "% global memory access\n");
+ FI.HasDenseGlobalMemAcc = true;
+ }
+ }
}
return &FI;
@@ -286,6 +320,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
}
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+ // Reverting optimal scheduling in favour of occupancy with basic block(s)
+ // having dense global memory access can potentially hurt performance.
+ if (FI.HasDenseGlobalMemAcc)
+ return true;
+
return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index 31ff80f5f431..2db8db6957ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -41,7 +41,11 @@ public:
unsigned InstCost;
unsigned IAMInstCost; // Indirect access memory instruction count
unsigned LSMInstCost; // Large stride memory instruction count
- FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
+ bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively
+ // high global memory access
+ FuncInfo()
+ : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0),
+ HasDenseGlobalMemAcc(false) {}
};
typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 0df6f4d45b06..bd8e568213b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -153,7 +153,10 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
if (!isVgprRegBank(Dst))
return false;
- if (MRI.getType(Dst).isVector())
+ // med3 for i16 is only available on gfx9+, and not available for v2i16.
+ LLT Ty = MRI.getType(Dst);
+ if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) &&
+ Ty != LLT::scalar(32))
return false;
MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0830cbd919a0..887341e67454 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4426,7 +4426,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
- case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: {
+ case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
@@ -4451,7 +4459,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
- case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: {
+ case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
// vdst, srcA, srcB, srcC, idx
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
index 4d7a3f4028e8..aa51c5d20bdc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -141,7 +141,7 @@ class ReplaceLDSUseImpl {
std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
// Collect LDS which requires module lowering.
std::vector<GlobalVariable *> LDSGlobals =
- llvm::AMDGPU::findVariablesToLower(M);
+ llvm::AMDGPU::findVariablesToLower(M, nullptr);
// Remove LDS which don't qualify for replacement.
llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 8297635d7bb2..5d7bade00a3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -340,12 +340,28 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 6bd906439ee8..cf4826d81b4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -736,13 +736,18 @@ static unsigned getMaxNumPreloadedSGPRs() {
2 + // dispatch ID
2 + // flat scratch init
2; // Implicit buffer ptr
+
// Max number of system SGPRs
unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
1 + // WorkGroupIDY
1 + // WorkGroupIDZ
1 + // WorkGroupInfo
1; // private segment wave byte offset
- return MaxUserSGPRs + MaxSystemSGPRs;
+
+ // Max number of synthetic SGPRs
+ unsigned SyntheticSGPRs = 1; // LDSKernelId
+
+ return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
}
unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
@@ -852,34 +857,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
return MI && TII->isVALU(*MI);
}
- bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
- if (Pred->NodeNum < Succ->NodeNum)
- return true;
-
- SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
-
- for (unsigned I = 0; I < Succs.size(); ++I) {
- for (const SDep &SI : Succs[I]->Succs) {
- const SUnit *SU = SI.getSUnit();
- if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
- Succs.push_back(SU);
- }
- }
-
- SmallPtrSet<const SUnit*, 32> Visited;
- while (!Preds.empty()) {
- const SUnit *SU = Preds.pop_back_val();
- if (llvm::is_contained(Succs, SU))
- return false;
- Visited.insert(SU);
- for (const SDep &SI : SU->Preds)
- if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
- Preds.push_back(SI.getSUnit());
- }
-
- return true;
- }
-
// Link as many SALU instructions in chain as possible. Return the size
// of the chain. Links up to MaxChain instructions.
unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
@@ -895,18 +872,20 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
- if (SU->addPred(SDep(From, SDep::Artificial), false))
- ++Linked;
+ if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
+ if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
+ ++Linked;
for (SDep &SI : From->Succs) {
SUnit *SUv = SI.getSUnit();
- if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
- SUv->addPred(SDep(SU, SDep::Artificial), false);
+ if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
+ DAG->canAddEdge(SUv, SU))
+ DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
}
for (SDep &SI : SU->Succs) {
SUnit *Succ = SI.getSUnit();
- if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
+ if (Succ != SU && isSALU(Succ))
Worklist.push_back(Succ);
}
}
@@ -949,7 +928,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
if (Visited.count(&*LastSALU))
continue;
- if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
+ if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
+ !DAG->canAddEdge(&*LastSALU, &SU))
continue;
Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 971e44723758..dca926867300 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1584,6 +1584,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
AMDGPU::SGPR_32RegClass,
MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
+ AMDGPU::SGPR_32RegClass,
+ MFI->ArgInfo.LDSKernelId, 0, 1) ||
parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
0, 1) ||
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index e12d0ffef35c..2a9393fc1595 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1739,6 +1739,8 @@ public:
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
+ void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
+ OptionalImmIndexMap &OptionalIdx);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx);
@@ -1767,21 +1769,11 @@ public:
void cvtDPP8(MCInst &Inst, const OperandVector &Operands) {
cvtDPP(Inst, Operands, true);
}
- void cvtVOPCNoDstDPP(MCInst &Inst, const OperandVector &Operands,
- bool IsDPP8 = false);
- void cvtVOPCNoDstDPP8(MCInst &Inst, const OperandVector &Operands) {
- cvtVOPCNoDstDPP(Inst, Operands, true);
- }
void cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
bool IsDPP8 = false);
void cvtVOP3DPP8(MCInst &Inst, const OperandVector &Operands) {
cvtVOP3DPP(Inst, Operands, true);
}
- void cvtVOPC64NoDstDPP(MCInst &Inst, const OperandVector &Operands,
- bool IsDPP8 = false);
- void cvtVOPC64NoDstDPP8(MCInst &Inst, const OperandVector &Operands) {
- cvtVOPC64NoDstDPP(Inst, Operands, true);
- }
OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
AMDGPUOperand::ImmTy Type);
@@ -4177,7 +4169,9 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
return false;
}
- if (isGFX940() && (MII.get(Opc).TSFlags & SIInstrFlags::IsDOT)) {
+ uint64_t TSFlags = MII.get(Opc).TSFlags;
+
+ if (isGFX940() && (TSFlags & SIInstrFlags::IsDOT)) {
int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
if (OpSelIdx != -1) {
if (Inst.getOperand(OpSelIdx).getImm() != 0)
@@ -4190,6 +4184,15 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
}
}
+ // op_sel[0:1] must be 0 for v_dot2_bf16_bf16 and v_dot2_f16_f16 (VOP3 Dot).
+ if ((TSFlags & SIInstrFlags::IsDOT) && (TSFlags & SIInstrFlags::VOP3) &&
+ !(TSFlags & SIInstrFlags::VOP3P)) {
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+ if (OpSel & 3)
+ return false;
+ }
+
return true;
}
@@ -4636,9 +4639,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
Error(IDLoc, "ABS not allowed in VOP3B instructions");
return false;
}
- if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
- return false;
- }
if (!validateExeczVcczOperands(Operands)) {
return false;
}
@@ -5004,6 +5004,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
Val, ValRange);
+ } else if (ID == ".amdhsa_uses_dynamic_stack") {
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
if (hasArchitectedFlatScratch())
return Error(IDRange.Start,
@@ -8024,10 +8027,13 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands)
return MatchOperand_NoMatch;
}
-void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands) {
- cvtVOP3P(Inst, Operands);
-
+// Determines which bit DST_OP_SEL occupies in the op_sel operand according to
+// the number of src operands present, then copies that bit into src0_modifiers.
+void cvtVOP3DstOpSelOnly(MCInst &Inst) {
int Opc = Inst.getOpcode();
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ if (OpSelIdx == -1)
+ return;
int SrcNum;
const int Ops[] = { AMDGPU::OpName::src0,
@@ -8038,7 +8044,6 @@ void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands)
++SrcNum);
assert(SrcNum > 0);
- int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
if ((OpSel & (1 << SrcNum)) != 0) {
@@ -8048,6 +8053,18 @@ void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands)
}
}
+void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst,
+ const OperandVector &Operands) {
+ cvtVOP3P(Inst, Operands);
+ cvtVOP3DstOpSelOnly(Inst);
+}
+
+void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
+ OptionalImmIndexMap &OptionalIdx) {
+ cvtVOP3P(Inst, Operands, OptionalIdx);
+ cvtVOP3DstOpSelOnly(Inst);
+}
+
static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
// 1. This operand is input modifiers
return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS
@@ -8241,6 +8258,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
+ if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
+ Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
+ Inst.addOperand(Inst.getOperand(0));
+ }
+
if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) {
assert(!IsPacked);
Inst.addOperand(Inst.getOperand(0));
@@ -8747,14 +8770,6 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi);
}
-// Add dummy $old operand
-void AMDGPUAsmParser::cvtVOPC64NoDstDPP(MCInst &Inst,
- const OperandVector &Operands,
- bool IsDPP8) {
- Inst.addOperand(MCOperand::createReg(0));
- cvtVOP3DPP(Inst, Operands, IsDPP8);
-}
-
void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
OptionalImmIndexMap OptionalIdx;
unsigned Opc = Inst.getOpcode();
@@ -8802,6 +8817,8 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bo
}
if (Desc.TSFlags & SIInstrFlags::VOP3P)
cvtVOP3P(Inst, Operands, OptionalIdx);
+ else if (Desc.TSFlags & SIInstrFlags::VOP3)
+ cvtVOP3OpSel(Inst, Operands, OptionalIdx);
else if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel);
}
@@ -8821,14 +8838,6 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bo
}
}
-// Add dummy $old operand
-void AMDGPUAsmParser::cvtVOPCNoDstDPP(MCInst &Inst,
- const OperandVector &Operands,
- bool IsDPP8) {
- Inst.addOperand(MCOperand::createReg(0));
- cvtDPP(Inst, Operands, IsDPP8);
-}
-
void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
OptionalImmIndexMap OptionalIdx;
@@ -9043,12 +9052,27 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
// v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
switch (BasicInstType) {
case SIInstrFlags::VOP1:
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
- if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::clamp) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyClampSI, 0);
+ }
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::omod) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyOModSI, 0);
+ }
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::dst_sel) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
+ }
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::dst_unused) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTySdwaDstUnused,
+ DstUnused::UNUSED_PRESERVE);
}
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
break;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ccaf646008b1..98ee720200b4 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -451,7 +451,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
convertVOP3PDPPInst(MI);
else if (AMDGPU::isVOPC64DPP(MI.getOpcode()))
- convertVOPCDPPInst(MI);
+ convertVOPCDPPInst(MI); // Special VOP3 case
+ else {
+ assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
+ convertVOP3DPPInst(MI); // Regular VOP3 case
+ }
break;
}
Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address);
@@ -745,6 +749,43 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
return MCDisassembler::Success;
}
+struct VOPModifiers {
+ unsigned OpSel = 0;
+ unsigned OpSelHi = 0;
+ unsigned NegLo = 0;
+ unsigned NegHi = 0;
+};
+
+// Reconstruct values of VOP3/VOP3P operands such as op_sel.
+// Note that these values do not affect disassembler output,
+// so this is only necessary for consistency with src_modifiers.
+static VOPModifiers collectVOPModifiers(const MCInst &MI,
+ bool IsVOP3P = false) {
+ VOPModifiers Modifiers;
+ unsigned Opc = MI.getOpcode();
+ const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers};
+ for (int J = 0; J < 3; ++J) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+ if (OpIdx == -1)
+ continue;
+
+ unsigned Val = MI.getOperand(OpIdx).getImm();
+
+ Modifiers.OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
+ if (IsVOP3P) {
+ Modifiers.OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
+ Modifiers.NegLo |= !!(Val & SISrcMods::NEG) << J;
+ Modifiers.NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
+ } else if (J == 0) {
+ Modifiers.OpSel |= !!(Val & SISrcMods::DST_OP_SEL) << 3;
+ }
+ }
+
+ return Modifiers;
+}
+
// We must check FI == literal to reject not genuine dpp8 insts, and we must
// first add optional MI operands to check FI
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
@@ -755,6 +796,11 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
} else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
AMDGPU::isVOPC64DPP(Opc)) {
convertVOPCDPPInst(MI);
+ } else if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
+ auto Mods = collectVOPModifiers(MI);
+ insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
+ AMDGPU::OpName::op_sel);
} else {
// Insert dummy unused src modifiers.
if (MI.getNumOperands() < DescNumOps &&
@@ -770,6 +816,18 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
}
+DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
+ unsigned Opc = MI.getOpcode();
+ unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
+ auto Mods = collectVOPModifiers(MI);
+ insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
+ AMDGPU::OpName::op_sel);
+ }
+ return MCDisassembler::Success;
+}
+
// Note that before gfx10, the MIMG encoding provided no information about
// VADDR size. Consequently, decoded instructions always show address as if it
// has 1 dword, which could be not really so.
@@ -914,45 +972,27 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+ auto Mods = collectVOPModifiers(MI, true);
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1)
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
- const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
- AMDGPU::OpName::src1_modifiers,
- AMDGPU::OpName::src2_modifiers};
- unsigned OpSel = 0;
- unsigned OpSelHi = 0;
- unsigned NegLo = 0;
- unsigned NegHi = 0;
- for (int J = 0; J < 3; ++J) {
- int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
- if (OpIdx == -1)
- break;
- unsigned Val = MI.getOperand(OpIdx).getImm();
-
- OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
- OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
- NegLo |= !!(Val & SISrcMods::NEG) << J;
- NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
- }
-
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1)
- insertNamedMCOperand(MI, MCOperand::createImm(OpSel),
+ insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
AMDGPU::OpName::op_sel);
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi) != -1)
- insertNamedMCOperand(MI, MCOperand::createImm(OpSelHi),
+ insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSelHi),
AMDGPU::OpName::op_sel_hi);
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo) != -1)
- insertNamedMCOperand(MI, MCOperand::createImm(NegLo),
+ insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegLo),
AMDGPU::OpName::neg_lo);
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi) != -1)
- insertNamedMCOperand(MI, MCOperand::createImm(NegHi),
+ insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
AMDGPU::OpName::neg_hi);
return MCDisassembler::Success;
@@ -2000,6 +2040,9 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
}
+ PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
+ KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
+
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1)
return MCDisassembler::Fail;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 31869f0917ae..d17e2d8d5082 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -162,6 +162,7 @@ public:
DecodeStatus convertSDWAInst(MCInst &MI) const;
DecodeStatus convertDPP8Inst(MCInst &MI) const;
DecodeStatus convertMIMGInst(MCInst &MI) const;
+ DecodeStatus convertVOP3DPPInst(MCInst &MI) const;
DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 5d254518c67a..4558ddf6dbfe 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -202,6 +202,19 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
return nullptr;
}
+ int OrigOpE32 = AMDGPU::getVOPe32(OrigOp);
+ // Prior checks cover Mask with VOPC condition, but not on purpose
+ auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
+ assert(RowMaskOpnd && RowMaskOpnd->isImm());
+ auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
+ assert(BankMaskOpnd && BankMaskOpnd->isImm());
+ const bool MaskAllLanes =
+ RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF;
+ (void)MaskAllLanes;
+ assert(MaskAllLanes ||
+ !(TII->isVOPC(DPPOp) ||
+ (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && TII->isVOPC(OrigOpE32))) &&
+ "VOPC cannot form DPP unless mask is full");
auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
OrigMI.getDebugLoc(), TII->get(DPPOp))
@@ -234,6 +247,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
CombOldVGPR.SubReg);
++NumOperands;
+ } else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
+ TII->isVOPC(OrigOpE32))) {
+ // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand
+ // because they write to SGPRs not VGPRs
} else {
// TODO: this discards MAC/FMA instructions for now, let's add it later
LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 281474994bca..6ff349e31f22 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -249,11 +249,11 @@ def : ProcessorModel<"gfx1036", GFX10SpeedModel,
//===----------------------------------------------------------------------===//
def : ProcessorModel<"gfx1100", GFX11SpeedModel,
- FeatureISAVersion11_0.Features
+ FeatureISAVersion11_0_0.Features
>;
def : ProcessorModel<"gfx1101", GFX11SpeedModel,
- FeatureISAVersion11_0.Features
+ FeatureISAVersion11_0_1.Features
>;
def : ProcessorModel<"gfx1102", GFX11SpeedModel,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 100410bb7644..04da14cc4916 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -733,7 +733,7 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() {
MachineOperand *Op = MRI.getOneDef(Reg);
MachineInstr *Def = Op->getParent();
- if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def, AA))
+ if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
continue;
MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg);
@@ -943,9 +943,8 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
}
// Copied from MachineLICM
-bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) {
- if (!TII->isTriviallyReMaterializable(MI, AA))
+bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI) {
+ if (!TII->isTriviallyReMaterializable(MI))
return false;
for (const MachineOperand &MO : MI.operands())
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 97f94f69b70e..c3db849cf81a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -142,7 +142,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// and single use outside the defining block into RematerializableInsts.
void collectRematerializableInstructions();
- bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA);
+ bool isTriviallyReMaterializable(const MachineInstr &MI);
// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
// Attempt to reduce RP of VGPR by sinking trivially rematerializable
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d269d0945f3b..d71f80c5f458 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -145,6 +145,7 @@ protected:
bool HasDot7Insts = false;
bool HasDot8Insts = false;
bool HasMAIInsts = false;
+ bool HasFP8Insts = false;
bool HasPkFmacF16Inst = false;
bool HasAtomicFaddRtnInsts = false;
bool HasAtomicFaddNoRtnInsts = false;
@@ -721,6 +722,10 @@ public:
return HasMAIInsts;
}
+ bool hasFP8Insts() const {
+ return HasFP8Insts;
+ }
+
bool hasPkFmacF16Inst() const {
return HasPkFmacF16Inst;
}
@@ -930,7 +935,7 @@ public:
}
bool hasUserSGPRInit16Bug() const {
- return UserSGPRInit16Bug;
+ return UserSGPRInit16Bug && isWave32();
}
bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index bd938d829953..21ff2744e5b4 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -627,7 +627,7 @@ void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
unsigned OpNo) const {
- return OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) &&
+ return OpNo == 0 && (Desc.TSFlags & SIInstrFlags::DPP) &&
(Desc.TSFlags & SIInstrFlags::VOPC) &&
(Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO));
@@ -644,8 +644,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
// If there are printed modifiers, printOperandAndFPInputMods or
// printOperandAndIntInputMods will be called instead
if ((OpNo == 0 ||
- (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP)) ||
- (OpNo == 2 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) &&
+ (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) &&
(Desc.TSFlags & SIInstrFlags::VOPC) &&
(Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)))
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 078133469549..0e71509cf2bd 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -367,6 +367,8 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
+ PRINT_FIELD(OS, ".amdhsa_uses_dynamic_stack", KD, kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
PRINT_FIELD(OS,
(hasArchitectedFlatScratch(STI)
? ".amdhsa_enable_private_segment"
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f54778535b7c..3e95c55df57e 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -67,6 +67,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetOperations.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
@@ -81,9 +82,9 @@ static cl::opt<bool> EnableM0Merge(
cl::init(true));
namespace {
-
class SIFixSGPRCopies : public MachineFunctionPass {
MachineDominatorTree *MDT;
+ unsigned NextVGPRToSGPRCopyID;
public:
static char ID;
@@ -92,9 +93,16 @@ public:
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
- SIFixSGPRCopies() : MachineFunctionPass(ID) {}
+ SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {}
bool runOnMachineFunction(MachineFunction &MF) override;
+ unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }
+ void lowerVGPR2SGPRCopies(MachineFunction &MF);
+ // Handles copies which source register is:
+ // 1. Physical register
+ // 2. AGPR
+ // 3. Defined by the instruction the merely moves the immediate
+ bool lowerSpecialCase(MachineInstr &MI);
MachineBasicBlock *processPHINode(MachineInstr &MI);
@@ -569,6 +577,14 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();
+ // We have to lower VGPR to SGPR copies before the main loop
+ // because the REG_SEQUENCE and PHI lowering in main loop
+ // convert the def-use chains to VALU and close the opportunities
+ // for keeping them scalar.
+ // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch
+ // addresses their lowering and unify the processing in one main loop.
+ lowerVGPR2SGPRCopies(MF);
+
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock *MBB = &*BI;
@@ -640,42 +656,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
- Register SrcReg = MI.getOperand(1).getReg();
- if (!SrcReg.isVirtual()) {
- MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
- if (NewBB && NewBB != MBB) {
- MBB = NewBB;
- E = MBB->end();
- BI = MachineFunction::iterator(MBB);
- BE = MF.end();
- }
- assert((!NewBB || NewBB == I->getParent()) &&
- "moveToVALU did not return the right basic block");
- break;
- }
-
- MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
- unsigned SMovOp;
- int64_t Imm;
- // If we are just copying an immediate, we can replace the copy with
- // s_mov_b32.
- if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
- MI.getOperand(1).ChangeToImmediate(Imm);
- MI.addImplicitDefUseOperands(MF);
- MI.setDesc(TII->get(SMovOp));
- break;
- }
- MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
- if (NewBB && NewBB != MBB) {
- MBB = NewBB;
- E = MBB->end();
- BI = MachineFunction::iterator(MBB);
- BE = MF.end();
- }
- assert((!NewBB || NewBB == I->getParent()) &&
- "moveToVALU did not return the right basic block");
- } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+ if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
}
@@ -916,3 +897,269 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
}
return CreatedBB;
}
+
+bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
+
+ // We return true to indicate that no further processing needed
+ if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
+ return true;
+
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
+ TII->moveToVALU(MI, MDT);
+ return true;
+ }
+
+ unsigned SMovOp;
+ int64_t Imm;
+ // If we are just copying an immediate, we can replace the copy with
+ // s_mov_b32.
+ if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
+ MI.getOperand(1).ChangeToImmediate(Imm);
+ MI.addImplicitDefUseOperands(*MBB->getParent());
+ MI.setDesc(TII->get(SMovOp));
+ return true;
+ }
+ return false;
+}
+
+class V2SCopyInfo {
+public:
+ // VGPR to SGPR copy being processed
+ MachineInstr *Copy;
+ // All SALU instructions reachable from this copy in SSA graph
+ DenseSet<MachineInstr *> SChain;
+ // Number of SGPR to VGPR copies that are used to put the SALU computation
+ // results back to VALU.
+ unsigned NumSVCopies;
+
+ unsigned Score;
+ // Actual count of v_readfirstlane_b32
+ // which need to be inserted to keep SChain SALU
+ unsigned NumReadfirstlanes;
+ // Current score state. To speedup selection V2SCopyInfos for processing
+ bool NeedToBeConvertedToVALU = false;
+ // Unique ID. Used as a key for mapping to keep permanent order.
+ unsigned ID;
+
+ // Count of another VGPR to SGPR copies that contribute to the
+ // current copy SChain
+ unsigned SiblingPenalty = 0;
+ SetVector<unsigned> Siblings;
+ V2SCopyInfo() : Copy(nullptr), ID(0){};
+ V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
+ : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){};
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() {
+ dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
+ << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
+ << "\nScore: " << Score << "\n";
+ }
+#endif
+};
+
+void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
+
+ DenseMap<unsigned, V2SCopyInfo> Copies;
+ DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
+
+ // The main function that computes the VGPR to SGPR copy score
+ // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
+ auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool {
+ if (I->SChain.empty())
+ return true;
+ I->Siblings = SiblingPenalty[*std::max_element(
+ I->SChain.begin(), I->SChain.end(),
+ [&](MachineInstr *A, MachineInstr *B) -> bool {
+ return SiblingPenalty[A].size() < SiblingPenalty[B].size();
+ })];
+ I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; });
+ // The loop below computes the number of another VGPR to SGPR copies
+ // which contribute to the current copy SALU chain. We assume that all the
+ // copies with the same source virtual register will be squashed to one by
+ // regalloc. Also we take careof the copies of the differnt subregs of the
+ // same register.
+ SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
+ for (auto J : I->Siblings) {
+ auto InfoIt = Copies.find(J);
+ if (InfoIt != Copies.end()) {
+ MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
+ if (SiblingCopy->isImplicitDef())
+ // the COPY has already been MoveToVALUed
+ continue;
+
+ SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(),
+ SiblingCopy->getOperand(1).getSubReg()));
+ }
+ }
+ I->SiblingPenalty = SrcRegs.size();
+
+ unsigned Penalty =
+ I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes;
+ unsigned Profit = I->SChain.size();
+ I->Score = Penalty > Profit ? 0 : Profit - Penalty;
+ I->NeedToBeConvertedToVALU = I->Score < 3;
+ return I->NeedToBeConvertedToVALU;
+ };
+
+ auto needProcessing = [](MachineInstr &MI) -> bool {
+ switch (MI.getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::WQM:
+ case AMDGPU::STRICT_WQM:
+ case AMDGPU::SOFT_WQM:
+ case AMDGPU::STRICT_WWM:
+ return true;
+ default:
+ return false;
+ }
+ };
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+ ++BI) {
+ MachineBasicBlock *MBB = &*BI;
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+ ++I) {
+ MachineInstr &MI = *I;
+ if (!needProcessing(MI))
+ continue;
+ if (lowerSpecialCase(MI))
+ continue;
+
+ // Compute the COPY width to pass it to V2SCopyInfo Ctor
+ Register DstReg = MI.getOperand(0).getReg();
+
+ const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg);
+
+ V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI,
+ TRI->getRegSizeInBits(*DstRC));
+
+ SmallVector<MachineInstr *, 8> AnalysisWorklist;
+ // Needed because the SSA is not a tree but a graph and may have
+ // forks and joins. We should not then go same way twice.
+ DenseSet<MachineInstr *> Visited;
+ AnalysisWorklist.push_back(&MI);
+ while (!AnalysisWorklist.empty()) {
+
+ MachineInstr *Inst = AnalysisWorklist.pop_back_val();
+
+ if (!Visited.insert(Inst).second)
+ continue;
+
+ // Copies and REG_SEQUENCE do not contribute to the final assembly
+ // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
+ if (Inst->isCopy() || Inst->isRegSequence()) {
+ if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+ if (!Inst->isCopy() ||
+ !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+ In.NumSVCopies++;
+ continue;
+ }
+ }
+ }
+
+ SiblingPenalty[Inst].insert(In.ID);
+
+ SmallVector<MachineInstr *, 4> Users;
+ if ((TII->isSALU(*Inst) && Inst->isCompare()) ||
+ (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) {
+ auto I = Inst->getIterator();
+ auto E = Inst->getParent()->end();
+ while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) {
+ if (I->readsRegister(AMDGPU::SCC))
+ Users.push_back(&*I);
+ }
+ } else if (Inst->getNumExplicitDefs() != 0) {
+ Register Reg = Inst->getOperand(0).getReg();
+ if (TRI->isSGPRReg(*MRI, Reg))
+ for (auto &U : MRI->use_instructions(Reg))
+ Users.push_back(&U);
+ }
+ for (auto U : Users) {
+ if (TII->isSALU(*U))
+ In.SChain.insert(U);
+ AnalysisWorklist.push_back(U);
+ }
+ }
+ Copies[In.ID] = In;
+ }
+ }
+
+ SmallVector<unsigned, 8> LoweringWorklist;
+ for (auto &C : Copies) {
+ if (needToBeConvertedToVALU(&C.second))
+ LoweringWorklist.push_back(C.second.ID);
+ }
+
+ while (!LoweringWorklist.empty()) {
+ unsigned CurID = LoweringWorklist.pop_back_val();
+ auto CurInfoIt = Copies.find(CurID);
+ if (CurInfoIt != Copies.end()) {
+ V2SCopyInfo C = CurInfoIt->getSecond();
+ LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
+ for (auto S : C.Siblings) {
+ auto SibInfoIt = Copies.find(S);
+ if (SibInfoIt != Copies.end()) {
+ V2SCopyInfo &SI = SibInfoIt->getSecond();
+ LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
+ if (!SI.NeedToBeConvertedToVALU) {
+ set_subtract(SI.SChain, C.SChain);
+ if (needToBeConvertedToVALU(&SI))
+ LoweringWorklist.push_back(SI.ID);
+ }
+ SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; });
+ }
+ }
+ LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
+ << " is being turned to VALU\n");
+ Copies.erase(C.ID);
+ TII->moveToVALU(*C.Copy, MDT);
+ }
+ }
+
+ // Now do actual lowering
+ for (auto C : Copies) {
+ MachineInstr *MI = C.second.Copy;
+ MachineBasicBlock *MBB = MI->getParent();
+ // We decide to turn V2S copy to v_readfirstlane_b32
+ // remove it from the V2SCopies and remove it from all its siblings
+ LLVM_DEBUG(dbgs() << "V2S copy " << *MI
+ << " is being turned to v_readfirstlane_b32"
+ << " Score: " << C.second.Score << "\n");
+ Register DstReg = MI->getOperand(0).getReg();
+ Register SrcReg = MI->getOperand(1).getReg();
+ unsigned SubReg = MI->getOperand(1).getSubReg();
+ const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg);
+ SrcRC = TRI->getSubRegClass(SrcRC, SubReg);
+ size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
+ if (SrcSize == 16) {
+ // HACK to handle possible 16bit VGPR source
+ auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
+ MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
+ } else if (SrcSize == 32) {
+ auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
+ MIB.addReg(SrcReg, 0, SubReg);
+ } else {
+ auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::REG_SEQUENCE), DstReg);
+ int N = TRI->getRegSizeInBits(*SrcRC) / 32;
+ for (int i = 0; i < N; i++) {
+ Register PartialSrc = TII->buildExtractSubReg(
+ Result, *MRI, MI->getOperand(1), SrcRC,
+ TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
+ Register PartialDst =
+ MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, *Result, Result->getDebugLoc(),
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
+ .addReg(PartialSrc);
+ Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));
+ }
+ }
+ MI->eraseFromParent();
+ }
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d16da2a8b86b..438e8b200ecc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1664,6 +1664,17 @@ SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
}
+SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
+ const SDLoc &SL) const {
+
+ Function &F = DAG.getMachineFunction().getFunction();
+ Optional<uint32_t> KnownSize =
+ AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
+ if (KnownSize.has_value())
+ return DAG.getConstant(KnownSize.value(), SL, MVT::i32);
+ return SDValue();
+}
+
SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Val,
bool Signed,
@@ -2049,6 +2060,9 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasWorkGroupIDZ())
allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
+
+ if (Info.hasLDSKernelId())
+ allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
}
// Allocate special inputs passed in user SGPRs.
@@ -2102,6 +2116,12 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(FlatScratchInitReg);
}
+ if (Info.hasLDSKernelId()) {
+ Register Reg = Info.addLDSKernelId();
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
// these from the dispatch pointer.
}
@@ -2347,8 +2367,8 @@ SDValue SITargetLowering::LowerFormalArguments(
(!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
- !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
- !Info->hasWorkItemIDZ());
+ !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
+ !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
}
if (CallConv == CallingConv::AMDGPU_PS) {
@@ -2762,7 +2782,8 @@ void SITargetLowering::passSpecialInputs(
{AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
{AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
{AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
- {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}
+ {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
+ {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
};
for (auto Attr : ImplicitAttrs) {
@@ -2798,6 +2819,13 @@ void SITargetLowering::passSpecialInputs(
// The implicit arg ptr is special because it doesn't have a corresponding
// input for kernels, and is computed from the kernarg segment pointer.
InputReg = getImplicitArgPtr(DAG, DL);
+ } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
+ Optional<uint32_t> Id = AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
+ if (Id.has_value()) {
+ InputReg = DAG.getConstant(Id.value(), DL, ArgVT);
+ } else {
+ InputReg = DAG.getUNDEF(ArgVT);
+ }
} else {
// We may have proven the input wasn't needed, although the ABI is
// requiring it. We just need to allocate the register appropriately.
@@ -6887,6 +6915,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_workgroup_id_z:
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_lds_kernel_id: {
+ if (MFI->isEntryFunction())
+ return getLDSKernelId(DAG, DL);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
+ }
case Intrinsic::amdgcn_workitem_id_x:
return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
case Intrinsic::amdgcn_workitem_id_y:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 4fbccf0c5850..d1fecc1afc7f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -48,6 +48,7 @@ private:
SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
SDValue Chain, uint64_t Offset) const;
SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
+ SDValue getLDSKernelId(SelectionDAG &DAG, const SDLoc &SL) const;
SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
uint64_t Offset, Align Alignment,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index b398e108bf62..7c1d8d32b624 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -85,7 +85,7 @@ class InstSI <dag outs, dag ins, string asm = "",
field bit VOPAsmPrefer32Bit = 0;
// This bit indicates that this is a VOP3 opcode which supports op_sel
- // modifier (gfx9 only).
+ // modifier.
field bit VOP3_OPSEL = 0;
// Is it possible for this instruction to be atomic?
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 799d34e32d27..8916f06598c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -108,8 +108,8 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
}
-bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const {
+bool SIInstrInfo::isReallyTriviallyReMaterializable(
+ const MachineInstr &MI) const {
if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
// Normally VALU use of exec would block the rematerialization, but that
// is OK in this case to have an implicit exec read as all VALU do.
@@ -220,16 +220,23 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
return false;
- assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
+ unsigned NumOps = getNumOperandsNoGlue(Load0);
+ if (NumOps != getNumOperandsNoGlue(Load1))
+ return false;
// Check base reg.
if (Load0->getOperand(0) != Load1->getOperand(0))
return false;
+ // Match register offsets, if both register and immediate offsets present.
+ assert(NumOps == 4 || NumOps == 5);
+ if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
+ return false;
+
const ConstantSDNode *Load0Offset =
- dyn_cast<ConstantSDNode>(Load0->getOperand(1));
+ dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
const ConstantSDNode *Load1Offset =
- dyn_cast<ConstantSDNode>(Load1->getOperand(1));
+ dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
if (!Load0Offset || !Load1Offset)
return false;
@@ -5011,10 +5018,8 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
}
if (MO->isReg()) {
- if (!DefinedRC) {
- // This operand allows any register.
- return true;
- }
+ if (!DefinedRC)
+ return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
if (!isLegalRegOperand(MRI, OpInfo, *MO))
return false;
bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 1b411eb83eb3..5840f45bdc5a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -184,8 +184,7 @@ public:
return ST;
}
- bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const override;
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
bool isIgnorableUse(const MachineOperand &MO) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 23afd6556bc9..81f8dcc482da 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -324,7 +324,8 @@ class isFloatType<ValueType SrcVT> {
// XXX - do v2i16 instructions?
class isIntType<ValueType SrcVT> {
- bit ret = !or(!eq(SrcVT.Value, i16.Value),
+ bit ret = !or(!eq(SrcVT.Value, i8.Value),
+ !eq(SrcVT.Value, i16.Value),
!eq(SrcVT.Value, i32.Value),
!eq(SrcVT.Value, i64.Value),
!eq(SrcVT.Value, v4i16.Value),
@@ -1411,6 +1412,10 @@ class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>;
def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>;
+def Bin32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32> {
+ let Name = "SDWAWithBin32InputMods";
+ let ParserMethod = "parseRegOrImm";
+}
class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
InputMods <matchClass> {
@@ -1419,6 +1424,7 @@ class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>;
def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
+def Bin32SDWAInputMods : IntSDWAInputMods<Bin32SDWAInputModsMatchClass>;
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
@@ -1897,94 +1903,94 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld> {
- dag ret = !if (!eq(NumSrcArgs, 0),
+ dag ret = !if(!eq(NumSrcArgs, 0),
// VOP1 without input operands (V_NOP)
(ins ),
- !if (!eq(NumSrcArgs, 1),
- !if (HasModifiers,
- // VOP1_DPP with modifiers
- (ins OldRC:$old, Src0Mod:$src0_modifiers,
- Src0RC:$src0)
- /* else */,
- // VOP1_DPP without modifiers
- (ins OldRC:$old, Src0RC:$src0)
- /* endif */),
- !if (!eq(NumSrcArgs, 2),
- !if (HasModifiers,
- // VOP2_DPP with modifiers
- (ins OldRC:$old,
- Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1)
- /* else */,
- // VOP2_DPP without modifiers
- (ins OldRC:$old,
- Src0RC:$src0, Src1RC:$src1)
- )
- /* NumSrcArgs == 3, VOP3 */,
- !if (HasModifiers,
- // VOP3_DPP with modifiers
- (ins OldRC:$old,
- Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- Src2Mod:$src2_modifiers, Src2RC:$src2)
- /* else */,
- // VOP3_DPP without modifiers
- (ins OldRC:$old,
- Src0RC:$src0, Src1RC:$src1,
- Src2RC:$src2)
+ !con(
+ !if(HasOld ,(ins OldRC:$old), (ins)),
+ !if (!eq(NumSrcArgs, 1),
+ !if (HasModifiers,
+ // VOP1_DPP with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0)
+ /* else */,
+ // VOP1_DPP without modifiers
+ (ins Src0RC:$src0)
+ /* endif */),
+ !if (!eq(NumSrcArgs, 2),
+ !if (HasModifiers,
+ // VOP2_DPP with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1)
+ /* else */,
+ // VOP2_DPP without modifiers
+ (ins Src0RC:$src0, Src1RC:$src1)
+ )
+ /* NumSrcArgs == 3, VOP3 */,
+ !if (HasModifiers,
+ // VOP3_DPP with modifiers
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2)
+ /* else */,
+ // VOP3_DPP without modifiers
+ (ins Src0RC:$src0, Src1RC:$src1,
+ Src2RC:$src2)
+ )
+ )
+ )
)
- /* endif */)));
+ );
}
class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
(ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
}
class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
(ins FI:$fi));
}
class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
+ HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
(ins dpp8:$dpp8, FI:$fi));
}
-class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld> {
dag old = ( ins OldRC:$old );
dag base = VOP3Base;
dag ret = !con(
- !if(!ne(NumSrcArgs, 0), old, (ins)),
+ !if(!and(HasOld,!ne(NumSrcArgs, 0)), old, (ins)),
base
);
}
-class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
- dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret,
+class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
+ dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
(ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
}
-class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
- dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs>.ret,
+class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
+ dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
(ins FI:$fi));
}
-class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
- dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret,
+class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
+ dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
(ins dpp8:$dpp8, FI:$fi));
}
@@ -2665,6 +2671,8 @@ def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>;
def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>;
def VOP_V4F32_V2F32_V2F32_V4F32 : VOPProfile <[v4f32, v2f32, v2f32, v4f32]>;
def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>;
+def VOP_V4F32_I64_I64_V4F32 : VOPProfile <[v4f32, i64, i64, v4f32]>;
+def VOP_V16F32_I64_I64_V16F32 : VOPProfile <[v16f32, i64, i64, v16f32]>;
def VOP_V4F32_V4F16_V8F16_I32 : VOPProfile <[v4f32, v4f16, v8f16, i32]>;
def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>;
@@ -2672,6 +2680,8 @@ def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>;
def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>;
def VOP_V4I32_V2I32_V4I32_I32 : VOPProfile <[v4i32, v2i32, v4i32, i32]>;
def VOP_V16I32_V2I32_V4I32_I32 : VOPProfile <[v16i32, v2i32, v4i32, i32]>;
+def VOP_V4F32_V2I32_V4I32_I32 : VOPProfile <[v4f32, v2i32, v4i32, i32]>;
+def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 0504c59ebd9e..9176e85568ee 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -44,6 +44,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkGroupIDY(false),
WorkGroupIDZ(false),
WorkGroupInfo(false),
+ LDSKernelId(false),
PrivateSegmentWaveByteOffset(false),
WorkItemIDX(false),
WorkItemIDY(false),
@@ -143,6 +144,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
DispatchID = true;
+
+ if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
+ LDSKernelId = true;
}
// FIXME: This attribute is a hack, we just need an analysis on the function
@@ -261,6 +265,12 @@ Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI)
return ArgInfo.ImplicitBufferPtr.getRegister();
}
+Register SIMachineFunctionInfo::addLDSKernelId() {
+ ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR());
+ NumUserSGPRs += 1;
+ return ArgInfo.LDSKernelId.getRegister();
+}
+
bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
MCPhysReg Reg) {
for (unsigned I = 0; CSRegs[I]; ++I) {
@@ -561,6 +571,7 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
+ Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index bebb13cbf09f..5105587617fd 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -191,6 +191,7 @@ struct SIArgumentInfo {
Optional<SIArgument> WorkGroupIDY;
Optional<SIArgument> WorkGroupIDZ;
Optional<SIArgument> WorkGroupInfo;
+ Optional<SIArgument> LDSKernelId;
Optional<SIArgument> PrivateSegmentWaveByteOffset;
Optional<SIArgument> ImplicitArgPtr;
@@ -215,6 +216,7 @@ template <> struct MappingTraits<SIArgumentInfo> {
YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY);
YamlIO.mapOptional("workGroupIDZ", AI.WorkGroupIDZ);
YamlIO.mapOptional("workGroupInfo", AI.WorkGroupInfo);
+ YamlIO.mapOptional("LDSKernelId", AI.LDSKernelId);
YamlIO.mapOptional("privateSegmentWaveByteOffset",
AI.PrivateSegmentWaveByteOffset);
@@ -418,6 +420,7 @@ private:
bool WorkGroupIDY : 1;
bool WorkGroupIDZ : 1;
bool WorkGroupInfo : 1;
+ bool LDSKernelId : 1;
bool PrivateSegmentWaveByteOffset : 1;
bool WorkItemIDX : 1; // Always initialized.
@@ -608,6 +611,7 @@ public:
Register addDispatchID(const SIRegisterInfo &TRI);
Register addFlatScratchInit(const SIRegisterInfo &TRI);
Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
+ Register addLDSKernelId();
/// Increment user SGPRs used for padding the argument list only.
Register addReservedUserSGPR() {
@@ -705,6 +709,8 @@ public:
return WorkGroupInfo;
}
+ bool hasLDSKernelId() const { return LDSKernelId; }
+
bool hasPrivateSegmentWaveByteOffset() const {
return PrivateSegmentWaveByteOffset;
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 66bc46aaefea..19a83ad53e2e 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -12,6 +12,8 @@
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
@@ -26,6 +28,10 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
const SIRegisterInfo *TRI = nullptr;
const SIInstrInfo *TII = nullptr;
const MachineRegisterInfo *MRI = nullptr;
+ MCRegister Exec;
+
+ DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+ SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
Register isCopyFromExec(const MachineInstr &MI) const;
Register isCopyToExec(const MachineInstr &MI) const;
@@ -44,13 +50,13 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
std::function<bool(MachineInstr *)> Pred,
ArrayRef<MCRegister> NonModifiableRegs,
unsigned MaxInstructions = 20) const;
- MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec,
- MCRegister Exec) const;
- bool optimizeExecSequence() const;
- bool optimizeVCmpxAndSaveexecSequence() const;
- bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
- MachineInstr &VCmp,
- MCRegister Exec) const;
+ bool optimizeExecSequence();
+ void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
+ bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+ MachineInstr &VCmp, MCRegister Exec) const;
+
+ void tryRecordOrSaveexecXorSequence(MachineInstr &MI);
+ bool optimizeOrSaveexecXorSequences();
public:
static char ID;
@@ -92,7 +98,7 @@ Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B32_term: {
const MachineOperand &Src = MI.getOperand(1);
- if (Src.isReg() && Src.getReg() == TRI->getExec())
+ if (Src.isReg() && Src.getReg() == Exec)
return MI.getOperand(0).getReg();
}
}
@@ -107,8 +113,7 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
case AMDGPU::S_MOV_B64:
case AMDGPU::S_MOV_B32: {
const MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && Dst.getReg() == TRI->getExec() &&
- MI.getOperand(1).isReg())
+ if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
break;
}
@@ -394,9 +399,7 @@ bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop,
// =>
// x = s_<op>_saveexec_b64 y
//
-bool SIOptimizeExecMasking::optimizeExecSequence() const {
- MCRegister Exec = TRI->getExec();
-
+bool SIOptimizeExecMasking::optimizeExecSequence() {
bool Changed = false;
for (MachineBasicBlock &MBB : *MF) {
MachineBasicBlock::reverse_iterator I = fixTerminators(MBB);
@@ -551,88 +554,9 @@ bool SIOptimizeExecMasking::optimizeExecSequence() const {
return Changed;
}
-// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
-// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
-// to the v_cmp instruction if it is safe to replace the sequence (see the
-// conditions in the function body). This is after register allocation, so some
-// checks on operand dependencies need to be considered.
-MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization(
- MachineInstr &SaveExec, MCRegister Exec) const {
-
- MachineInstr *VCmp = nullptr;
-
- Register SaveExecDest = SaveExec.getOperand(0).getReg();
- if (!TRI->isSGPRReg(*MRI, SaveExecDest))
- return nullptr;
-
- MachineOperand *SaveExecSrc0 =
- TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
- if (!SaveExecSrc0->isReg())
- return nullptr;
-
- // Try to find the last v_cmp instruction that defs the saveexec input
- // operand without any write to Exec or the saveexec input operand inbetween.
- VCmp = findInstrBackwards(
- SaveExec,
- [&](MachineInstr *Check) {
- return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
- Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
- },
- {Exec, SaveExecSrc0->getReg()});
-
- if (!VCmp)
- return nullptr;
-
- MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
- assert(VCmpDest && "Should have an sdst operand!");
-
- // Check if any of the v_cmp source operands is written by the saveexec.
- MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
- if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
- SaveExec.modifiesRegister(Src0->getReg(), TRI))
- return nullptr;
-
- MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
- if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
- SaveExec.modifiesRegister(Src1->getReg(), TRI))
- return nullptr;
-
- // Don't do the transformation if the destination operand is included in
- // it's MBB Live-outs, meaning it's used in any of it's successors, leading
- // to incorrect code if the v_cmp and therefore the def of
- // the dest operand is removed.
- if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
- return nullptr;
-
- // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
- // s_and_saveexec, skip the optimization.
- if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false,
- true) ||
- isRegisterInUseAfter(SaveExec, VCmpDest->getReg()))
- return nullptr;
-
- // Try to determine if there is a write to any of the VCmp
- // operands between the saveexec and the vcmp.
- // If yes, additional VGPR spilling might need to be inserted. In this case,
- // it's not worth replacing the instruction sequence.
- SmallVector<MCRegister, 2> NonDefRegs;
- if (Src0->isReg())
- NonDefRegs.push_back(Src0->getReg());
-
- if (Src1->isReg())
- NonDefRegs.push_back(Src1->getReg());
-
- if (!findInstrBackwards(
- SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
- NonDefRegs))
- return nullptr;
-
- return VCmp;
-}
-
// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
// operands extracted from a v_cmp ..., s_and_saveexec pattern.
-bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
+bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
@@ -678,50 +602,164 @@ bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
if (Src1->isReg())
MRI->clearKillFlags(Src1->getReg());
+ SaveExecInstr.eraseFromParent();
+ VCmp.eraseFromParent();
+
return true;
}
-// After all s_op_saveexec instructions are inserted,
-// replace (on GFX10.3 and later)
+// Record (on GFX10.3 and later) occurences of
// v_cmp_* SGPR, IMM, VGPR
// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
-// with
+// to be replaced with
// s_mov_b32 EXEC_SGPR_DEST, exec_lo
// v_cmpx_* IMM, VGPR
// to reduce pipeline stalls.
-bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const {
+void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
+ MachineInstr &MI) {
if (!ST->hasGFX10_3Insts())
- return false;
+ return;
- bool Changed = false;
-
- DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
- MCRegister Exec = TRI->getExec();
const unsigned AndSaveExecOpcode =
ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
- for (MachineBasicBlock &MBB : *MF) {
- for (MachineInstr &MI : MBB) {
- // Record relevant v_cmp / s_and_saveexec instruction pairs for
- // replacement.
- if (MI.getOpcode() != AndSaveExecOpcode)
- continue;
+ if (MI.getOpcode() != AndSaveExecOpcode)
+ return;
+
+ Register SaveExecDest = MI.getOperand(0).getReg();
+ if (!TRI->isSGPRReg(*MRI, SaveExecDest))
+ return;
- if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec))
- SaveExecVCmpMapping[&MI] = VCmp;
+ MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ if (!SaveExecSrc0->isReg())
+ return;
+
+ // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec
+ // sequence by looking at an instance of a s_and_saveexec instruction. Returns
+ // a pointer to the v_cmp instruction if it is safe to replace the sequence
+ // (see the conditions in the function body). This is after register
+ // allocation, so some checks on operand dependencies need to be considered.
+ MachineInstr *VCmp = nullptr;
+
+ // Try to find the last v_cmp instruction that defs the saveexec input
+ // operand without any write to Exec or the saveexec input operand inbetween.
+ VCmp = findInstrBackwards(
+ MI,
+ [&](MachineInstr *Check) {
+ return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+ Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+ },
+ {Exec, SaveExecSrc0->getReg()});
+
+ if (!VCmp)
+ return;
+
+ MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+ assert(VCmpDest && "Should have an sdst operand!");
+
+ // Check if any of the v_cmp source operands is written by the saveexec.
+ MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+ if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
+ MI.modifiesRegister(Src0->getReg(), TRI))
+ return;
+
+ MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+ if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
+ MI.modifiesRegister(Src1->getReg(), TRI))
+ return;
+
+ // Don't do the transformation if the destination operand is included in
+ // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+ // to incorrect code if the v_cmp and therefore the def of
+ // the dest operand is removed.
+ if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+ return;
+
+ // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
+ // s_and_saveexec, skip the optimization.
+ if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) ||
+ isRegisterInUseAfter(MI, VCmpDest->getReg()))
+ return;
+
+ // Try to determine if there is a write to any of the VCmp
+ // operands between the saveexec and the vcmp.
+ // If yes, additional VGPR spilling might need to be inserted. In this case,
+ // it's not worth replacing the instruction sequence.
+ SmallVector<MCRegister, 2> NonDefRegs;
+ if (Src0->isReg())
+ NonDefRegs.push_back(Src0->getReg());
+
+ if (Src1->isReg())
+ NonDefRegs.push_back(Src1->getReg());
+
+ if (!findInstrBackwards(
+ MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
+ return;
+
+ if (VCmp)
+ SaveExecVCmpMapping[&MI] = VCmp;
+}
+
+// Record occurences of
+// s_or_saveexec s_o, s_i
+// s_xor exec, exec, s_o
+// to be replaced with
+// s_andn2_saveexec s_o, s_i.
+void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) {
+ const unsigned XorOpcode =
+ ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
+
+ if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) {
+ const MachineOperand &XorDst = MI.getOperand(0);
+ const MachineOperand &XorSrc0 = MI.getOperand(1);
+ const MachineOperand &XorSrc1 = MI.getOperand(2);
+
+ if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() &&
+ XorSrc1.isReg() &&
+ (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) {
+ const unsigned OrSaveexecOpcode = ST->isWave32()
+ ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64;
+
+ // Peek at the previous instruction and check if this is a relevant
+ // s_or_saveexec instruction.
+ MachineInstr &PossibleOrSaveexec = *MI.getPrevNode();
+ if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode)
+ return;
+
+ const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0);
+ const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1);
+ if (OrDst.isReg() && OrSrc0.isReg()) {
+ if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) ||
+ (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) {
+ OrXors.emplace_back(&PossibleOrSaveexec, &MI);
+ }
+ }
}
}
+}
- for (const auto &Entry : SaveExecVCmpMapping) {
- MachineInstr *SaveExecInstr = Entry.getFirst();
- MachineInstr *VCmpInstr = Entry.getSecond();
+bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
+ if (OrXors.empty()) {
+ return false;
+ }
- if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) {
- SaveExecInstr->eraseFromParent();
- VCmpInstr->eraseFromParent();
+ bool Changed = false;
+ const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
+ : AMDGPU::S_ANDN2_SAVEEXEC_B64;
- Changed = true;
- }
+ for (const auto &Pair : OrXors) {
+ MachineInstr *Or = nullptr;
+ MachineInstr *Xor = nullptr;
+ std::tie(Or, Xor) = Pair;
+ BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(),
+ TII->get(Andn2Opcode), Or->getOperand(0).getReg())
+ .addReg(Or->getOperand(1).getReg());
+
+ Or->eraseFromParent();
+ Xor->eraseFromParent();
+
+ Changed = true;
}
return Changed;
@@ -736,9 +774,42 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
TRI = ST->getRegisterInfo();
TII = ST->getInstrInfo();
MRI = &MF.getRegInfo();
+ Exec = TRI->getExec();
bool Changed = optimizeExecSequence();
- Changed |= optimizeVCmpxAndSaveexecSequence();
+
+ OrXors.clear();
+ SaveExecVCmpMapping.clear();
+ static unsigned SearchWindow = 10;
+ for (MachineBasicBlock &MBB : MF) {
+ unsigned SearchCount = 0;
+
+ for (auto &MI : llvm::reverse(MBB)) {
+ if (MI.isDebugInstr())
+ continue;
+
+ if (SearchCount >= SearchWindow) {
+ break;
+ }
+
+ tryRecordOrSaveexecXorSequence(MI);
+ tryRecordVCmpxAndSaveexecSequence(MI);
+
+ if (MI.modifiesRegister(Exec, TRI)) {
+ break;
+ }
+
+ ++SearchCount;
+ }
+ }
+
+ Changed |= optimizeOrSaveexecXorSequences();
+ for (const auto &Entry : SaveExecVCmpMapping) {
+ MachineInstr *SaveExecInstr = Entry.getFirst();
+ MachineInstr *VCmpInstr = Entry.getSecond();
+
+ Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
+ }
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 57dbad468de8..aed84437b890 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -184,6 +184,16 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
return false;
+ // Cannot safely mirror live intervals with PHI nodes, so check for these
+ // before optimization.
+ SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
+ LiveInterval *SelLI = &LIS->getInterval(SelReg);
+ if (llvm::any_of(SelLI->vnis(),
+ [](const VNInfo *VNI) {
+ return VNI->isPHIDef();
+ }))
+ return false;
+
// TODO: Guard against implicit def operands?
LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
<< *And);
@@ -204,31 +214,34 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
- SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
- SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
-
- LiveInterval *CmpLI =
- CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
- LiveInterval *SelLI =
- SelReg.isVirtual() ? &LIS->getInterval(SelReg) : nullptr;
-
// Update live intervals for CCReg before potentially removing CmpReg/SelReg,
// and their associated liveness information.
+ SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
if (CCReg.isVirtual()) {
- // Note: this ignores that SelLI might have multiple internal values
- // or splits and simply extends the live range to cover all cases
- // where the result of the v_cndmask_b32 was live (e.g. loops).
- // This could yield worse register allocation in rare edge cases.
- SlotIndex EndIdx = AndIdx.getRegSlot();
- if (SelLI && SelLI->endIndex() > EndIdx && SelLI->endIndex().isBlock())
- EndIdx = SelLI->endIndex();
+ // Apply live ranges from SelLI to CCReg potentially matching splits
+ // and extending to loop boundaries.
+
+ auto applyLiveRanges = [&](LiveRange &Dst, VNInfo *VNI) {
+ // Copy live ranges from SelLI, adjusting start and end as required
+ auto DefSegment = SelLI->FindSegmentContaining(SelIdx.getRegSlot());
+ assert(DefSegment != SelLI->end() &&
+ "No live interval segment covering definition?");
+ for (auto I = DefSegment; I != SelLI->end(); ++I) {
+ SlotIndex Start = I->start < SelIdx.getRegSlot() ?
+ SelIdx.getRegSlot() : I->start;
+ SlotIndex End = I->end < AndIdx.getRegSlot() || I->end.isBlock() ?
+ I->end : AndIdx.getRegSlot();
+ Dst.addSegment(LiveRange::Segment(Start, End, VNI));
+ }
+ // If SelLI does not cover AndIdx (because Cmp killed Sel) then extend.
+ if (!SelLI->getSegmentContaining(AndIdx.getRegSlot()))
+ Dst.addSegment(LiveRange::Segment(CmpIdx.getRegSlot(), AndIdx.getRegSlot(), VNI));
+ };
LiveInterval &CCLI = LIS->getInterval(CCReg);
auto CCQ = CCLI.Query(SelIdx.getRegSlot());
- if (CCQ.valueIn()) {
- CCLI.addSegment(LiveRange::Segment(SelIdx.getRegSlot(),
- EndIdx, CCQ.valueIn()));
- }
+ if (CCQ.valueIn())
+ applyLiveRanges(CCLI, CCQ.valueIn());
if (CC->getSubReg()) {
LaneBitmask Mask = TRI->getSubRegIndexLaneMask(CC->getSubReg());
@@ -237,10 +250,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
Allocator, Mask,
[=](LiveInterval::SubRange &SR) {
auto CCQS = SR.Query(SelIdx.getRegSlot());
- if (CCQS.valueIn()) {
- SR.addSegment(LiveRange::Segment(
- SelIdx.getRegSlot(), EndIdx, CCQS.valueIn()));
- }
+ if (CCQS.valueIn())
+ applyLiveRanges(SR, CCQS.valueIn());
},
*LIS->getSlotIndexes(), *TRI);
CCLI.removeEmptySubRanges();
@@ -253,7 +264,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
// Try to remove compare. Cmp value should not used in between of cmp
// and s_and_b64 if VCC or just unused if any other register.
- if ((CmpReg.isVirtual() && CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
+ LiveInterval *CmpLI = CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
+ if ((CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
(CmpReg == Register(CondReg) &&
std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
[&](const MachineInstr &MI) {
@@ -266,18 +278,16 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
Cmp->eraseFromParent();
// Try to remove v_cndmask_b32.
- if (SelLI) {
- // Kill status must be checked before shrinking the live range.
- bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill();
- LIS->shrinkToUses(SelLI);
- bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
- if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) {
- LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
-
- LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
- LIS->RemoveMachineInstrFromMaps(*Sel);
- Sel->eraseFromParent();
- }
+ // Kill status must be checked before shrinking the live range.
+ bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill();
+ LIS->shrinkToUses(SelLI);
+ bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
+ if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) {
+ LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
+
+ LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
+ LIS->RemoveMachineInstrFromMaps(*Sel);
+ Sel->eraseFromParent();
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index b13afceba20e..553fb4cf496c 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -49,6 +49,8 @@ struct SIProgramInfo {
uint32_t AccumOffset = 0;
uint32_t TgSplit = 0;
uint32_t NumSGPR = 0;
+ unsigned SGPRSpill = 0;
+ unsigned VGPRSpill = 0;
uint32_t LDSSize = 0;
bool FlatUsed = false;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 882d13402a19..b7e8eadfe71d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -119,13 +119,19 @@ class SM_Probe_Pseudo <string opName, string variant, RegisterClass baseClass,
let PseudoInstr = opName # variant;
}
-class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
- : SM_Pseudo<opName, outs, ins, asmOps, pattern> {
- RegisterClass BaseClass;
+class SM_Load_Pseudo <string opName, RegisterClass baseClass,
+ RegisterClass dstClass, OffsetMode offsets>
+ : SM_Pseudo<opName, (outs dstClass:$sdst),
+ !con((ins baseClass:$sbase), offsets.Ins, (ins CPol:$cpol)),
+ " $sdst, $sbase, " # offsets.Asm # "$cpol", []> {
+ RegisterClass BaseClass = baseClass;
let mayLoad = 1;
let mayStore = 0;
let has_glc = 1;
let has_dlc = 1;
+ let has_offset = offsets.HasOffset;
+ let has_soffset = offsets.HasSOffset;
+ let PseudoInstr = opName # offsets.Variant;
}
class SM_Store_Pseudo <string opName, RegisterClass baseClass,
@@ -158,40 +164,9 @@ class SM_Discard_Pseudo <string opName, string variant, dag offsets,
multiclass SM_Pseudo_Loads<string opName,
RegisterClass baseClass,
RegisterClass dstClass> {
- def _IMM : SM_Load_Pseudo <opName,
- (outs dstClass:$sdst),
- (ins baseClass:$sbase, i32imm:$offset, CPol:$cpol),
- " $sdst, $sbase, $offset$cpol", []> {
- let has_offset = 1;
- let BaseClass = baseClass;
- let PseudoInstr = opName # "_IMM";
- let has_glc = 1;
- let has_dlc = 1;
- }
-
- def _SGPR : SM_Load_Pseudo <opName,
- (outs dstClass:$sdst),
- (ins baseClass:$sbase, SReg_32:$soffset, CPol:$cpol),
- " $sdst, $sbase, $soffset$cpol", []> {
- let has_soffset = 1;
- let BaseClass = baseClass;
- let PseudoInstr = opName # "_SGPR";
- let has_glc = 1;
- let has_dlc = 1;
- }
-
- def _SGPR_IMM : SM_Load_Pseudo <opName,
- (outs dstClass:$sdst),
- (ins baseClass:$sbase, SReg_32:$soffset,
- i32imm:$offset, CPol:$cpol),
- " $sdst, $sbase, $soffset$offset$cpol", []> {
- let has_offset = 1;
- let has_soffset = 1;
- let BaseClass = baseClass;
- let PseudoInstr = opName # "_SGPR_IMM";
- let has_glc = 1;
- let has_dlc = 1;
- }
+ def _IMM : SM_Load_Pseudo <opName, baseClass, dstClass, IMM_Offset>;
+ def _SGPR : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_Offset>;
+ def _SGPR_IMM : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_IMM_Offset>;
}
multiclass SM_Pseudo_Stores<string opName,
@@ -596,10 +571,10 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
soffset{6-0}, ?);
}
-class SMEM_Real_Load_vi<bits<8> op, string ps, dag offsets>
- : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps)> {
- RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass;
- let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol));
+class SMEM_Real_Load_vi<bits<8> op, string ps, OffsetMode offsets>
+ : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps # offsets.Variant)> {
+ RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+ let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
}
// The alternative GFX9 SGPR encoding using soffset to encode the
@@ -614,14 +589,12 @@ class SMEM_Real_SGPR_alt_gfx9 {
}
multiclass SM_Real_Loads_vi<bits<8> op, string ps> {
- def _IMM_vi : SMEM_Real_Load_vi <op, ps#"_IMM", (ins smem_offset:$offset)>;
- def _SGPR_vi : SMEM_Real_Load_vi <op, ps#"_SGPR", (ins SReg_32:$soffset)>;
- def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR",
- (ins SReg_32:$soffset)>,
+ def _IMM_vi : SMEM_Real_Load_vi <op, ps, IMM_Offset>;
+ def _SGPR_vi : SMEM_Real_Load_vi <op, ps, SGPR_Offset>;
+ def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_Offset>,
SMEM_Real_SGPR_alt_gfx9;
let IsGFX9SpecificEncoding = true in
- def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <
- op, ps#"_SGPR_IMM", (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
+ def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_IMM_Offset>;
}
class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
@@ -883,6 +856,7 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
+def SMRDSgprImm : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
@@ -903,11 +877,18 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
// 3. SGPR offset
def : GCNPat <
- (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
+ (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))
>;
- // 4. No offset
+ // 4. SGPR+IMM offset
+ def : GCNPat <
+ (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+
+ // 5. No offset
def : GCNPat <
(vt (smrd_load (i64 SReg_64:$sbase))),
(vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
@@ -1021,19 +1002,16 @@ class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps>
let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
}
-multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
- SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
- SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
- def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
- }
- def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
- }
- def _SGPR_IMM_gfx10 : SMEM_Real_gfx10<op, !cast<SM_Load_Pseudo>(ps#_SGPR_IMM)> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset,
- smem_offset_mod:$offset, CPol:$cpol);
- }
+class SMEM_Real_Load_gfx10<bits<8> op, string ps, OffsetMode offsets>
+ : SMEM_Real_gfx10<op, !cast<SM_Pseudo>(ps # offsets.Variant)> {
+ RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+ let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
+}
+
+multiclass SM_Real_Loads_gfx10<bits<8> op, string ps> {
+ def _IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, IMM_Offset>;
+ def _SGPR_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_Offset>;
+ def _SGPR_IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_IMM_Offset>;
}
class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> {
@@ -1227,17 +1205,16 @@ class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
}
-class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, dag offsets> :
- SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps), opName> {
- RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass;
- let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol));
+class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, OffsetMode offsets> :
+ SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
+ RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+ let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
}
multiclass SM_Real_Loads_gfx11<bits<8> op, string ps, string opName> {
- def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_IMM", opName, (ins smem_offset:$offset)>;
- def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR", opName, (ins SReg_32:$soffset)>;
- def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<
- op, ps#"_SGPR_IMM", opName, (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
+ def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, IMM_Offset>;
+ def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_Offset>;
+ def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_IMM_Offset>;
def : MnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName>,
Requires<[isGFX11Plus]>;
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2f334e211181..b5fb390c08e1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -417,9 +417,9 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
CanBeVOPD getCanBeVOPD(unsigned Opc) {
const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc);
if (Info)
- return {Info->CanBeVOPDX, 1};
+ return {Info->CanBeVOPDX, true};
else
- return {0, 0};
+ return {false, false};
}
unsigned getVOPDOpcode(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index 65ed02ca62de..a2d59abd3abb 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -30,7 +30,7 @@ namespace AMDGPU {
Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
std::vector<GlobalVariable *> findVariablesToLower(Module &M,
- const Function *F = nullptr);
+ const Function *F);
/// Replace all uses of constant \p C with instructions in \p F.
void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1d374a9f90ba..73e4eb8cdc24 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -499,6 +499,59 @@ let SubtargetPredicate = isGFX9Only in {
defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
} // End SubtargetPredicate = isGFX9Only
+class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> {
+ let HasExtSDWA = 1;
+ let HasExtSDWA9 = 1;
+ let HasExt = 1;
+ let DstRCSDWA = getVALUDstForVT<vt>.ret;
+ let InsSDWA = (ins Bin32SDWAInputMods:$src0_modifiers, Src0SDWA:$src0,
+ clampmod:$clamp, omod:$omod, src0_sel:$src0_sel);
+ let AsmSDWA = "$vdst, $src0_modifiers$clamp$omod $src0_sel"; // No dst_sel
+ let AsmSDWA9 = AsmSDWA;
+ let EmitDstSel = 0;
+}
+
+def VOPProfileCVT_F32_F8 : VOPProfile_Base_CVT_F32_F8 <f32>;
+def VOPProfileCVT_PK_F32_F8 : VOPProfile_Base_CVT_F32_F8 <v2f32>;
+
+let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+ SchedRW = [WriteFloatCvt] in {
+ defm V_CVT_F32_FP8 : VOP1Inst<"v_cvt_f32_fp8", VOPProfileCVT_F32_F8>;
+ defm V_CVT_F32_BF8 : VOP1Inst<"v_cvt_f32_bf8", VOPProfileCVT_F32_F8>;
+ defm V_CVT_PK_F32_FP8 : VOP1Inst<"v_cvt_pk_f32_fp8", VOPProfileCVT_PK_F32_F8>;
+ defm V_CVT_PK_F32_BF8 : VOP1Inst<"v_cvt_pk_f32_bf8", VOPProfileCVT_PK_F32_F8>;
+}
+
+class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
+ VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+ (f32 (node i32:$src, index)),
+ !if (index,
+ (inst_sdwa 0, $src, 0, 0, index),
+ (inst_e32 $src))
+>;
+
+foreach Index = [0, 1, 2, 3] in {
+ def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index,
+ V_CVT_F32_FP8_e32, V_CVT_F32_FP8_sdwa>;
+ def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index,
+ V_CVT_F32_BF8_e32, V_CVT_F32_BF8_sdwa>;
+}
+
+class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
+ VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+ (v2f32 (node i32:$src, index)),
+ !if (index,
+ (inst_sdwa 0, $src, 0, 0, SDWA.WORD_1),
+ (inst_e32 $src))
+>;
+
+foreach Index = [0, -1] in {
+ def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
+ V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
+ def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
+ V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+}
+
let SubtargetPredicate = isGFX10Plus in {
defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NO_EXT<VOP_NONE>>;
@@ -1106,11 +1159,36 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
}
+multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
+ let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+ defm NAME : VOP1_Real_e32e64_vi <op>;
+ }
+
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ let Inst{42-40} = 6;
+ }
+
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx9 :
+ VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+ VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+}
+
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
+let OtherPredicates = [HasFP8Insts] in {
+defm V_CVT_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>;
+defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
+defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
+defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
+}
+
//===----------------------------------------------------------------------===//
// GFX10
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index dddd0aacc140..a911483cade5 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -481,6 +481,30 @@ def shl_0_to_4 : PatFrag<
}];
}
+def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
+ let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+ FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+ VGPR_32:$vdst_in, op_sel0:$op_sel);
+ let HasClamp = 0;
+ let HasExtVOP3DPP = 0;
+}
+
+def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
+ VOP3_OPSEL> {
+ let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+ FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+ FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+ op_sel0:$op_sel);
+ let HasClamp = 0;
+ let HasSrc2 = 0;
+ let HasSrc2Mods = 1;
+ let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
+ getAsmVOP3OpSel<3, HasClamp,
+ HasSrc0FloatMods, HasSrc1FloatMods,
+ HasSrc2FloatMods>.ret);
+ let HasExtVOP3DPP = 0;
+}
+
let SubtargetPredicate = isGFX9Plus in {
let isCommutable = 1, isReMaterializable = 1 in {
defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -526,6 +550,43 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32
let SubtargetPredicate = isGFX940Plus in
defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>;
+let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+ SchedRW = [WriteFloatCvt] in {
+ let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+ defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+ defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+ }
+
+ // These instructions have non-standard use of op_sel. In particular they are
+ // using op_sel bits 2 and 3 while only having two sources. Therefore dummy
+ // src2 is used to hold the op_sel value.
+ let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+ defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
+ defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
+ }
+}
+
+class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : GCNPat<
+ (i32 (node f32:$src0, f32:$src1, i32:$old, index)),
+ (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0))
+>;
+
+class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
+ (i32 (node f32:$src0, i32:$src1, i32:$old, index)),
+ (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
+ !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0))
+>;
+
+foreach Index = [0, -1] in {
+ def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
+ def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
+}
+
+foreach Index = [0, 1, 2, 3] in {
+ def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
+ def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+}
+
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
// This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
(ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
@@ -699,15 +760,19 @@ def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC_LO>;
}
class VOP3_DOT_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile<P, Features> {
- // FIXME VOP3 DPP versions are unsupported
- let HasExtVOP3DPP = 0;
let HasClamp = 0;
let HasOMod = 0;
- let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
- NumSrcArgs, HasClamp, HasOMod,
- !if(isFloatType<Src0VT>.ret, FPVRegInputMods, IntOpSelMods),
- !if(isFloatType<Src1VT>.ret, FPVRegInputMods, IntOpSelMods),
- !if(isFloatType<Src2VT>.ret, FPVRegInputMods, IntOpSelMods)>.ret;
+ // Override modifiers for bf16(i16) (same as float modifiers).
+ let HasSrc0Mods = 1;
+ let HasSrc1Mods = 1;
+ let HasSrc2Mods = 1;
+ let Src0ModDPP = FPVRegInputMods;
+ let Src1ModDPP = FPVRegInputMods;
+ let Src2ModVOP3DPP = FPVRegInputMods;
+ let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ HasClamp, HasOMod, FPVRegInputMods,
+ FPVRegInputMods, FPVRegInputMods>.ret;
+ let AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs, HasClamp, 1, 1, 1>.ret;
}
let SubtargetPredicate = isGFX11Plus in {
@@ -723,7 +788,7 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
} // End SubtargetPredicate = isGFX11Plus
-let SubtargetPredicate = HasDot8Insts in {
+let SubtargetPredicate = HasDot8Insts, IsDOT=1 in {
defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>;
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
}
@@ -848,9 +913,8 @@ defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11<0x262>;
defm V_MINMAX_U32 : VOP3_Realtriple_gfx11<0x263>;
defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11<0x264>;
defm V_MINMAX_I32 : VOP3_Realtriple_gfx11<0x265>;
-// FIXME VOP3 DPP Dot instructions are unsupported
-defm V_DOT2_F16_F16 : VOP3_Real_Base_gfx11<0x266>;
-defm V_DOT2_BF16_BF16 : VOP3_Real_Base_gfx11<0x267>;
+defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_gfx11<0x266>;
+defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11<0x267>;
defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">;
defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
@@ -1161,6 +1225,13 @@ multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
}
+multiclass VOP3OpSel_Real_gfx9_forced_opsel2<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> {
+ let Inst{13} = src2_modifiers{2}; // op_sel(2)
+ }
+}
+
multiclass VOP3Interp_Real_vi<bits<10> op> {
def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
@@ -1352,3 +1423,10 @@ defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>;
defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
+
+let OtherPredicates = [HasFP8Insts] in {
+defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
+defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
+defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
+defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
+}
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 59ce532af59b..f1ce613d613b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -493,6 +493,8 @@ def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, A
def VOPProfileMAI_I32_I64_X32 : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, AISrc_512_b32, ADst_512, AVSrc_64>;
def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
+def VOPProfileMAI_F32_I64_X32 : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_F32_I64_X16 : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>;
def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>;
@@ -515,6 +517,8 @@ def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32,
def VOPProfileMAI_I32_I64_X32_VCD : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, VISrc_512_b32, VDst_512, AVSrc_64>;
def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
+def VOPProfileMAI_F32_I64_X32_VCD : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_F32_I64_X16_VCD : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
@@ -522,6 +526,8 @@ def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I
def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
class MFMATable <bit is_mac, string Name> {
bit IsMac = is_mac;
@@ -638,6 +644,14 @@ let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in {
defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>;
defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>;
defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>;
+ defm V_MFMA_F32_16X16X32_BF8_BF8 : MAIInst<"v_mfma_f32_16x16x32_bf8_bf8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
+ defm V_MFMA_F32_16X16X32_BF8_FP8 : MAIInst<"v_mfma_f32_16x16x32_bf8_fp8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
+ defm V_MFMA_F32_16X16X32_FP8_BF8 : MAIInst<"v_mfma_f32_16x16x32_fp8_bf8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
+ defm V_MFMA_F32_16X16X32_FP8_FP8 : MAIInst<"v_mfma_f32_16x16x32_fp8_fp8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
+ defm V_MFMA_F32_32X32X16_BF8_BF8 : MAIInst<"v_mfma_f32_32x32x16_bf8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
+ defm V_MFMA_F32_32X32X16_BF8_FP8 : MAIInst<"v_mfma_f32_32x32x16_bf8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
+ defm V_MFMA_F32_32X32X16_FP8_BF8 : MAIInst<"v_mfma_f32_32x32x16_fp8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
+ defm V_MFMA_F32_32X32X16_FP8_FP8 : MAIInst<"v_mfma_f32_32x32x16_fp8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
} // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1
multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
@@ -654,6 +668,14 @@ defm V_SMFMAC_F32_16X16X32_BF16 : SMFMACInst<"v_smfmac_f32_16x16x32_bf16",
defm V_SMFMAC_F32_32X32X16_BF16 : SMFMACInst<"v_smfmac_f32_32x32x16_bf16", "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>;
defm V_SMFMAC_I32_16X16X64_I8 : SMFMACInst<"v_smfmac_i32_16x16x64_i8", "I32_16X16X64_I8", int_amdgcn_smfmac_i32_16x16x64_i8>;
defm V_SMFMAC_I32_32X32X32_I8 : SMFMACInst<"v_smfmac_i32_32x32x32_i8", "I32_32X32X32_I8", int_amdgcn_smfmac_i32_32x32x32_i8>;
+defm V_SMFMAC_F32_16X16X64_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_bf8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
+defm V_SMFMAC_F32_16X16X64_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_fp8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
+defm V_SMFMAC_F32_16X16X64_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_bf8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
+defm V_SMFMAC_F32_16X16X64_FP8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_fp8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
+defm V_SMFMAC_F32_32X32X32_BF8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_bf8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
+defm V_SMFMAC_F32_32X32X32_BF8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_fp8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
+defm V_SMFMAC_F32_32X32X32_FP8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_bf8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
+defm V_SMFMAC_F32_32X32X32_FP8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_fp8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
}
def MAIInstInfoTable : GenericTable {
@@ -1121,6 +1143,14 @@ defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x
defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
defm V_MFMA_F32_16X16X8XF32 : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
defm V_MFMA_F32_32X32X4XF32 : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
+defm V_MFMA_F32_16X16X32_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x70>;
+defm V_MFMA_F32_16X16X32_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x71>;
+defm V_MFMA_F32_16X16X32_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x72>;
+defm V_MFMA_F32_16X16X32_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x73>;
+defm V_MFMA_F32_32X32X16_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x74>;
+defm V_MFMA_F32_32X32X16_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x75>;
+defm V_MFMA_F32_32X32X16_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x76>;
+defm V_MFMA_F32_32X32X16_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x77>;
defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">;
defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">;
@@ -1137,6 +1167,14 @@ defm V_SMFMAC_F32_16X16X32_BF16 : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x1
defm V_SMFMAC_F32_32X32X16_BF16 : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">;
defm V_SMFMAC_I32_16X16X64_I8 : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">;
defm V_SMFMAC_I32_32X32X32_I8 : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">;
+defm V_SMFMAC_F32_16X16X64_BF8_BF8 : VOP3P_Real_SMFMAC <0x78, "v_smfmac_f32_16x16x64bf8bf8">;
+defm V_SMFMAC_F32_16X16X64_BF8_FP8 : VOP3P_Real_SMFMAC <0x79, "v_smfmac_f32_16x16x64bf8fp8">;
+defm V_SMFMAC_F32_16X16X64_FP8_BF8 : VOP3P_Real_SMFMAC <0x7a, "v_smfmac_f32_16x16x64fp8bf8">;
+defm V_SMFMAC_F32_16X16X64_FP8_FP8 : VOP3P_Real_SMFMAC <0x7b, "v_smfmac_f32_16x16x64fp8fp8">;
+defm V_SMFMAC_F32_32X32X32_BF8_BF8 : VOP3P_Real_SMFMAC <0x7c, "v_smfmac_f32_32x32x32bf8bf8">;
+defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x32x32bf8fp8">;
+defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">;
+defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">;
let SubtargetPredicate = HasPackedFP32Ops in {
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 33d3441e94c2..d489a089ac78 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -59,15 +59,17 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt
"$src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl");
let AsmDPP8 = "$src0, $src1 $dpp8$fi";
let AsmDPP16 = AsmDPP#"$fi";
+ // VOPC DPP Instructions do not need an old operand
+ let TieRegDPP = "";
let InsDPP = getInsDPP<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
- Src2ModDPP>.ret;
+ Src2ModDPP, 0/*HasOld*/>.ret;
let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
- Src2ModDPP>.ret;
+ Src2ModDPP, 0/*HasOld*/>.ret;
let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
- Src2ModDPP>.ret;
+ Src2ModDPP, 0/*HasOld*/>.ret;
// The destination for 32-bit encoding is implicit.
let HasDst32 = 0;
@@ -76,9 +78,9 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt
let Outs64 = (outs VOPDstS64orS32:$sdst);
let OutsVOP3DPP = Outs64;
let OutsVOP3DPP8 = Outs64;
- let InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
- let InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
- let InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
+ let InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, Src0VOP3DPP, NumSrcArgs, 0/*HasOld*/>.ret;
+ let InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, Src0VOP3DPP, NumSrcArgs, 0/*HasOld*/>.ret;
+ let InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, Src0VOP3DPP, NumSrcArgs, 0/*HasOld*/>.ret;
list<SchedReadWrite> Schedule = sched;
}
@@ -293,7 +295,7 @@ multiclass VOPC_Pseudos <string opName,
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = P.Schedule;
let isCompare = 1;
- let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $sdst", "");
+ let Constraints = "";
}
} // end SubtargetPredicate = isGFX11Plus
@@ -711,7 +713,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
VOPC_Profile<sched, vt, i32> {
let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
let AsmDPP16 = AsmDPP#"$fi";
- let InsDPP = (ins VGPR_32:$old, FPVRegInputMods:$src0_modifiers, VGPR_32:$src0, VGPR_32:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP = (ins FPVRegInputMods:$src0_modifiers, VGPR_32:$src0, VGPR_32:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP16 = !con(InsDPP, (ins FI:$fi));
// DPP8 forbids modifiers and can inherit from VOPC_Profile
@@ -793,7 +795,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
def _e64_dpp : VOP3_DPP_Pseudo<opName, p> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = p.Schedule;
- let Constraints = !if(p.NumSrcArgs, p.TieRegDPP # " = $sdst", "");
+ let Constraints = "";
}
} // end SubtargetPredicate = isGFX11Plus
}
@@ -1068,7 +1070,6 @@ class VOPC_DPP16<bits<8> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
let Uses = ps.Uses;
let OtherPredicates = ps.OtherPredicates;
let Constraints = ps.Constraints;
- let AsmMatchConverter = "cvtVOPCNoDstDPP";
}
class VOPC_DPP16_SIMC<bits<8> op, VOP_DPP_Pseudo ps, int subtarget,
@@ -1084,7 +1085,6 @@ class VOPC_DPP8<bits<8> op, VOPC_Pseudo ps, string opName = ps.OpName>
let Uses = ps.Uses;
let OtherPredicates = ps.OtherPredicates;
let Constraints = "";
- let AsmMatchConverter = "cvtVOPCNoDstDPP8";
}
// VOPC64
@@ -1133,7 +1133,6 @@ class VOPC64_DPP16_NoDst<bits<10> op, VOP_DPP_Pseudo ps,
string opName = ps.OpName>
: VOPC64_DPP16<op, ps, opName> {
let Inst{7-0} = ? ;
- let AsmMatchConverter = "cvtVOPC64NoDstDPP";
}
class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P>
@@ -1163,13 +1162,12 @@ class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
: VOPC64_DPP8<op, ps, opName> {
bits<8> sdst;
let Inst{7-0} = sdst;
- let Constraints = "$old = $sdst";
+ let Constraints = "";
}
class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
: VOPC64_DPP8<op, ps, opName> {
let Inst{7-0} = ? ;
- let AsmMatchConverter = "cvtVOPC64NoDstDPP8";
let Constraints = "";
}
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 187485ffa3ae..b65ca2d6b1b3 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -269,6 +269,10 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
class VOP3OpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
+class VOP3DotOpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11<op, p>{
+ let Inst{11} = ?;
+ let Inst{12} = ?;
+}
// NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa
class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
@@ -1270,6 +1274,8 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f
class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
: VOP3_DPP<op, opName, ps.Pfl, 1> {
+ let VOP3_OPSEL = ps.Pfl.HasOpSel;
+ let IsDOT = ps.IsDOT;
let hasSideEffects = ps.hasSideEffects;
let Defs = ps.Defs;
let SchedRW = ps.SchedRW;
@@ -1285,6 +1291,8 @@ class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
: VOP3_DPP8<op, opName, ps.Pfl> {
+ let VOP3_OPSEL = ps.Pfl.HasOpSel;
+ let IsDOT = ps.IsDOT;
let hasSideEffects = ps.hasSideEffects;
let Defs = ps.Defs;
let SchedRW = ps.SchedRW;
@@ -1326,6 +1334,15 @@ let AssemblerPredicate = isGFX11Only,
VOP3e_gfx11<op, ps.Pfl>;
}
}
+ multiclass VOP3Dot_Real_Base_gfx11<bits<10> op, string opName = NAME,
+ bit isSingle = 0> {
+ defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
+ let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+ def _e64_gfx11 :
+ VOP3_Real<ps, SIEncodingFamily.GFX11>,
+ VOP3DotOpSel_gfx11<op, ps.Pfl>;
+ }
+ }
multiclass VOP3_Real_with_name_gfx11<bits<10> op, string opName,
string asmName, bit isSingle = 0> {
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
@@ -1355,6 +1372,15 @@ let AssemblerPredicate = isGFX11Only,
let DecoderNamespace = "DPPGFX11";
}
}
+
+ multiclass VOP3Dot_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> {
+ def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> {
+ let Inst{11} = ?;
+ let Inst{12} = ?;
+ let DecoderNamespace = "DPPGFX11";
+ }
+ }
+
multiclass VOP3_Real_dpp_with_name_gfx11<bits<10> op, string opName,
string asmName> {
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
@@ -1368,6 +1394,16 @@ let AssemblerPredicate = isGFX11Only,
let DecoderNamespace = "DPP8GFX11";
}
}
+
+ multiclass VOP3Dot_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> {
+ let Inst{11} = ?;
+ let Inst{12} = ?;
+ let DecoderNamespace = "DPP8GFX11";
+ }
+ }
+
multiclass VOP3_Real_dpp8_with_name_gfx11<bits<10> op, string opName,
string asmName> {
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
@@ -1406,6 +1442,12 @@ multiclass VOP3_Realtriple_gfx11<bits<10> op,
VOP3_Real_dpp_Base_gfx11<op, opName>,
VOP3_Real_dpp8_Base_gfx11<op, opName>;
+multiclass VOP3Dot_Realtriple_gfx11<bits<10> op,
+ bit isSingle = 0, string opName = NAME> :
+ VOP3Dot_Real_Base_gfx11<op, opName, isSingle>,
+ VOP3Dot_Real_dpp_Base_gfx11<op, opName>,
+ VOP3Dot_Real_dpp8_Base_gfx11<op, opName>;
+
multiclass VOP3Only_Realtriple_gfx11<bits<10> op> :
VOP3_Realtriple_gfx11<op, 1>;
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 80ba7b5f0d2e..183febe756c1 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6726,8 +6726,8 @@ bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault(
return Subtarget.isMClass() && MF.getFunction().hasMinSize();
}
-bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const {
+bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(
+ const MachineInstr &MI) const {
// Try hard to rematerialize any VCTPs because if we spill P0, it will block
// the tail predication conversion. This means that the element count
// register has to be live for longer, but that has to be better than
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 3b8f3403e3c3..453e3fa1b99b 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -480,8 +480,7 @@ private:
MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
const TargetInstrInfo *TII) const;
- bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const override;
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
private:
/// Modeling special VFP / NEON fp MLA / MLS hazards.
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 613904f702f0..e5347ed8e53a 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1720,6 +1720,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
unsigned UxtOp,
MachineBasicBlock::iterator &NextMBBI) {
bool IsThumb = STI->isThumb();
+ bool IsThumb1Only = STI->isThumb1Only();
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Dest = MI.getOperand(0);
@@ -1794,7 +1795,8 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
MIB.add(predOps(ARMCC::AL));
- unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+ unsigned CMPri =
+ IsThumb ? (IsThumb1Only ? ARM::tCMPi8 : ARM::t2CMPri) : ARM::CMPri;
BuildMI(StoreBB, DL, TII->get(CMPri))
.addReg(TempReg, RegState::Kill)
.addImm(0)
@@ -1848,6 +1850,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
bool IsThumb = STI->isThumb();
+ assert(!STI->isThumb1Only() && "CMP_SWAP_64 unsupported under Thumb1!");
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
@@ -3044,6 +3047,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
assert(STI->isThumb());
return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH, ARM::tUXTH,
NextMBBI);
+ case ARM::tCMP_SWAP_32:
+ assert(STI->isThumb());
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0, NextMBBI);
case ARM::CMP_SWAP_8:
assert(!STI->isThumb());
@@ -3054,11 +3060,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH, ARM::UXTH,
NextMBBI);
case ARM::CMP_SWAP_32:
- if (STI->isThumb())
- return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,
- NextMBBI);
- else
- return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI);
+ assert(!STI->isThumb());
+ return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI);
case ARM::CMP_SWAP_64:
return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index e0e4ffd90e0e..afe16a3cd55c 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3131,7 +3131,7 @@ bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) {
// Else v8i16 pattern of an extract and an insert, with a optional vmovx for
// extracting odd lanes.
- if (VT == MVT::v8i16) {
+ if (VT == MVT::v8i16 && Subtarget->hasFullFP16()) {
SDValue Inp1 = CurDAG->getTargetExtractSubreg(
ARM::ssub_0 + ExtractLane1 / 2, dl, MVT::f32, Val1.getOperand(0));
SDValue Inp2 = CurDAG->getTargetExtractSubreg(
@@ -3151,7 +3151,7 @@ bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) {
// The inserted values are not extracted - if they are f16 then insert them
// directly using a VINS.
- if (VT == MVT::v8f16) {
+ if (VT == MVT::v8f16 && Subtarget->hasFullFP16()) {
SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Val2, Val1);
SDValue NewIns =
CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32,
@@ -3512,7 +3512,7 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
else if (MemTy == MVT::i16)
Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_16 : ARM::CMP_SWAP_16;
else if (MemTy == MVT::i32)
- Opcode = ARM::CMP_SWAP_32;
+ Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_32 : ARM::CMP_SWAP_32;
else
llvm_unreachable("Unknown AtomicCmpSwap type");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e6be93e6480a..743cca9ff71f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -13572,6 +13572,10 @@ static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
bool
ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const {
+ assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
+ N->getOpcode() == ISD::SRL) &&
+ "Expected shift op");
+
if (Level == BeforeLegalizeTypes)
return true;
@@ -13605,8 +13609,38 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
return false;
}
+bool ARMTargetLowering::isDesirableToCommuteXorWithShift(
+ const SDNode *N) const {
+ assert(N->getOpcode() == ISD::XOR &&
+ (N->getOperand(0).getOpcode() == ISD::SHL ||
+ N->getOperand(0).getOpcode() == ISD::SRL) &&
+ "Expected XOR(SHIFT) pattern");
+
+ // Only commute if the entire NOT mask is a hidden shifted mask.
+ auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
+ if (XorC && ShiftC) {
+ unsigned MaskIdx, MaskLen;
+ if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
+ unsigned ShiftAmt = ShiftC->getZExtValue();
+ unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+ if (N->getOperand(0).getOpcode() == ISD::SHL)
+ return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
+ return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
+ }
+ }
+
+ return false;
+}
+
bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
+ assert(((N->getOpcode() == ISD::SHL &&
+ N->getOperand(0).getOpcode() == ISD::SRL) ||
+ (N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getOpcode() == ISD::SHL)) &&
+ "Expected shift-shift mask");
+
if (!Subtarget->isThumb1Only())
return true;
@@ -19962,6 +19996,14 @@ bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
}
break;
}
+ case ARMISD::VBICIMM: {
+ SDValue Op0 = Op.getOperand(0);
+ unsigned ModImm = Op.getConstantOperandVal(1);
+ unsigned EltBits = 0;
+ uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
+ if ((OriginalDemandedBits & Mask) == 0)
+ return TLO.CombineTo(Op, Op0);
+ }
}
return TargetLowering::SimplifyDemandedBitsForTargetNode(
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 10f60ab93ae3..fae279ea7569 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -733,6 +733,8 @@ class VectorType;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
+ bool isDesirableToCommuteXorWithShift(const SDNode *N) const override;
+
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
CombineLevel Level) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 15c33014e988..9c03f72fe6ae 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -1882,6 +1882,7 @@ let Predicates = [HasMVEInt] in {
def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
// For i16's inserts being extracted from low lanes, then may use VINS.
+ let Predicates = [HasFullFP16] in {
def : Pat<(ARMinsertelt (v8i16 MQPR:$src1),
(ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$extlane),
imm_odd:$inslane),
@@ -1889,6 +1890,7 @@ let Predicates = [HasMVEInt] in {
(VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$inslane)),
(EXTRACT_SUBREG MQPR:$src2, (SSubReg_f16_reg imm_even:$extlane))),
(SSubReg_f16_reg imm_odd:$inslane)), MQPR)>;
+ }
def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
@@ -1905,17 +1907,21 @@ let Predicates = [HasMVEInt] in {
def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_even:$lane),
(MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>;
+ let Predicates = [HasFullFP16] in {
def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_odd:$lane),
(COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
(VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$lane)),
(COPY_TO_REGCLASS HPR:$src2, SPR)),
(SSubReg_f16_reg imm_odd:$lane)), MQPR)>;
+ }
def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
(EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
+ let Predicates = [HasFullFP16] in {
def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
(COPY_TO_REGCLASS
(VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))),
HPR)>;
+ }
def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index 71527ae1ab11..8f7039a327b3 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1782,11 +1782,15 @@ def tLDRConstPool
let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
mayLoad = 1, mayStore = 1 in {
-def tCMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
+def tCMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, tGPR:$temp),
(ins GPR:$addr, tGPR:$desired, GPR:$new),
NoItinerary, []>, Sched<[]>;
-def tCMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
+def tCMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, tGPR:$temp),
(ins GPR:$addr, tGPR:$desired, GPR:$new),
NoItinerary, []>, Sched<[]>;
+
+def tCMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, tGPR:$temp),
+ (ins GPR:$addr, GPR:$desired, GPR:$new),
+ NoItinerary, []>, Sched<[]>;
}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index ba1d806c8d81..3c102463ba08 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -20,8 +20,8 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
@@ -33,6 +33,7 @@
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -2197,12 +2198,9 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
return true;
}
-bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
- ScalarEvolution &SE,
- AssumptionCache &AC,
- TargetLibraryInfo *TLI,
- DominatorTree *DT,
- const LoopAccessInfo *LAI) {
+bool ARMTTIImpl::preferPredicateOverEpilogue(
+ Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+ TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) {
if (!EnableTailPredication) {
LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
return false;
@@ -2244,7 +2242,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
return false;
}
- return canTailPredicateLoop(L, LI, SE, DL, LAI);
+ return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
}
PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index dcf82e703a7f..9c3980d79e60 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -288,12 +288,10 @@ public:
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo);
- bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
- ScalarEvolution &SE,
- AssumptionCache &AC,
- TargetLibraryInfo *TLI,
+ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+ AssumptionCache &AC, TargetLibraryInfo *TLI,
DominatorTree *DT,
- const LoopAccessInfo *LAI);
+ LoopVectorizationLegality *LVL);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE);
diff --git a/llvm/lib/Target/AVR/AVRSubtarget.h b/llvm/lib/Target/AVR/AVRSubtarget.h
index 2325193bac0a..3dd71243387b 100644
--- a/llvm/lib/Target/AVR/AVRSubtarget.h
+++ b/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -92,15 +92,15 @@ public:
}
/// Get I/O register addresses.
- int getIORegRAMPZ(void) const { return hasELPM() ? 0x3b : -1; }
- int getIORegEIND(void) const { return hasEIJMPCALL() ? 0x3c : -1; }
- int getIORegSPL(void) const { return 0x3d; }
- int getIORegSPH(void) const { return hasSmallStack() ? -1 : 0x3e; }
- int getIORegSREG(void) const { return 0x3f; }
+ int getIORegRAMPZ() const { return hasELPM() ? 0x3b : -1; }
+ int getIORegEIND() const { return hasEIJMPCALL() ? 0x3c : -1; }
+ int getIORegSPL() const { return 0x3d; }
+ int getIORegSPH() const { return hasSmallStack() ? -1 : 0x3e; }
+ int getIORegSREG() const { return 0x3f; }
/// Get GPR aliases.
- int getRegTmpIndex(void) const { return hasTinyEncoding() ? 16 : 0; }
- int getRegZeroIndex(void) const { return hasTinyEncoding() ? 17 : 1; }
+ int getRegTmpIndex() const { return hasTinyEncoding() ? 16 : 0; }
+ int getRegZeroIndex() const { return hasTinyEncoding() ? 17 : 1; }
private:
/// The ELF e_flags architecture.
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index d490b385ac16..0bf739452fd2 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -518,7 +518,7 @@ void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned Opcode = 0;
if (CSKY::GPRRegClass.contains(DestReg, SrcReg))
- Opcode = CSKY::MOV32;
+ Opcode = STI.hasE2() ? CSKY::MOV32 : CSKY::MOV16;
else if (v2sf && CSKY::sFPR32RegClass.contains(DestReg, SrcReg))
Opcode = CSKY::FMOV_S;
else if (v3sf && CSKY::FPR32RegClass.contains(DestReg, SrcReg))
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 3e09270a66d0..869433613620 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -13,6 +13,7 @@
#include "DXILBitcodeWriter.h"
#include "DXILValueEnumerator.h"
#include "PointerTypeAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Bitcode/BitcodeCommon.h"
#include "llvm/Bitcode/BitcodeReader.h"
@@ -2580,10 +2581,9 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
SortedTable.push_back(VI.second->getValueName());
}
// The keys are unique, so there shouldn't be stability issues.
- std::sort(SortedTable.begin(), SortedTable.end(),
- [](const ValueName *A, const ValueName *B) {
- return A->first() < B->first();
- });
+ llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) {
+ return A->first() < B->first();
+ });
for (const ValueName *SI : SortedTable) {
auto &Name = *SI;
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
index 08944ee3f1fe..e2a41515de38 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
@@ -809,7 +809,7 @@ void ValueEnumerator::organizeMetadata() {
// - by function, then
// - by isa<MDString>
// and then sort by the original/current ID. Since the IDs are guaranteed to
- // be unique, the result of std::sort will be deterministic. There's no need
+ // be unique, the result of llvm::sort will be deterministic. There's no need
// for std::stable_sort.
llvm::sort(Order, [this](MDIndex LHS, MDIndex RHS) {
return std::make_tuple(LHS.F, getMetadataTypeOrder(LHS.get(MDs)), LHS.ID) <
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index abd84a188cfa..bd0232c71d48 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -85,7 +85,6 @@ public:
int getAllocSizeOf(const Type *Ty) const;
int getTypeAlignment(Type *Ty) const;
- VectorType *getByteVectorTy(int ScLen) const;
Constant *getNullValue(Type *Ty) const;
Constant *getFullValue(Type *Ty) const;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 4acf90bd9788..93c8864347bb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -217,9 +217,8 @@ SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op,
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
unsigned ADDIOp = Subtarget.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
- // FIXME: Only support PC-relative addressing to access the symbol.
- // TODO: Add target flags.
- if (!isPositionIndependent()) {
+ // TODO: Support dso_preemptable and target flags.
+ if (GV->isDSOLocal()) {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty);
SDValue AddrHi(DAG.getMachineNode(LoongArch::PCALAU12I, DL, Ty, GA), 0);
SDValue Addr(DAG.getMachineNode(ADDIOp, DL, Ty, AddrHi, GA), 0);
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 468c4f43cb90..2d08d5c674bc 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -38,9 +38,7 @@ static std::string computeDataLayout(const Triple &TT) {
static Reloc::Model getEffectiveRelocModel(const Triple &TT,
Optional<Reloc::Model> RM) {
- if (!RM.hasValue())
- return Reloc::Static;
- return *RM;
+ return RM.value_or(Reloc::Static);
}
LoongArchTargetMachine::LoongArchTargetMachine(
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index b98be4ae4b75..4dfc16526a00 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -1192,6 +1192,12 @@ bool MipsTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
bool MipsTargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
+ assert(((N->getOpcode() == ISD::SHL &&
+ N->getOperand(0).getOpcode() == ISD::SRL) ||
+ (N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getOpcode() == ISD::SHL)) &&
+ "Expected shift-shift mask");
+
if (N->getOperand(0).getValueType().isVector())
return false;
return true;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 9977d8ba0300..45e82e935772 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -73,8 +73,10 @@
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Endian.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
@@ -354,8 +356,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
// PTX ABI requires all scalar return values to be at least 32
// bits in size. fp16 normally uses .b16 as its storage type in
// PTX, so its size must be adjusted here, too.
- if (size < 32)
- size = 32;
+ size = promoteScalarArgumentSize(size);
O << ".param .b" << size << " func_retval0";
} else if (isa<PointerType>(Ty)) {
@@ -384,8 +385,8 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
for (unsigned j = 0, je = elems; j != je; ++j) {
unsigned sz = elemtype.getSizeInBits();
- if (elemtype.isInteger() && (sz < 32))
- sz = 32;
+ if (elemtype.isInteger())
+ sz = promoteScalarArgumentSize(sz);
O << ".reg .b" << sz << " func_retval" << idx;
if (j < je - 1)
O << ", ";
@@ -1168,31 +1169,37 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
GVar->hasInitializer()) {
const Constant *Initializer = GVar->getInitializer();
if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
- AggBuffer aggBuffer(ElementSize, O, *this);
+ AggBuffer aggBuffer(ElementSize, *this);
bufferAggregateConstant(Initializer, &aggBuffer);
- if (aggBuffer.numSymbols) {
- if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit()) {
- O << " .u64 ";
+ if (aggBuffer.numSymbols()) {
+ unsigned int ptrSize = MAI->getCodePointerSize();
+ if (ElementSize % ptrSize ||
+ !aggBuffer.allSymbolsAligned(ptrSize)) {
+ // Print in bytes and use the mask() operator for pointers.
+ if (!STI.hasMaskOperator())
+ report_fatal_error(
+ "initialized packed aggregate with pointers '" +
+ GVar->getName() +
+ "' requires at least PTX ISA version 7.1");
+ O << " .u8 ";
getSymbol(GVar)->print(O, MAI);
- O << "[";
- O << ElementSize / 8;
+ O << "[" << ElementSize << "] = {";
+ aggBuffer.printBytes(O);
+ O << "}";
} else {
- O << " .u32 ";
+ O << " .u" << ptrSize * 8 << " ";
getSymbol(GVar)->print(O, MAI);
- O << "[";
- O << ElementSize / 4;
+ O << "[" << ElementSize / ptrSize << "] = {";
+ aggBuffer.printWords(O);
+ O << "}";
}
- O << "]";
} else {
O << " .b8 ";
getSymbol(GVar)->print(O, MAI);
- O << "[";
- O << ElementSize;
- O << "]";
+ O << "[" << ElementSize << "] = {";
+ aggBuffer.printBytes(O);
+ O << "}";
}
- O << " = {";
- aggBuffer.print();
- O << "}";
} else {
O << " .b8 ";
getSymbol(GVar)->print(O, MAI);
@@ -1219,6 +1226,80 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
O << ";\n";
}
+void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
+ const Value *v = Symbols[nSym];
+ const Value *v0 = SymbolsBeforeStripping[nSym];
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+ MCSymbol *Name = AP.getSymbol(GVar);
+ PointerType *PTy = dyn_cast<PointerType>(v0->getType());
+ // Is v0 a generic pointer?
+ bool isGenericPointer = PTy && PTy->getAddressSpace() == 0;
+ if (EmitGeneric && isGenericPointer && !isa<Function>(v)) {
+ os << "generic(";
+ Name->print(os, AP.MAI);
+ os << ")";
+ } else {
+ Name->print(os, AP.MAI);
+ }
+ } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) {
+ const MCExpr *Expr = AP.lowerConstantForGV(cast<Constant>(CExpr), false);
+ AP.printMCExpr(*Expr, os);
+ } else
+ llvm_unreachable("symbol type unknown");
+}
+
+void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
+ unsigned int ptrSize = AP.MAI->getCodePointerSize();
+ symbolPosInBuffer.push_back(size);
+ unsigned int nSym = 0;
+ unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+ for (unsigned int pos = 0; pos < size;) {
+ if (pos)
+ os << ", ";
+ if (pos != nextSymbolPos) {
+ os << (unsigned int)buffer[pos];
+ ++pos;
+ continue;
+ }
+ // Generate a per-byte mask() operator for the symbol, which looks like:
+ // .global .u8 addr[] = {0xFF(foo), 0xFF00(foo), 0xFF0000(foo), ...};
+ // See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#initializers
+ std::string symText;
+ llvm::raw_string_ostream oss(symText);
+ printSymbol(nSym, oss);
+ for (unsigned i = 0; i < ptrSize; ++i) {
+ if (i)
+ os << ", ";
+ llvm::write_hex(os, 0xFFULL << i * 8, HexPrintStyle::PrefixUpper);
+ os << "(" << symText << ")";
+ }
+ pos += ptrSize;
+ nextSymbolPos = symbolPosInBuffer[++nSym];
+ assert(nextSymbolPos >= pos);
+ }
+}
+
+void NVPTXAsmPrinter::AggBuffer::printWords(raw_ostream &os) {
+ unsigned int ptrSize = AP.MAI->getCodePointerSize();
+ symbolPosInBuffer.push_back(size);
+ unsigned int nSym = 0;
+ unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+ assert(nextSymbolPos % ptrSize == 0);
+ for (unsigned int pos = 0; pos < size; pos += ptrSize) {
+ if (pos)
+ os << ", ";
+ if (pos == nextSymbolPos) {
+ printSymbol(nSym, os);
+ nextSymbolPos = symbolPosInBuffer[++nSym];
+ assert(nextSymbolPos % ptrSize == 0);
+ assert(nextSymbolPos >= pos + ptrSize);
+ } else if (ptrSize == 4)
+ os << support::endian::read32le(&buffer[pos]);
+ else
+ os << support::endian::read64le(&buffer[pos]);
+ }
+}
+
void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
if (localDecls.find(f) == localDecls.end())
return;
@@ -1494,8 +1575,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
unsigned sz = 0;
if (isa<IntegerType>(Ty)) {
sz = cast<IntegerType>(Ty)->getBitWidth();
- if (sz < 32)
- sz = 32;
+ sz = promoteScalarArgumentSize(sz);
} else if (isa<PointerType>(Ty))
sz = thePointerTy.getSizeInBits();
else if (Ty->isHalfTy())
@@ -1559,8 +1639,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
for (unsigned j = 0, je = elems; j != je; ++j) {
unsigned sz = elemtype.getSizeInBits();
- if (elemtype.isInteger() && (sz < 32))
- sz = 32;
+ if (elemtype.isInteger())
+ sz = promoteScalarArgumentSize(sz);
O << "\t.reg .b" << sz << " ";
printParamName(I, paramIndex, O);
if (j < je - 1)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index cd61e99a103a..710c089e3325 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -61,24 +61,30 @@ class MCOperand;
class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
class AggBuffer {
- // Used to buffer the emitted string for initializing global
- // aggregates.
+ // Used to buffer the emitted string for initializing global aggregates.
//
- // Normally an aggregate (array, vector or structure) is emitted
- // as a u8[]. However, if one element/field of the aggregate
- // is a non-NULL address, then the aggregate is emitted as u32[]
- // or u64[].
+ // Normally an aggregate (array, vector, or structure) is emitted as a u8[].
+ // However, if either element/field of the aggregate is a non-NULL address,
+ // and all such addresses are properly aligned, then the aggregate is
+ // emitted as u32[] or u64[]. In the case of unaligned addresses, the
+ // aggregate is emitted as u8[], and the mask() operator is used for all
+ // pointers.
//
- // We first layout the aggregate in 'buffer' in bytes, except for
- // those symbol addresses. For the i-th symbol address in the
- //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
- // are filled with 0s. symbolPosInBuffer[i-1] records its position
- // in 'buffer', and Symbols[i-1] records the Value*.
+ // We first layout the aggregate in 'buffer' in bytes, except for those
+ // symbol addresses. For the i-th symbol address in the aggregate, its
+ // corresponding 4-byte or 8-byte elements in 'buffer' are filled with 0s.
+ // symbolPosInBuffer[i-1] records its position in 'buffer', and Symbols[i-1]
+ // records the Value*.
//
- // Once we have this AggBuffer setup, we can choose how to print
- // it out.
+ // Once we have this AggBuffer setup, we can choose how to print it out.
public:
- unsigned numSymbols; // number of symbol addresses
+ // number of symbol addresses
+ unsigned numSymbols() const { return Symbols.size(); }
+
+ bool allSymbolsAligned(unsigned ptrSize) const {
+ return llvm::all_of(symbolPosInBuffer,
+ [=](unsigned pos) { return pos % ptrSize == 0; });
+ }
private:
const unsigned size; // size of the buffer in bytes
@@ -94,15 +100,13 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
// SymbolsBeforeStripping[i].
SmallVector<const Value *, 4> SymbolsBeforeStripping;
unsigned curpos;
- raw_ostream &O;
NVPTXAsmPrinter &AP;
bool EmitGeneric;
public:
- AggBuffer(unsigned size, raw_ostream &O, NVPTXAsmPrinter &AP)
- : size(size), buffer(size), O(O), AP(AP) {
+ AggBuffer(unsigned size, NVPTXAsmPrinter &AP)
+ : size(size), buffer(size), AP(AP) {
curpos = 0;
- numSymbols = 0;
EmitGeneric = AP.EmitGeneric;
}
@@ -135,63 +139,13 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
symbolPosInBuffer.push_back(curpos);
Symbols.push_back(GVar);
SymbolsBeforeStripping.push_back(GVarBeforeStripping);
- numSymbols++;
}
- void print() {
- if (numSymbols == 0) {
- // print out in bytes
- for (unsigned i = 0; i < size; i++) {
- if (i)
- O << ", ";
- O << (unsigned int) buffer[i];
- }
- } else {
- // print out in 4-bytes or 8-bytes
- unsigned int pos = 0;
- unsigned int nSym = 0;
- unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
- unsigned int nBytes = 4;
- if (static_cast<const NVPTXTargetMachine &>(AP.TM).is64Bit())
- nBytes = 8;
- for (pos = 0; pos < size; pos += nBytes) {
- if (pos)
- O << ", ";
- if (pos == nextSymbolPos) {
- const Value *v = Symbols[nSym];
- const Value *v0 = SymbolsBeforeStripping[nSym];
- if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
- MCSymbol *Name = AP.getSymbol(GVar);
- PointerType *PTy = dyn_cast<PointerType>(v0->getType());
- bool IsNonGenericPointer = false; // Is v0 a non-generic pointer?
- if (PTy && PTy->getAddressSpace() != 0) {
- IsNonGenericPointer = true;
- }
- if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
- O << "generic(";
- Name->print(O, AP.MAI);
- O << ")";
- } else {
- Name->print(O, AP.MAI);
- }
- } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) {
- const MCExpr *Expr =
- AP.lowerConstantForGV(cast<Constant>(CExpr), false);
- AP.printMCExpr(*Expr, O);
- } else
- llvm_unreachable("symbol type unknown");
- nSym++;
- if (nSym >= numSymbols)
- nextSymbolPos = size + 1;
- else
- nextSymbolPos = symbolPosInBuffer[nSym];
- } else if (nBytes == 4)
- O << *(unsigned int *)(&buffer[pos]);
- else
- O << *(unsigned long long *)(&buffer[pos]);
- }
- }
- }
+ void printBytes(raw_ostream &os);
+ void printWords(raw_ostream &os);
+
+ private:
+ void printSymbol(unsigned nSym, raw_ostream &os);
};
friend class AggBuffer;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 6ad016dfa0a7..8264032b765a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -206,6 +206,40 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
}
}
+/// PromoteScalarIntegerPTX
+/// Used to make sure the arguments/returns are suitable for passing
+/// and promote them to a larger size if they're not.
+///
+/// The promoted type is placed in \p PromoteVT if the function returns true.
+static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
+ if (VT.isScalarInteger()) {
+ switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
+ default:
+ llvm_unreachable(
+ "Promotion is not suitable for scalars of size larger than 64-bits");
+ case 1:
+ *PromotedVT = MVT::i1;
+ break;
+ case 2:
+ case 4:
+ case 8:
+ *PromotedVT = MVT::i8;
+ break;
+ case 16:
+ *PromotedVT = MVT::i16;
+ break;
+ case 32:
+ *PromotedVT = MVT::i32;
+ break;
+ case 64:
+ *PromotedVT = MVT::i64;
+ break;
+ }
+ return EVT(*PromotedVT) != VT;
+ }
+ return false;
+}
+
// Check whether we can merge loads/stores of some of the pieces of a
// flattened function parameter or return value into a single vector
// load/store.
@@ -1291,8 +1325,7 @@ std::string NVPTXTargetLowering::getPrototype(
// PTX ABI requires all scalar return values to be at least 32
// bits in size. fp16 normally uses .b16 as its storage type in
// PTX, so its size must be adjusted here, too.
- if (size < 32)
- size = 32;
+ size = promoteScalarArgumentSize(size);
O << ".param .b" << size << " _";
} else if (isa<PointerType>(retTy)) {
@@ -1343,8 +1376,7 @@ std::string NVPTXTargetLowering::getPrototype(
unsigned sz = 0;
if (isa<IntegerType>(Ty)) {
sz = cast<IntegerType>(Ty)->getBitWidth();
- if (sz < 32)
- sz = 32;
+ sz = promoteScalarArgumentSize(sz);
} else if (isa<PointerType>(Ty)) {
sz = PtrVT.getSizeInBits();
} else if (Ty->isHalfTy())
@@ -1515,11 +1547,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
NeedAlign = true;
} else {
// declare .param .b<size> .param<n>;
- if ((VT.isInteger() || VT.isFloatingPoint()) && TypeSize < 4) {
+ if (VT.isInteger() || VT.isFloatingPoint()) {
// PTX ABI requires integral types to be at least 32 bits in
// size. FP16 is loaded/stored using i16, so it's handled
// here as well.
- TypeSize = 4;
+ TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;
}
SDValue DeclareScalarParamOps[] = {
Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
@@ -1556,6 +1588,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
SDValue StVal = OutVals[OIdx];
+
+ MVT PromotedVT;
+ if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
+ EltVT = EVT(PromotedVT);
+ }
+ if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
+ llvm::ISD::NodeType Ext =
+ Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
+ }
+
if (IsByVal) {
auto PtrVT = getPointerTy(DL);
SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
@@ -1638,9 +1681,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Plus, this behavior is consistent with nvcc's.
if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
(RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
- // Scalar needs to be at least 32bit wide
- if (resultsz < 32)
- resultsz = 32;
+ resultsz = promoteScalarArgumentSize(resultsz);
SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
DAG.getConstant(resultsz, dl, MVT::i32),
@@ -1778,6 +1819,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
EVT TheLoadType = VTs[i];
EVT EltType = Ins[i].VT;
Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
+ MVT PromotedVT;
+
+ if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
+ TheLoadType = EVT(PromotedVT);
+ EltType = EVT(PromotedVT);
+ needTruncate = true;
+ }
+
if (ExtendIntegerRetVal) {
TheLoadType = MVT::i32;
EltType = MVT::i32;
@@ -2558,6 +2607,13 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
// v2f16 was loaded as an i32. Now we must bitcast it back.
else if (EltVT == MVT::v2f16)
Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
+
+ // If a promoted integer type is used, truncate down to the original
+ MVT PromotedVT;
+ if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
+ Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+ }
+
// Extend the element if necessary (e.g. an i8 is loaded
// into an i16 register)
if (Ins[InsIdx].VT.isInteger() &&
@@ -2627,11 +2683,26 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
return Chain;
const DataLayout &DL = DAG.getDataLayout();
+ SmallVector<SDValue, 16> PromotedOutVals;
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
+ for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+ SDValue PromotedOutVal = OutVals[i];
+ MVT PromotedVT;
+ if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
+ VTs[i] = EVT(PromotedVT);
+ }
+ if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
+ llvm::ISD::NodeType Ext =
+ Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
+ }
+ PromotedOutVals.push_back(PromotedOutVal);
+ }
+
auto VectorInfo = VectorizePTXValueVTs(
VTs, Offsets,
RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
@@ -2652,12 +2723,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
}
- SDValue RetVal = OutVals[i];
+ SDValue OutVal = OutVals[i];
+ SDValue RetVal = PromotedOutVals[i];
+
if (ExtendIntegerRetVal) {
RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND,
dl, MVT::i32, RetVal);
- } else if (RetVal.getValueSizeInBits() < 16) {
+ } else if (OutVal.getValueSizeInBits() < 16) {
// Use 16-bit registers for small load-stores as it's the
// smallest general purpose register size supported by NVPTX.
RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 9a249d3da3d5..cea3dce3f1c5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -77,6 +77,7 @@ public:
bool hasImageHandles() const;
bool hasFP16Math() const { return SmVersion >= 53; }
bool allowFP16Math() const;
+ bool hasMaskOperator() const { return PTXVersion >= 71; }
unsigned int getSmVersion() const { return SmVersion; }
std::string getTargetName() const { return TargetName; }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index bf1524194cfb..6fee57b4664e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -59,6 +59,16 @@ bool isKernelFunction(const Function &);
bool getAlign(const Function &, unsigned index, unsigned &);
bool getAlign(const CallInst &, unsigned index, unsigned &);
+// PTX ABI requires all scalar argument/return values to have
+// bit-size as a power of two of at least 32 bits.
+inline unsigned promoteScalarArgumentSize(unsigned size) {
+ if (size <= 32)
+ return 32;
+ else if (size <= 64)
+ return 64;
+ else
+ return size;
+}
}
#endif
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 4247cf557c2a..14c4fd3a9ffa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -5473,7 +5473,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
}
case ISD::MUL: {
SDValue Op1 = N->getOperand(1);
- if (Op1.getOpcode() != ISD::Constant || Op1.getValueType() != MVT::i64)
+ if (Op1.getOpcode() != ISD::Constant ||
+ (Op1.getValueType() != MVT::i64 && Op1.getValueType() != MVT::i32))
break;
// If the multiplier fits int16, we can handle it with mulli.
@@ -5486,13 +5487,27 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
// (mul X, c1 << c2) -> (rldicr (mulli X, c1) c2). We do this in ISEL due to
// DAGCombiner prefers (shl (mul X, c1), c2) -> (mul X, c1 << c2).
uint64_t ImmSh = Imm >> Shift;
- if (isInt<16>(ImmSh)) {
- uint64_t SextImm = SignExtend64(ImmSh & 0xFFFF, 16);
+ if (!isInt<16>(ImmSh))
+ break;
+
+ uint64_t SextImm = SignExtend64(ImmSh & 0xFFFF, 16);
+ if (Op1.getValueType() == MVT::i64) {
SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
SDNode *MulNode = CurDAG->getMachineNode(PPC::MULLI8, dl, MVT::i64,
N->getOperand(0), SDImm);
- CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, SDValue(MulNode, 0),
- getI32Imm(Shift, dl), getI32Imm(63 - Shift, dl));
+
+ SDValue Ops[] = {SDValue(MulNode, 0), getI32Imm(Shift, dl),
+ getI32Imm(63 - Shift, dl)};
+ CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops);
+ return;
+ } else {
+ SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i32);
+ SDNode *MulNode = CurDAG->getMachineNode(PPC::MULLI, dl, MVT::i32,
+ N->getOperand(0), SDImm);
+
+ SDValue Ops[] = {SDValue(MulNode, 0), getI32Imm(Shift, dl),
+ getI32Imm(0, dl), getI32Imm(31 - Shift, dl)};
+ CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
return;
}
break;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 59486c323567..c85f57f04c7d 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1086,8 +1086,8 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
// For opcodes with the ReMaterializable flag set, this function is called to
// verify the instruction is really rematable.
-bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const {
+bool PPCInstrInfo::isReallyTriviallyReMaterializable(
+ const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
// This function should only be called for opcodes with the ReMaterializable
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index e22b0086bde8..980bb3107a8b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -495,8 +495,7 @@ public:
unsigned &SubIdx) const override;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
- bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const override;
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
unsigned isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 4689c0638ca6..23703ac54d0e 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -568,7 +568,7 @@ bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
const SCEVAddRecExpr *BasePtrSCEV = cast<SCEVAddRecExpr>(BaseSCEV);
// Make sure the base is able to expand.
- if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE))
+ if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart()))
return MadeChange;
assert(BasePtrSCEV->isAffine() &&
@@ -602,7 +602,7 @@ bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
// Make sure offset is able to expand. Only need to check one time as the
// offsets are reused between different chains.
if (!BaseElemIdx)
- if (!isSafeToExpand(OffsetSCEV, *SE))
+ if (!SCEVE.isSafeToExpand(OffsetSCEV))
return false;
Value *OffsetValue = SCEVE.expandCodeFor(
@@ -1018,14 +1018,13 @@ bool PPCLoopInstrFormPrep::rewriteLoadStores(
if (!BasePtrSCEV->isAffine())
return MadeChange;
- if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE))
- return MadeChange;
-
- SmallPtrSet<Value *, 16> DeletedPtrs;
-
BasicBlock *Header = L->getHeader();
SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
"loopprepare-formrewrite");
+ if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart()))
+ return MadeChange;
+
+ SmallPtrSet<Value *, 16> DeletedPtrs;
// For some DS form load/store instructions, it can also be an update form,
// if the stride is constant and is a multipler of 4. Use update form if
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 917837a307ad..e6140edc8403 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -30,6 +30,9 @@ class MachineInstr;
class MachineOperand;
class PassRegistry;
+FunctionPass *createRISCVCodeGenPreparePass();
+void initializeRISCVCodeGenPreparePass(PassRegistry &);
+
bool lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
AsmPrinter &AP);
bool lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index e783ef38b448..8a6f69c7f7ca 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -19,6 +19,19 @@ def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
AssemblerPredicate<(all_of FeatureStdExtM),
"'M' (Integer Multiplication and Division)">;
+def FeatureStdExtZmmul
+ : SubtargetFeature<"zmmul", "HasStdExtZmmul", "true",
+ "'Zmmul' (Integer Multiplication)">;
+def HasStdExtZmmul : Predicate<"Subtarget->hasStdExtZmmul()">,
+ AssemblerPredicate<(all_of FeatureStdExtZmmul),
+ "'Zmmul' (Integer Multiplication)">;
+
+def HasStdExtMOrZmmul
+ : Predicate<"Subtarget->hasStdExtM() || Subtarget->hasStdExtZmmul()">,
+ AssemblerPredicate<(any_of FeatureStdExtM, FeatureStdExtZmmul),
+ "'M' (Integer Multiplication and Division) or "
+ "'Zmmul' (Integer Multiplication)">;
+
def FeatureStdExtA
: SubtargetFeature<"a", "HasStdExtA", "true",
"'A' (Atomic Instructions)">;
@@ -465,7 +478,8 @@ def TuneNoDefaultUnroll
"Disable default unroll preference.">;
def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
- "SiFive 7-Series processors">;
+ "SiFive 7-Series processors",
+ [TuneNoDefaultUnroll]>;
//===----------------------------------------------------------------------===//
// Named operands for CSR instructions.
@@ -499,9 +513,9 @@ def : ProcessorModel<"rocket-rv32", RocketModel, []>;
def : ProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit]>;
def : ProcessorModel<"sifive-7-rv32", SiFive7Model, [],
- [TuneSiFive7, TuneNoDefaultUnroll]>;
+ [TuneSiFive7]>;
def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit],
- [TuneSiFive7, TuneNoDefaultUnroll]>;
+ [TuneSiFive7]>;
def : ProcessorModel<"sifive-e20", RocketModel, [FeatureStdExtM,
FeatureStdExtC]>;
@@ -528,7 +542,7 @@ def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM,
FeatureStdExtA,
FeatureStdExtF,
FeatureStdExtC],
- [TuneSiFive7, TuneNoDefaultUnroll]>;
+ [TuneSiFive7]>;
def : ProcessorModel<"sifive-s21", RocketModel, [Feature64Bit,
FeatureStdExtM,
@@ -553,7 +567,7 @@ def : ProcessorModel<"sifive-s76", SiFive7Model, [Feature64Bit,
FeatureStdExtF,
FeatureStdExtD,
FeatureStdExtC],
- [TuneSiFive7, TuneNoDefaultUnroll]>;
+ [TuneSiFive7]>;
def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit,
FeatureStdExtM,
@@ -568,7 +582,7 @@ def : ProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit,
FeatureStdExtF,
FeatureStdExtD,
FeatureStdExtC],
- [TuneSiFive7, TuneNoDefaultUnroll]>;
+ [TuneSiFive7]>;
//===----------------------------------------------------------------------===//
// Define the RISC-V target.
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
new file mode 100644
index 000000000000..b700a9ede39b
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -0,0 +1,169 @@
+//===----- RISCVCodeGenPrepare.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a RISCV specific version of CodeGenPrepare.
+// It munges the code in the input function to better prepare it for
+// SelectionDAG-based code generation. This works around limitations in it's
+// basic-block-at-a-time approach.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-codegenprepare"
+#define PASS_NAME "RISCV CodeGenPrepare"
+
+STATISTIC(NumZExtToSExt, "Number of SExt instructions converted to ZExt");
+
+namespace {
+
+class RISCVCodeGenPrepare : public FunctionPass {
+ const DataLayout *DL;
+ const RISCVSubtarget *ST;
+
+public:
+ static char ID;
+
+ RISCVCodeGenPrepare() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return PASS_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetPassConfig>();
+ }
+
+private:
+ bool optimizeZExt(ZExtInst *I);
+ bool optimizeAndExt(BinaryOperator *BO);
+};
+
+} // end anonymous namespace
+
+bool RISCVCodeGenPrepare::optimizeZExt(ZExtInst *ZExt) {
+ if (!ST->is64Bit())
+ return false;
+
+ Value *Src = ZExt->getOperand(0);
+
+ // We only care about ZExt from i32 to i64.
+ if (!ZExt->getType()->isIntegerTy(64) || !Src->getType()->isIntegerTy(32))
+ return false;
+
+ // Look for an opportunity to replace (i64 (zext (i32 X))) with a sext if we
+ // can determine that the sign bit of X is zero via a dominating condition.
+ // This often occurs with widened induction variables.
+ if (isImpliedByDomCondition(ICmpInst::ICMP_SGE, Src,
+ Constant::getNullValue(Src->getType()), ZExt,
+ *DL)) {
+ auto *SExt = new SExtInst(Src, ZExt->getType(), "", ZExt);
+ SExt->takeName(ZExt);
+ SExt->setDebugLoc(ZExt->getDebugLoc());
+
+ ZExt->replaceAllUsesWith(SExt);
+ ZExt->eraseFromParent();
+ ++NumZExtToSExt;
+ return true;
+ }
+
+ return false;
+}
+
+// Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set,
+// but bits 63:32 are zero. If we can prove that bit 31 of X is 0, we can fill
+// the upper 32 bits with ones. A separate transform will turn (zext X) into
+// (sext X) for the same condition.
+bool RISCVCodeGenPrepare::optimizeAndExt(BinaryOperator *BO) {
+ if (!ST->is64Bit())
+ return false;
+
+ if (BO->getOpcode() != Instruction::And)
+ return false;
+
+ if (!BO->getType()->isIntegerTy(64))
+ return false;
+
+ // Left hand side should be sext or zext.
+ Instruction *LHS = dyn_cast<Instruction>(BO->getOperand(0));
+ if (!LHS || (!isa<SExtInst>(LHS) && !isa<ZExtInst>(LHS)))
+ return false;
+
+ Value *LHSSrc = LHS->getOperand(0);
+ if (!LHSSrc->getType()->isIntegerTy(32))
+ return false;
+
+ // Right hand side should be a constant.
+ Value *RHS = BO->getOperand(1);
+
+ auto *CI = dyn_cast<ConstantInt>(RHS);
+ if (!CI)
+ return false;
+ uint64_t C = CI->getZExtValue();
+
+ // Look for constants that fit in 32 bits but not simm12, and can be made
+ // into simm12 by sign extending bit 31. This will allow use of ANDI.
+ // TODO: Is worth making simm32?
+ if (!isUInt<32>(C) || isInt<12>(C) || !isInt<12>(SignExtend64<32>(C)))
+ return false;
+
+ // If we can determine the sign bit of the input is 0, we can replace the
+ // And mask constant.
+ if (!isImpliedByDomCondition(ICmpInst::ICMP_SGE, LHSSrc,
+ Constant::getNullValue(LHSSrc->getType()),
+ LHS, *DL))
+ return false;
+
+ // Sign extend the constant and replace the And operand.
+ C = SignExtend64<32>(C);
+ BO->setOperand(1, ConstantInt::get(LHS->getType(), C));
+
+ return true;
+}
+
+bool RISCVCodeGenPrepare::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ auto &TM = TPC.getTM<RISCVTargetMachine>();
+ ST = &TM.getSubtarget<RISCVSubtarget>(F);
+
+ DL = &F.getParent()->getDataLayout();
+
+ bool MadeChange = false;
+ for (auto &BB : F) {
+ for (Instruction &I : llvm::make_early_inc_range(BB)) {
+ if (auto *ZExt = dyn_cast<ZExtInst>(&I))
+ MadeChange |= optimizeZExt(ZExt);
+ else if (I.getOpcode() == Instruction::And)
+ MadeChange |= optimizeAndExt(cast<BinaryOperator>(&I));
+ }
+ }
+
+ return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false)
+
+char RISCVCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createRISCVCodeGenPreparePass() {
+ return new RISCVCodeGenPrepare();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 5b823af1e9b8..d5826b46d738 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -690,6 +690,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// 32 trailing ones should use srliw via tablegen pattern.
if (TrailingOnes == 32 || ShAmt >= TrailingOnes)
break;
+ // If C2 is (1 << ShAmt) use bexti if possible.
+ if (Subtarget->hasStdExtZbs() && ShAmt + 1 == TrailingOnes) {
+ SDNode *BEXTI =
+ CurDAG->getMachineNode(RISCV::BEXTI, DL, VT, N0->getOperand(0),
+ CurDAG->getTargetConstant(ShAmt, DL, VT));
+ ReplaceNode(Node, BEXTI);
+ return;
+ }
unsigned LShAmt = Subtarget->getXLen() - TrailingOnes;
SDNode *SLLI =
CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
@@ -939,18 +947,17 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (!isMask_64(C2))
break;
- // This should be the only use of the AND unless we will use
- // (SRLI (SLLI X, 32), 32). We don't use a shift pair for other AND
- // constants.
- if (!N0.hasOneUse() && C2 != UINT64_C(0xFFFFFFFF))
- break;
-
- // If this can be an ANDI, ZEXT.H or ZEXT.W we don't need to do this
- // optimization.
- if (isInt<12>(C2) ||
+ // If this can be an ANDI, ZEXT.H or ZEXT.W, don't do this if the ANDI/ZEXT
+ // has multiple users or the constant is a simm12. This prevents inserting
+ // a shift and still have uses of the AND/ZEXT. Shifting a simm12 will
+ // likely make it more costly to materialize. Otherwise, using a SLLI
+ // might allow it to be compressed.
+ bool IsANDIOrZExt =
+ isInt<12>(C2) ||
(C2 == UINT64_C(0xFFFF) &&
(Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp())) ||
- (C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba()))
+ (C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba());
+ if (IsANDIOrZExt && (isInt<12>(N1C->getSExtValue()) || !N0.hasOneUse()))
break;
// We need to shift left the AND input and C1 by a total of XLen bits.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 658865703079..1702546b58a6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -215,21 +215,26 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setLibcallName(RTLIB::MULO_I64, nullptr);
}
- if (!Subtarget.hasStdExtM()) {
- setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::SDIV, ISD::UDIV,
- ISD::SREM, ISD::UREM},
- XLenVT, Expand);
+ if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul()) {
+ setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU}, XLenVT, Expand);
} else {
if (Subtarget.is64Bit()) {
setOperationAction(ISD::MUL, {MVT::i32, MVT::i128}, Custom);
-
- setOperationAction({ISD::SDIV, ISD::UDIV, ISD::UREM},
- {MVT::i8, MVT::i16, MVT::i32}, Custom);
} else {
setOperationAction(ISD::MUL, MVT::i64, Custom);
}
}
+ if (!Subtarget.hasStdExtM()) {
+ setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM},
+ XLenVT, Expand);
+ } else {
+ if (Subtarget.is64Bit()) {
+ setOperationAction({ISD::SDIV, ISD::UDIV, ISD::UREM},
+ {MVT::i8, MVT::i16, MVT::i32}, Custom);
+ }
+ }
+
setOperationAction(
{ISD::SDIVREM, ISD::UDIVREM, ISD::SMUL_LOHI, ISD::UMUL_LOHI}, XLenVT,
Expand);
@@ -294,7 +299,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, XLenVT, Custom);
}
- static constexpr ISD::NodeType FPLegalNodeTypes[] = {
+ static const unsigned FPLegalNodeTypes[] = {
ISD::FMINNUM, ISD::FMAXNUM, ISD::LRINT,
ISD::LLRINT, ISD::LROUND, ISD::LLROUND,
ISD::STRICT_LRINT, ISD::STRICT_LLRINT, ISD::STRICT_LROUND,
@@ -307,7 +312,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
ISD::SETGE, ISD::SETNE, ISD::SETO, ISD::SETUO};
- static const ISD::NodeType FPOpToExpand[] = {
+ static const unsigned FPOpToExpand[] = {
ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW,
ISD::FREM, ISD::FP16_TO_FP, ISD::FP_TO_FP16};
@@ -315,8 +320,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
if (Subtarget.hasStdExtZfh()) {
- for (auto NT : FPLegalNodeTypes)
- setOperationAction(NT, MVT::f16, Legal);
+ setOperationAction(FPLegalNodeTypes, MVT::f16, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
setCondCodeAction(FPCCToExpand, MVT::f16, Expand);
@@ -340,14 +344,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
if (Subtarget.hasStdExtF()) {
- for (auto NT : FPLegalNodeTypes)
- setOperationAction(NT, MVT::f32, Legal);
+ setOperationAction(FPLegalNodeTypes, MVT::f32, Legal);
setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
- for (auto Op : FPOpToExpand)
- setOperationAction(Op, MVT::f32, Expand);
+ setOperationAction(FPOpToExpand, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
}
@@ -356,8 +358,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::i32, Custom);
if (Subtarget.hasStdExtD()) {
- for (auto NT : FPLegalNodeTypes)
- setOperationAction(NT, MVT::f64, Legal);
+ setOperationAction(FPLegalNodeTypes, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
@@ -366,8 +367,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
- for (auto Op : FPOpToExpand)
- setOperationAction(Op, MVT::f64, Expand);
+ setOperationAction(FPOpToExpand, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
}
@@ -458,17 +458,22 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VP_SETCC, ISD::VP_FP_ROUND,
ISD::VP_FP_EXTEND};
+ static const unsigned IntegerVecReduceOps[] = {
+ ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND, ISD::VECREDUCE_OR,
+ ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
+ ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN};
+
+ static const unsigned FloatingPointVecReduceOps[] = {
+ ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_FMIN,
+ ISD::VECREDUCE_FMAX};
+
if (!Subtarget.is64Bit()) {
// We must custom-lower certain vXi64 operations on RV32 due to the vector
// element type being illegal.
setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT},
MVT::i64, Custom);
- setOperationAction({ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND,
- ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR,
- ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
- ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN},
- MVT::i64, Custom);
+ setOperationAction(IntegerVecReduceOps, MVT::i64, Custom);
setOperationAction({ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND,
ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR,
@@ -581,11 +586,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// Custom-lower reduction operations to set up the corresponding custom
// nodes' operands.
- setOperationAction({ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND,
- ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR,
- ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
- ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN},
- VT, Custom);
+ setOperationAction(IntegerVecReduceOps, VT, Custom);
setOperationAction(IntegerVPOps, VT, Custom);
@@ -661,9 +662,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND},
VT, Custom);
- setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD,
- ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAX},
- VT, Custom);
+ setOperationAction(FloatingPointVecReduceOps, VT, Custom);
// Expand FP operations that need libcalls.
setOperationAction(ISD::FREM, VT, Expand);
@@ -905,17 +904,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND},
VT, Custom);
- for (auto CC : VFPCCToExpand)
- setCondCodeAction(CC, VT, Expand);
+ setCondCodeAction(VFPCCToExpand, VT, Expand);
setOperationAction({ISD::VSELECT, ISD::SELECT}, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::BITCAST, VT, Custom);
- setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD,
- ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAX},
- VT, Custom);
+ setOperationAction(FloatingPointVecReduceOps, VT, Custom);
setOperationAction(FloatingPointVPOps, VT, Custom);
}
@@ -943,7 +939,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setJumpIsExpensive();
setTargetDAGCombine({ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND,
- ISD::OR, ISD::XOR});
+ ISD::OR, ISD::XOR, ISD::SETCC});
if (Subtarget.is64Bit())
setTargetDAGCombine(ISD::SRA);
@@ -1374,6 +1370,23 @@ unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context
// with 1/-1.
static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS,
ISD::CondCode &CC, SelectionDAG &DAG) {
+ // If this is a single bit test that can't be handled by ANDI, shift the
+ // bit to be tested to the MSB and perform a signed compare with 0.
+ if (isIntEqualitySetCC(CC) && isNullConstant(RHS) &&
+ LHS.getOpcode() == ISD::AND && LHS.hasOneUse() &&
+ isa<ConstantSDNode>(LHS.getOperand(1))) {
+ uint64_t Mask = LHS.getConstantOperandVal(1);
+ if (isPowerOf2_64(Mask) && !isInt<12>(Mask)) {
+ CC = CC == ISD::SETEQ ? ISD::SETGE : ISD::SETLT;
+ unsigned ShAmt = LHS.getValueSizeInBits() - 1 - Log2_64(Mask);
+ LHS = LHS.getOperand(0);
+ if (ShAmt != 0)
+ LHS = DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS,
+ DAG.getConstant(ShAmt, DL, LHS.getValueType()));
+ return;
+ }
+ }
+
// Convert X > -1 to X >= 0.
if (CC == ISD::SETGT && isAllOnesConstant(RHS)) {
RHS = DAG.getConstant(0, DL, RHS.getValueType());
@@ -3707,10 +3720,7 @@ SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
SDLoc DL(Op);
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
assert(N->getOffset() == 0 && "unexpected offset in global node");
-
- const GlobalValue *GV = N->getGlobal();
- bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
- return getAddr(N, DAG, IsLocal);
+ return getAddr(N, DAG, N->getGlobal()->isDSOLocal());
}
SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
@@ -8130,6 +8140,50 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG) {
return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false);
}
+// Replace (seteq (i64 (and X, 0xffffffff)), C1) with
+// (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
+// bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
+// can become a sext.w instead of a shift pair.
+static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ EVT OpVT = N0.getValueType();
+
+ if (OpVT != MVT::i64 || !Subtarget.is64Bit())
+ return SDValue();
+
+ // RHS needs to be a constant.
+ auto *N1C = dyn_cast<ConstantSDNode>(N1);
+ if (!N1C)
+ return SDValue();
+
+ // LHS needs to be (and X, 0xffffffff).
+ if (N0.getOpcode() != ISD::AND || !N0.hasOneUse() ||
+ !isa<ConstantSDNode>(N0.getOperand(1)) ||
+ N0.getConstantOperandVal(1) != UINT64_C(0xffffffff))
+ return SDValue();
+
+ // Looking for an equality compare.
+ ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ if (!isIntEqualitySetCC(Cond))
+ return SDValue();
+
+ const APInt &C1 = cast<ConstantSDNode>(N1)->getAPIntValue();
+
+ SDLoc dl(N);
+ // If the constant is larger than 2^32 - 1 it is impossible for both sides
+ // to be equal.
+ if (C1.getActiveBits() > 32)
+ return DAG.getBoolConstant(Cond == ISD::SETNE, dl, VT, OpVT);
+
+ SDValue SExtOp = DAG.getNode(ISD::SIGN_EXTEND_INREG, N, OpVT,
+ N0.getOperand(0), DAG.getValueType(MVT::i32));
+ return DAG.getSetCC(dl, VT, SExtOp, DAG.getConstant(C1.trunc(32).sext(64),
+ dl, OpVT), Cond);
+}
+
static SDValue
performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
@@ -8658,6 +8712,75 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(32 - ShAmt, DL, MVT::i64));
}
+// Perform common combines for BR_CC and SELECT_CC condtions.
+static bool combine_CC(SDValue &LHS, SDValue &RHS, SDValue &CC, const SDLoc &DL,
+ SelectionDAG &DAG, const RISCVSubtarget &Subtarget) {
+ ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get();
+ if (!ISD::isIntEqualitySetCC(CCVal))
+ return false;
+
+ // Fold ((setlt X, Y), 0, ne) -> (X, Y, lt)
+ // Sometimes the setcc is introduced after br_cc/select_cc has been formed.
+ if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
+ LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) {
+ // If we're looking for eq 0 instead of ne 0, we need to invert the
+ // condition.
+ bool Invert = CCVal == ISD::SETEQ;
+ CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+ if (Invert)
+ CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+ CC = DAG.getCondCode(CCVal);
+ return true;
+ }
+
+ // Fold ((xor X, Y), 0, eq/ne) -> (X, Y, eq/ne)
+ if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS)) {
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ return true;
+ }
+
+ // Fold ((srl (and X, 1<<C), C), 0, eq/ne) -> ((shl X, XLen-1-C), 0, ge/lt)
+ if (isNullConstant(RHS) && LHS.getOpcode() == ISD::SRL && LHS.hasOneUse() &&
+ LHS.getOperand(1).getOpcode() == ISD::Constant) {
+ SDValue LHS0 = LHS.getOperand(0);
+ if (LHS0.getOpcode() == ISD::AND &&
+ LHS0.getOperand(1).getOpcode() == ISD::Constant) {
+ uint64_t Mask = LHS0.getConstantOperandVal(1);
+ uint64_t ShAmt = LHS.getConstantOperandVal(1);
+ if (isPowerOf2_64(Mask) && Log2_64(Mask) == ShAmt) {
+ CCVal = CCVal == ISD::SETEQ ? ISD::SETGE : ISD::SETLT;
+ CC = DAG.getCondCode(CCVal);
+
+ ShAmt = LHS.getValueSizeInBits() - 1 - ShAmt;
+ LHS = LHS0.getOperand(0);
+ if (ShAmt != 0)
+ LHS =
+ DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS0.getOperand(0),
+ DAG.getConstant(ShAmt, DL, LHS.getValueType()));
+ return true;
+ }
+ }
+ }
+
+ // (X, 1, setne) -> // (X, 0, seteq) if we can prove X is 0/1.
+ // This can occur when legalizing some floating point comparisons.
+ APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
+ if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
+ CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+ CC = DAG.getCondCode(CCVal);
+ RHS = DAG.getConstant(0, DL, LHS.getValueType());
+ return true;
+ }
+
+ return false;
+}
+
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -8872,6 +8995,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FMAXNUM:
case ISD::FMINNUM:
return combineBinOpToReduce(N, DAG);
+ case ISD::SETCC:
+ return performSETCCCombine(N, DAG, Subtarget);
case ISD::SIGN_EXTEND_INREG:
return performSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
case ISD::ZERO_EXTEND:
@@ -8900,110 +9025,32 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
// Transform
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ SDValue CC = N->getOperand(2);
SDValue TrueV = N->getOperand(3);
SDValue FalseV = N->getOperand(4);
+ SDLoc DL(N);
// If the True and False values are the same, we don't need a select_cc.
if (TrueV == FalseV)
return TrueV;
- ISD::CondCode CCVal = cast<CondCodeSDNode>(N->getOperand(2))->get();
- if (!ISD::isIntEqualitySetCC(CCVal))
- break;
-
- // Fold (select_cc (setlt X, Y), 0, ne, trueV, falseV) ->
- // (select_cc X, Y, lt, trueV, falseV)
- // Sometimes the setcc is introduced after select_cc has been formed.
- if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
- LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) {
- // If we're looking for eq 0 instead of ne 0, we need to invert the
- // condition.
- bool Invert = CCVal == ISD::SETEQ;
- CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
- if (Invert)
- CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
-
- SDLoc DL(N);
- RHS = LHS.getOperand(1);
- LHS = LHS.getOperand(0);
- translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
-
- SDValue TargetCC = DAG.getCondCode(CCVal);
- return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
- {LHS, RHS, TargetCC, TrueV, FalseV});
- }
-
- // Fold (select_cc (xor X, Y), 0, eq/ne, trueV, falseV) ->
- // (select_cc X, Y, eq/ne, trueV, falseV)
- if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS))
- return DAG.getNode(RISCVISD::SELECT_CC, SDLoc(N), N->getValueType(0),
- {LHS.getOperand(0), LHS.getOperand(1),
- N->getOperand(2), TrueV, FalseV});
- // (select_cc X, 1, setne, trueV, falseV) ->
- // (select_cc X, 0, seteq, trueV, falseV) if we can prove X is 0/1.
- // This can occur when legalizing some floating point comparisons.
- APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
- if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
- SDLoc DL(N);
- CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
- SDValue TargetCC = DAG.getCondCode(CCVal);
- RHS = DAG.getConstant(0, DL, LHS.getValueType());
+ if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget))
return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
- {LHS, RHS, TargetCC, TrueV, FalseV});
- }
+ {LHS, RHS, CC, TrueV, FalseV});
- break;
+ return SDValue();
}
case RISCVISD::BR_CC: {
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
- ISD::CondCode CCVal = cast<CondCodeSDNode>(N->getOperand(3))->get();
- if (!ISD::isIntEqualitySetCC(CCVal))
- break;
-
- // Fold (br_cc (setlt X, Y), 0, ne, dest) ->
- // (br_cc X, Y, lt, dest)
- // Sometimes the setcc is introduced after br_cc has been formed.
- if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
- LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) {
- // If we're looking for eq 0 instead of ne 0, we need to invert the
- // condition.
- bool Invert = CCVal == ISD::SETEQ;
- CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
- if (Invert)
- CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
-
- SDLoc DL(N);
- RHS = LHS.getOperand(1);
- LHS = LHS.getOperand(0);
- translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+ SDValue CC = N->getOperand(3);
+ SDLoc DL(N);
+ if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget))
return DAG.getNode(RISCVISD::BR_CC, DL, N->getValueType(0),
- N->getOperand(0), LHS, RHS, DAG.getCondCode(CCVal),
- N->getOperand(4));
- }
-
- // Fold (br_cc (xor X, Y), 0, eq/ne, dest) ->
- // (br_cc X, Y, eq/ne, trueV, falseV)
- if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS))
- return DAG.getNode(RISCVISD::BR_CC, SDLoc(N), N->getValueType(0),
- N->getOperand(0), LHS.getOperand(0), LHS.getOperand(1),
- N->getOperand(3), N->getOperand(4));
-
- // (br_cc X, 1, setne, br_cc) ->
- // (br_cc X, 0, seteq, br_cc) if we can prove X is 0/1.
- // This can occur when legalizing some floating point comparisons.
- APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
- if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
- SDLoc DL(N);
- CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
- SDValue TargetCC = DAG.getCondCode(CCVal);
- RHS = DAG.getConstant(0, DL, LHS.getValueType());
- return DAG.getNode(RISCVISD::BR_CC, DL, N->getValueType(0),
- N->getOperand(0), LHS, RHS, TargetCC,
- N->getOperand(4));
- }
- break;
+ N->getOperand(0), LHS, RHS, CC, N->getOperand(4));
+
+ return SDValue();
}
case ISD::BITREVERSE:
return performBITREVERSECombine(N, DAG, Subtarget);
@@ -9299,6 +9346,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
bool RISCVTargetLowering::isDesirableToCommuteWithShift(
const SDNode *N, CombineLevel Level) const {
+ assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
+ N->getOpcode() == ISD::SRL) &&
+ "Expected shift op");
+
// The following folds are only desirable if `(OP _, c1 << c2)` can be
// materialised in fewer instructions than `(OP _, c1)`:
//
@@ -9357,7 +9408,8 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
return false;
// Only handle AND for now.
- if (Op.getOpcode() != ISD::AND)
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode != ISD::AND && Opcode != ISD::OR && Opcode != ISD::XOR)
return false;
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
@@ -9376,12 +9428,13 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
auto IsLegalMask = [ShrunkMask, ExpandedMask](const APInt &Mask) -> bool {
return ShrunkMask.isSubsetOf(Mask) && Mask.isSubsetOf(ExpandedMask);
};
- auto UseMask = [Mask, Op, VT, &TLO](const APInt &NewMask) -> bool {
+ auto UseMask = [Mask, Op, &TLO](const APInt &NewMask) -> bool {
if (NewMask == Mask)
return true;
SDLoc DL(Op);
- SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
- SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+ SDValue NewC = TLO.DAG.getConstant(NewMask, DL, Op.getValueType());
+ SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
+ Op.getOperand(0), NewC);
return TLO.CombineTo(Op, NewOp);
};
@@ -9390,18 +9443,21 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
if (ShrunkMask.isSignedIntN(12))
return false;
- // Preserve (and X, 0xffff) when zext.h is supported.
- if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) {
- APInt NewMask = APInt(Mask.getBitWidth(), 0xffff);
- if (IsLegalMask(NewMask))
- return UseMask(NewMask);
- }
+ // And has a few special cases for zext.
+ if (Opcode == ISD::AND) {
+ // Preserve (and X, 0xffff) when zext.h is supported.
+ if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) {
+ APInt NewMask = APInt(Mask.getBitWidth(), 0xffff);
+ if (IsLegalMask(NewMask))
+ return UseMask(NewMask);
+ }
- // Try to preserve (and X, 0xffffffff), the (zext_inreg X, i32) pattern.
- if (VT == MVT::i64) {
- APInt NewMask = APInt(64, 0xffffffff);
- if (IsLegalMask(NewMask))
- return UseMask(NewMask);
+ // Try to preserve (and X, 0xffffffff), the (zext_inreg X, i32) pattern.
+ if (VT == MVT::i64) {
+ APInt NewMask = APInt(64, 0xffffffff);
+ if (IsLegalMask(NewMask))
+ return UseMask(NewMask);
+ }
}
// For the remaining optimizations, we need to be able to make a negative
@@ -9414,10 +9470,11 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
// Try to make a 12 bit negative immediate. If that fails try to make a 32
// bit negative immediate unless the shrunk immediate already fits in 32 bits.
+ // If we can't create a simm12, we shouldn't change opaque constants.
APInt NewMask = ShrunkMask;
if (MinSignedBits <= 12)
NewMask.setBitsFrom(11);
- else if (MinSignedBits <= 32 && !ShrunkMask.isSignedIntN(32))
+ else if (!C->isOpaque() && MinSignedBits <= 32 && !ShrunkMask.isSignedIntN(32))
NewMask.setBitsFrom(31);
else
return false;
@@ -10015,15 +10072,15 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
LastSelectPseudo = &*SequenceMBBI;
SequenceMBBI->collectDebugValues(SelectDebugValues);
SelectDests.insert(SequenceMBBI->getOperand(0).getReg());
- } else {
- if (SequenceMBBI->hasUnmodeledSideEffects() ||
- SequenceMBBI->mayLoadOrStore())
- break;
- if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
- return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
- }))
- break;
+ continue;
}
+ if (SequenceMBBI->hasUnmodeledSideEffects() ||
+ SequenceMBBI->mayLoadOrStore())
+ break;
+ if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
+ return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
+ }))
+ break;
}
const RISCVInstrInfo &TII = *Subtarget.getInstrInfo();
@@ -12159,7 +12216,8 @@ bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const {
// FIXME: This doesn't work for zve32, but that's already broken
// elsewhere for the same reason.
assert(Subtarget.getRealMinVLen() >= 64 && "zve32* unsupported");
- assert(RISCV::RVVBitsPerBlock == 64 && "RVVBitsPerBlock changed, audit needed");
+ static_assert(RISCV::RVVBitsPerBlock == 64,
+ "RVVBitsPerBlock changed, audit needed");
return true;
}
@@ -12214,10 +12272,12 @@ bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned)
bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const {
// Check integral scalar types.
+ const bool HasExtMOrZmmul =
+ Subtarget.hasStdExtM() || Subtarget.hasStdExtZmmul();
if (VT.isScalarInteger()) {
// Omit the optimization if the sub target has the M extension and the data
// size exceeds XLen.
- if (Subtarget.hasStdExtM() && VT.getSizeInBits() > Subtarget.getXLen())
+ if (HasExtMOrZmmul && VT.getSizeInBits() > Subtarget.getXLen())
return false;
if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
// Break the MUL to a SLLI and an ADD/SUB.
@@ -12232,7 +12292,7 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
return true;
// Omit the following optimization if the sub target has the M extension
// and the data size >= XLen.
- if (Subtarget.hasStdExtM() && VT.getSizeInBits() >= Subtarget.getXLen())
+ if (HasExtMOrZmmul && VT.getSizeInBits() >= Subtarget.getXLen())
return false;
// Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
// a pair of LUI/ADDI.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 685604ad9a59..75a79895330f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -637,6 +637,64 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
}
}
+MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
+ VirtRegMap *VRM) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // The below optimizations narrow the load so they are only valid for little
+ // endian.
+ // TODO: Support big endian by adding an offset into the frame object?
+ if (MF.getDataLayout().isBigEndian())
+ return nullptr;
+
+ // Fold load from stack followed by sext.w into lw.
+ // TODO: Fold with sext.b, sext.h, zext.b, zext.h, zext.w?
+ if (Ops.size() != 1 || Ops[0] != 1)
+ return nullptr;
+
+ unsigned LoadOpc;
+ switch (MI.getOpcode()) {
+ default:
+ if (RISCV::isSEXT_W(MI)) {
+ LoadOpc = RISCV::LW;
+ break;
+ }
+ if (RISCV::isZEXT_W(MI)) {
+ LoadOpc = RISCV::LWU;
+ break;
+ }
+ if (RISCV::isZEXT_B(MI)) {
+ LoadOpc = RISCV::LBU;
+ break;
+ }
+ return nullptr;
+ case RISCV::SEXT_H:
+ LoadOpc = RISCV::LH;
+ break;
+ case RISCV::SEXT_B:
+ LoadOpc = RISCV::LB;
+ break;
+ case RISCV::ZEXT_H_RV32:
+ case RISCV::ZEXT_H_RV64:
+ LoadOpc = RISCV::LHU;
+ break;
+ }
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIndex),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+ MFI.getObjectAlign(FrameIndex));
+
+ Register DstReg = MI.getOperand(0).getReg();
+ return BuildMI(*MI.getParent(), InsertPt, MI.getDebugLoc(), get(LoadOpc),
+ DstReg)
+ .addFrameIndex(FrameIndex)
+ .addImm(0)
+ .addMemOperand(MMO);
+}
+
void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, Register DstReg, uint64_t Val,
@@ -1799,17 +1857,30 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
.addReg(VL, RegState::Kill)
.addImm(ShiftAmount)
.setMIFlag(Flag);
- } else if ((NumOfVReg == 3 || NumOfVReg == 5 || NumOfVReg == 9) &&
- STI.hasStdExtZba()) {
- // We can use Zba SHXADD instructions for multiply in some cases.
- // TODO: Generalize to SHXADD+SLLI.
+ } else if (STI.hasStdExtZba() &&
+ ((NumOfVReg % 3 == 0 && isPowerOf2_64(NumOfVReg / 3)) ||
+ (NumOfVReg % 5 == 0 && isPowerOf2_64(NumOfVReg / 5)) ||
+ (NumOfVReg % 9 == 0 && isPowerOf2_64(NumOfVReg / 9)))) {
+ // We can use Zba SHXADD+SLLI instructions for multiply in some cases.
unsigned Opc;
- switch (NumOfVReg) {
- default: llvm_unreachable("Unexpected number of vregs");
- case 3: Opc = RISCV::SH1ADD; break;
- case 5: Opc = RISCV::SH2ADD; break;
- case 9: Opc = RISCV::SH3ADD; break;
+ uint32_t ShiftAmount;
+ if (NumOfVReg % 9 == 0) {
+ Opc = RISCV::SH3ADD;
+ ShiftAmount = Log2_64(NumOfVReg / 9);
+ } else if (NumOfVReg % 5 == 0) {
+ Opc = RISCV::SH2ADD;
+ ShiftAmount = Log2_64(NumOfVReg / 5);
+ } else if (NumOfVReg % 3 == 0) {
+ Opc = RISCV::SH1ADD;
+ ShiftAmount = Log2_64(NumOfVReg / 3);
+ } else {
+ llvm_unreachable("Unexpected number of vregs");
}
+ if (ShiftAmount)
+ BuildMI(MBB, II, DL, get(RISCV::SLLI), VL)
+ .addReg(VL, RegState::Kill)
+ .addImm(ShiftAmount)
+ .setMIFlag(Flag);
BuildMI(MBB, II, DL, get(Opc), VL)
.addReg(VL, RegState::Kill)
.addReg(VL)
@@ -1839,10 +1910,11 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
} else {
Register N = MRI.createVirtualRegister(&RISCV::GPRRegClass);
movImm(MBB, II, DL, N, NumOfVReg, Flag);
- if (!STI.hasStdExtM())
+ if (!STI.hasStdExtM() && !STI.hasStdExtZmmul())
MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{
MF.getFunction(),
- "M-extension must be enabled to calculate the vscaled size/offset."});
+ "M- or Zmmul-extension must be enabled to calculate the vscaled size/"
+ "offset."});
BuildMI(MBB, II, DL, get(RISCV::MUL), VL)
.addReg(VL, RegState::Kill)
.addReg(N, RegState::Kill)
@@ -1852,6 +1924,24 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
return VL;
}
+// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
+bool RISCV::isSEXT_W(const MachineInstr &MI) {
+ return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() &&
+ MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0;
+}
+
+// Returns true if this is the zext.w pattern, adduw rd, rs1, x0.
+bool RISCV::isZEXT_W(const MachineInstr &MI) {
+ return MI.getOpcode() == RISCV::ADD_UW && MI.getOperand(1).isReg() &&
+ MI.getOperand(2).isReg() && MI.getOperand(2).getReg() == RISCV::X0;
+}
+
+// Returns true if this is the zext.b pattern, andi rd, rs1, 255.
+bool RISCV::isZEXT_B(const MachineInstr &MI) {
+ return MI.getOpcode() == RISCV::ANDI && MI.getOperand(1).isReg() &&
+ MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 255;
+}
+
static bool isRVVWholeLoadStore(unsigned Opcode) {
switch (Opcode) {
default:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 5368437618bd..4aa9ded5b3a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -69,6 +69,14 @@ public:
int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
+ using TargetInstrInfo::foldMemoryOperandImpl;
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt,
+ int FrameIndex,
+ LiveIntervals *LIS = nullptr,
+ VirtRegMap *VRM = nullptr) const override;
+
// Materializes the given integer Val into DstReg.
void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, Register DstReg, uint64_t Val,
@@ -183,6 +191,11 @@ protected:
namespace RISCV {
+// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
+bool isSEXT_W(const MachineInstr &MI);
+bool isZEXT_W(const MachineInstr &MI);
+bool isZEXT_B(const MachineInstr &MI);
+
// Returns true if the given MI is an RVV instruction opcode for which we may
// expect to see a FrameIndex operand.
bool isRVVSpill(const MachineInstr &MI);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 06a90438838e..78fd09fbf387 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1278,6 +1278,13 @@ def : Pat<(setgt GPR:$rs1, simm12_minus1_nonzero:$imm),
def : Pat<(setugt GPR:$rs1, simm12_minus1_nonzero:$imm),
(XORI (SLTIU GPR:$rs1, (ImmPlus1 simm12_minus1_nonzero:$imm)), 1)>;
+// If negating a pattern that requires an XORI above, we can fold the XORI with
+// the NEG. The XORI is equivalent to 1-X and negating gives X-1.
+def : Pat<(ineg (setuge GPR:$rs1, GPR:$rs2)), (ADDI (SLTU GPR:$rs1, GPR:$rs2), -1)>;
+def : Pat<(ineg (setule GPR:$rs1, GPR:$rs2)), (ADDI (SLTU GPR:$rs2, GPR:$rs1), -1)>;
+def : Pat<(ineg (setge GPR:$rs1, GPR:$rs2)), (ADDI (SLT GPR:$rs1, GPR:$rs2), -1)>;
+def : Pat<(ineg (setle GPR:$rs1, GPR:$rs2)), (ADDI (SLT GPR:$rs2, GPR:$rs1), -1)>;
+
def IntCCtoRISCVCC : SDNodeXForm<riscv_selectcc, [{
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
RISCVCC::CondCode BrCC = getRISCVCCForIntCC(CC);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index 72ba8460116f..662604b138d2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -24,7 +24,7 @@ def riscv_remuw : SDNode<"RISCVISD::REMUW", SDT_RISCVIntBinOpW>;
// Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtM] in {
+let Predicates = [HasStdExtMOrZmmul] in {
def MUL : ALU_rr<0b0000001, 0b000, "mul", /*Commutable*/1>,
Sched<[WriteIMul, ReadIMul, ReadIMul]>;
def MULH : ALU_rr<0b0000001, 0b001, "mulh", /*Commutable*/1>,
@@ -33,6 +33,9 @@ def MULHSU : ALU_rr<0b0000001, 0b010, "mulhsu">,
Sched<[WriteIMul, ReadIMul, ReadIMul]>;
def MULHU : ALU_rr<0b0000001, 0b011, "mulhu", /*Commutable*/1>,
Sched<[WriteIMul, ReadIMul, ReadIMul]>;
+} // Predicates = [HasStdExtMOrZmmul]
+
+let Predicates = [HasStdExtM] in {
def DIV : ALU_rr<0b0000001, 0b100, "div">,
Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>;
def DIVU : ALU_rr<0b0000001, 0b101, "divu">,
@@ -43,9 +46,12 @@ def REMU : ALU_rr<0b0000001, 0b111, "remu">,
Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>;
} // Predicates = [HasStdExtM]
-let Predicates = [HasStdExtM, IsRV64] in {
+let Predicates = [HasStdExtMOrZmmul, IsRV64] in {
def MULW : ALUW_rr<0b0000001, 0b000, "mulw", /*Commutable*/1>,
Sched<[WriteIMul32, ReadIMul32, ReadIMul32]>;
+} // Predicates = [HasStdExtMOrZmmul, IsRV64]
+
+let Predicates = [HasStdExtM, IsRV64] in {
def DIVW : ALUW_rr<0b0000001, 0b100, "divw">,
Sched<[WriteIDiv32, ReadIDiv32, ReadIDiv32]>;
def DIVUW : ALUW_rr<0b0000001, 0b101, "divuw">,
@@ -60,21 +66,25 @@ def REMUW : ALUW_rr<0b0000001, 0b111, "remuw">,
// Pseudo-instructions and codegen patterns
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtM] in {
+let Predicates = [HasStdExtMOrZmmul] in {
def : PatGprGpr<mul, MUL>;
def : PatGprGpr<mulhs, MULH>;
def : PatGprGpr<mulhu, MULHU>;
def : PatGprGpr<riscv_mulhsu, MULHSU>;
+} // Predicates = [HasStdExtMOrZmmul]
+
+let Predicates = [HasStdExtM] in {
def : PatGprGpr<sdiv, DIV>;
def : PatGprGpr<udiv, DIVU>;
def : PatGprGpr<srem, REM>;
def : PatGprGpr<urem, REMU>;
} // Predicates = [HasStdExtM]
-let Predicates = [HasStdExtM, IsRV64] in {
// Select W instructions if only the lower 32-bits of the result are used.
+let Predicates = [HasStdExtMOrZmmul, IsRV64] in
def : PatGprGpr<binop_allwusers<mul>, MULW>;
+let Predicates = [HasStdExtM, IsRV64] in {
def : PatGprGpr<riscv_divw, DIVW>;
def : PatGprGpr<riscv_divuw, DIVUW>;
def : PatGprGpr<riscv_remuw, REMUW>;
@@ -96,11 +106,11 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))),
(REMW GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtM, IsRV64]
-let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
+let Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] in {
// Special case for calculating the full 64-bit product of a 32x32 unsigned
// multiply where the inputs aren't known to be zero extended. We can shift the
// inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish
// zeroing the upper 32 bits.
def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
(MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
-} // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba]
+} // Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba]
diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
index dadf8f81a2c0..920729e9ebbf 100644
--- a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
@@ -443,8 +443,7 @@ bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) {
MachineInstr *MI = &*I++;
// We're looking for the sext.w pattern ADDIW rd, rs1, 0.
- if (MI->getOpcode() != RISCV::ADDIW || !MI->getOperand(2).isImm() ||
- MI->getOperand(2).getImm() != 0 || !MI->getOperand(1).isReg())
+ if (!RISCV::isSEXT_W(*MI))
continue;
// Input should be a virtual register.
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 7589b44b81d3..0446edefa979 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -202,11 +202,9 @@ bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
}
bool RISCVSubtarget::enableSubRegLiveness() const {
- if (EnableSubRegLiveness.getNumOccurrences())
- return EnableSubRegLiveness;
- // Enable subregister liveness for RVV to better handle LMUL>1 and segment
- // load/store.
- return hasVInstructions();
+ // FIXME: Enable subregister liveness by default for RVV to better handle
+ // LMUL>1 and segment load/store.
+ return EnableSubRegLiveness;
}
void RISCVSubtarget::getPostRAMutations(
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 831f7fadaa62..6eb949fa551c 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -89,6 +89,7 @@ private:
bool HasStdExtZicbom = false;
bool HasStdExtZicboz = false;
bool HasStdExtZicbop = false;
+ bool HasStdExtZmmul = false;
bool HasRV64 = false;
bool IsRV32E = false;
bool EnableLinkerRelax = false;
@@ -184,6 +185,7 @@ public:
bool hasStdExtZicbom() const { return HasStdExtZicbom; }
bool hasStdExtZicboz() const { return HasStdExtZicboz; }
bool hasStdExtZicbop() const { return HasStdExtZicbop; }
+ bool hasStdExtZmmul() const { return HasStdExtZmmul; }
bool is64Bit() const { return HasRV64; }
bool isRV32E() const { return IsRV32E; }
bool enableLinkerRelax() const { return EnableLinkerRelax; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b2707b753e87..50fcb00e6c63 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -49,6 +49,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
initializeGlobalISel(*PR);
initializeRISCVMakeCompressibleOptPass(*PR);
initializeRISCVGatherScatterLoweringPass(*PR);
+ initializeRISCVCodeGenPreparePass(*PR);
initializeRISCVMergeBaseOffsetOptPass(*PR);
initializeRISCVSExtWRemovalPass(*PR);
initializeRISCVExpandPseudoPass(*PR);
@@ -187,7 +188,11 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
void RISCVPassConfig::addIRPasses() {
addPass(createAtomicExpandPass());
- addPass(createRISCVGatherScatterLoweringPass());
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createRISCVGatherScatterLoweringPass());
+
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createRISCVCodeGenPreparePass());
TargetPassConfig::addIRPasses();
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 29d3c5e491de..f9cd5ffb512b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -65,7 +65,7 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
return TTI::TCC_Free;
// zext.w
- if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZbb())
+ if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
return TTI::TCC_Free;
LLVM_FALLTHROUGH;
case Instruction::Add:
@@ -198,6 +198,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// vid.v v9
// vrsub.vx v10, v9, a0
// vrgather.vv v9, v8, v10
+ if (Tp->getElementType()->isIntegerTy(1))
+ // Mask operation additionally required extend and truncate
+ return LT.first * 9;
return LT.first * 6;
}
}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
index 1a3e35a5f901..220fd76305aa 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
@@ -1068,5 +1068,15 @@ StringRef getKernelProfilingInfoName(KernelProfilingInfo e) {
}
llvm_unreachable("Unexpected operand");
}
+
+std::string getExtInstSetName(InstructionSet e) {
+ switch (e) {
+ CASE(InstructionSet, OpenCL_std)
+ CASE(InstructionSet, GLSL_std_450)
+ CASE(InstructionSet, SPV_AMD_shader_trinary_minmax)
+ break;
+ }
+ llvm_unreachable("Unexpected operand");
+}
} // namespace SPIRV
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index 2aa9f076c78e..9482723993a2 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -706,6 +706,19 @@ enum class KernelProfilingInfo : uint32_t {
CmdExecTime = 0x1,
};
StringRef getKernelProfilingInfoName(KernelProfilingInfo e);
+
+enum class InstructionSet : uint32_t {
+ OpenCL_std = 0,
+ GLSL_std_450 = 1,
+ SPV_AMD_shader_trinary_minmax = 2,
+};
+std::string getExtInstSetName(InstructionSet e);
+
+// TODO: implement other mnemonics.
+enum class Opcode : uint32_t {
+ InBoundsPtrAccessChain = 70,
+ PtrCastToGeneric = 121,
+};
} // namespace SPIRV
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 3105baa02c90..d60e61f36270 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -59,7 +59,7 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI,
}
void SPIRVInstPrinter::recordOpExtInstImport(const MCInst *MI) {
- llvm_unreachable("Unimplemented recordOpExtInstImport");
+ // TODO: insert {Reg, Set} into ExtInstSetIDs map.
}
void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
@@ -176,7 +176,18 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
}
void SPIRVInstPrinter::printOpExtInst(const MCInst *MI, raw_ostream &O) {
- llvm_unreachable("Unimplemented printOpExtInst");
+ // The fixed operands have already been printed, so just need to decide what
+ // type of ExtInst operands to print based on the instruction set and number.
+ MCInstrDesc MCDesc = MII.get(MI->getOpcode());
+ unsigned NumFixedOps = MCDesc.getNumOperands();
+ const auto NumOps = MI->getNumOperands();
+ if (NumOps == NumFixedOps)
+ return;
+
+ O << ' ';
+
+ // TODO: implement special printing for OpenCLExtInst::vstor*.
+ printRemainingVariableOps(MI, NumFixedOps, O, true);
}
void SPIRVInstPrinter::printOpDecorate(const MCInst *MI, raw_ostream &O) {
diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h
index 8da54a5d6e61..5a7f2e51afb8 100644
--- a/llvm/lib/Target/SPIRV/SPIRV.h
+++ b/llvm/lib/Target/SPIRV/SPIRV.h
@@ -19,6 +19,7 @@ class SPIRVSubtarget;
class InstructionSelector;
class RegisterBankInfo;
+ModulePass *createSPIRVPrepareFunctionsPass();
FunctionPass *createSPIRVPreLegalizerPass();
FunctionPass *createSPIRVEmitIntrinsicsPass(SPIRVTargetMachine *TM);
InstructionSelector *
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 605bf949187f..6d60bd5e3c97 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -21,6 +21,7 @@
#include "SPIRVUtils.h"
#include "TargetInfo/SPIRVTargetInfo.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -58,9 +59,14 @@ public:
void outputModuleSection(SPIRV::ModuleSectionType MSType);
void outputEntryPoints();
void outputDebugSourceAndStrings(const Module &M);
+ void outputOpExtInstImports(const Module &M);
void outputOpMemoryModel();
void outputOpFunctionEnd();
void outputExtFuncDecls();
+ void outputExecutionModeFromMDNode(Register Reg, MDNode *Node,
+ SPIRV::ExecutionMode EM);
+ void outputExecutionMode(const Module &M);
+ void outputAnnotations(const Module &M);
void outputModuleSections();
void emitInstruction(const MachineInstr *MI) override;
@@ -127,6 +133,8 @@ void SPIRVAsmPrinter::emitFunctionBodyEnd() {
}
void SPIRVAsmPrinter::emitOpLabel(const MachineBasicBlock &MBB) {
+ if (MAI->MBBsToSkip.contains(&MBB))
+ return;
MCInst LabelInst;
LabelInst.setOpcode(SPIRV::OpLabel);
LabelInst.addOperand(MCOperand::createReg(MAI->getOrCreateMBBRegister(MBB)));
@@ -237,6 +245,13 @@ void SPIRVAsmPrinter::outputModuleSection(SPIRV::ModuleSectionType MSType) {
}
void SPIRVAsmPrinter::outputDebugSourceAndStrings(const Module &M) {
+ // Output OpSourceExtensions.
+ for (auto &Str : MAI->SrcExt) {
+ MCInst Inst;
+ Inst.setOpcode(SPIRV::OpSourceExtension);
+ addStringImm(Str.first(), Inst);
+ outputMCInst(Inst);
+ }
// Output OpSource.
MCInst Inst;
Inst.setOpcode(SPIRV::OpSource);
@@ -246,6 +261,19 @@ void SPIRVAsmPrinter::outputDebugSourceAndStrings(const Module &M) {
outputMCInst(Inst);
}
+void SPIRVAsmPrinter::outputOpExtInstImports(const Module &M) {
+ for (auto &CU : MAI->ExtInstSetMap) {
+ unsigned Set = CU.first;
+ Register Reg = CU.second;
+ MCInst Inst;
+ Inst.setOpcode(SPIRV::OpExtInstImport);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ addStringImm(getExtInstSetName(static_cast<SPIRV::InstructionSet>(Set)),
+ Inst);
+ outputMCInst(Inst);
+ }
+}
+
void SPIRVAsmPrinter::outputOpMemoryModel() {
MCInst Inst;
Inst.setOpcode(SPIRV::OpMemoryModel);
@@ -301,6 +329,135 @@ void SPIRVAsmPrinter::outputExtFuncDecls() {
}
}
+// Encode LLVM type by SPIR-V execution mode VecTypeHint.
+static unsigned encodeVecTypeHint(Type *Ty) {
+ if (Ty->isHalfTy())
+ return 4;
+ if (Ty->isFloatTy())
+ return 5;
+ if (Ty->isDoubleTy())
+ return 6;
+ if (IntegerType *IntTy = dyn_cast<IntegerType>(Ty)) {
+ switch (IntTy->getIntegerBitWidth()) {
+ case 8:
+ return 0;
+ case 16:
+ return 1;
+ case 32:
+ return 2;
+ case 64:
+ return 3;
+ default:
+ llvm_unreachable("invalid integer type");
+ }
+ }
+ if (FixedVectorType *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+ Type *EleTy = VecTy->getElementType();
+ unsigned Size = VecTy->getNumElements();
+ return Size << 16 | encodeVecTypeHint(EleTy);
+ }
+ llvm_unreachable("invalid type");
+}
+
+static void addOpsFromMDNode(MDNode *MDN, MCInst &Inst,
+ SPIRV::ModuleAnalysisInfo *MAI) {
+ for (const MDOperand &MDOp : MDN->operands()) {
+ if (auto *CMeta = dyn_cast<ConstantAsMetadata>(MDOp)) {
+ Constant *C = CMeta->getValue();
+ if (ConstantInt *Const = dyn_cast<ConstantInt>(C)) {
+ Inst.addOperand(MCOperand::createImm(Const->getZExtValue()));
+ } else if (auto *CE = dyn_cast<Function>(C)) {
+ Register FuncReg = MAI->getFuncReg(CE->getName().str());
+ assert(FuncReg.isValid());
+ Inst.addOperand(MCOperand::createReg(FuncReg));
+ }
+ }
+ }
+}
+
+void SPIRVAsmPrinter::outputExecutionModeFromMDNode(Register Reg, MDNode *Node,
+ SPIRV::ExecutionMode EM) {
+ MCInst Inst;
+ Inst.setOpcode(SPIRV::OpExecutionMode);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createImm(static_cast<unsigned>(EM)));
+ addOpsFromMDNode(Node, Inst, MAI);
+ outputMCInst(Inst);
+}
+
+void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
+ NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode");
+ if (Node) {
+ for (unsigned i = 0; i < Node->getNumOperands(); i++) {
+ MCInst Inst;
+ Inst.setOpcode(SPIRV::OpExecutionMode);
+ addOpsFromMDNode(cast<MDNode>(Node->getOperand(i)), Inst, MAI);
+ outputMCInst(Inst);
+ }
+ }
+ for (auto FI = M.begin(), E = M.end(); FI != E; ++FI) {
+ const Function &F = *FI;
+ if (F.isDeclaration())
+ continue;
+ Register FReg = MAI->getFuncReg(F.getGlobalIdentifier());
+ assert(FReg.isValid());
+ if (MDNode *Node = F.getMetadata("reqd_work_group_size"))
+ outputExecutionModeFromMDNode(FReg, Node,
+ SPIRV::ExecutionMode::LocalSize);
+ if (MDNode *Node = F.getMetadata("work_group_size_hint"))
+ outputExecutionModeFromMDNode(FReg, Node,
+ SPIRV::ExecutionMode::LocalSizeHint);
+ if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size"))
+ outputExecutionModeFromMDNode(FReg, Node,
+ SPIRV::ExecutionMode::SubgroupSize);
+ if (MDNode *Node = F.getMetadata("vec_type_hint")) {
+ MCInst Inst;
+ Inst.setOpcode(SPIRV::OpExecutionMode);
+ Inst.addOperand(MCOperand::createReg(FReg));
+ unsigned EM = static_cast<unsigned>(SPIRV::ExecutionMode::VecTypeHint);
+ Inst.addOperand(MCOperand::createImm(EM));
+ unsigned TypeCode = encodeVecTypeHint(getMDOperandAsType(Node, 0));
+ Inst.addOperand(MCOperand::createImm(TypeCode));
+ outputMCInst(Inst);
+ }
+ }
+}
+
+void SPIRVAsmPrinter::outputAnnotations(const Module &M) {
+ outputModuleSection(SPIRV::MB_Annotations);
+ // Process llvm.global.annotations special global variable.
+ for (auto F = M.global_begin(), E = M.global_end(); F != E; ++F) {
+ if ((*F).getName() != "llvm.global.annotations")
+ continue;
+ const GlobalVariable *V = &(*F);
+ const ConstantArray *CA = cast<ConstantArray>(V->getOperand(0));
+ for (Value *Op : CA->operands()) {
+ ConstantStruct *CS = cast<ConstantStruct>(Op);
+ // The first field of the struct contains a pointer to
+ // the annotated variable.
+ Value *AnnotatedVar = CS->getOperand(0)->stripPointerCasts();
+ if (!isa<Function>(AnnotatedVar))
+ llvm_unreachable("Unsupported value in llvm.global.annotations");
+ Function *Func = cast<Function>(AnnotatedVar);
+ Register Reg = MAI->getFuncReg(Func->getGlobalIdentifier());
+
+ // The second field contains a pointer to a global annotation string.
+ GlobalVariable *GV =
+ cast<GlobalVariable>(CS->getOperand(1)->stripPointerCasts());
+
+ StringRef AnnotationString;
+ getConstantStringInfo(GV, AnnotationString);
+ MCInst Inst;
+ Inst.setOpcode(SPIRV::OpDecorate);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ unsigned Dec = static_cast<unsigned>(SPIRV::Decoration::UserSemantic);
+ Inst.addOperand(MCOperand::createImm(Dec));
+ addStringImm(AnnotationString, Inst);
+ outputMCInst(Inst);
+ }
+ }
+}
+
void SPIRVAsmPrinter::outputModuleSections() {
const Module *M = MMI->getModule();
// Get the global subtarget to output module-level info.
@@ -311,13 +468,14 @@ void SPIRVAsmPrinter::outputModuleSections() {
// Output instructions according to the Logical Layout of a Module:
// TODO: 1,2. All OpCapability instructions, then optional OpExtension
// instructions.
- // TODO: 3. Optional OpExtInstImport instructions.
+ // 3. Optional OpExtInstImport instructions.
+ outputOpExtInstImports(*M);
// 4. The single required OpMemoryModel instruction.
outputOpMemoryModel();
// 5. All entry point declarations, using OpEntryPoint.
outputEntryPoints();
// 6. Execution-mode declarations, using OpExecutionMode or OpExecutionModeId.
- // TODO:
+ outputExecutionMode(*M);
// 7a. Debug: all OpString, OpSourceExtension, OpSource, and
// OpSourceContinued, without forward references.
outputDebugSourceAndStrings(*M);
@@ -326,7 +484,7 @@ void SPIRVAsmPrinter::outputModuleSections() {
// 7c. Debug: all OpModuleProcessed instructions.
outputModuleSection(SPIRV::MB_DebugModuleProcessed);
// 8. All annotation instructions (all decorations).
- outputModuleSection(SPIRV::MB_Annotations);
+ outputAnnotations(*M);
// 9. All type declarations (OpTypeXXX instructions), all constant
// instructions, and all global variable declarations. This section is
// the first section to allow use of: OpLine and OpNoLine debug information;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 5b6b82aebf30..e8fedfeffde7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -24,9 +24,8 @@
using namespace llvm;
SPIRVCallLowering::SPIRVCallLowering(const SPIRVTargetLowering &TLI,
- const SPIRVSubtarget &ST,
SPIRVGlobalRegistry *GR)
- : CallLowering(&TLI), ST(ST), GR(GR) {}
+ : CallLowering(&TLI), GR(GR) {}
bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val, ArrayRef<Register> VRegs,
@@ -36,11 +35,13 @@ bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
// TODO: handle the case of multiple registers.
if (VRegs.size() > 1)
return false;
- if (Val)
+ if (Val) {
+ const auto &STI = MIRBuilder.getMF().getSubtarget();
return MIRBuilder.buildInstr(SPIRV::OpReturnValue)
.addUse(VRegs[0])
- .constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(),
- *ST.getRegBankInfo());
+ .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+ *STI.getRegBankInfo());
+ }
MIRBuilder.buildInstr(SPIRV::OpReturn);
return true;
}
@@ -63,6 +64,56 @@ static uint32_t getFunctionControl(const Function &F) {
return FuncControl;
}
+static ConstantInt *getConstInt(MDNode *MD, unsigned NumOp) {
+ if (MD->getNumOperands() > NumOp) {
+ auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(NumOp));
+ if (CMeta)
+ return dyn_cast<ConstantInt>(CMeta->getValue());
+ }
+ return nullptr;
+}
+
+// This code restores function args/retvalue types for composite cases
+// because the final types should still be aggregate whereas they're i32
+// during the translation to cope with aggregate flattening etc.
+static FunctionType *getOriginalFunctionType(const Function &F) {
+ auto *NamedMD = F.getParent()->getNamedMetadata("spv.cloned_funcs");
+ if (NamedMD == nullptr)
+ return F.getFunctionType();
+
+ Type *RetTy = F.getFunctionType()->getReturnType();
+ SmallVector<Type *, 4> ArgTypes;
+ for (auto &Arg : F.args())
+ ArgTypes.push_back(Arg.getType());
+
+ auto ThisFuncMDIt =
+ std::find_if(NamedMD->op_begin(), NamedMD->op_end(), [&F](MDNode *N) {
+ return isa<MDString>(N->getOperand(0)) &&
+ cast<MDString>(N->getOperand(0))->getString() == F.getName();
+ });
+ // TODO: probably one function can have numerous type mutations,
+ // so we should support this.
+ if (ThisFuncMDIt != NamedMD->op_end()) {
+ auto *ThisFuncMD = *ThisFuncMDIt;
+ MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1));
+ assert(MD && "MDNode operand is expected");
+ ConstantInt *Const = getConstInt(MD, 0);
+ if (Const) {
+ auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
+ assert(CMeta && "ConstantAsMetadata operand is expected");
+ assert(Const->getSExtValue() >= -1);
+ // Currently -1 indicates return value, greater values mean
+ // argument numbers.
+ if (Const->getSExtValue() == -1)
+ RetTy = CMeta->getType();
+ else
+ ArgTypes[Const->getSExtValue()] = CMeta->getType();
+ }
+ }
+
+ return FunctionType::get(RetTy, ArgTypes, F.isVarArg());
+}
+
bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
const Function &F,
ArrayRef<ArrayRef<Register>> VRegs,
@@ -71,7 +122,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
GR->setCurrentFunc(MIRBuilder.getMF());
// Assign types and names to all args, and store their types for later.
- SmallVector<Register, 4> ArgTypeVRegs;
+ FunctionType *FTy = getOriginalFunctionType(F);
+ SmallVector<SPIRVType *, 4> ArgTypeVRegs;
if (VRegs.size() > 0) {
unsigned i = 0;
for (const auto &Arg : F.args()) {
@@ -79,9 +131,18 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
// TODO: handle the case of multiple registers.
if (VRegs[i].size() > 1)
return false;
- auto *SpirvTy =
- GR->assignTypeToVReg(Arg.getType(), VRegs[i][0], MIRBuilder);
- ArgTypeVRegs.push_back(GR->getSPIRVTypeID(SpirvTy));
+ Type *ArgTy = FTy->getParamType(i);
+ SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite;
+ MDNode *Node = F.getMetadata("kernel_arg_access_qual");
+ if (Node && i < Node->getNumOperands()) {
+ StringRef AQString = cast<MDString>(Node->getOperand(i))->getString();
+ if (AQString.compare("read_only") == 0)
+ AQ = SPIRV::AccessQualifier::ReadOnly;
+ else if (AQString.compare("write_only") == 0)
+ AQ = SPIRV::AccessQualifier::WriteOnly;
+ }
+ auto *SpirvTy = GR->assignTypeToVReg(ArgTy, VRegs[i][0], MIRBuilder, AQ);
+ ArgTypeVRegs.push_back(SpirvTy);
if (Arg.hasName())
buildOpName(VRegs[i][0], Arg.getName(), MIRBuilder);
@@ -92,8 +153,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
SPIRV::Decoration::MaxByteOffset, {DerefBytes});
}
if (Arg.hasAttribute(Attribute::Alignment)) {
+ auto Alignment = static_cast<unsigned>(
+ Arg.getAttribute(Attribute::Alignment).getValueAsInt());
buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::Alignment,
- {static_cast<unsigned>(Arg.getParamAlignment())});
+ {Alignment});
}
if (Arg.hasAttribute(Attribute::ReadOnly)) {
auto Attr =
@@ -107,6 +170,38 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
buildOpDecorate(VRegs[i][0], MIRBuilder,
SPIRV::Decoration::FuncParamAttr, {Attr});
}
+ if (Arg.hasAttribute(Attribute::NoAlias)) {
+ auto Attr =
+ static_cast<unsigned>(SPIRV::FunctionParameterAttribute::NoAlias);
+ buildOpDecorate(VRegs[i][0], MIRBuilder,
+ SPIRV::Decoration::FuncParamAttr, {Attr});
+ }
+ Node = F.getMetadata("kernel_arg_type_qual");
+ if (Node && i < Node->getNumOperands()) {
+ StringRef TypeQual = cast<MDString>(Node->getOperand(i))->getString();
+ if (TypeQual.compare("volatile") == 0)
+ buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::Volatile,
+ {});
+ }
+ Node = F.getMetadata("spirv.ParameterDecorations");
+ if (Node && i < Node->getNumOperands() &&
+ isa<MDNode>(Node->getOperand(i))) {
+ MDNode *MD = cast<MDNode>(Node->getOperand(i));
+ for (const MDOperand &MDOp : MD->operands()) {
+ MDNode *MD2 = dyn_cast<MDNode>(MDOp);
+ assert(MD2 && "Metadata operand is expected");
+ ConstantInt *Const = getConstInt(MD2, 0);
+ assert(Const && "MDOperand should be ConstantInt");
+ auto Dec = static_cast<SPIRV::Decoration>(Const->getZExtValue());
+ std::vector<uint32_t> DecVec;
+ for (unsigned j = 1; j < MD2->getNumOperands(); j++) {
+ ConstantInt *Const = getConstInt(MD2, j);
+ assert(Const && "MDOperand should be ConstantInt");
+ DecVec.push_back(static_cast<uint32_t>(Const->getZExtValue()));
+ }
+ buildOpDecorate(VRegs[i][0], MIRBuilder, Dec, DecVec);
+ }
+ }
++i;
}
}
@@ -117,30 +212,30 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
if (F.isDeclaration())
GR->add(&F, &MIRBuilder.getMF(), FuncVReg);
-
- auto *FTy = F.getFunctionType();
- auto FuncTy = GR->assignTypeToVReg(FTy, FuncVReg, MIRBuilder);
+ SPIRVType *RetTy = GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder);
+ SPIRVType *FuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
+ FTy, RetTy, ArgTypeVRegs, MIRBuilder);
// Build the OpTypeFunction declaring it.
- Register ReturnTypeID = FuncTy->getOperand(1).getReg();
uint32_t FuncControl = getFunctionControl(F);
MIRBuilder.buildInstr(SPIRV::OpFunction)
.addDef(FuncVReg)
- .addUse(ReturnTypeID)
+ .addUse(GR->getSPIRVTypeID(RetTy))
.addImm(FuncControl)
.addUse(GR->getSPIRVTypeID(FuncTy));
// Add OpFunctionParameters.
- const unsigned NumArgs = ArgTypeVRegs.size();
- for (unsigned i = 0; i < NumArgs; ++i) {
+ int i = 0;
+ for (const auto &Arg : F.args()) {
assert(VRegs[i].size() == 1 && "Formal arg has multiple vregs");
MRI->setRegClass(VRegs[i][0], &SPIRV::IDRegClass);
MIRBuilder.buildInstr(SPIRV::OpFunctionParameter)
.addDef(VRegs[i][0])
- .addUse(ArgTypeVRegs[i]);
+ .addUse(GR->getSPIRVTypeID(ArgTypeVRegs[i]));
if (F.isDeclaration())
- GR->add(F.getArg(i), &MIRBuilder.getMF(), VRegs[i][0]);
+ GR->add(&Arg, &MIRBuilder.getMF(), VRegs[i][0]);
+ i++;
}
// Name the function.
if (F.hasName())
@@ -169,48 +264,51 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// TODO: handle the case of multiple registers.
if (Info.OrigRet.Regs.size() > 1)
return false;
+ MachineFunction &MF = MIRBuilder.getMF();
+ GR->setCurrentFunc(MF);
+ FunctionType *FTy = nullptr;
+ const Function *CF = nullptr;
- GR->setCurrentFunc(MIRBuilder.getMF());
- Register ResVReg =
- Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0];
// Emit a regular OpFunctionCall. If it's an externally declared function,
- // be sure to emit its type and function declaration here. It will be
- // hoisted globally later.
+ // be sure to emit its type and function declaration here. It will be hoisted
+ // globally later.
if (Info.Callee.isGlobal()) {
- auto *CF = dyn_cast_or_null<const Function>(Info.Callee.getGlobal());
+ CF = dyn_cast_or_null<const Function>(Info.Callee.getGlobal());
// TODO: support constexpr casts and indirect calls.
if (CF == nullptr)
return false;
- if (CF->isDeclaration()) {
- // Emit the type info and forward function declaration to the first MBB
- // to ensure VReg definition dependencies are valid across all MBBs.
- MachineBasicBlock::iterator OldII = MIRBuilder.getInsertPt();
- MachineBasicBlock &OldBB = MIRBuilder.getMBB();
- MachineBasicBlock &FirstBB = *MIRBuilder.getMF().getBlockNumbered(0);
- MIRBuilder.setInsertPt(FirstBB, FirstBB.instr_end());
-
- SmallVector<ArrayRef<Register>, 8> VRegArgs;
- SmallVector<SmallVector<Register, 1>, 8> ToInsert;
- for (const Argument &Arg : CF->args()) {
- if (MIRBuilder.getDataLayout().getTypeStoreSize(Arg.getType()).isZero())
- continue; // Don't handle zero sized types.
- ToInsert.push_back({MIRBuilder.getMRI()->createGenericVirtualRegister(
- LLT::scalar(32))});
- VRegArgs.push_back(ToInsert.back());
- }
- // TODO: Reuse FunctionLoweringInfo.
- FunctionLoweringInfo FuncInfo;
- lowerFormalArguments(MIRBuilder, *CF, VRegArgs, FuncInfo);
- MIRBuilder.setInsertPt(OldBB, OldII);
+ FTy = getOriginalFunctionType(*CF);
+ }
+
+ Register ResVReg =
+ Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0];
+ if (CF && CF->isDeclaration() &&
+ !GR->find(CF, &MIRBuilder.getMF()).isValid()) {
+ // Emit the type info and forward function declaration to the first MBB
+ // to ensure VReg definition dependencies are valid across all MBBs.
+ MachineIRBuilder FirstBlockBuilder;
+ FirstBlockBuilder.setMF(MF);
+ FirstBlockBuilder.setMBB(*MF.getBlockNumbered(0));
+
+ SmallVector<ArrayRef<Register>, 8> VRegArgs;
+ SmallVector<SmallVector<Register, 1>, 8> ToInsert;
+ for (const Argument &Arg : CF->args()) {
+ if (MIRBuilder.getDataLayout().getTypeStoreSize(Arg.getType()).isZero())
+ continue; // Don't handle zero sized types.
+ ToInsert.push_back(
+ {MIRBuilder.getMRI()->createGenericVirtualRegister(LLT::scalar(32))});
+ VRegArgs.push_back(ToInsert.back());
}
+ // TODO: Reuse FunctionLoweringInfo
+ FunctionLoweringInfo FuncInfo;
+ lowerFormalArguments(FirstBlockBuilder, *CF, VRegArgs, FuncInfo);
}
// Make sure there's a valid return reg, even for functions returning void.
- if (!ResVReg.isValid()) {
+ if (!ResVReg.isValid())
ResVReg = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::IDRegClass);
- }
SPIRVType *RetType =
- GR->assignTypeToVReg(Info.OrigRet.Ty, ResVReg, MIRBuilder);
+ GR->assignTypeToVReg(FTy->getReturnType(), ResVReg, MIRBuilder);
// Emit the OpFunctionCall and its args.
auto MIB = MIRBuilder.buildInstr(SPIRV::OpFunctionCall)
@@ -224,6 +322,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
MIB.addUse(Arg.Regs[0]);
}
- return MIB.constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(),
- *ST.getRegBankInfo());
+ const auto &STI = MF.getSubtarget();
+ return MIB.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+ *STI.getRegBankInfo());
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
index c179bb35154b..c2d6ad82d507 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
@@ -13,23 +13,21 @@
#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVCALLLOWERING_H
#define LLVM_LIB_TARGET_SPIRV_SPIRVCALLLOWERING_H
+#include "SPIRVGlobalRegistry.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
namespace llvm {
class SPIRVGlobalRegistry;
-class SPIRVSubtarget;
class SPIRVTargetLowering;
class SPIRVCallLowering : public CallLowering {
private:
- const SPIRVSubtarget &ST;
// Used to create and assign function, argument, and return type information.
SPIRVGlobalRegistry *GR;
public:
- SPIRVCallLowering(const SPIRVTargetLowering &TLI, const SPIRVSubtarget &ST,
- SPIRVGlobalRegistry *GR);
+ SPIRVCallLowering(const SPIRVTargetLowering &TLI, SPIRVGlobalRegistry *GR);
// Built OpReturn or OpReturnValue.
bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
index 57cd4bafd351..1926977ea66e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
@@ -92,4 +92,4 @@ void SPIRVGeneralDuplicatesTracker::buildDepsGraph(
}
}
}
-} \ No newline at end of file
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
index 58ae1f86ce42..ab22c3d2a647 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
@@ -169,6 +169,8 @@ public:
Register find(const Argument *Arg, const MachineFunction *MF) {
return AT.find(const_cast<Argument *>(Arg), MF);
}
+
+ const SPIRVDuplicatesTracker<Type> *getTypes() { return &TT; }
};
} // namespace llvm
-#endif \ No newline at end of file
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVDUPLICATESTRACKER_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 9624482e3622..0075f547b6d6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -87,6 +87,7 @@ public:
Instruction *visitLoadInst(LoadInst &I);
Instruction *visitStoreInst(StoreInst &I);
Instruction *visitAllocaInst(AllocaInst &I);
+ Instruction *visitAtomicCmpXchgInst(AtomicCmpXchgInst &I);
bool runOnFunction(Function &F) override;
};
} // namespace
@@ -103,7 +104,7 @@ static inline bool isAssignTypeInstr(const Instruction *I) {
static bool isMemInstrToReplace(Instruction *I) {
return isa<StoreInst>(I) || isa<LoadInst>(I) || isa<InsertValueInst>(I) ||
- isa<ExtractValueInst>(I);
+ isa<ExtractValueInst>(I) || isa<AtomicCmpXchgInst>(I);
}
static bool isAggrToReplace(const Value *V) {
@@ -134,13 +135,14 @@ void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old,
Instruction *New) {
while (!Old->user_empty()) {
auto *U = Old->user_back();
- if (isMemInstrToReplace(U) || isa<ReturnInst>(U)) {
- U->replaceUsesOfWith(Old, New);
- } else if (isAssignTypeInstr(U)) {
+ if (isAssignTypeInstr(U)) {
IRB->SetInsertPoint(U);
SmallVector<Value *, 2> Args = {New, U->getOperand(1)};
IRB->CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args);
U->eraseFromParent();
+ } else if (isMemInstrToReplace(U) || isa<ReturnInst>(U) ||
+ isa<CallInst>(U)) {
+ U->replaceUsesOfWith(Old, New);
} else {
llvm_unreachable("illegal aggregate intrinsic user");
}
@@ -301,10 +303,10 @@ Instruction *SPIRVEmitIntrinsics::visitStoreInst(StoreInst &I) {
MachineMemOperand::Flags Flags =
TLI->getStoreMemOperandFlags(I, F->getParent()->getDataLayout());
auto *PtrOp = I.getPointerOperand();
- auto *NewI =
- IRB->CreateIntrinsic(Intrinsic::spv_store, {PtrOp->getType()},
- {I.getValueOperand(), PtrOp, IRB->getInt16(Flags),
- IRB->getInt8(I.getAlign().value())});
+ auto *NewI = IRB->CreateIntrinsic(
+ Intrinsic::spv_store, {I.getValueOperand()->getType(), PtrOp->getType()},
+ {I.getValueOperand(), PtrOp, IRB->getInt16(Flags),
+ IRB->getInt8(I.getAlign().value())});
I.eraseFromParent();
return NewI;
}
@@ -314,6 +316,22 @@ Instruction *SPIRVEmitIntrinsics::visitAllocaInst(AllocaInst &I) {
return &I;
}
+Instruction *SPIRVEmitIntrinsics::visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+ assert(I.getType()->isAggregateType() && "Aggregate result is expected");
+ SmallVector<Value *> Args;
+ for (auto &Op : I.operands())
+ Args.push_back(Op);
+ Args.push_back(IRB->getInt32(I.getSyncScopeID()));
+ Args.push_back(IRB->getInt32(
+ static_cast<uint32_t>(getMemSemantics(I.getSuccessOrdering()))));
+ Args.push_back(IRB->getInt32(
+ static_cast<uint32_t>(getMemSemantics(I.getFailureOrdering()))));
+ auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_cmpxchg,
+ {I.getPointerOperand()->getType()}, {Args});
+ replaceMemInstrUses(&I, NewI);
+ return NewI;
+}
+
void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV) {
// Skip special artifical variable llvm.global.annotations.
if (GV.getName() == "llvm.global.annotations")
@@ -351,14 +369,13 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I) {
// Check GetElementPtrConstantExpr case.
(isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) {
IRB->SetInsertPoint(I);
- buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op);
+ if (isa<UndefValue>(Op) && Op->getType()->isAggregateType())
+ buildIntrWithMD(Intrinsic::spv_assign_type, {IRB->getInt32Ty()}, Op,
+ UndefValue::get(IRB->getInt32Ty()));
+ else
+ buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op);
}
}
- // StoreInst's operand type can be changed in the next stage so we need to
- // store it in the set.
- if (isa<StoreInst>(I) &&
- cast<StoreInst>(I)->getValueOperand()->getType()->isAggregateType())
- AggrStores.insert(I);
}
void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I) {
@@ -378,7 +395,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I) {
if ((isa<ConstantAggregateZero>(Op) && Op->getType()->isVectorTy()) ||
isa<PHINode>(I) || isa<SwitchInst>(I))
TrackConstants = false;
- if (isa<ConstantData>(Op) && TrackConstants) {
+ if ((isa<ConstantData>(Op) || isa<ConstantExpr>(Op)) && TrackConstants) {
unsigned OpNo = Op.getOperandNo();
if (II && ((II->getIntrinsicID() == Intrinsic::spv_gep && OpNo == 0) ||
(II->paramHasAttr(OpNo, Attribute::ImmArg))))
@@ -405,8 +422,20 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
AggrConsts.clear();
AggrStores.clear();
- IRB->SetInsertPoint(&Func.getEntryBlock().front());
+ // StoreInst's operand type can be changed during the next transformations,
+ // so we need to store it in the set. Also store already transformed types.
+ for (auto &I : instructions(Func)) {
+ StoreInst *SI = dyn_cast<StoreInst>(&I);
+ if (!SI)
+ continue;
+ Type *ElTy = SI->getValueOperand()->getType();
+ PointerType *PTy = cast<PointerType>(SI->getOperand(1)->getType());
+ if (ElTy->isAggregateType() || ElTy->isVectorTy() ||
+ !PTy->isOpaqueOrPointeeTypeMatches(ElTy))
+ AggrStores.insert(&I);
+ }
+ IRB->SetInsertPoint(&Func.getEntryBlock().front());
for (auto &GV : Func.getParent()->globals())
processGlobalValue(GV);
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 5f890c003cbc..5c8fa7adfbdf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -24,6 +24,24 @@ using namespace llvm;
SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize)
: PointerSize(PointerSize) {}
+SPIRVType *SPIRVGlobalRegistry::assignIntTypeToVReg(unsigned BitWidth,
+ Register VReg,
+ MachineInstr &I,
+ const SPIRVInstrInfo &TII) {
+ SPIRVType *SpirvType = getOrCreateSPIRVIntegerType(BitWidth, I, TII);
+ assignSPIRVTypeToVReg(SpirvType, VReg, *CurMF);
+ return SpirvType;
+}
+
+SPIRVType *SPIRVGlobalRegistry::assignVectTypeToVReg(
+ SPIRVType *BaseType, unsigned NumElements, Register VReg, MachineInstr &I,
+ const SPIRVInstrInfo &TII) {
+ SPIRVType *SpirvType =
+ getOrCreateSPIRVVectorType(BaseType, NumElements, I, TII);
+ assignSPIRVTypeToVReg(SpirvType, VReg, *CurMF);
+ return SpirvType;
+}
+
SPIRVType *SPIRVGlobalRegistry::assignTypeToVReg(
const Type *Type, Register VReg, MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier AccessQual, bool EmitIR) {
@@ -96,6 +114,65 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems,
return MIB;
}
+std::tuple<Register, ConstantInt *, bool>
+SPIRVGlobalRegistry::getOrCreateConstIntReg(uint64_t Val, SPIRVType *SpvType,
+ MachineIRBuilder *MIRBuilder,
+ MachineInstr *I,
+ const SPIRVInstrInfo *TII) {
+ const IntegerType *LLVMIntTy;
+ if (SpvType)
+ LLVMIntTy = cast<IntegerType>(getTypeForSPIRVType(SpvType));
+ else
+ LLVMIntTy = IntegerType::getInt32Ty(CurMF->getFunction().getContext());
+ bool NewInstr = false;
+ // Find a constant in DT or build a new one.
+ ConstantInt *CI = ConstantInt::get(const_cast<IntegerType *>(LLVMIntTy), Val);
+ Register Res = DT.find(CI, CurMF);
+ if (!Res.isValid()) {
+ unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
+ LLT LLTy = LLT::scalar(32);
+ Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy);
+ if (MIRBuilder)
+ assignTypeToVReg(LLVMIntTy, Res, *MIRBuilder);
+ else
+ assignIntTypeToVReg(BitWidth, Res, *I, *TII);
+ DT.add(CI, CurMF, Res);
+ NewInstr = true;
+ }
+ return std::make_tuple(Res, CI, NewInstr);
+}
+
+Register SPIRVGlobalRegistry::getOrCreateConstInt(uint64_t Val, MachineInstr &I,
+ SPIRVType *SpvType,
+ const SPIRVInstrInfo &TII) {
+ assert(SpvType);
+ ConstantInt *CI;
+ Register Res;
+ bool New;
+ std::tie(Res, CI, New) =
+ getOrCreateConstIntReg(Val, SpvType, nullptr, &I, &TII);
+ // If we have found Res register which is defined by the passed G_CONSTANT
+ // machine instruction, a new constant instruction should be created.
+ if (!New && (!I.getOperand(0).isReg() || Res != I.getOperand(0).getReg()))
+ return Res;
+ MachineInstrBuilder MIB;
+ MachineBasicBlock &BB = *I.getParent();
+ if (Val) {
+ MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI))
+ .addDef(Res)
+ .addUse(getSPIRVTypeID(SpvType));
+ addNumImm(APInt(getScalarOrVectorBitWidth(SpvType), Val), MIB);
+ } else {
+ MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
+ .addDef(Res)
+ .addUse(getSPIRVTypeID(SpvType));
+ }
+ const auto &ST = CurMF->getSubtarget();
+ constrainSelectedInstRegOperands(*MIB, *ST.getInstrInfo(),
+ *ST.getRegisterInfo(), *ST.getRegBankInfo());
+ return Res;
+}
+
Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val,
MachineIRBuilder &MIRBuilder,
SPIRVType *SpvType,
@@ -112,14 +189,32 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val,
Register Res = DT.find(ConstInt, &MF);
if (!Res.isValid()) {
unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
- Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
- assignTypeToVReg(LLVMIntTy, Res, MIRBuilder);
- if (EmitIR)
+ LLT LLTy = LLT::scalar(EmitIR ? BitWidth : 32);
+ Res = MF.getRegInfo().createGenericVirtualRegister(LLTy);
+ assignTypeToVReg(LLVMIntTy, Res, MIRBuilder,
+ SPIRV::AccessQualifier::ReadWrite, EmitIR);
+ DT.add(ConstInt, &MIRBuilder.getMF(), Res);
+ if (EmitIR) {
MIRBuilder.buildConstant(Res, *ConstInt);
- else
- MIRBuilder.buildInstr(SPIRV::OpConstantI)
- .addDef(Res)
- .addImm(ConstInt->getSExtValue());
+ } else {
+ MachineInstrBuilder MIB;
+ if (Val) {
+ assert(SpvType);
+ MIB = MIRBuilder.buildInstr(SPIRV::OpConstantI)
+ .addDef(Res)
+ .addUse(getSPIRVTypeID(SpvType));
+ addNumImm(APInt(BitWidth, Val), MIB);
+ } else {
+ assert(SpvType);
+ MIB = MIRBuilder.buildInstr(SPIRV::OpConstantNull)
+ .addDef(Res)
+ .addUse(getSPIRVTypeID(SpvType));
+ }
+ const auto &Subtarget = CurMF->getSubtarget();
+ constrainSelectedInstRegOperands(*MIB, *Subtarget.getInstrInfo(),
+ *Subtarget.getRegisterInfo(),
+ *Subtarget.getRegBankInfo());
+ }
}
return Res;
}
@@ -142,11 +237,63 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val,
unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
assignTypeToVReg(LLVMFPTy, Res, MIRBuilder);
+ DT.add(ConstFP, &MF, Res);
MIRBuilder.buildFConstant(Res, *ConstFP);
}
return Res;
}
+Register
+SPIRVGlobalRegistry::getOrCreateConsIntVector(uint64_t Val, MachineInstr &I,
+ SPIRVType *SpvType,
+ const SPIRVInstrInfo &TII) {
+ const Type *LLVMTy = getTypeForSPIRVType(SpvType);
+ assert(LLVMTy->isVectorTy());
+ const FixedVectorType *LLVMVecTy = cast<FixedVectorType>(LLVMTy);
+ Type *LLVMBaseTy = LLVMVecTy->getElementType();
+ // Find a constant vector in DT or build a new one.
+ const auto ConstInt = ConstantInt::get(LLVMBaseTy, Val);
+ auto ConstVec =
+ ConstantVector::getSplat(LLVMVecTy->getElementCount(), ConstInt);
+ Register Res = DT.find(ConstVec, CurMF);
+ if (!Res.isValid()) {
+ unsigned BitWidth = getScalarOrVectorBitWidth(SpvType);
+ SPIRVType *SpvBaseType = getOrCreateSPIRVIntegerType(BitWidth, I, TII);
+ // SpvScalConst should be created before SpvVecConst to avoid undefined ID
+ // error on validation.
+ // TODO: can moved below once sorting of types/consts/defs is implemented.
+ Register SpvScalConst;
+ if (Val)
+ SpvScalConst = getOrCreateConstInt(Val, I, SpvBaseType, TII);
+ // TODO: maybe use bitwidth of base type.
+ LLT LLTy = LLT::scalar(32);
+ Register SpvVecConst =
+ CurMF->getRegInfo().createGenericVirtualRegister(LLTy);
+ const unsigned ElemCnt = SpvType->getOperand(2).getImm();
+ assignVectTypeToVReg(SpvBaseType, ElemCnt, SpvVecConst, I, TII);
+ DT.add(ConstVec, CurMF, SpvVecConst);
+ MachineInstrBuilder MIB;
+ MachineBasicBlock &BB = *I.getParent();
+ if (Val) {
+ MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantComposite))
+ .addDef(SpvVecConst)
+ .addUse(getSPIRVTypeID(SpvType));
+ for (unsigned i = 0; i < ElemCnt; ++i)
+ MIB.addUse(SpvScalConst);
+ } else {
+ MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
+ .addDef(SpvVecConst)
+ .addUse(getSPIRVTypeID(SpvType));
+ }
+ const auto &Subtarget = CurMF->getSubtarget();
+ constrainSelectedInstRegOperands(*MIB, *Subtarget.getInstrInfo(),
+ *Subtarget.getRegisterInfo(),
+ *Subtarget.getRegBankInfo());
+ return SpvVecConst;
+ }
+ return Res;
+}
+
Register SPIRVGlobalRegistry::buildGlobalVariable(
Register ResVReg, SPIRVType *BaseType, StringRef Name,
const GlobalValue *GV, SPIRV::StorageClass Storage,
@@ -169,7 +316,13 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
}
GV = GVar;
}
- Register Reg;
+ Register Reg = DT.find(GVar, &MIRBuilder.getMF());
+ if (Reg.isValid()) {
+ if (Reg != ResVReg)
+ MIRBuilder.buildCopy(ResVReg, Reg);
+ return ResVReg;
+ }
+
auto MIB = MIRBuilder.buildInstr(SPIRV::OpVariable)
.addDef(ResVReg)
.addUse(getSPIRVTypeID(BaseType))
@@ -234,14 +387,76 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems,
return MIB;
}
+SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty,
+ MachineIRBuilder &MIRBuilder) {
+ assert(Ty->hasName());
+ const StringRef Name = Ty->hasName() ? Ty->getName() : "";
+ Register ResVReg = createTypeVReg(MIRBuilder);
+ auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeOpaque).addDef(ResVReg);
+ addStringImm(Name, MIB);
+ buildOpName(ResVReg, Name, MIRBuilder);
+ return MIB;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(const StructType *Ty,
+ MachineIRBuilder &MIRBuilder,
+ bool EmitIR) {
+ SmallVector<Register, 4> FieldTypes;
+ for (const auto &Elem : Ty->elements()) {
+ SPIRVType *ElemTy = findSPIRVType(Elem, MIRBuilder);
+ assert(ElemTy && ElemTy->getOpcode() != SPIRV::OpTypeVoid &&
+ "Invalid struct element type");
+ FieldTypes.push_back(getSPIRVTypeID(ElemTy));
+ }
+ Register ResVReg = createTypeVReg(MIRBuilder);
+ auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeStruct).addDef(ResVReg);
+ for (const auto &Ty : FieldTypes)
+ MIB.addUse(Ty);
+ if (Ty->hasName())
+ buildOpName(ResVReg, Ty->getName(), MIRBuilder);
+ if (Ty->isPacked())
+ buildOpDecorate(ResVReg, MIRBuilder, SPIRV::Decoration::CPacked, {});
+ return MIB;
+}
+
+static bool isOpenCLBuiltinType(const StructType *SType) {
+ return SType->isOpaque() && SType->hasName() &&
+ SType->getName().startswith("opencl.");
+}
+
+static bool isSPIRVBuiltinType(const StructType *SType) {
+ return SType->isOpaque() && SType->hasName() &&
+ SType->getName().startswith("spirv.");
+}
+
+static bool isSpecialType(const Type *Ty) {
+ if (auto PType = dyn_cast<PointerType>(Ty)) {
+ if (!PType->isOpaque())
+ Ty = PType->getNonOpaquePointerElementType();
+ }
+ if (auto SType = dyn_cast<StructType>(Ty))
+ return isOpenCLBuiltinType(SType) || isSPIRVBuiltinType(SType);
+ return false;
+}
+
SPIRVType *SPIRVGlobalRegistry::getOpTypePointer(SPIRV::StorageClass SC,
SPIRVType *ElemType,
- MachineIRBuilder &MIRBuilder) {
- auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypePointer)
- .addDef(createTypeVReg(MIRBuilder))
- .addImm(static_cast<uint32_t>(SC))
- .addUse(getSPIRVTypeID(ElemType));
- return MIB;
+ MachineIRBuilder &MIRBuilder,
+ Register Reg) {
+ if (!Reg.isValid())
+ Reg = createTypeVReg(MIRBuilder);
+ return MIRBuilder.buildInstr(SPIRV::OpTypePointer)
+ .addDef(Reg)
+ .addImm(static_cast<uint32_t>(SC))
+ .addUse(getSPIRVTypeID(ElemType));
+}
+
+SPIRVType *
+SPIRVGlobalRegistry::getOpTypeForwardPointer(SPIRV::StorageClass SC,
+ MachineIRBuilder &MIRBuilder) {
+ return MIRBuilder.buildInstr(SPIRV::OpTypeForwardPointer)
+ .addUse(createTypeVReg(MIRBuilder))
+ .addImm(static_cast<uint32_t>(SC));
}
SPIRVType *SPIRVGlobalRegistry::getOpTypeFunction(
@@ -255,10 +470,49 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeFunction(
return MIB;
}
+SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeFunctionWithArgs(
+ const Type *Ty, SPIRVType *RetType,
+ const SmallVectorImpl<SPIRVType *> &ArgTypes,
+ MachineIRBuilder &MIRBuilder) {
+ Register Reg = DT.find(Ty, &MIRBuilder.getMF());
+ if (Reg.isValid())
+ return getSPIRVTypeForVReg(Reg);
+ SPIRVType *SpirvType = getOpTypeFunction(RetType, ArgTypes, MIRBuilder);
+ return finishCreatingSPIRVType(Ty, SpirvType);
+}
+
+SPIRVType *SPIRVGlobalRegistry::findSPIRVType(const Type *Ty,
+ MachineIRBuilder &MIRBuilder,
+ SPIRV::AccessQualifier AccQual,
+ bool EmitIR) {
+ Register Reg = DT.find(Ty, &MIRBuilder.getMF());
+ if (Reg.isValid())
+ return getSPIRVTypeForVReg(Reg);
+ if (ForwardPointerTypes.find(Ty) != ForwardPointerTypes.end())
+ return ForwardPointerTypes[Ty];
+ return restOfCreateSPIRVType(Ty, MIRBuilder, AccQual, EmitIR);
+}
+
+Register SPIRVGlobalRegistry::getSPIRVTypeID(const SPIRVType *SpirvType) const {
+ assert(SpirvType && "Attempting to get type id for nullptr type.");
+ if (SpirvType->getOpcode() == SPIRV::OpTypeForwardPointer)
+ return SpirvType->uses().begin()->getReg();
+ return SpirvType->defs().begin()->getReg();
+}
+
SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty,
MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier AccQual,
bool EmitIR) {
+ assert(!isSpecialType(Ty));
+ auto &TypeToSPIRVTypeMap = DT.getTypes()->getAllUses();
+ auto t = TypeToSPIRVTypeMap.find(Ty);
+ if (t != TypeToSPIRVTypeMap.end()) {
+ auto tt = t->second.find(&MIRBuilder.getMF());
+ if (tt != t->second.end())
+ return getSPIRVTypeForVReg(tt->second);
+ }
+
if (auto IType = dyn_cast<IntegerType>(Ty)) {
const unsigned Width = IType->getBitWidth();
return Width == 1 ? getOpTypeBool(MIRBuilder)
@@ -269,21 +523,25 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty,
if (Ty->isVoidTy())
return getOpTypeVoid(MIRBuilder);
if (Ty->isVectorTy()) {
- auto El = getOrCreateSPIRVType(cast<FixedVectorType>(Ty)->getElementType(),
- MIRBuilder);
+ SPIRVType *El =
+ findSPIRVType(cast<FixedVectorType>(Ty)->getElementType(), MIRBuilder);
return getOpTypeVector(cast<FixedVectorType>(Ty)->getNumElements(), El,
MIRBuilder);
}
if (Ty->isArrayTy()) {
- auto *El = getOrCreateSPIRVType(Ty->getArrayElementType(), MIRBuilder);
+ SPIRVType *El = findSPIRVType(Ty->getArrayElementType(), MIRBuilder);
return getOpTypeArray(Ty->getArrayNumElements(), El, MIRBuilder, EmitIR);
}
- assert(!isa<StructType>(Ty) && "Unsupported StructType");
+ if (auto SType = dyn_cast<StructType>(Ty)) {
+ if (SType->isOpaque())
+ return getOpTypeOpaque(SType, MIRBuilder);
+ return getOpTypeStruct(SType, MIRBuilder, EmitIR);
+ }
if (auto FType = dyn_cast<FunctionType>(Ty)) {
- SPIRVType *RetTy = getOrCreateSPIRVType(FType->getReturnType(), MIRBuilder);
+ SPIRVType *RetTy = findSPIRVType(FType->getReturnType(), MIRBuilder);
SmallVector<SPIRVType *, 4> ParamTypes;
for (const auto &t : FType->params()) {
- ParamTypes.push_back(getOrCreateSPIRVType(t, MIRBuilder));
+ ParamTypes.push_back(findSPIRVType(t, MIRBuilder));
}
return getOpTypeFunction(RetTy, ParamTypes, MIRBuilder);
}
@@ -292,24 +550,51 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty,
// At the moment, all opaque pointers correspond to i8 element type.
// TODO: change the implementation once opaque pointers are supported
// in the SPIR-V specification.
- if (PType->isOpaque()) {
+ if (PType->isOpaque())
SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder);
- } else {
- Type *ElemType = PType->getNonOpaquePointerElementType();
- // TODO: support OpenCL and SPIRV builtins like image2d_t that are passed
- // as pointers, but should be treated as custom types like OpTypeImage.
- assert(!isa<StructType>(ElemType) && "Unsupported StructType pointer");
-
- // Otherwise, treat it as a regular pointer type.
- SpvElementType = getOrCreateSPIRVType(
- ElemType, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, EmitIR);
- }
+ else
+ SpvElementType =
+ findSPIRVType(PType->getNonOpaquePointerElementType(), MIRBuilder,
+ SPIRV::AccessQualifier::ReadWrite, EmitIR);
auto SC = addressSpaceToStorageClass(PType->getAddressSpace());
- return getOpTypePointer(SC, SpvElementType, MIRBuilder);
+ // Null pointer means we have a loop in type definitions, make and
+ // return corresponding OpTypeForwardPointer.
+ if (SpvElementType == nullptr) {
+ if (ForwardPointerTypes.find(Ty) == ForwardPointerTypes.end())
+ ForwardPointerTypes[PType] = getOpTypeForwardPointer(SC, MIRBuilder);
+ return ForwardPointerTypes[PType];
+ }
+ Register Reg(0);
+ // If we have forward pointer associated with this type, use its register
+ // operand to create OpTypePointer.
+ if (ForwardPointerTypes.find(PType) != ForwardPointerTypes.end())
+ Reg = getSPIRVTypeID(ForwardPointerTypes[PType]);
+
+ return getOpTypePointer(SC, SpvElementType, MIRBuilder, Reg);
}
llvm_unreachable("Unable to convert LLVM type to SPIRVType");
}
+SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(
+ const Type *Ty, MachineIRBuilder &MIRBuilder,
+ SPIRV::AccessQualifier AccessQual, bool EmitIR) {
+ if (TypesInProcessing.count(Ty) && !Ty->isPointerTy())
+ return nullptr;
+ TypesInProcessing.insert(Ty);
+ SPIRVType *SpirvType = createSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR);
+ TypesInProcessing.erase(Ty);
+ VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType;
+ SPIRVToLLVMType[SpirvType] = Ty;
+ Register Reg = DT.find(Ty, &MIRBuilder.getMF());
+ // Do not add OpTypeForwardPointer to DT, a corresponding normal pointer type
+ // will be added later. For special types it is already added to DT.
+ if (SpirvType->getOpcode() != SPIRV::OpTypeForwardPointer && !Reg.isValid() &&
+ !isSpecialType(Ty))
+ DT.add(Ty, &MIRBuilder.getMF(), getSPIRVTypeID(SpirvType));
+
+ return SpirvType;
+}
+
SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const {
auto t = VRegToTypeMap.find(CurMF);
if (t != VRegToTypeMap.end()) {
@@ -321,13 +606,26 @@ SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const {
}
SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
- const Type *Type, MachineIRBuilder &MIRBuilder,
+ const Type *Ty, MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier AccessQual, bool EmitIR) {
- Register Reg = DT.find(Type, &MIRBuilder.getMF());
+ Register Reg = DT.find(Ty, &MIRBuilder.getMF());
if (Reg.isValid())
return getSPIRVTypeForVReg(Reg);
- SPIRVType *SpirvType = createSPIRVType(Type, MIRBuilder, AccessQual, EmitIR);
- return restOfCreateSPIRVType(Type, SpirvType);
+ TypesInProcessing.clear();
+ SPIRVType *STy = restOfCreateSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR);
+ // Create normal pointer types for the corresponding OpTypeForwardPointers.
+ for (auto &CU : ForwardPointerTypes) {
+ const Type *Ty2 = CU.first;
+ SPIRVType *STy2 = CU.second;
+ if ((Reg = DT.find(Ty2, &MIRBuilder.getMF())).isValid())
+ STy2 = getSPIRVTypeForVReg(Reg);
+ else
+ STy2 = restOfCreateSPIRVType(Ty2, MIRBuilder, AccessQual, EmitIR);
+ if (Ty == Ty2)
+ STy = STy2;
+ }
+ ForwardPointerTypes.clear();
+ return STy;
}
bool SPIRVGlobalRegistry::isScalarOfType(Register VReg,
@@ -393,8 +691,8 @@ SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(unsigned BitWidth,
MIRBuilder);
}
-SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(const Type *LLVMTy,
- SPIRVType *SpirvType) {
+SPIRVType *SPIRVGlobalRegistry::finishCreatingSPIRVType(const Type *LLVMTy,
+ SPIRVType *SpirvType) {
assert(CurMF == SpirvType->getMF());
VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType;
SPIRVToLLVMType[SpirvType] = LLVMTy;
@@ -413,7 +711,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(
.addDef(createTypeVReg(CurMF->getRegInfo()))
.addImm(BitWidth)
.addImm(0);
- return restOfCreateSPIRVType(LLVMTy, MIB);
+ return finishCreatingSPIRVType(LLVMTy, MIB);
}
SPIRVType *
@@ -423,6 +721,19 @@ SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder) {
MIRBuilder);
}
+SPIRVType *
+SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineInstr &I,
+ const SPIRVInstrInfo &TII) {
+ Type *LLVMTy = IntegerType::get(CurMF->getFunction().getContext(), 1);
+ Register Reg = DT.find(LLVMTy, CurMF);
+ if (Reg.isValid())
+ return getSPIRVTypeForVReg(Reg);
+ MachineBasicBlock &BB = *I.getParent();
+ auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeBool))
+ .addDef(createTypeVReg(CurMF->getRegInfo()));
+ return finishCreatingSPIRVType(LLVMTy, MIB);
+}
+
SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
SPIRVType *BaseType, unsigned NumElements, MachineIRBuilder &MIRBuilder) {
return getOrCreateSPIRVType(
@@ -436,12 +747,15 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
const SPIRVInstrInfo &TII) {
Type *LLVMTy = FixedVectorType::get(
const_cast<Type *>(getTypeForSPIRVType(BaseType)), NumElements);
+ Register Reg = DT.find(LLVMTy, CurMF);
+ if (Reg.isValid())
+ return getSPIRVTypeForVReg(Reg);
MachineBasicBlock &BB = *I.getParent();
auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeVector))
.addDef(createTypeVReg(CurMF->getRegInfo()))
.addUse(getSPIRVTypeID(BaseType))
.addImm(NumElements);
- return restOfCreateSPIRVType(LLVMTy, MIB);
+ return finishCreatingSPIRVType(LLVMTy, MIB);
}
SPIRVType *
@@ -460,10 +774,39 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVPointerType(
Type *LLVMTy =
PointerType::get(const_cast<Type *>(getTypeForSPIRVType(BaseType)),
storageClassToAddressSpace(SC));
+ Register Reg = DT.find(LLVMTy, CurMF);
+ if (Reg.isValid())
+ return getSPIRVTypeForVReg(Reg);
MachineBasicBlock &BB = *I.getParent();
auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypePointer))
.addDef(createTypeVReg(CurMF->getRegInfo()))
.addImm(static_cast<uint32_t>(SC))
.addUse(getSPIRVTypeID(BaseType));
- return restOfCreateSPIRVType(LLVMTy, MIB);
+ return finishCreatingSPIRVType(LLVMTy, MIB);
+}
+
+Register SPIRVGlobalRegistry::getOrCreateUndef(MachineInstr &I,
+ SPIRVType *SpvType,
+ const SPIRVInstrInfo &TII) {
+ assert(SpvType);
+ const Type *LLVMTy = getTypeForSPIRVType(SpvType);
+ assert(LLVMTy);
+ // Find a constant in DT or build a new one.
+ UndefValue *UV = UndefValue::get(const_cast<Type *>(LLVMTy));
+ Register Res = DT.find(UV, CurMF);
+ if (Res.isValid())
+ return Res;
+ LLT LLTy = LLT::scalar(32);
+ Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy);
+ assignSPIRVTypeToVReg(SpvType, Res, *CurMF);
+ DT.add(UV, CurMF, Res);
+
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpUndef))
+ .addDef(Res)
+ .addUse(getSPIRVTypeID(SpvType));
+ const auto &ST = CurMF->getSubtarget();
+ constrainSelectedInstRegOperands(*MIB, *ST.getInstrInfo(),
+ *ST.getRegisterInfo(), *ST.getRegBankInfo());
+ return Res;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 13dcc20a3e0a..59ac2712a02f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -30,7 +30,7 @@ class SPIRVGlobalRegistry {
// Do not confuse this with DuplicatesTracker as DT maps Type* to <MF, Reg>
// where Reg = OpType...
// while VRegToTypeMap tracks SPIR-V type assigned to other regs (i.e. not
- // type-declaring ones)
+ // type-declaring ones).
DenseMap<const MachineFunction *, DenseMap<Register, SPIRVType *>>
VRegToTypeMap;
@@ -38,6 +38,9 @@ class SPIRVGlobalRegistry {
DenseMap<SPIRVType *, const Type *> SPIRVToLLVMType;
+ SmallPtrSet<const Type *, 4> TypesInProcessing;
+ DenseMap<const Type *, SPIRVType *> ForwardPointerTypes;
+
// Number of bits pointers and size_t integers require.
const unsigned PointerSize;
@@ -46,6 +49,14 @@ class SPIRVGlobalRegistry {
createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite,
bool EmitIR = true);
+ SPIRVType *findSPIRVType(
+ const Type *Ty, MachineIRBuilder &MIRBuilder,
+ SPIRV::AccessQualifier accessQual = SPIRV::AccessQualifier::ReadWrite,
+ bool EmitIR = true);
+ SPIRVType *restOfCreateSPIRVType(const Type *Type,
+ MachineIRBuilder &MIRBuilder,
+ SPIRV::AccessQualifier AccessQual,
+ bool EmitIR);
public:
SPIRVGlobalRegistry(unsigned PointerSize);
@@ -91,6 +102,11 @@ public:
const Type *Type, Register VReg, MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite,
bool EmitIR = true);
+ SPIRVType *assignIntTypeToVReg(unsigned BitWidth, Register VReg,
+ MachineInstr &I, const SPIRVInstrInfo &TII);
+ SPIRVType *assignVectTypeToVReg(SPIRVType *BaseType, unsigned NumElements,
+ Register VReg, MachineInstr &I,
+ const SPIRVInstrInfo &TII);
// In cases where the SPIR-V type is already known, this function can be
// used to map it to the given VReg via an ASSIGN_TYPE instruction.
@@ -123,10 +139,7 @@ public:
}
// Return the VReg holding the result of the given OpTypeXXX instruction.
- Register getSPIRVTypeID(const SPIRVType *SpirvType) const {
- assert(SpirvType && "Attempting to get type id for nullptr type.");
- return SpirvType->defs().begin()->getReg();
- }
+ Register getSPIRVTypeID(const SPIRVType *SpirvType) const;
void setCurrentFunc(MachineFunction &MF) { CurMF = &MF; }
@@ -167,19 +180,38 @@ private:
SPIRVType *getOpTypeArray(uint32_t NumElems, SPIRVType *ElemType,
MachineIRBuilder &MIRBuilder, bool EmitIR = true);
+ SPIRVType *getOpTypeOpaque(const StructType *Ty,
+ MachineIRBuilder &MIRBuilder);
+
+ SPIRVType *getOpTypeStruct(const StructType *Ty, MachineIRBuilder &MIRBuilder,
+ bool EmitIR = true);
+
SPIRVType *getOpTypePointer(SPIRV::StorageClass SC, SPIRVType *ElemType,
- MachineIRBuilder &MIRBuilder);
+ MachineIRBuilder &MIRBuilder, Register Reg);
+
+ SPIRVType *getOpTypeForwardPointer(SPIRV::StorageClass SC,
+ MachineIRBuilder &MIRBuilder);
SPIRVType *getOpTypeFunction(SPIRVType *RetType,
const SmallVectorImpl<SPIRVType *> &ArgTypes,
MachineIRBuilder &MIRBuilder);
- SPIRVType *restOfCreateSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType);
+ std::tuple<Register, ConstantInt *, bool> getOrCreateConstIntReg(
+ uint64_t Val, SPIRVType *SpvType, MachineIRBuilder *MIRBuilder,
+ MachineInstr *I = nullptr, const SPIRVInstrInfo *TII = nullptr);
+ SPIRVType *finishCreatingSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType);
public:
Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder,
SPIRVType *SpvType = nullptr, bool EmitIR = true);
+ Register getOrCreateConstInt(uint64_t Val, MachineInstr &I,
+ SPIRVType *SpvType, const SPIRVInstrInfo &TII);
Register buildConstantFP(APFloat Val, MachineIRBuilder &MIRBuilder,
SPIRVType *SpvType = nullptr);
+ Register getOrCreateConsIntVector(uint64_t Val, MachineInstr &I,
+ SPIRVType *SpvType,
+ const SPIRVInstrInfo &TII);
+ Register getOrCreateUndef(MachineInstr &I, SPIRVType *SpvType,
+ const SPIRVInstrInfo &TII);
Register
buildGlobalVariable(Register Reg, SPIRVType *BaseType, StringRef Name,
const GlobalValue *GV, SPIRV::StorageClass Storage,
@@ -193,19 +225,24 @@ public:
SPIRVType *getOrCreateSPIRVIntegerType(unsigned BitWidth, MachineInstr &I,
const SPIRVInstrInfo &TII);
SPIRVType *getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder);
+ SPIRVType *getOrCreateSPIRVBoolType(MachineInstr &I,
+ const SPIRVInstrInfo &TII);
SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType,
unsigned NumElements,
MachineIRBuilder &MIRBuilder);
SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType,
unsigned NumElements, MachineInstr &I,
const SPIRVInstrInfo &TII);
-
SPIRVType *getOrCreateSPIRVPointerType(
SPIRVType *BaseType, MachineIRBuilder &MIRBuilder,
SPIRV::StorageClass SClass = SPIRV::StorageClass::Function);
SPIRVType *getOrCreateSPIRVPointerType(
SPIRVType *BaseType, MachineInstr &I, const SPIRVInstrInfo &TII,
SPIRV::StorageClass SClass = SPIRV::StorageClass::Function);
+ SPIRVType *getOrCreateOpTypeFunctionWithArgs(
+ const Type *Ty, SPIRVType *RetType,
+ const SmallVectorImpl<SPIRVType *> &ArgTypes,
+ MachineIRBuilder &MIRBuilder);
};
} // end namespace llvm
#endif // LLLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index 754906308114..66d8b17b4296 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -52,7 +52,7 @@ bool SPIRVInstrInfo::isTypeDeclInstr(const MachineInstr &MI) const {
auto DefRegClass = MRI.getRegClassOrNull(MI.getOperand(0).getReg());
return DefRegClass && DefRegClass->getID() == SPIRV::TYPERegClass.getID();
} else {
- return false;
+ return MI.getOpcode() == SPIRV::OpTypeForwardPointer;
}
}
@@ -193,3 +193,15 @@ void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
auto &MRI = I->getMF()->getRegInfo();
MRI.replaceRegWith(DstOp.getReg(), SrcOp.getReg());
}
+
+bool SPIRVInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_fID ||
+ MI.getOpcode() == SPIRV::GET_pID || MI.getOpcode() == SPIRV::GET_vfID ||
+ MI.getOpcode() == SPIRV::GET_vID) {
+ auto &MRI = MI.getMF()->getRegInfo();
+ MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+ MI.eraseFromParent();
+ return true;
+ }
+ return false;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 2600d9cfca2e..334351c8eeae 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -48,6 +48,7 @@ public:
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
bool KillSrc) const override;
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index d6fec5fd0785..d1c20795f804 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -449,6 +449,7 @@ def OpCopyLogical: UnOp<"OpCopyLogical", 400>;
def OpSNegate: UnOp<"OpSNegate", 126>;
def OpFNegate: UnOpTyped<"OpFNegate", 127, fID, fneg>;
+def OpFNegateV: UnOpTyped<"OpFNegate", 127, vfID, fneg>;
defm OpIAdd: BinOpTypedGen<"OpIAdd", 128, add, 0, 1>;
defm OpFAdd: BinOpTypedGen<"OpFAdd", 129, fadd, 1, 1>;
@@ -618,8 +619,10 @@ def OpAtomicCompareExchange: Op<230, (outs ID:$res),
(ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$eq,
ID:$neq, ID:$val, ID:$cmp),
"$res = OpAtomicCompareExchange $ty $ptr $sc $eq $neq $val $cmp">;
-// TODO Currently the following deprecated opcode is missing:
-// OpAtomicCompareExchangeWeak
+def OpAtomicCompareExchangeWeak: Op<231, (outs ID:$res),
+ (ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$eq,
+ ID:$neq, ID:$val, ID:$cmp),
+ "$res = OpAtomicCompareExchangeWeak $ty $ptr $sc $eq $neq $val $cmp">;
def OpAtomicIIncrement: AtomicOp<"OpAtomicIIncrement", 232>;
def OpAtomicIDecrement: AtomicOp<"OpAtomicIDecrement", 233>;
@@ -660,6 +663,11 @@ def OpMemoryNamedBarrier: Op<329, (outs), (ins ID:$barr, ID:$mem, ID:$sem),
// 3.42.21. Group and Subgroup Instructions
+def OpGroupAsyncCopy: Op<259, (outs ID:$res), (ins TYPE:$ty, ID:$scope,
+ ID:$dst, ID:$src, ID:$nelts, ID:$stride, ID:$event),
+ "$res = OpGroupAsyncCopy $ty $scope $dst $src $nelts $stride $event">;
+def OpGroupWaitEvents: Op<260, (outs), (ins ID:$scope, ID:$nelts, ID:$elist),
+ "OpGroupWaitEvents $scope $nelts $elist">;
def OpGroupAll: Op<261, (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$pr),
"$res = OpGroupAll $ty $scope $pr">;
def OpGroupAny: Op<262, (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$pr),
@@ -680,6 +688,18 @@ def OpGroupUMax: OpGroup<"UMax", 270>;
def OpGroupSMax: OpGroup<"SMax", 271>;
// TODO: 3.42.22. Device-Side Enqueue Instructions
+def OpRetainEvent: Op<297, (outs), (ins ID:$event), "OpRetainEvent $event">;
+def OpReleaseEvent: Op<298, (outs), (ins ID:$event), "OpReleaseEvent $event">;
+def OpCreateUserEvent: Op<299, (outs ID:$res), (ins TYPE:$type),
+ "$res = OpCreateUserEvent $type">;
+def OpIsValidEvent: Op<300, (outs ID:$res), (ins TYPE:$type, ID:$event),
+ "$res = OpIsValidEvent $type $event ">;
+def OpSetUserEventStatus: Op<301, (outs), (ins ID:$event, ID:$status),
+ "OpSetUserEventStatus $event $status">;
+def OpCaptureEventProfilingInfo: Op<302, (outs),
+ (ins ID:$event, ID:$info, ID:$value),
+ "OpCaptureEventProfilingInfo $event $info $value">;
+
// TODO: 3.42.23. Pipe Instructions
// 3.42.24. Non-Uniform Instructions
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 90b921a06f21..9365fd22e4e7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -197,6 +197,8 @@ void SPIRVInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
}
+static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI);
+
// Defined in SPIRVLegalizerInfo.cpp.
extern bool isTypeFoldingSupported(unsigned Opcode);
@@ -335,6 +337,30 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast);
case TargetOpcode::G_ADDRSPACE_CAST:
return selectAddrSpaceCast(ResVReg, ResType, I);
+ case TargetOpcode::G_PTR_ADD: {
+ // Currently, we get G_PTR_ADD only as a result of translating
+ // global variables, initialized with constant expressions like GV + Const
+ // (see test opencl/basic/progvar_prog_scope_init.ll).
+ // TODO: extend the handler once we have other cases.
+ assert(I.getOperand(1).isReg() && I.getOperand(2).isReg());
+ Register GV = I.getOperand(1).getReg();
+ MachineRegisterInfo::def_instr_iterator II = MRI->def_instr_begin(GV);
+ assert(((*II).getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
+ (*II).getOpcode() == TargetOpcode::COPY ||
+ (*II).getOpcode() == SPIRV::OpVariable) &&
+ isImm(I.getOperand(2), MRI));
+ Register Idx = buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I);
+ MachineBasicBlock &BB = *I.getParent();
+ auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addImm(static_cast<uint32_t>(
+ SPIRV::Opcode::InBoundsPtrAccessChain))
+ .addUse(GV)
+ .addUse(Idx)
+ .addUse(I.getOperand(2).getReg());
+ return MIB.constrainAllUses(TII, TRI, RBI);
+ }
case TargetOpcode::G_ATOMICRMW_OR:
return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicOr);
@@ -387,23 +413,6 @@ bool SPIRVInstructionSelector::selectUnOp(Register ResVReg,
Opcode);
}
-static SPIRV::MemorySemantics getMemSemantics(AtomicOrdering Ord) {
- switch (Ord) {
- case AtomicOrdering::Acquire:
- return SPIRV::MemorySemantics::Acquire;
- case AtomicOrdering::Release:
- return SPIRV::MemorySemantics::Release;
- case AtomicOrdering::AcquireRelease:
- return SPIRV::MemorySemantics::AcquireRelease;
- case AtomicOrdering::SequentiallyConsistent:
- return SPIRV::MemorySemantics::SequentiallyConsistent;
- case AtomicOrdering::Unordered:
- case AtomicOrdering::Monotonic:
- case AtomicOrdering::NotAtomic:
- return SPIRV::MemorySemantics::None;
- }
-}
-
static SPIRV::Scope getScope(SyncScope::ID Ord) {
switch (Ord) {
case SyncScope::SingleThread:
@@ -484,16 +493,15 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
MachineInstr &I) const {
MachineBasicBlock &BB = *I.getParent();
auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCopyMemorySized))
- .addDef(I.getOperand(0).getReg())
+ .addUse(I.getOperand(0).getReg())
.addUse(I.getOperand(1).getReg())
.addUse(I.getOperand(2).getReg());
if (I.getNumMemOperands())
addMemoryOperands(*I.memoperands_begin(), MIB);
bool Result = MIB.constrainAllUses(TII, TRI, RBI);
- if (ResVReg.isValid() && ResVReg != MIB->getOperand(0).getReg()) {
+ if (ResVReg.isValid() && ResVReg != MIB->getOperand(0).getReg())
BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), ResVReg)
.addUse(MIB->getOperand(0).getReg());
- }
return Result;
}
@@ -541,36 +549,71 @@ bool SPIRVInstructionSelector::selectFence(MachineInstr &I) const {
bool SPIRVInstructionSelector::selectAtomicCmpXchg(Register ResVReg,
const SPIRVType *ResType,
MachineInstr &I) const {
- assert(I.hasOneMemOperand());
- const MachineMemOperand *MemOp = *I.memoperands_begin();
- uint32_t Scope = static_cast<uint32_t>(getScope(MemOp->getSyncScopeID()));
- Register ScopeReg = buildI32Constant(Scope, I);
-
+ Register ScopeReg;
+ Register MemSemEqReg;
+ Register MemSemNeqReg;
Register Ptr = I.getOperand(2).getReg();
+ if (I.getOpcode() != TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS) {
+ assert(I.hasOneMemOperand());
+ const MachineMemOperand *MemOp = *I.memoperands_begin();
+ unsigned Scope = static_cast<uint32_t>(getScope(MemOp->getSyncScopeID()));
+ ScopeReg = buildI32Constant(Scope, I);
+
+ unsigned ScSem = static_cast<uint32_t>(
+ getMemSemanticsForStorageClass(GR.getPointerStorageClass(Ptr)));
+ AtomicOrdering AO = MemOp->getSuccessOrdering();
+ unsigned MemSemEq = static_cast<uint32_t>(getMemSemantics(AO)) | ScSem;
+ MemSemEqReg = buildI32Constant(MemSemEq, I);
+ AtomicOrdering FO = MemOp->getFailureOrdering();
+ unsigned MemSemNeq = static_cast<uint32_t>(getMemSemantics(FO)) | ScSem;
+ MemSemNeqReg =
+ MemSemEq == MemSemNeq ? MemSemEqReg : buildI32Constant(MemSemNeq, I);
+ } else {
+ ScopeReg = I.getOperand(5).getReg();
+ MemSemEqReg = I.getOperand(6).getReg();
+ MemSemNeqReg = I.getOperand(7).getReg();
+ }
+
Register Cmp = I.getOperand(3).getReg();
Register Val = I.getOperand(4).getReg();
-
SPIRVType *SpvValTy = GR.getSPIRVTypeForVReg(Val);
- SPIRV::StorageClass SC = GR.getPointerStorageClass(Ptr);
- uint32_t ScSem = static_cast<uint32_t>(getMemSemanticsForStorageClass(SC));
- AtomicOrdering AO = MemOp->getSuccessOrdering();
- uint32_t MemSemEq = static_cast<uint32_t>(getMemSemantics(AO)) | ScSem;
- Register MemSemEqReg = buildI32Constant(MemSemEq, I);
- AtomicOrdering FO = MemOp->getFailureOrdering();
- uint32_t MemSemNeq = static_cast<uint32_t>(getMemSemantics(FO)) | ScSem;
- Register MemSemNeqReg =
- MemSemEq == MemSemNeq ? MemSemEqReg : buildI32Constant(MemSemNeq, I);
+ Register ACmpRes = MRI->createVirtualRegister(&SPIRV::IDRegClass);
const DebugLoc &DL = I.getDebugLoc();
- return BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpAtomicCompareExchange))
- .addDef(ResVReg)
- .addUse(GR.getSPIRVTypeID(SpvValTy))
- .addUse(Ptr)
- .addUse(ScopeReg)
- .addUse(MemSemEqReg)
- .addUse(MemSemNeqReg)
- .addUse(Val)
- .addUse(Cmp)
- .constrainAllUses(TII, TRI, RBI);
+ bool Result =
+ BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpAtomicCompareExchange))
+ .addDef(ACmpRes)
+ .addUse(GR.getSPIRVTypeID(SpvValTy))
+ .addUse(Ptr)
+ .addUse(ScopeReg)
+ .addUse(MemSemEqReg)
+ .addUse(MemSemNeqReg)
+ .addUse(Val)
+ .addUse(Cmp)
+ .constrainAllUses(TII, TRI, RBI);
+ Register CmpSuccReg = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ SPIRVType *BoolTy = GR.getOrCreateSPIRVBoolType(I, TII);
+ Result |= BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpIEqual))
+ .addDef(CmpSuccReg)
+ .addUse(GR.getSPIRVTypeID(BoolTy))
+ .addUse(ACmpRes)
+ .addUse(Cmp)
+ .constrainAllUses(TII, TRI, RBI);
+ Register TmpReg = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpCompositeInsert))
+ .addDef(TmpReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(ACmpRes)
+ .addUse(GR.getOrCreateUndef(I, ResType, TII))
+ .addImm(0)
+ .constrainAllUses(TII, TRI, RBI);
+ Result |= BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpCompositeInsert))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(CmpSuccReg)
+ .addUse(TmpReg)
+ .addImm(1)
+ .constrainAllUses(TII, TRI, RBI);
+ return Result;
}
static bool isGenericCastablePtr(SPIRV::StorageClass SC) {
@@ -592,6 +635,27 @@ static bool isGenericCastablePtr(SPIRV::StorageClass SC) {
bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
const SPIRVType *ResType,
MachineInstr &I) const {
+ // If the AddrSpaceCast user is single and in OpConstantComposite or
+ // OpVariable, we should select OpSpecConstantOp.
+ auto UIs = MRI->use_instructions(ResVReg);
+ if (!UIs.empty() && ++UIs.begin() == UIs.end() &&
+ (UIs.begin()->getOpcode() == SPIRV::OpConstantComposite ||
+ UIs.begin()->getOpcode() == SPIRV::OpVariable ||
+ isSpvIntrinsic(*UIs.begin(), Intrinsic::spv_init_global))) {
+ Register NewReg = I.getOperand(1).getReg();
+ MachineBasicBlock &BB = *I.getParent();
+ SPIRVType *SpvBaseTy = GR.getOrCreateSPIRVIntegerType(8, I, TII);
+ ResType = GR.getOrCreateSPIRVPointerType(SpvBaseTy, I, TII,
+ SPIRV::StorageClass::Generic);
+ bool Result =
+ BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addImm(static_cast<uint32_t>(SPIRV::Opcode::PtrCastToGeneric))
+ .addUse(NewReg)
+ .constrainAllUses(TII, TRI, RBI);
+ return Result;
+ }
Register SrcPtr = I.getOperand(1).getReg();
SPIRVType *SrcPtrTy = GR.getSPIRVTypeForVReg(SrcPtr);
SPIRV::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr);
@@ -842,7 +906,9 @@ bool SPIRVInstructionSelector::selectFCmp(Register ResVReg,
Register SPIRVInstructionSelector::buildZerosVal(const SPIRVType *ResType,
MachineInstr &I) const {
- return buildI32Constant(0, I, ResType);
+ if (ResType->getOpcode() == SPIRV::OpTypeVector)
+ return GR.getOrCreateConsIntVector(0, I, ResType, TII);
+ return GR.getOrCreateConstInt(0, I, ResType, TII);
}
Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes,
@@ -851,20 +917,9 @@ Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes,
unsigned BitWidth = GR.getScalarOrVectorBitWidth(ResType);
APInt One = AllOnes ? APInt::getAllOnesValue(BitWidth)
: APInt::getOneBitSet(BitWidth, 0);
- Register OneReg = buildI32Constant(One.getZExtValue(), I, ResType);
- if (ResType->getOpcode() == SPIRV::OpTypeVector) {
- const unsigned NumEles = ResType->getOperand(2).getImm();
- Register OneVec = MRI->createVirtualRegister(&SPIRV::IDRegClass);
- unsigned Opcode = SPIRV::OpConstantComposite;
- auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode))
- .addDef(OneVec)
- .addUse(GR.getSPIRVTypeID(ResType));
- for (unsigned i = 0; i < NumEles; ++i)
- MIB.addUse(OneReg);
- constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
- return OneVec;
- }
- return OneReg;
+ if (ResType->getOpcode() == SPIRV::OpTypeVector)
+ return GR.getOrCreateConsIntVector(One.getZExtValue(), I, ResType, TII);
+ return GR.getOrCreateConstInt(One.getZExtValue(), I, ResType, TII);
}
bool SPIRVInstructionSelector::selectSelect(Register ResVReg,
@@ -959,13 +1014,23 @@ bool SPIRVInstructionSelector::selectConst(Register ResVReg,
const SPIRVType *ResType,
const APInt &Imm,
MachineInstr &I) const {
- assert(ResType->getOpcode() != SPIRV::OpTypePointer || Imm.isNullValue());
+ unsigned TyOpcode = ResType->getOpcode();
+ assert(TyOpcode != SPIRV::OpTypePointer || Imm.isNullValue());
MachineBasicBlock &BB = *I.getParent();
- if (ResType->getOpcode() == SPIRV::OpTypePointer && Imm.isNullValue()) {
+ if ((TyOpcode == SPIRV::OpTypePointer || TyOpcode == SPIRV::OpTypeEvent) &&
+ Imm.isNullValue())
return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
.addDef(ResVReg)
.addUse(GR.getSPIRVTypeID(ResType))
.constrainAllUses(TII, TRI, RBI);
+ if (TyOpcode == SPIRV::OpTypeInt) {
+ Register Reg = GR.getOrCreateConstInt(Imm.getZExtValue(), I, ResType, TII);
+ if (Reg == ResVReg)
+ return true;
+ return BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY))
+ .addDef(ResVReg)
+ .addUse(Reg)
+ .constrainAllUses(TII, TRI, RBI);
}
auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI))
.addDef(ResVReg)
@@ -1006,29 +1071,29 @@ bool SPIRVInstructionSelector::selectInsertVal(Register ResVReg,
const SPIRVType *ResType,
MachineInstr &I) const {
MachineBasicBlock &BB = *I.getParent();
- return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeInsert))
- .addDef(ResVReg)
- .addUse(GR.getSPIRVTypeID(ResType))
- // object to insert
- .addUse(I.getOperand(3).getReg())
- // composite to insert into
- .addUse(I.getOperand(2).getReg())
- // TODO: support arbitrary number of indices
- .addImm(foldImm(I.getOperand(4), MRI))
- .constrainAllUses(TII, TRI, RBI);
+ auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeInsert))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ // object to insert
+ .addUse(I.getOperand(3).getReg())
+ // composite to insert into
+ .addUse(I.getOperand(2).getReg());
+ for (unsigned i = 4; i < I.getNumOperands(); i++)
+ MIB.addImm(foldImm(I.getOperand(i), MRI));
+ return MIB.constrainAllUses(TII, TRI, RBI);
}
bool SPIRVInstructionSelector::selectExtractVal(Register ResVReg,
const SPIRVType *ResType,
MachineInstr &I) const {
MachineBasicBlock &BB = *I.getParent();
- return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
- .addDef(ResVReg)
- .addUse(GR.getSPIRVTypeID(ResType))
- .addUse(I.getOperand(2).getReg())
- // TODO: support arbitrary number of indices
- .addImm(foldImm(I.getOperand(3), MRI))
- .constrainAllUses(TII, TRI, RBI);
+ auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(2).getReg());
+ for (unsigned i = 3; i < I.getNumOperands(); i++)
+ MIB.addImm(foldImm(I.getOperand(i), MRI));
+ return MIB.constrainAllUses(TII, TRI, RBI);
}
bool SPIRVInstructionSelector::selectInsertElt(Register ResVReg,
@@ -1154,6 +1219,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
}
return MIB.constrainAllUses(TII, TRI, RBI);
} break;
+ case Intrinsic::spv_cmpxchg:
+ return selectAtomicCmpXchg(ResVReg, ResType, I);
+ break;
default:
llvm_unreachable("Intrinsic selection not implemented");
}
@@ -1239,8 +1307,32 @@ bool SPIRVInstructionSelector::selectGlobalValue(
GV->getType(), MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false);
std::string GlobalIdent = GV->getGlobalIdentifier();
- // TODO: suport @llvm.global.annotations.
+ // We have functions as operands in tests with blocks of instruction e.g. in
+ // transcoding/global_block.ll. These operands are not used and should be
+ // substituted by zero constants. Their type is expected to be always
+ // OpTypePointer Function %uchar.
+ if (isa<Function>(GV)) {
+ const Constant *ConstVal = GV;
+ MachineBasicBlock &BB = *I.getParent();
+ Register NewReg = GR.find(ConstVal, GR.CurMF);
+ if (!NewReg.isValid()) {
+ SPIRVType *SpvBaseTy = GR.getOrCreateSPIRVIntegerType(8, I, TII);
+ ResType = GR.getOrCreateSPIRVPointerType(SpvBaseTy, I, TII);
+ Register NewReg = ResVReg;
+ GR.add(ConstVal, GR.CurMF, NewReg);
+ return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
+ .addDef(NewReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .constrainAllUses(TII, TRI, RBI);
+ }
+ assert(NewReg != ResVReg);
+ return BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY))
+ .addDef(ResVReg)
+ .addUse(NewReg)
+ .constrainAllUses(TII, TRI, RBI);
+ }
auto GlobalVar = cast<GlobalVariable>(GV);
+ assert(GlobalVar->getName() != "llvm.global.annotations");
bool HasInit = GlobalVar->hasInitializer() &&
!isa<UndefValue>(GlobalVar->getInitializer());
diff --git a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
index 8e4ab973bf07..8aaac50c94d7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
@@ -45,7 +45,12 @@ void SPIRVMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI,
break;
}
case MachineOperand::MO_Immediate:
- MCOp = MCOperand::createImm(MO.getImm());
+ if (MI->getOpcode() == SPIRV::OpExtInst && i == 2) {
+ Register Reg = MAI->getExtInstSetReg(MO.getImm());
+ MCOp = MCOperand::createReg(Reg);
+ } else {
+ MCOp = MCOperand::createImm(MO.getImm());
+ }
break;
case MachineOperand::MO_FPImmediate:
MCOp = MCOperand::createDFPImm(
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index a39df5234935..143ddf7297dc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -60,62 +60,50 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) {
MAI.InstrsToDelete.clear();
MAI.FuncNameMap.clear();
MAI.GlobalVarList.clear();
+ MAI.ExtInstSetMap.clear();
// TODO: determine memory model and source language from the configuratoin.
- MAI.Mem = SPIRV::MemoryModel::OpenCL;
- MAI.SrcLang = SPIRV::SourceLanguage::OpenCL_C;
- unsigned PtrSize = ST->getPointerSize();
- MAI.Addr = PtrSize == 32 ? SPIRV::AddressingModel::Physical32
- : PtrSize == 64 ? SPIRV::AddressingModel::Physical64
- : SPIRV::AddressingModel::Logical;
+ if (auto MemModel = M.getNamedMetadata("spirv.MemoryModel")) {
+ auto MemMD = MemModel->getOperand(0);
+ MAI.Addr = static_cast<SPIRV::AddressingModel>(getMetadataUInt(MemMD, 0));
+ MAI.Mem = static_cast<SPIRV::MemoryModel>(getMetadataUInt(MemMD, 1));
+ } else {
+ MAI.Mem = SPIRV::MemoryModel::OpenCL;
+ unsigned PtrSize = ST->getPointerSize();
+ MAI.Addr = PtrSize == 32 ? SPIRV::AddressingModel::Physical32
+ : PtrSize == 64 ? SPIRV::AddressingModel::Physical64
+ : SPIRV::AddressingModel::Logical;
+ }
// Get the OpenCL version number from metadata.
// TODO: support other source languages.
- MAI.SrcLangVersion = 0;
if (auto VerNode = M.getNamedMetadata("opencl.ocl.version")) {
- // Construct version literal according to OpenCL 2.2 environment spec.
+ MAI.SrcLang = SPIRV::SourceLanguage::OpenCL_C;
+ // Construct version literal in accordance with SPIRV-LLVM-Translator.
+ // TODO: support multiple OCL version metadata.
+ assert(VerNode->getNumOperands() > 0 && "Invalid SPIR");
auto VersionMD = VerNode->getOperand(0);
unsigned MajorNum = getMetadataUInt(VersionMD, 0, 2);
unsigned MinorNum = getMetadataUInt(VersionMD, 1);
unsigned RevNum = getMetadataUInt(VersionMD, 2);
- MAI.SrcLangVersion = 0 | (MajorNum << 16) | (MinorNum << 8) | RevNum;
+ MAI.SrcLangVersion = (MajorNum * 100 + MinorNum) * 1000 + RevNum;
+ } else {
+ MAI.SrcLang = SPIRV::SourceLanguage::Unknown;
+ MAI.SrcLangVersion = 0;
}
-}
-// True if there is an instruction in the MS list with all the same operands as
-// the given instruction has (after the given starting index).
-// TODO: maybe it needs to check Opcodes too.
-static bool findSameInstrInMS(const MachineInstr &A,
- SPIRV::ModuleSectionType MSType,
- SPIRV::ModuleAnalysisInfo &MAI,
- bool UpdateRegAliases,
- unsigned StartOpIndex = 0) {
- for (const auto *B : MAI.MS[MSType]) {
- const unsigned NumAOps = A.getNumOperands();
- if (NumAOps == B->getNumOperands() && A.getNumDefs() == B->getNumDefs()) {
- bool AllOpsMatch = true;
- for (unsigned i = StartOpIndex; i < NumAOps && AllOpsMatch; ++i) {
- if (A.getOperand(i).isReg() && B->getOperand(i).isReg()) {
- Register RegA = A.getOperand(i).getReg();
- Register RegB = B->getOperand(i).getReg();
- AllOpsMatch = MAI.getRegisterAlias(A.getMF(), RegA) ==
- MAI.getRegisterAlias(B->getMF(), RegB);
- } else {
- AllOpsMatch = A.getOperand(i).isIdenticalTo(B->getOperand(i));
- }
- }
- if (AllOpsMatch) {
- if (UpdateRegAliases) {
- assert(A.getOperand(0).isReg() && B->getOperand(0).isReg());
- Register LocalReg = A.getOperand(0).getReg();
- Register GlobalReg =
- MAI.getRegisterAlias(B->getMF(), B->getOperand(0).getReg());
- MAI.setRegisterAlias(A.getMF(), LocalReg, GlobalReg);
- }
- return true;
- }
+ if (auto ExtNode = M.getNamedMetadata("opencl.used.extensions")) {
+ for (unsigned I = 0, E = ExtNode->getNumOperands(); I != E; ++I) {
+ MDNode *MD = ExtNode->getOperand(I);
+ if (!MD || MD->getNumOperands() == 0)
+ continue;
+ for (unsigned J = 0, N = MD->getNumOperands(); J != N; ++J)
+ MAI.SrcExt.insert(cast<MDString>(MD->getOperand(J))->getString());
}
}
- return false;
+
+ // TODO: check if it's required by default.
+ MAI.ExtInstSetMap[static_cast<unsigned>(SPIRV::InstructionSet::OpenCL_std)] =
+ Register::index2VirtReg(MAI.getNextID());
}
// Collect MI which defines the register in the given machine function.
@@ -135,7 +123,7 @@ void SPIRVModuleAnalysis::collectGlobalEntities(
const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
SPIRV::ModuleSectionType MSType,
std::function<bool(const SPIRV::DTSortableEntry *)> Pred,
- bool UsePreOrder) {
+ bool UsePreOrder = false) {
DenseSet<const SPIRV::DTSortableEntry *> Visited;
for (const auto *E : DepsGraph) {
std::function<void(const SPIRV::DTSortableEntry *)> RecHoistUtil;
@@ -188,13 +176,41 @@ void SPIRVModuleAnalysis::processDefInstrs(const Module &M) {
collectGlobalEntities(
DepsGraph, SPIRV::MB_TypeConstVars,
- [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); }, false);
+ [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); });
collectGlobalEntities(
DepsGraph, SPIRV::MB_ExtFuncDecls,
[](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true);
}
+// True if there is an instruction in the MS list with all the same operands as
+// the given instruction has (after the given starting index).
+// TODO: maybe it needs to check Opcodes too.
+static bool findSameInstrInMS(const MachineInstr &A,
+ SPIRV::ModuleSectionType MSType,
+ SPIRV::ModuleAnalysisInfo &MAI,
+ unsigned StartOpIndex = 0) {
+ for (const auto *B : MAI.MS[MSType]) {
+ const unsigned NumAOps = A.getNumOperands();
+ if (NumAOps != B->getNumOperands() || A.getNumDefs() != B->getNumDefs())
+ continue;
+ bool AllOpsMatch = true;
+ for (unsigned i = StartOpIndex; i < NumAOps && AllOpsMatch; ++i) {
+ if (A.getOperand(i).isReg() && B->getOperand(i).isReg()) {
+ Register RegA = A.getOperand(i).getReg();
+ Register RegB = B->getOperand(i).getReg();
+ AllOpsMatch = MAI.getRegisterAlias(A.getMF(), RegA) ==
+ MAI.getRegisterAlias(B->getMF(), RegB);
+ } else {
+ AllOpsMatch = A.getOperand(i).isIdenticalTo(B->getOperand(i));
+ }
+ }
+ if (AllOpsMatch)
+ return true;
+ }
+ return false;
+}
+
// Look for IDs declared with Import linkage, and map the imported name string
// to the register defining that variable (which will usually be the result of
// an OpFunction). This lets us call externally imported functions using
@@ -228,12 +244,16 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
// numbering has already occurred by this point. We can directly compare reg
// arguments when detecting duplicates.
static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
- SPIRV::ModuleSectionType MSType) {
+ SPIRV::ModuleSectionType MSType,
+ bool Append = true) {
MAI.setSkipEmission(&MI);
- if (findSameInstrInMS(MI, MSType, MAI, false))
+ if (findSameInstrInMS(MI, MSType, MAI))
return; // Found a duplicate, so don't add it.
// No duplicates, so add it.
- MAI.MS[MSType].push_back(&MI);
+ if (Append)
+ MAI.MS[MSType].push_back(&MI);
+ else
+ MAI.MS[MSType].insert(MAI.MS[MSType].begin(), &MI);
}
// Some global instructions make reference to function-local ID regs, so cannot
@@ -256,15 +276,22 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
} else if (TII->isDecorationInstr(MI)) {
collectOtherInstr(MI, MAI, SPIRV::MB_Annotations);
collectFuncNames(MI, *F);
+ } else if (TII->isConstantInstr(MI)) {
+ // Now OpSpecConstant*s are not in DT,
+ // but they need to be collected anyway.
+ collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars);
} else if (OpCode == SPIRV::OpFunction) {
collectFuncNames(MI, *F);
+ } else if (OpCode == SPIRV::OpTypeForwardPointer) {
+ collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, false);
}
}
}
}
// Number registers in all functions globally from 0 onwards and store
-// the result in global register alias table.
+// the result in global register alias table. Some registers are already
+// numbered in collectGlobalEntities.
void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
for (auto F = M.begin(), E = M.end(); F != E; ++F) {
if ((*F).isDeclaration())
@@ -282,11 +309,50 @@ void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
Register NewReg = Register::index2VirtReg(MAI.getNextID());
MAI.setRegisterAlias(MF, Reg, NewReg);
}
+ if (MI.getOpcode() != SPIRV::OpExtInst)
+ continue;
+ auto Set = MI.getOperand(2).getImm();
+ if (MAI.ExtInstSetMap.find(Set) == MAI.ExtInstSetMap.end())
+ MAI.ExtInstSetMap[Set] = Register::index2VirtReg(MAI.getNextID());
}
}
}
}
+// Find OpIEqual and OpBranchConditional instructions originating from
+// OpSwitches, mark them skipped for emission. Also mark MBB skipped if it
+// contains only these instructions.
+static void processSwitches(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
+ MachineModuleInfo *MMI) {
+ DenseSet<Register> SwitchRegs;
+ for (auto F = M.begin(), E = M.end(); F != E; ++F) {
+ MachineFunction *MF = MMI->getMachineFunction(*F);
+ if (!MF)
+ continue;
+ for (MachineBasicBlock &MBB : *MF)
+ for (MachineInstr &MI : MBB) {
+ if (MAI.getSkipEmission(&MI))
+ continue;
+ if (MI.getOpcode() == SPIRV::OpSwitch) {
+ assert(MI.getOperand(0).isReg());
+ SwitchRegs.insert(MI.getOperand(0).getReg());
+ }
+ if (MI.getOpcode() != SPIRV::OpIEqual || !MI.getOperand(2).isReg() ||
+ !SwitchRegs.contains(MI.getOperand(2).getReg()))
+ continue;
+ Register CmpReg = MI.getOperand(0).getReg();
+ MachineInstr *CBr = MI.getNextNode();
+ assert(CBr && CBr->getOpcode() == SPIRV::OpBranchConditional &&
+ CBr->getOperand(0).isReg() &&
+ CBr->getOperand(0).getReg() == CmpReg);
+ MAI.setSkipEmission(&MI);
+ MAI.setSkipEmission(CBr);
+ if (&MBB.front() == &MI && &MBB.back() == CBr)
+ MAI.MBBsToSkip.insert(&MBB);
+ }
+ }
+}
+
struct SPIRV::ModuleAnalysisInfo SPIRVModuleAnalysis::MAI;
void SPIRVModuleAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -305,7 +371,9 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
setBaseInfo(M);
- // TODO: Process type/const/global var/func decl instructions, number their
+ processSwitches(M, MAI, MMI);
+
+ // Process type/const/global var/func decl instructions, number their
// destination registers from 0 to N, collect Extensions and Capabilities.
processDefInstrs(M);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 585868909d28..9bcdf6e9ae2a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -52,6 +52,9 @@ struct ModuleAnalysisInfo {
SPIRV::AddressingModel Addr;
SPIRV::SourceLanguage SrcLang;
unsigned SrcLangVersion;
+ StringSet<> SrcExt;
+ // Maps ExtInstSet to corresponding ID register.
+ DenseMap<unsigned, Register> ExtInstSetMap;
// Contains the list of all global OpVariables in the module.
SmallVector<MachineInstr *, 4> GlobalVarList;
// Maps function names to coresponding function ID registers.
@@ -59,6 +62,9 @@ struct ModuleAnalysisInfo {
// The set contains machine instructions which are necessary
// for correct MIR but will not be emitted in function bodies.
DenseSet<MachineInstr *> InstrsToDelete;
+ // The set contains machine basic blocks which are necessary
+ // for correct MIR but will not be emitted.
+ DenseSet<MachineBasicBlock *> MBBsToSkip;
// The table contains global aliases of local registers for each machine
// function. The aliases are used to substitute local registers during
// code emission.
@@ -75,6 +81,7 @@ struct ModuleAnalysisInfo {
assert(FuncReg != FuncNameMap.end() && "Cannot find function Id");
return FuncReg->second;
}
+ Register getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; }
InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; }
void setSkipEmission(MachineInstr *MI) { InstrsToDelete.insert(MI); }
bool getSkipEmission(const MachineInstr *MI) {
@@ -123,7 +130,6 @@ public:
private:
void setBaseInfo(const Module &M);
- template <typename T> void collectTypesConstsVars();
void collectGlobalEntities(
const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
SPIRV::ModuleSectionType MSType,
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 687f84046650..e620226dcc7a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -39,11 +39,58 @@ public:
};
} // namespace
-static bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) {
- if (MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS &&
- MI.getIntrinsicID() == IntrinsicID)
- return true;
- return false;
+static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT;
+ SmallVector<MachineInstr *, 10> ToErase, ToEraseComposites;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (!isSpvIntrinsic(MI, Intrinsic::spv_track_constant))
+ continue;
+ ToErase.push_back(&MI);
+ auto *Const =
+ cast<Constant>(cast<ConstantAsMetadata>(
+ MI.getOperand(3).getMetadata()->getOperand(0))
+ ->getValue());
+ if (auto *GV = dyn_cast<GlobalValue>(Const)) {
+ Register Reg = GR->find(GV, &MF);
+ if (!Reg.isValid())
+ GR->add(GV, &MF, MI.getOperand(2).getReg());
+ else
+ RegsAlreadyAddedToDT[&MI] = Reg;
+ } else {
+ Register Reg = GR->find(Const, &MF);
+ if (!Reg.isValid()) {
+ if (auto *ConstVec = dyn_cast<ConstantDataVector>(Const)) {
+ auto *BuildVec = MRI.getVRegDef(MI.getOperand(2).getReg());
+ assert(BuildVec &&
+ BuildVec->getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+ for (unsigned i = 0; i < ConstVec->getNumElements(); ++i)
+ GR->add(ConstVec->getElementAsConstant(i), &MF,
+ BuildVec->getOperand(1 + i).getReg());
+ }
+ GR->add(Const, &MF, MI.getOperand(2).getReg());
+ } else {
+ RegsAlreadyAddedToDT[&MI] = Reg;
+ // This MI is unused and will be removed. If the MI uses
+ // const_composite, it will be unused and should be removed too.
+ assert(MI.getOperand(2).isReg() && "Reg operand is expected");
+ MachineInstr *SrcMI = MRI.getVRegDef(MI.getOperand(2).getReg());
+ if (SrcMI && isSpvIntrinsic(*SrcMI, Intrinsic::spv_const_composite))
+ ToEraseComposites.push_back(SrcMI);
+ }
+ }
+ }
+ }
+ for (MachineInstr *MI : ToErase) {
+ Register Reg = MI->getOperand(2).getReg();
+ if (RegsAlreadyAddedToDT.find(MI) != RegsAlreadyAddedToDT.end())
+ Reg = RegsAlreadyAddedToDT[MI];
+ MRI.replaceRegWith(MI->getOperand(0).getReg(), Reg);
+ MI->eraseFromParent();
+ }
+ for (MachineInstr *MI : ToEraseComposites)
+ MI->eraseFromParent();
}
static void foldConstantsIntoIntrinsics(MachineFunction &MF) {
@@ -120,6 +167,7 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
}
case TargetOpcode::G_TRUNC:
case TargetOpcode::G_ADDRSPACE_CAST:
+ case TargetOpcode::G_PTR_ADD:
case TargetOpcode::COPY: {
MachineOperand &Op = MI->getOperand(1);
MachineInstr *Def = Op.isReg() ? MRI.getVRegDef(Op.getReg()) : nullptr;
@@ -308,6 +356,22 @@ static void processInstrsWithTypeFolding(MachineFunction &MF,
processInstr(MI, MIB, MRI, GR);
}
}
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ // We need to rewrite dst types for ASSIGN_TYPE instrs to be able
+ // to perform tblgen'erated selection and we can't do that on Legalizer
+ // as it operates on gMIR only.
+ if (MI.getOpcode() != SPIRV::ASSIGN_TYPE)
+ continue;
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!isTypeFoldingSupported(MRI.getVRegDef(SrcReg)->getOpcode()))
+ continue;
+ Register DstReg = MI.getOperand(0).getReg();
+ if (MRI.getType(DstReg).isVector())
+ MRI.setRegClass(DstReg, &SPIRV::IDRegClass);
+ MRI.setType(DstReg, LLT::scalar(32));
+ }
+ }
}
static void processSwitches(MachineFunction &MF, SPIRVGlobalRegistry *GR,
@@ -421,6 +485,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
GR->setCurrentFunc(MF);
MachineIRBuilder MIB(MF);
+ addConstantsToTrack(MF, GR);
foldConstantsIntoIntrinsics(MF);
insertBitcasts(MF, GR, MIB);
generateAssignInstrs(MF, GR, MIB);
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
new file mode 100644
index 000000000000..13c3c12c1b41
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -0,0 +1,288 @@
+//===-- SPIRVPrepareFunctions.cpp - modify function signatures --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass modifies function signatures containing aggregate arguments
+// and/or return value. Also it substitutes some llvm intrinsic calls by
+// function calls, generating these functions as the translator does.
+//
+// NOTE: this pass is a module-level one due to the necessity to modify
+// GVs/functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRV.h"
+#include "SPIRVTargetMachine.h"
+#include "SPIRVUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeSPIRVPrepareFunctionsPass(PassRegistry &);
+}
+
+namespace {
+
+class SPIRVPrepareFunctions : public ModulePass {
+ Function *processFunctionSignature(Function *F);
+
+public:
+ static char ID;
+ SPIRVPrepareFunctions() : ModulePass(ID) {
+ initializeSPIRVPrepareFunctionsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+
+ StringRef getPassName() const override { return "SPIRV prepare functions"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ ModulePass::getAnalysisUsage(AU);
+ }
+};
+
+} // namespace
+
+char SPIRVPrepareFunctions::ID = 0;
+
+INITIALIZE_PASS(SPIRVPrepareFunctions, "prepare-functions",
+ "SPIRV prepare functions", false, false)
+
+Function *SPIRVPrepareFunctions::processFunctionSignature(Function *F) {
+ IRBuilder<> B(F->getContext());
+
+ bool IsRetAggr = F->getReturnType()->isAggregateType();
+ bool HasAggrArg =
+ std::any_of(F->arg_begin(), F->arg_end(), [](Argument &Arg) {
+ return Arg.getType()->isAggregateType();
+ });
+ bool DoClone = IsRetAggr || HasAggrArg;
+ if (!DoClone)
+ return F;
+ SmallVector<std::pair<int, Type *>, 4> ChangedTypes;
+ Type *RetType = IsRetAggr ? B.getInt32Ty() : F->getReturnType();
+ if (IsRetAggr)
+ ChangedTypes.push_back(std::pair<int, Type *>(-1, F->getReturnType()));
+ SmallVector<Type *, 4> ArgTypes;
+ for (const auto &Arg : F->args()) {
+ if (Arg.getType()->isAggregateType()) {
+ ArgTypes.push_back(B.getInt32Ty());
+ ChangedTypes.push_back(
+ std::pair<int, Type *>(Arg.getArgNo(), Arg.getType()));
+ } else
+ ArgTypes.push_back(Arg.getType());
+ }
+ FunctionType *NewFTy =
+ FunctionType::get(RetType, ArgTypes, F->getFunctionType()->isVarArg());
+ Function *NewF =
+ Function::Create(NewFTy, F->getLinkage(), F->getName(), *F->getParent());
+
+ ValueToValueMapTy VMap;
+ auto NewFArgIt = NewF->arg_begin();
+ for (auto &Arg : F->args()) {
+ StringRef ArgName = Arg.getName();
+ NewFArgIt->setName(ArgName);
+ VMap[&Arg] = &(*NewFArgIt++);
+ }
+ SmallVector<ReturnInst *, 8> Returns;
+
+ CloneFunctionInto(NewF, F, VMap, CloneFunctionChangeType::LocalChangesOnly,
+ Returns);
+ NewF->takeName(F);
+
+ NamedMDNode *FuncMD =
+ F->getParent()->getOrInsertNamedMetadata("spv.cloned_funcs");
+ SmallVector<Metadata *, 2> MDArgs;
+ MDArgs.push_back(MDString::get(B.getContext(), NewF->getName()));
+ for (auto &ChangedTyP : ChangedTypes)
+ MDArgs.push_back(MDNode::get(
+ B.getContext(),
+ {ConstantAsMetadata::get(B.getInt32(ChangedTyP.first)),
+ ValueAsMetadata::get(Constant::getNullValue(ChangedTyP.second))}));
+ MDNode *ThisFuncMD = MDNode::get(B.getContext(), MDArgs);
+ FuncMD->addOperand(ThisFuncMD);
+
+ for (auto *U : make_early_inc_range(F->users())) {
+ if (auto *CI = dyn_cast<CallInst>(U))
+ CI->mutateFunctionType(NewF->getFunctionType());
+ U->replaceUsesOfWith(F, NewF);
+ }
+ return NewF;
+}
+
+std::string lowerLLVMIntrinsicName(IntrinsicInst *II) {
+ Function *IntrinsicFunc = II->getCalledFunction();
+ assert(IntrinsicFunc && "Missing function");
+ std::string FuncName = IntrinsicFunc->getName().str();
+ std::replace(FuncName.begin(), FuncName.end(), '.', '_');
+ FuncName = "spirv." + FuncName;
+ return FuncName;
+}
+
+static Function *getOrCreateFunction(Module *M, Type *RetTy,
+ ArrayRef<Type *> ArgTypes,
+ StringRef Name) {
+ FunctionType *FT = FunctionType::get(RetTy, ArgTypes, false);
+ Function *F = M->getFunction(Name);
+ if (F && F->getFunctionType() == FT)
+ return F;
+ Function *NewF = Function::Create(FT, GlobalValue::ExternalLinkage, Name, M);
+ if (F)
+ NewF->setDSOLocal(F->isDSOLocal());
+ NewF->setCallingConv(CallingConv::SPIR_FUNC);
+ return NewF;
+}
+
+static void lowerFunnelShifts(Module *M, IntrinsicInst *FSHIntrinsic) {
+ // Get a separate function - otherwise, we'd have to rework the CFG of the
+ // current one. Then simply replace the intrinsic uses with a call to the new
+ // function.
+ // Generate LLVM IR for i* @spirv.llvm_fsh?_i* (i* %a, i* %b, i* %c)
+ FunctionType *FSHFuncTy = FSHIntrinsic->getFunctionType();
+ Type *FSHRetTy = FSHFuncTy->getReturnType();
+ const std::string FuncName = lowerLLVMIntrinsicName(FSHIntrinsic);
+ Function *FSHFunc =
+ getOrCreateFunction(M, FSHRetTy, FSHFuncTy->params(), FuncName);
+
+ if (!FSHFunc->empty()) {
+ FSHIntrinsic->setCalledFunction(FSHFunc);
+ return;
+ }
+ BasicBlock *RotateBB = BasicBlock::Create(M->getContext(), "rotate", FSHFunc);
+ IRBuilder<> IRB(RotateBB);
+ Type *Ty = FSHFunc->getReturnType();
+ // Build the actual funnel shift rotate logic.
+ // In the comments, "int" is used interchangeably with "vector of int
+ // elements".
+ FixedVectorType *VectorTy = dyn_cast<FixedVectorType>(Ty);
+ Type *IntTy = VectorTy ? VectorTy->getElementType() : Ty;
+ unsigned BitWidth = IntTy->getIntegerBitWidth();
+ ConstantInt *BitWidthConstant = IRB.getInt({BitWidth, BitWidth});
+ Value *BitWidthForInsts =
+ VectorTy
+ ? IRB.CreateVectorSplat(VectorTy->getNumElements(), BitWidthConstant)
+ : BitWidthConstant;
+ Value *RotateModVal =
+ IRB.CreateURem(/*Rotate*/ FSHFunc->getArg(2), BitWidthForInsts);
+ Value *FirstShift = nullptr, *SecShift = nullptr;
+ if (FSHIntrinsic->getIntrinsicID() == Intrinsic::fshr) {
+ // Shift the less significant number right, the "rotate" number of bits
+ // will be 0-filled on the left as a result of this regular shift.
+ FirstShift = IRB.CreateLShr(FSHFunc->getArg(1), RotateModVal);
+ } else {
+ // Shift the more significant number left, the "rotate" number of bits
+ // will be 0-filled on the right as a result of this regular shift.
+ FirstShift = IRB.CreateShl(FSHFunc->getArg(0), RotateModVal);
+ }
+ // We want the "rotate" number of the more significant int's LSBs (MSBs) to
+ // occupy the leftmost (rightmost) "0 space" left by the previous operation.
+ // Therefore, subtract the "rotate" number from the integer bitsize...
+ Value *SubRotateVal = IRB.CreateSub(BitWidthForInsts, RotateModVal);
+ if (FSHIntrinsic->getIntrinsicID() == Intrinsic::fshr) {
+ // ...and left-shift the more significant int by this number, zero-filling
+ // the LSBs.
+ SecShift = IRB.CreateShl(FSHFunc->getArg(0), SubRotateVal);
+ } else {
+ // ...and right-shift the less significant int by this number, zero-filling
+ // the MSBs.
+ SecShift = IRB.CreateLShr(FSHFunc->getArg(1), SubRotateVal);
+ }
+ // A simple binary addition of the shifted ints yields the final result.
+ IRB.CreateRet(IRB.CreateOr(FirstShift, SecShift));
+
+ FSHIntrinsic->setCalledFunction(FSHFunc);
+}
+
+static void buildUMulWithOverflowFunc(Module *M, Function *UMulFunc) {
+ // The function body is already created.
+ if (!UMulFunc->empty())
+ return;
+
+ BasicBlock *EntryBB = BasicBlock::Create(M->getContext(), "entry", UMulFunc);
+ IRBuilder<> IRB(EntryBB);
+ // Build the actual unsigned multiplication logic with the overflow
+ // indication. Do unsigned multiplication Mul = A * B. Then check
+ // if unsigned division Div = Mul / A is not equal to B. If so,
+ // then overflow has happened.
+ Value *Mul = IRB.CreateNUWMul(UMulFunc->getArg(0), UMulFunc->getArg(1));
+ Value *Div = IRB.CreateUDiv(Mul, UMulFunc->getArg(0));
+ Value *Overflow = IRB.CreateICmpNE(UMulFunc->getArg(0), Div);
+
+ // umul.with.overflow intrinsic return a structure, where the first element
+ // is the multiplication result, and the second is an overflow bit.
+ Type *StructTy = UMulFunc->getReturnType();
+ Value *Agg = IRB.CreateInsertValue(UndefValue::get(StructTy), Mul, {0});
+ Value *Res = IRB.CreateInsertValue(Agg, Overflow, {1});
+ IRB.CreateRet(Res);
+}
+
+static void lowerUMulWithOverflow(Module *M, IntrinsicInst *UMulIntrinsic) {
+ // Get a separate function - otherwise, we'd have to rework the CFG of the
+ // current one. Then simply replace the intrinsic uses with a call to the new
+ // function.
+ FunctionType *UMulFuncTy = UMulIntrinsic->getFunctionType();
+ Type *FSHLRetTy = UMulFuncTy->getReturnType();
+ const std::string FuncName = lowerLLVMIntrinsicName(UMulIntrinsic);
+ Function *UMulFunc =
+ getOrCreateFunction(M, FSHLRetTy, UMulFuncTy->params(), FuncName);
+ buildUMulWithOverflowFunc(M, UMulFunc);
+ UMulIntrinsic->setCalledFunction(UMulFunc);
+}
+
+static void substituteIntrinsicCalls(Module *M, Function *F) {
+ for (BasicBlock &BB : *F) {
+ for (Instruction &I : BB) {
+ auto Call = dyn_cast<CallInst>(&I);
+ if (!Call)
+ continue;
+ Call->setTailCall(false);
+ Function *CF = Call->getCalledFunction();
+ if (!CF || !CF->isIntrinsic())
+ continue;
+ auto *II = cast<IntrinsicInst>(Call);
+ if (II->getIntrinsicID() == Intrinsic::fshl ||
+ II->getIntrinsicID() == Intrinsic::fshr)
+ lowerFunnelShifts(M, II);
+ else if (II->getIntrinsicID() == Intrinsic::umul_with_overflow)
+ lowerUMulWithOverflow(M, II);
+ }
+ }
+}
+
+bool SPIRVPrepareFunctions::runOnModule(Module &M) {
+ for (Function &F : M)
+ substituteIntrinsicCalls(&M, &F);
+
+ std::vector<Function *> FuncsWorklist;
+ bool Changed = false;
+ for (auto &F : M)
+ FuncsWorklist.push_back(&F);
+
+ for (auto *Func : FuncsWorklist) {
+ Function *F = processFunctionSignature(Func);
+
+ bool CreatedNewF = F != Func;
+
+ if (Func->isDeclaration()) {
+ Changed |= CreatedNewF;
+ continue;
+ }
+
+ if (CreatedNewF)
+ Func->eraseFromParent();
+ }
+
+ return Changed;
+}
+
+ModulePass *llvm::createSPIRVPrepareFunctionsPass() {
+ return new SPIRVPrepareFunctions();
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index cdf3a160f373..00549c7b5768 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -46,8 +46,7 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
PointerSize(computePointerSize(TT)), SPIRVVersion(0), InstrInfo(),
FrameLowering(initSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {
GR = std::make_unique<SPIRVGlobalRegistry>(PointerSize);
- CallLoweringInfo =
- std::make_unique<SPIRVCallLowering>(TLInfo, *this, GR.get());
+ CallLoweringInfo = std::make_unique<SPIRVCallLowering>(TLInfo, GR.get());
Legalizer = std::make_unique<SPIRVLegalizerInfo>(*this);
RegBankInfo = std::make_unique<SPIRVRegisterBankInfo>();
InstSelector.reset(
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index f7c88a5c6d4a..7f5f14dc3ce8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -140,7 +140,10 @@ TargetPassConfig *SPIRVTargetMachine::createPassConfig(PassManagerBase &PM) {
return new SPIRVPassConfig(*this, PM);
}
-void SPIRVPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); }
+void SPIRVPassConfig::addIRPasses() {
+ TargetPassConfig::addIRPasses();
+ addPass(createSPIRVPrepareFunctionsPass());
+}
void SPIRVPassConfig::addISelPrepare() {
addPass(createSPIRVEmitIntrinsicsPass(&getTM<SPIRVTargetMachine>()));
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index b92dc12735f8..15671ef3e512 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -45,6 +45,14 @@ static size_t getPaddedLen(const StringRef &Str) {
return (Len % 4 == 0) ? Len : Len + (4 - (Len % 4));
}
+void addStringImm(const StringRef &Str, MCInst &Inst) {
+ const size_t PaddedLen = getPaddedLen(Str);
+ for (unsigned i = 0; i < PaddedLen; i += 4) {
+ // Add an operand for the 32-bits of chars or padding.
+ Inst.addOperand(MCOperand::createImm(convertCharsToWord(Str, i)));
+ }
+}
+
void addStringImm(const StringRef &Str, MachineInstrBuilder &MIB) {
const size_t PaddedLen = getPaddedLen(Str);
for (unsigned i = 0; i < PaddedLen; i += 4) {
@@ -182,6 +190,24 @@ SPIRV::MemorySemantics getMemSemanticsForStorageClass(SPIRV::StorageClass SC) {
}
}
+SPIRV::MemorySemantics getMemSemantics(AtomicOrdering Ord) {
+ switch (Ord) {
+ case AtomicOrdering::Acquire:
+ return SPIRV::MemorySemantics::Acquire;
+ case AtomicOrdering::Release:
+ return SPIRV::MemorySemantics::Release;
+ case AtomicOrdering::AcquireRelease:
+ return SPIRV::MemorySemantics::AcquireRelease;
+ case AtomicOrdering::SequentiallyConsistent:
+ return SPIRV::MemorySemantics::SequentiallyConsistent;
+ case AtomicOrdering::Unordered:
+ case AtomicOrdering::Monotonic:
+ case AtomicOrdering::NotAtomic:
+ default:
+ return SPIRV::MemorySemantics::None;
+ }
+}
+
MachineInstr *getDefInstrMaybeConstant(Register &ConstReg,
const MachineRegisterInfo *MRI) {
MachineInstr *ConstInstr = MRI->getVRegDef(ConstReg);
@@ -202,6 +228,11 @@ uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI) {
return MI->getOperand(1).getCImm()->getValue().getZExtValue();
}
+bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) {
+ return MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS &&
+ MI.getIntrinsicID() == IntrinsicID;
+}
+
Type *getMDOperandAsType(const MDNode *N, unsigned I) {
return cast<ValueAsMetadata>(N->getOperand(I))->getType();
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index ffa82c9c1fe4..35e24b076570 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -32,6 +32,7 @@ class SPIRVInstrInfo;
// Add the given string as a series of integer operand, inserting null
// terminators and padding to make sure the operands all have 32-bit
// little-endian words.
+void addStringImm(const llvm::StringRef &Str, llvm::MCInst &Inst);
void addStringImm(const llvm::StringRef &Str, llvm::MachineInstrBuilder &MIB);
void addStringImm(const llvm::StringRef &Str, llvm::IRBuilder<> &B,
std::vector<llvm::Value *> &Args);
@@ -67,6 +68,8 @@ llvm::SPIRV::StorageClass addressSpaceToStorageClass(unsigned AddrSpace);
llvm::SPIRV::MemorySemantics
getMemSemanticsForStorageClass(llvm::SPIRV::StorageClass SC);
+llvm::SPIRV::MemorySemantics getMemSemantics(llvm::AtomicOrdering Ord);
+
// Find def instruction for the given ConstReg, walking through
// spv_track_constant and ASSIGN_TYPE instructions. Updates ConstReg by def
// of OpConstant instruction.
@@ -78,6 +81,9 @@ getDefInstrMaybeConstant(llvm::Register &ConstReg,
uint64_t getIConstVal(llvm::Register ConstReg,
const llvm::MachineRegisterInfo *MRI);
+// Check if MI is a SPIR-V specific intrinsic call.
+bool isSpvIntrinsic(llvm::MachineInstr &MI, llvm::Intrinsic::ID IntrinsicID);
+
// Get type of i-th operand of the metadata node.
llvm::Type *getMDOperandAsType(const llvm::MDNode *N, unsigned I);
#endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index 1138788ac7fa..1f8837eb0194 100644
--- a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -24,10 +24,10 @@ Target &llvm::getTheSparcelTarget() {
}
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTargetInfo() {
- RegisterTarget<Triple::sparc, /*HasJIT=*/true> X(getTheSparcTarget(), "sparc",
- "Sparc", "Sparc");
- RegisterTarget<Triple::sparcv9, /*HasJIT=*/true> Y(
+ RegisterTarget<Triple::sparc, /*HasJIT=*/false> X(getTheSparcTarget(),
+ "sparc", "Sparc", "Sparc");
+ RegisterTarget<Triple::sparcv9, /*HasJIT=*/false> Y(
getTheSparcV9Target(), "sparcv9", "Sparc V9", "Sparc");
- RegisterTarget<Triple::sparcel, /*HasJIT=*/true> Z(
+ RegisterTarget<Triple::sparcel, /*HasJIT=*/false> Z(
getTheSparcelTarget(), "sparcel", "Sparc LE", "Sparc");
}
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
index 9c73757d7f5c..86eb8365d527 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -28,7 +28,3 @@ const MCPhysReg SystemZ::XPLINK64ArgGPRs[SystemZ::XPLINK64NumArgGPRs] = {
const MCPhysReg SystemZ::XPLINK64ArgFPRs[SystemZ::XPLINK64NumArgFPRs] = {
SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
};
-
-const MCPhysReg SystemZ::XPLINK64ArgVRs[SystemZ::XPLINK64NumArgVRs] = {
- SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27,
- SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31};
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index f82c61c0f344..387411942aba 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -27,9 +27,6 @@ namespace SystemZ {
const unsigned XPLINK64NumArgFPRs = 4;
extern const MCPhysReg XPLINK64ArgFPRs[XPLINK64NumArgFPRs];
-
- const unsigned XPLINK64NumArgVRs = 8;
- extern const MCPhysReg XPLINK64ArgVRs[XPLINK64NumArgVRs];
} // end namespace SystemZ
class SystemZCCState : public CCState {
@@ -205,41 +202,6 @@ inline bool CC_XPLINK64_Allocate128BitVararg(unsigned &ValNo, MVT &ValVT,
return false;
}
-inline bool CC_XPLINK64_Shadow_Stack(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- ArrayRef<MCPhysReg> RegList;
-
- switch (LocVT.SimpleTy) {
- case MVT::i64:
- RegList = SystemZ::XPLINK64ArgGPRs;
- break;
- case MVT::v16i8:
- case MVT::v8i16:
- case MVT::v4i32:
- case MVT::v2i64:
- case MVT::v4f32:
- case MVT::v2f64:
- RegList = SystemZ::XPLINK64ArgVRs;
- break;
- case MVT::f32:
- case MVT::f64:
- case MVT::f128:
- RegList = SystemZ::XPLINK64ArgFPRs;
- break;
- default:
- return false;
- }
-
- unsigned UnallocatedRegisterIndex = State.getFirstUnallocated(RegList);
- // Every time we can allocate a register, allocate on the stack.
- if (UnallocatedRegisterIndex < RegList.size())
- State.AllocateStack(LocVT.getSizeInBits() / 8, Align(8));
-
- return false;
-}
-
inline bool RetCC_SystemZ_Error(unsigned &, MVT &, MVT &,
CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
CCState &) {
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index fdd82a01f211..29b4a26736b2 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -221,9 +221,10 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
// XPLINK64 ABI compliant code widens integral types smaller than i64
// to i64 before placing the parameters either on the stack or in registers.
CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
- // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRS.
- CCIfType<[f32], CCIfNotFixed<CCPromoteToType<f64>>>,
- CCIfType<[f64], CCIfNotFixed<CCBitConvertToType<i64>>>,
+ // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRs.
+ // Although we assign the f32 vararg to be bitcast, it will first be promoted
+ // to an f64 within convertValVTToLocVT().
+ CCIfType<[f32, f64], CCIfNotFixed<CCBitConvertToType<i64>>>,
// long double, can only be passed in GPR2 and GPR3, if available,
// hence R2Q
CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
@@ -246,34 +247,29 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
// The first 3 integer arguments are passed in registers R1D-R3D.
// The rest will be passed in the user area. The address offset of the user
// area can be found in register R4D.
- CCIfType<[i64], CCCustom<"CC_XPLINK64_Shadow_Stack">>,
- CCIfType<[i64], CCAssignToReg<[R1D, R2D, R3D]>>,
+ CCIfType<[i64], CCAssignToRegAndStack<[R1D, R2D, R3D], 8, 8>>,
- // The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
+ // The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
// are passed in the same way, but they're widened to one of these types
// during type legalization.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
- CCIfSubtarget<"hasVector()",
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>>,
+ CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfFixed<CCAssignToReg<[V24, V25, V26, V27,
- V28, V29, V30, V31]>>>>,
+ CCIfFixed<CCAssignToRegAndStack<[V24, V25, V26, V27,
+ V28, V29, V30, V31], 16, 8>>>>,
- // The first 4 named float and double arguments are passed in registers FPR0-FPR6.
- // The rest will be passed in the user area.
+ // The first 4 named float and double arguments are passed in registers
+ // FPR0-FPR6. The rest will be passed in the user area.
CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
- CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>,
- CCIfType<[f32], CCIfFixed<CCAssignToReg<[F0S, F2S, F4S, F6S]>>>,
- CCIfType<[f64], CCIfFixed<CCAssignToReg<[F0D, F2D, F4D, F6D]>>>,
+ CCIfType<[f32], CCIfFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>,
+ CCIfType<[f64], CCIfFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>,
+
// The first 2 long double arguments are passed in register FPR0/FPR2
// and FPR4/FPR6. The rest will be passed in the user area.
CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
- CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>,
- CCIfType<[f128], CCIfFixed<CCAssignToReg<[F0Q, F4Q]>>>,
+ CCIfType<[f128], CCIfFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>,
// Other arguments are passed in 8-byte-aligned 8-byte stack slots.
CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 975eb8862e82..d943507b4112 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -911,6 +911,54 @@ SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering()
XPLINKSpillOffsetTable[I].Offset;
}
+// Checks if the function is a potential candidate for being a XPLeaf routine.
+static bool isXPLeafCandidate(const MachineFunction &MF) {
+ const MachineFrameInfo &MFFrame = MF.getFrameInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+ auto *Regs =
+ static_cast<SystemZXPLINK64Registers *>(Subtarget.getSpecialRegisters());
+
+ // If function calls other functions including alloca, then it is not a XPLeaf
+ // routine.
+ if (MFFrame.hasCalls())
+ return false;
+
+ // If the function has var Sized Objects, then it is not a XPLeaf routine.
+ if (MFFrame.hasVarSizedObjects())
+ return false;
+
+ // If the function adjusts the stack, then it is not a XPLeaf routine.
+ if (MFFrame.adjustsStack())
+ return false;
+
+ // If function modifies the stack pointer register, then it is not a XPLeaf
+ // routine.
+ if (MRI.isPhysRegModified(Regs->getStackPointerRegister()))
+ return false;
+
+ // If function modifies the ADA register, then it is not a XPLeaf routine.
+ if (MRI.isPhysRegModified(Regs->getAddressOfCalleeRegister()))
+ return false;
+
+ // If function modifies the return address register, then it is not a XPLeaf
+ // routine.
+ if (MRI.isPhysRegModified(Regs->getReturnFunctionAddressRegister()))
+ return false;
+
+ // If the backchain pointer should be stored, then it is not a XPLeaf routine.
+ if (MF.getFunction().hasFnAttribute("backchain"))
+ return false;
+
+ // If function acquires its own stack frame, then it is not a XPLeaf routine.
+ // At the time this function is called, only slots for local variables are
+ // allocated, so this is a very rough estimate.
+ if (MFFrame.estimateStackSize(MF) > 0)
+ return false;
+
+ return true;
+}
+
bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots(
MachineFunction &MF, const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const {
@@ -920,6 +968,18 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots(
auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
auto &GRRegClass = SystemZ::GR64BitRegClass;
+ // At this point, the result of isXPLeafCandidate() is not accurate because
+ // the size of the save area has not yet been determined. If
+ // isXPLeafCandidate() indicates a potential leaf function, and there are no
+ // callee-save registers, then it is indeed a leaf function, and we can early
+ // exit.
+ // TODO: It is possible for leaf functions to use callee-saved registers.
+ // It can use the 0-2k range between R4 and the caller's stack frame without
+ // acquiring its own stack frame.
+ bool IsLeaf = CSI.empty() && isXPLeafCandidate(MF);
+ if (IsLeaf)
+ return true;
+
// For non-leaf functions:
// - the address of callee (entry point) register R6 must be saved
CSI.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister()));
@@ -1137,16 +1197,16 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
MachineFrameInfo &MFFrame = MF.getFrameInfo();
MachineInstr *StoreInstr = nullptr;
+
+ determineFrameLayout(MF);
+
bool HasFP = hasFP(MF);
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
DebugLoc DL;
uint64_t Offset = 0;
- // TODO: Support leaf functions; only add size of save+reserved area when
- // function is non-leaf.
- MFFrame.setStackSize(MFFrame.getStackSize() + Regs.getCallFrameSize());
- uint64_t StackSize = MFFrame.getStackSize();
+ const uint64_t StackSize = MFFrame.getStackSize();
if (ZFI->getSpillGPRRegs().LowGPR) {
// Skip over the GPR saves.
@@ -1213,8 +1273,8 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
// Mark the FramePtr as live at the beginning of every block except
// the entry block. (We'll have marked R8 as live on entry when
// saving the GPRs.)
- for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I)
- I->addLiveIn(Regs.getFramePointerRegister());
+ for (MachineBasicBlock &B : llvm::drop_begin(MF))
+ B.addLiveIn(Regs.getFramePointerRegister());
}
}
@@ -1321,3 +1381,32 @@ void SystemZXPLINKFrameLowering::processFunctionBeforeFrameFinalized(
// Setup stack frame offset
MFFrame.setOffsetAdjustment(Regs.getStackPointerBias());
}
+
+// Determines the size of the frame, and creates the deferred spill objects.
+void SystemZXPLINKFrameLowering::determineFrameLayout(
+ MachineFunction &MF) const {
+ MachineFrameInfo &MFFrame = MF.getFrameInfo();
+ const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+ auto *Regs =
+ static_cast<SystemZXPLINK64Registers *>(Subtarget.getSpecialRegisters());
+
+ uint64_t StackSize = MFFrame.getStackSize();
+ if (StackSize == 0)
+ return;
+
+ // Add the size of the register save area and the reserved area to the size.
+ StackSize += Regs->getCallFrameSize();
+ MFFrame.setStackSize(StackSize);
+
+ // We now know the stack size. Create the fixed spill stack objects for the
+ // register save area now. This has no impact on the stack frame layout, as
+ // this is already computed. However, it makes sure that all callee saved
+ // registers have a valid frame index assigned.
+ const unsigned RegSize = MF.getDataLayout().getPointerSize();
+ for (auto &CS : MFFrame.getCalleeSavedInfo()) {
+ int Offset = RegSpillOffsets[CS.getReg()];
+ if (Offset >= 0)
+ CS.setFrameIdx(
+ MFFrame.CreateFixedSpillStackObject(RegSize, Offset - StackSize));
+ }
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index bec83a9457e0..95f30e3c0d99 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -134,6 +134,8 @@ public:
void processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS) const override;
+
+ void determineFrameLayout(MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 42c1c77f14e4..ac4531262187 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1404,8 +1404,12 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
case CCValAssign::BCvt: {
assert(VA.getLocVT() == MVT::i64 || VA.getLocVT() == MVT::i128);
- assert(VA.getValVT().isVector() || VA.getValVT() == MVT::f64 ||
- VA.getValVT() == MVT::f128);
+ assert(VA.getValVT().isVector() || VA.getValVT() == MVT::f32 ||
+ VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::f128);
+ // For an f32 vararg we need to first promote it to an f64 and then
+ // bitcast it to an i64.
+ if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i64)
+ Value = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f64, Value);
MVT BitCastToType = VA.getValVT().isVector() && VA.getLocVT() == MVT::i64
? MVT::v2i64
: VA.getLocVT();
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index 94ebb59c4c77..46bb85606a62 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -418,7 +418,9 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
if (MI.getOpcode() == VE::LDrii || // I64
MI.getOpcode() == VE::LDLSXrii || // I32
MI.getOpcode() == VE::LDUrii || // F32
- MI.getOpcode() == VE::LDQrii // F128 (pseudo)
+ MI.getOpcode() == VE::LDQrii || // F128 (pseudo)
+ MI.getOpcode() == VE::LDVMrii || // VM (pseudo)
+ MI.getOpcode() == VE::LDVM512rii // VM512 (pseudo)
) {
if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
MI.getOperand(2).getImm() == 0 && MI.getOperand(3).isImm() &&
@@ -437,10 +439,12 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
/// any side effects other than storing to the stack slot.
unsigned VEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
- if (MI.getOpcode() == VE::STrii || // I64
- MI.getOpcode() == VE::STLrii || // I32
- MI.getOpcode() == VE::STUrii || // F32
- MI.getOpcode() == VE::STQrii // F128 (pseudo)
+ if (MI.getOpcode() == VE::STrii || // I64
+ MI.getOpcode() == VE::STLrii || // I32
+ MI.getOpcode() == VE::STUrii || // F32
+ MI.getOpcode() == VE::STQrii || // F128 (pseudo)
+ MI.getOpcode() == VE::STVMrii || // VM (pseudo)
+ MI.getOpcode() == VE::STVM512rii // VM512 (pseudo)
) {
if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
MI.getOperand(1).getImm() == 0 && MI.getOperand(2).isImm() &&
@@ -496,6 +500,20 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addImm(0)
.addReg(SrcReg, getKillRegState(isKill))
.addMemOperand(MMO);
+ } else if (RC == &VE::VMRegClass) {
+ BuildMI(MBB, I, DL, get(VE::STVMrii))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addMemOperand(MMO);
+ } else if (VE::VM512RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(VE::STVM512rii))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addMemOperand(MMO);
} else
report_fatal_error("Can't store this register to stack slot");
}
@@ -539,6 +557,18 @@ void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
.addImm(0)
.addImm(0)
.addMemOperand(MMO);
+ } else if (RC == &VE::VMRegClass) {
+ BuildMI(MBB, I, DL, get(VE::LDVMrii), DestReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addImm(0)
+ .addMemOperand(MMO);
+ } else if (VE::VM512RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(VE::LDVM512rii), DestReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addImm(0)
+ .addMemOperand(MMO);
} else
report_fatal_error("Can't load this register from stack slot");
}
diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td
index 71199717a3a2..0b2f5039e3f3 100644
--- a/llvm/lib/Target/VE/VEInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td
@@ -25,6 +25,20 @@ def: Pat<(i64 (repl_i32 i32:$val)),
(zero_f32 (i2l $val)),
(SLLri (i2l $val), 32))>;
+///// Mask Load & Store /////
+
+// Store for v256i1, v512i1 are implemented in 2 ways. These STVM/STVM512
+// pseudo instruction is used for frameindex related load/store instructions.
+// Custom Lowering is used for other load/store instructions.
+
+def : Pat<(v256i1 (load ADDRrii:$addr)),
+ (LDVMrii ADDRrii:$addr)>;
+def : Pat<(v512i1 (load ADDRrii:$addr)),
+ (LDVM512rii ADDRrii:$addr)>;
+def : Pat<(store v256i1:$vx, ADDRrii:$addr),
+ (STVMrii ADDRrii:$addr, $vx)>;
+def : Pat<(store v512i1:$vx, ADDRrii:$addr),
+ (STVM512rii ADDRrii:$addr, $vx)>;
multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp,
SDNodeXForm ImmCast, OutPatFrag SuperRegCast> {
diff --git a/llvm/lib/Target/VE/VEInstrVec.td b/llvm/lib/Target/VE/VEInstrVec.td
index 4a8476f7288a..327ad9ceacc5 100644
--- a/llvm/lib/Target/VE/VEInstrVec.td
+++ b/llvm/lib/Target/VE/VEInstrVec.td
@@ -2,6 +2,33 @@
// Vector Instructions
//===----------------------------------------------------------------------===//
+// Pseudo instructions for VM/VM512 spill/restore
+//
+// These pseudo instructions are used for only spill/restore since
+// InlineSpiller assumes storeRegToStackSlot/loadRegFromStackSlot
+// functions emit only single instruction. Those functions emit a
+// single store/load instruction or one of these pseudo store/load
+// instructions.
+//
+// Specifies hasSideEffects = 0 to disable UnmodeledSideEffects.
+
+let mayLoad = 1, hasSideEffects = 0 in {
+def LDVMrii : Pseudo<
+ (outs VM:$vmx), (ins MEMrii:$addr),
+ "# pseudo ldvm $vmx, $addr", []>;
+def LDVM512rii : Pseudo<
+ (outs VM512:$vmx), (ins MEMrii:$addr),
+ "# pseudo ldvm512 $vmx, $addr", []>;
+}
+let mayStore = 1, hasSideEffects = 0 in {
+def STVMrii : Pseudo<
+ (outs), (ins MEMrii:$addr, VM:$vmx),
+ "# pseudo stvm $addr, $vmx", []>;
+def STVM512rii : Pseudo<
+ (outs), (ins MEMrii:$addr, VM512:$vmx),
+ "# pseudo stvm512 $addr, $vmx", []>;
+}
+
//===----------------------------------------------------------------------===//
// Pseudo instructions for VM512 modifications
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp
index f334af128162..397ea09c9a02 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -180,6 +180,16 @@ class EliminateFrameIndex {
int FIOperandNum);
void processLDQ(MachineInstr &MI, Register FrameReg, int64_t Offset,
int FIOperandNum);
+ // Expand and eliminate Frame Index of pseudo STVMrii and LDVMrii.
+ void processSTVM(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+ void processLDVM(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+ // Expand and eliminate Frame Index of pseudo STVM512rii and LDVM512rii.
+ void processSTVM512(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+ void processLDVM512(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
public:
EliminateFrameIndex(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
@@ -271,6 +281,185 @@ void EliminateFrameIndex::processLDQ(MachineInstr &MI, Register FrameReg,
replaceFI(MI, FrameReg, Offset, FIOperandNum);
}
+void EliminateFrameIndex::processSTVM(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(MI.getOpcode() == VE::STVMrii);
+ LLVM_DEBUG(dbgs() << "processSTVM: "; MI.dump());
+
+ // Original MI is:
+ // STVMrii frame-index, 0, offset, reg (, memory operand)
+ // Convert it to:
+ // SVMi tmp-reg, reg, 0
+ // STrii frame-reg, 0, offset, tmp-reg
+ // SVMi tmp-reg, reg, 1
+ // STrii frame-reg, 0, offset+8, tmp-reg
+ // SVMi tmp-reg, reg, 2
+ // STrii frame-reg, 0, offset+16, tmp-reg
+ // SVMi tmp-reg, reg, 3
+ // STrii frame-reg, 0, offset+24, tmp-reg
+
+ prepareReplaceFI(MI, FrameReg, Offset, 24);
+
+ Register SrcReg = MI.getOperand(3).getReg();
+ bool isKill = MI.getOperand(3).isKill();
+ // FIXME: it would be better to scavenge a register here instead of
+ // reserving SX16 all of the time.
+ Register TmpReg = VE::SX16;
+ for (int i = 0; i < 3; ++i) {
+ build(VE::SVMmr, TmpReg).addReg(SrcReg).addImm(i);
+ MachineInstr *StMI =
+ build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(
+ TmpReg, getKillRegState(true));
+ replaceFI(*StMI, FrameReg, Offset, 0);
+ Offset += 8;
+ }
+ build(VE::SVMmr, TmpReg).addReg(SrcReg, getKillRegState(isKill)).addImm(3);
+ MI.setDesc(get(VE::STrii));
+ MI.getOperand(3).ChangeToRegister(TmpReg, false, false, true);
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processLDVM(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(MI.getOpcode() == VE::LDVMrii);
+ LLVM_DEBUG(dbgs() << "processLDVM: "; MI.dump());
+
+ // Original MI is:
+ // LDVMri reg, frame-index, 0, offset (, memory operand)
+ // Convert it to:
+ // LDrii tmp-reg, frame-reg, 0, offset
+ // LVMir vm, 0, tmp-reg
+ // LDrii tmp-reg, frame-reg, 0, offset+8
+ // LVMir_m vm, 1, tmp-reg, vm
+ // LDrii tmp-reg, frame-reg, 0, offset+16
+ // LVMir_m vm, 2, tmp-reg, vm
+ // LDrii tmp-reg, frame-reg, 0, offset+24
+ // LVMir_m vm, 3, tmp-reg, vm
+
+ prepareReplaceFI(MI, FrameReg, Offset, 24);
+
+ Register DestReg = MI.getOperand(0).getReg();
+ // FIXME: it would be better to scavenge a register here instead of
+ // reserving SX16 all of the time.
+ unsigned TmpReg = VE::SX16;
+ for (int i = 0; i < 4; ++i) {
+ if (i != 3) {
+ MachineInstr *StMI =
+ build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0);
+ replaceFI(*StMI, FrameReg, Offset, 1);
+ Offset += 8;
+ } else {
+ // Last LDrii replace the target instruction.
+ MI.setDesc(get(VE::LDrii));
+ MI.getOperand(0).ChangeToRegister(TmpReg, true);
+ }
+ // First LVM is LVMir. Others are LVMir_m. Last LVM places at the
+ // next of the target instruction.
+ if (i == 0)
+ build(VE::LVMir, DestReg).addImm(i).addReg(TmpReg, getKillRegState(true));
+ else if (i != 3)
+ build(VE::LVMir_m, DestReg)
+ .addImm(i)
+ .addReg(TmpReg, getKillRegState(true))
+ .addReg(DestReg);
+ else
+ BuildMI(*MI.getParent(), std::next(II), DL, get(VE::LVMir_m), DestReg)
+ .addImm(3)
+ .addReg(TmpReg, getKillRegState(true))
+ .addReg(DestReg);
+ }
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processSTVM512(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(MI.getOpcode() == VE::STVM512rii);
+ LLVM_DEBUG(dbgs() << "processSTVM512: "; MI.dump());
+
+ prepareReplaceFI(MI, FrameReg, Offset, 56);
+
+ Register SrcReg = MI.getOperand(3).getReg();
+ Register SrcLoReg = getSubReg(SrcReg, VE::sub_vm_odd);
+ Register SrcHiReg = getSubReg(SrcReg, VE::sub_vm_even);
+ bool isKill = MI.getOperand(3).isKill();
+ // FIXME: it would be better to scavenge a register here instead of
+ // reserving SX16 all of the time.
+ Register TmpReg = VE::SX16;
+ // store low part of VMP
+ MachineInstr *LastMI = nullptr;
+ for (int i = 0; i < 4; ++i) {
+ LastMI = build(VE::SVMmr, TmpReg).addReg(SrcLoReg).addImm(i);
+ MachineInstr *StMI =
+ build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(
+ TmpReg, getKillRegState(true));
+ replaceFI(*StMI, FrameReg, Offset, 0);
+ Offset += 8;
+ }
+ if (isKill)
+ LastMI->addRegisterKilled(SrcLoReg, &TRI, true);
+ // store high part of VMP
+ for (int i = 0; i < 3; ++i) {
+ build(VE::SVMmr, TmpReg).addReg(SrcHiReg).addImm(i);
+ MachineInstr *StMI =
+ build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(
+ TmpReg, getKillRegState(true));
+ replaceFI(*StMI, FrameReg, Offset, 0);
+ Offset += 8;
+ }
+ LastMI = build(VE::SVMmr, TmpReg).addReg(SrcHiReg).addImm(3);
+ if (isKill) {
+ LastMI->addRegisterKilled(SrcHiReg, &TRI, true);
+ // Add implicit super-register kills to the particular MI.
+ LastMI->addRegisterKilled(SrcReg, &TRI, true);
+ }
+ MI.setDesc(get(VE::STrii));
+ MI.getOperand(3).ChangeToRegister(TmpReg, false, false, true);
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processLDVM512(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(MI.getOpcode() == VE::LDVM512rii);
+ LLVM_DEBUG(dbgs() << "processLDVM512: "; MI.dump());
+
+ prepareReplaceFI(MI, FrameReg, Offset, 56);
+
+ Register DestReg = MI.getOperand(0).getReg();
+ Register DestLoReg = getSubReg(DestReg, VE::sub_vm_odd);
+ Register DestHiReg = getSubReg(DestReg, VE::sub_vm_even);
+ // FIXME: it would be better to scavenge a register here instead of
+ // reserving SX16 all of the time.
+ Register TmpReg = VE::SX16;
+ build(VE::IMPLICIT_DEF, DestReg);
+ for (int i = 0; i < 4; ++i) {
+ MachineInstr *LdMI =
+ build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0);
+ replaceFI(*LdMI, FrameReg, Offset, 1);
+ build(VE::LVMir_m, DestLoReg)
+ .addImm(i)
+ .addReg(TmpReg, getKillRegState(true))
+ .addReg(DestLoReg);
+ Offset += 8;
+ }
+ for (int i = 0; i < 3; ++i) {
+ MachineInstr *LdMI =
+ build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0);
+ replaceFI(*LdMI, FrameReg, Offset, 1);
+ build(VE::LVMir_m, DestHiReg)
+ .addImm(i)
+ .addReg(TmpReg, getKillRegState(true))
+ .addReg(DestHiReg);
+ Offset += 8;
+ }
+ MI.setDesc(get(VE::LDrii));
+ MI.getOperand(0).ChangeToRegister(TmpReg, true);
+ BuildMI(*MI.getParent(), std::next(II), DL, get(VE::LVMir_m), DestHiReg)
+ .addImm(3)
+ .addReg(TmpReg, getKillRegState(true))
+ .addReg(DestHiReg);
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg,
int64_t Offset, int FIOperandNum) {
switch (MI.getOpcode()) {
@@ -280,6 +469,18 @@ void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg,
case VE::LDQrii:
processLDQ(MI, FrameReg, Offset, FIOperandNum);
return;
+ case VE::STVMrii:
+ processSTVM(MI, FrameReg, Offset, FIOperandNum);
+ return;
+ case VE::LDVMrii:
+ processLDVM(MI, FrameReg, Offset, FIOperandNum);
+ return;
+ case VE::STVM512rii:
+ processSTVM512(MI, FrameReg, Offset, FIOperandNum);
+ return;
+ case VE::LDVM512rii:
+ processLDVM512(MI, FrameReg, Offset, FIOperandNum);
+ return;
}
prepareReplaceFI(MI, FrameReg, Offset);
replaceFI(MI, FrameReg, Offset, FIOperandNum);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 9316826e3d92..d7720604d6dc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -40,7 +40,7 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
RI(STI.getTargetTriple()) {}
bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
- const MachineInstr &MI, AAResults *AA) const {
+ const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case WebAssembly::CONST_I32:
case WebAssembly::CONST_I64:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index f45a3792467a..29d700bdf83f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -43,8 +43,7 @@ public:
const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
- bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const override;
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index d3ad47147ac8..f9ef45bfb41c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -49,7 +49,6 @@ class WebAssemblyRegStackify final : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<MachineDominatorTree>();
AU.addRequired<LiveIntervals>();
AU.addPreserved<MachineBlockFrequencyInfo>();
@@ -164,15 +163,15 @@ static void queryCallee(const MachineInstr &MI, bool &Read, bool &Write,
// Determine whether MI reads memory, writes memory, has side effects,
// and/or uses the stack pointer value.
-static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
- bool &Write, bool &Effects, bool &StackPointer) {
+static void query(const MachineInstr &MI, bool &Read, bool &Write,
+ bool &Effects, bool &StackPointer) {
assert(!MI.isTerminator());
if (MI.isDebugInstr() || MI.isPosition())
return;
// Check for loads.
- if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad(&AA))
+ if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad())
Read = true;
// Check for stores.
@@ -255,9 +254,9 @@ static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
}
// Test whether Def is safe and profitable to rematerialize.
-static bool shouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA,
+static bool shouldRematerialize(const MachineInstr &Def,
const WebAssemblyInstrInfo *TII) {
- return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA);
+ return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def);
}
// Identify the definition for this register at this point. This is a
@@ -311,7 +310,7 @@ static bool hasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI,
// TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
// more precise.
static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
- const MachineInstr *Insert, AliasAnalysis &AA,
+ const MachineInstr *Insert,
const WebAssemblyFunctionInfo &MFI,
const MachineRegisterInfo &MRI) {
const MachineInstr *DefI = Def->getParent();
@@ -391,7 +390,7 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
}
bool Read = false, Write = false, Effects = false, StackPointer = false;
- query(*DefI, AA, Read, Write, Effects, StackPointer);
+ query(*DefI, Read, Write, Effects, StackPointer);
// If the instruction does not access memory and has no side effects, it has
// no additional dependencies.
@@ -406,7 +405,7 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
bool InterveningWrite = false;
bool InterveningEffects = false;
bool InterveningStackPointer = false;
- query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects,
+ query(*I, InterveningRead, InterveningWrite, InterveningEffects,
InterveningStackPointer);
if (Effects && InterveningEffects)
return false;
@@ -808,7 +807,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
- AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
auto &MDT = getAnalysis<MachineDominatorTree>();
auto &LIS = getAnalysis<LiveIntervals>();
@@ -872,8 +870,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
// supports intra-block moves) and it's MachineSink's job to catch all
// the sinking opportunities anyway.
bool SameBlock = DefI->getParent() == &MBB;
- bool CanMove = SameBlock &&
- isSafeToMove(Def, &Use, Insert, AA, MFI, MRI) &&
+ bool CanMove = SameBlock && isSafeToMove(Def, &Use, Insert, MFI, MRI) &&
!TreeWalker.isOnStack(Reg);
if (CanMove && hasOneUse(Reg, DefI, MRI, MDT, LIS)) {
Insert = moveForSingleUse(Reg, Use, DefI, MBB, Insert, LIS, MFI, MRI);
@@ -883,7 +880,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
// TODO: Encode this properly as a stackified value.
if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg)
MFI.clearFrameBaseVreg();
- } else if (shouldRematerialize(*DefI, AA, TII)) {
+ } else if (shouldRematerialize(*DefI, TII)) {
Insert =
rematerializeCheapDef(Reg, Use, *DefI, MBB, Insert->getIterator(),
LIS, MFI, MRI, TII, TRI);
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a859176220c7..fa0a6bd415dc 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1277,7 +1277,7 @@ class ProcModel<string Name, SchedMachineModel Model,
// enabled. It has no effect on code generation.
// NOTE: As a default tuning, "generic" aims to produce code optimized for the
// most common X86 processors. The tunings might be changed over time. It is
-// recommended to use "x86-64" in lit tests for consistency.
+// recommended to use "tune-cpu"="x86-64" in function attribute for consistency.
def : ProcModel<"generic", SandyBridgeModel,
[FeatureX87, FeatureCX8, FeatureX86_64],
[TuningSlow3OpsLEA,
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 16bff201dd03..db6923416177 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -393,12 +393,12 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
switch (MI->getOpcode()) {
case X86::MOV8rm:
- // Only replace 8 bit loads with the zero extending versions if
- // in an inner most loop and not optimizing for size. This takes
- // an extra byte to encode, and provides limited performance upside.
- if (MachineLoop *ML = MLI->getLoopFor(&MBB))
- if (ML->begin() == ML->end() && !OptForSize)
- return tryReplaceLoad(X86::MOVZX32rm8, MI);
+ // Replace 8-bit loads with the zero-extending version if not optimizing
+ // for size. The extending op is cheaper across a wide range of uarch and
+ // it avoids a potentially expensive partial register stall. It takes an
+ // extra byte to encode, however, so don't do this when optimizing for size.
+ if (!OptForSize)
+ return tryReplaceLoad(X86::MOVZX32rm8, MI);
break;
case X86::MOV16rm:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 12af6087cb47..5a4533c4bac4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -555,6 +555,39 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+ auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
+ setOperationAction(ISD::FABS, VT, Action);
+ setOperationAction(ISD::FNEG, VT, Action);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Action);
+ setOperationAction(ISD::FMA, VT, Action);
+ setOperationAction(ISD::FMINNUM, VT, Action);
+ setOperationAction(ISD::FMAXNUM, VT, Action);
+ setOperationAction(ISD::FMINIMUM, VT, Action);
+ setOperationAction(ISD::FMAXIMUM, VT, Action);
+ setOperationAction(ISD::FSIN, VT, Action);
+ setOperationAction(ISD::FCOS, VT, Action);
+ setOperationAction(ISD::FSINCOS, VT, Action);
+ setOperationAction(ISD::FSQRT, VT, Action);
+ setOperationAction(ISD::FPOW, VT, Action);
+ setOperationAction(ISD::FLOG, VT, Action);
+ setOperationAction(ISD::FLOG2, VT, Action);
+ setOperationAction(ISD::FLOG10, VT, Action);
+ setOperationAction(ISD::FEXP, VT, Action);
+ setOperationAction(ISD::FEXP2, VT, Action);
+ setOperationAction(ISD::FCEIL, VT, Action);
+ setOperationAction(ISD::FFLOOR, VT, Action);
+ setOperationAction(ISD::FNEARBYINT, VT, Action);
+ setOperationAction(ISD::FRINT, VT, Action);
+ setOperationAction(ISD::BR_CC, VT, Action);
+ setOperationAction(ISD::SETCC, VT, Action);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Action);
+ setOperationAction(ISD::FROUND, VT, Action);
+ setOperationAction(ISD::FROUNDEVEN, VT, Action);
+ setOperationAction(ISD::FTRUNC, VT, Action);
+ };
+
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
// f16, f32 and f64 use SSE.
// Set up the FP register classes.
@@ -592,40 +625,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Half type will be promoted by default.
- setOperationAction(ISD::FABS, MVT::f16, Promote);
- setOperationAction(ISD::FNEG, MVT::f16, Promote);
- setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+ setF16Action(MVT::f16, Promote);
setOperationAction(ISD::FADD, MVT::f16, Promote);
setOperationAction(ISD::FSUB, MVT::f16, Promote);
setOperationAction(ISD::FMUL, MVT::f16, Promote);
setOperationAction(ISD::FDIV, MVT::f16, Promote);
- setOperationAction(ISD::FREM, MVT::f16, Promote);
- setOperationAction(ISD::FMA, MVT::f16, Promote);
- setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
- setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
- setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
- setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
- setOperationAction(ISD::FSIN, MVT::f16, Promote);
- setOperationAction(ISD::FCOS, MVT::f16, Promote);
- setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
- setOperationAction(ISD::FSQRT, MVT::f16, Promote);
- setOperationAction(ISD::FPOW, MVT::f16, Promote);
- setOperationAction(ISD::FLOG, MVT::f16, Promote);
- setOperationAction(ISD::FLOG2, MVT::f16, Promote);
- setOperationAction(ISD::FLOG10, MVT::f16, Promote);
- setOperationAction(ISD::FEXP, MVT::f16, Promote);
- setOperationAction(ISD::FEXP2, MVT::f16, Promote);
- setOperationAction(ISD::FCEIL, MVT::f16, Promote);
- setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
- setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
- setOperationAction(ISD::FRINT, MVT::f16, Promote);
- setOperationAction(ISD::BR_CC, MVT::f16, Promote);
- setOperationAction(ISD::SETCC, MVT::f16, Promote);
- setOperationAction(ISD::SELECT, MVT::f16, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
- setOperationAction(ISD::FROUND, MVT::f16, Promote);
- setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
- setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
@@ -1003,6 +1007,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
: &X86::VR128RegClass);
addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
+ addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
@@ -1084,7 +1090,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
- for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
+ for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
@@ -1095,19 +1101,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
+ setF16Action(MVT::v8f16, Expand);
+ setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+ setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+ setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
// Custom lower v2i64 and v2f64 selects.
setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
@@ -1118,8 +1130,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
}
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
@@ -1304,6 +1316,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
: &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
+ addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
@@ -1340,12 +1354,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
@@ -1356,7 +1372,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
@@ -1386,6 +1401,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
@@ -1507,7 +1523,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Custom lower several nodes for 256-bit types.
for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
- MVT::v8f32, MVT::v4f64 }) {
+ MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
@@ -1518,6 +1534,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
}
+ setF16Action(MVT::v16f16, Expand);
+ setOperationAction(ISD::FADD, MVT::v16f16, Expand);
+ setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
+ setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
+ setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
if (HasInt256) {
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
@@ -1532,11 +1553,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
- if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) {
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
+ if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
+ Subtarget.hasF16C()) {
+ for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
+ setOperationAction(ISD::FP_ROUND, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
+ }
+ for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
+ setOperationAction(ISD::FP_EXTEND, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
+ }
+ for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
+ setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
+ setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
+ }
+
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
}
// This block controls legalization of the mask vector sizes that are
@@ -1619,6 +1652,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+ addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
@@ -1645,14 +1679,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
}
- setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
@@ -1664,7 +1700,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
@@ -1799,15 +1834,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
if (Subtarget.hasDQI()) {
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
-
+ for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+ ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+ ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
+ setOperationAction(Opc, MVT::v8i64, Custom);
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
@@ -1831,7 +1861,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
- MVT::v16f32, MVT::v8f64 }) {
+ MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
@@ -1842,6 +1872,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
}
+ setF16Action(MVT::v32f16, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+ for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
+ setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
+ setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
+ }
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::MLOAD, VT, Legal);
@@ -1881,23 +1920,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
- Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
- Subtarget.hasVLX() ? Legal : Custom);
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
@@ -1934,25 +1959,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MSCATTER, VT, Custom);
if (Subtarget.hasDQI()) {
- for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
- setOperationAction(ISD::SINT_TO_FP, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::UINT_TO_FP, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::FP_TO_SINT, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::FP_TO_UINT, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::MUL, VT, Legal);
+ for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+ ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+ ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
+ setOperationAction(Opc, MVT::v2i64, Custom);
+ setOperationAction(Opc, MVT::v4i64, Custom);
}
+ setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i64, Legal);
}
if (Subtarget.hasCDI()) {
@@ -2052,7 +2066,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// AVX512_FP16 scalar operations
setGroup(MVT::f16);
- addRegisterClass(MVT::f16, &X86::FR16XRegClass);
setOperationAction(ISD::FREM, MVT::f16, Promote);
setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
@@ -2066,6 +2079,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
@@ -2073,14 +2087,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.useAVX512Regs()) {
setGroup(MVT::v32f16);
- addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
@@ -2112,8 +2129,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (Subtarget.hasVLX()) {
- addRegisterClass(MVT::v8f16, &X86::VR128XRegClass);
- addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
setGroup(MVT::v8f16);
setGroup(MVT::v16f16);
@@ -2132,8 +2147,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
// INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
@@ -2347,7 +2366,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::FP16_TO_FP,
ISD::FP_EXTEND,
ISD::STRICT_FP_EXTEND,
- ISD::FP_ROUND});
+ ISD::FP_ROUND,
+ ISD::STRICT_FP_ROUND});
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -2404,6 +2424,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
return TypeSplitVector;
if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
+ !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
+ return TypeSplitVector;
+
+ if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
@@ -2447,22 +2471,21 @@ handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512()) {
- unsigned NumElts = VT.getVectorNumElements();
+ if (VT.isVector()) {
+ if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
- MVT RegisterVT;
- unsigned NumRegisters;
- std::tie(RegisterVT, NumRegisters) =
- handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
- if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
- return RegisterVT;
- }
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return RegisterVT;
+ }
- // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
- // So its default register type is f16. We override the type to v8f16 here.
- if (VT == MVT::v3f16 && Subtarget.hasFP16())
- return MVT::v8f16;
+ if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
+ return MVT::v8f16;
+ }
// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
@@ -2475,22 +2498,21 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512()) {
- unsigned NumElts = VT.getVectorNumElements();
+ if (VT.isVector()) {
+ if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
- MVT RegisterVT;
- unsigned NumRegisters;
- std::tie(RegisterVT, NumRegisters) =
- handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
- if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
- return NumRegisters;
- }
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return NumRegisters;
+ }
- // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
- // So its default register number is 3. We override the number to 1 here.
- if (VT == MVT::v3f16 && Subtarget.hasFP16())
- return 1;
+ if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
+ return 1;
+ }
// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
// x87 is disabled.
@@ -9646,13 +9668,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
EVT CVT = Ld.getValueType();
assert(!CVT.isVector() && "Must not broadcast a vector type");
- // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+ // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
// For size optimization, also splat v2f64 and v2i64, and for size opt
// with AVX2, also splat i8 and i16.
// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
if (ScalarSize == 32 ||
(ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
- (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
+ CVT == MVT::f16 ||
(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
@@ -14129,6 +14151,16 @@ static bool isShuffleFoldableLoad(SDValue V) {
ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
}
+template<typename T>
+static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
+ return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
+}
+
+template<typename T>
+bool X86TargetLowering::isSoftFP16(T VT) const {
+ return ::isSoftFP16(VT, Subtarget);
+}
+
/// Try to lower insertion of a single element into a zero vector.
///
/// This is a common pattern that we have especially efficient patterns to lower
@@ -14140,6 +14172,9 @@ static SDValue lowerShuffleAsElementInsertion(
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
+ if (isSoftFP16(EltVT, Subtarget))
+ return SDValue();
+
int V2Index =
find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
Mask.begin();
@@ -19444,6 +19479,15 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ if (isSoftFP16(VT)) {
+ MVT NVT = VT.changeVectorElementTypeToInteger();
+ return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
+ DAG.getBitcast(NVT, LHS),
+ DAG.getBitcast(NVT, RHS)));
+ }
+
// A vselect where all conditions and data are constants can be optimized into
// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
@@ -19467,8 +19511,6 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget.hasSSE41())
return SDValue();
- SDLoc dl(Op);
- MVT VT = Op.getSimpleValueType();
unsigned EltSize = VT.getScalarSizeInBits();
unsigned NumElts = VT.getVectorNumElements();
@@ -20856,16 +20898,6 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
return Cvt;
}
-template<typename T>
-static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
- return VT == MVT::f16 && !Subtarget.hasFP16();
-}
-
-template<typename T>
-bool X86TargetLowering::isSoftFP16(T VT) const {
- return ::isSoftFP16(VT, Subtarget);
-}
-
static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
bool IsStrict = Op->isStrictFPOpcode();
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
@@ -20885,6 +20917,26 @@ static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
}
+static bool isLegalConversion(MVT VT, bool IsSigned,
+ const X86Subtarget &Subtarget) {
+ if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
+ return true;
+ if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
+ return true;
+ if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
+ return true;
+ if (Subtarget.useAVX512Regs()) {
+ if (VT == MVT::v16i32)
+ return true;
+ if (VT == MVT::v8i64 && Subtarget.hasDQI())
+ return true;
+ }
+ if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
+ (VT == MVT::v2i64 || VT == MVT::v4i64))
+ return true;
+ return false;
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
@@ -20897,6 +20949,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
if (isSoftFP16(VT))
return promoteXINT_TO_FP(Op, DAG);
+ else if (isLegalConversion(SrcVT, true, Subtarget))
+ return Op;
if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
return LowerWin64_INT128_TO_FP(Op, DAG);
@@ -21400,6 +21454,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (isSoftFP16(DstVT))
return promoteXINT_TO_FP(Op, DAG);
+ else if (isLegalConversion(SrcVT, false, Subtarget))
+ return Op;
if (DstVT.isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
@@ -22229,6 +22285,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
{NVT, MVT::Other}, {Chain, Src})});
return DAG.getNode(Op.getOpcode(), dl, VT,
DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
+ } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
+ return Op;
}
if (VT.isVector()) {
@@ -22826,7 +22884,7 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
return Op;
if (SVT.getVectorElementType() == MVT::f16) {
- assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
+ assert(Subtarget.hasF16C() && "Unexpected features!");
if (SVT == MVT::v2f16)
In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
DAG.getUNDEF(MVT::v2f16));
@@ -22836,6 +22894,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
{Op->getOperand(0), Res});
return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
+ } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
+ return Op;
}
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
@@ -22854,34 +22914,19 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
- SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1);
MVT VT = Op.getSimpleValueType();
MVT SVT = In.getSimpleValueType();
if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
return SDValue();
- if (VT == MVT::f16) {
- if (Subtarget.hasFP16())
- return Op;
-
- if (SVT != MVT::f32) {
- if (IsStrict)
- return DAG.getNode(
- ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
- {Chain,
- DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other},
- {Chain, In, Op2}),
- Op2});
-
- return DAG.getNode(ISD::FP_ROUND, DL, VT,
- DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2),
- Op2);
- }
-
- if (!Subtarget.hasF16C())
+ if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
+ if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
return SDValue();
+ if (VT.isVector())
+ return Op;
+
SDValue Res;
SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
MVT::i32);
@@ -24176,10 +24221,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
if (isFP) {
-#ifndef NDEBUG
MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
-#endif
+ if (isSoftFP16(EltVT, Subtarget))
+ return SDValue();
bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
@@ -24741,6 +24786,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
ISD::CondCode CC =
cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
+ if (isSoftFP16(Op0.getValueType()))
+ return SDValue();
+
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets handled by emitFlagsForSetcc.
if (Op0.getValueType() == MVT::f128) {
@@ -24931,10 +24979,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op1.getSimpleValueType();
SDValue CC;
- if (isSoftFP16(VT))
- return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond,
- DAG.getBitcast(MVT::i16, Op1),
- DAG.getBitcast(MVT::i16, Op2)));
+ if (isSoftFP16(VT)) {
+ MVT NVT = VT.changeTypeToInteger();
+ return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
+ DAG.getBitcast(NVT, Op1),
+ DAG.getBitcast(NVT, Op2)));
+ }
// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
// are available or VBLENDV if AVX is available.
@@ -27268,27 +27318,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
}
- case Intrinsic::swift_async_context_addr: {
- auto &MF = DAG.getMachineFunction();
- auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
- if (Subtarget.is64Bit()) {
- MF.getFrameInfo().setFrameAddressIsTaken(true);
- X86FI->setHasSwiftAsyncContext(true);
- return SDValue(
- DAG.getMachineNode(
- X86::SUB64ri8, dl, MVT::i64,
- DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
- DAG.getTargetConstant(8, dl, MVT::i32)),
- 0);
- } else {
- // 32-bit so no special extended frame, create or reuse an existing stack
- // slot.
- if (!X86FI->getSwiftAsyncContextFrameIdx())
- X86FI->setSwiftAsyncContextFrameIdx(
- MF.getFrameInfo().CreateStackObject(4, Align(4), false));
- return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
- }
- }
case Intrinsic::x86_avx512_vp2intersect_q_512:
case Intrinsic::x86_avx512_vp2intersect_q_256:
case Intrinsic::x86_avx512_vp2intersect_q_128:
@@ -27668,6 +27697,37 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData) {
switch (IntNo) {
+
+ case Intrinsic::swift_async_context_addr: {
+ SDLoc dl(Op);
+ auto &MF = DAG.getMachineFunction();
+ auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ if (Subtarget.is64Bit()) {
+ MF.getFrameInfo().setFrameAddressIsTaken(true);
+ X86FI->setHasSwiftAsyncContext(true);
+ SDValue Chain = Op->getOperand(0);
+ SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
+ SDValue Result =
+ SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,
+ DAG.getTargetConstant(8, dl, MVT::i32)),
+ 0);
+ // Return { result, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+ CopyRBP.getValue(1));
+ } else {
+ // 32-bit so no special extended frame, create or reuse an existing
+ // stack slot.
+ if (!X86FI->getSwiftAsyncContextFrameIdx())
+ X86FI->setSwiftAsyncContextFrameIdx(
+ MF.getFrameInfo().CreateStackObject(4, Align(4), false));
+ SDValue Result =
+ DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
+ // Return { result, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+ Op->getOperand(0));
+ }
+ }
+
case llvm::Intrinsic::x86_seh_ehregnode:
return MarkEHRegistrationNode(Op, DAG);
case llvm::Intrinsic::x86_seh_ehguard:
@@ -32901,20 +32961,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::STRICT_FP_ROUND:
case ISD::FP_ROUND: {
bool IsStrict = N->isStrictFPOpcode();
+ SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
+ EVT SrcVT = Src.getValueType();
EVT VT = N->getValueType(0);
- EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
+ SDValue V;
if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
: DAG.getUNDEF(MVT::v2f32);
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
}
+ if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
+ assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
+ if (SrcVT.getVectorElementType() != MVT::f32)
+ return;
+
+ if (IsStrict)
+ V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
+ {Chain, Src, Rnd});
+ else
+ V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
+
+ Results.push_back(DAG.getBitcast(MVT::v8f16, V));
+ if (IsStrict)
+ Results.push_back(V.getValue(1));
+ return;
+ }
if (!isTypeLegal(Src.getValueType()))
return;
- SDValue V;
+ EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
if (IsStrict)
V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
- {N->getOperand(0), Src});
+ {Chain, Src});
else
V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
Results.push_back(V);
@@ -37342,6 +37421,7 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool IsUnary) {
unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
+ unsigned SizeInBits = MaskVT.getSizeInBits();
if (MaskVT.is128BitVector()) {
if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
@@ -37409,7 +37489,10 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Attempt to match against a OR if we're performing a blend shuffle and the
// non-blended source element is zero in each case.
- if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+ // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
+ if (SizeInBits == V1.getValueSizeInBits() &&
+ SizeInBits == V2.getValueSizeInBits() &&
+ (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
bool IsBlend = true;
unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
@@ -39652,11 +39735,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
- // FIXME: Remove this after we support vector FP16
- if (isSoftFP16(peekThroughBitcasts(N.getOperand(0)).getSimpleValueType(),
- Subtarget))
- return SDValue();
-
if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
return R;
@@ -40947,12 +41025,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
EltBits)) {
OpBits.clearAllBits();
OpElts.clearAllBits();
- for (int I = 0; I != NumElts; ++I)
- if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) ||
- (!Invert && !EltBits[I].isZero()))) {
+ for (int I = 0; I != NumElts; ++I) {
+ if (!DemandedElts[I])
+ continue;
+ if (UndefElts[I]) {
+ // We can't assume an undef src element gives an undef dst - the
+ // other src might be zero.
+ OpBits.setAllBits();
+ OpElts.setBit(I);
+ } else if ((Invert && !EltBits[I].isAllOnes()) ||
+ (!Invert && !EltBits[I].isZero())) {
OpBits |= Invert ? ~EltBits[I] : EltBits[I];
OpElts.setBit(I);
}
+ }
}
return std::make_pair(OpBits, OpElts);
};
@@ -44715,7 +44801,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
// Early exit check
- if (!TLI.isTypeLegal(VT))
+ if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
return SDValue();
if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
@@ -47798,11 +47884,17 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
EltBits)) {
DemandedBits.clearAllBits();
DemandedElts.clearAllBits();
- for (int I = 0; I != NumElts; ++I)
- if (!EltBits[I].isZero()) {
+ for (int I = 0; I != NumElts; ++I) {
+ if (UndefElts[I]) {
+ // We can't assume an undef src element gives an undef dst - the
+ // other src might be zero.
+ DemandedBits.setAllBits();
+ DemandedElts.setBit(I);
+ } else if (!EltBits[I].isZero()) {
DemandedBits |= EltBits[I];
DemandedElts.setBit(I);
}
+ }
}
return std::make_pair(DemandedBits, DemandedElts);
};
@@ -51042,6 +51134,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
MVT VT = N->getSimpleValueType(0);
+ int NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
// ANDNP(undef, x) -> 0
// ANDNP(x, undef) -> 0
@@ -51060,6 +51154,19 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
if (SDValue Not = IsNOT(N0, DAG))
return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
+ // Constant Folding
+ APInt Undefs0, Undefs1;
+ SmallVector<APInt> EltBits0, EltBits1;
+ if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0) &&
+ getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
+ SDLoc DL(N);
+ SmallVector<APInt> ResultBits;
+ for (int I = 0; I != NumElts; ++I)
+ ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
+ APInt ResultUndefs = APInt::getZero(NumElts);
+ return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);
+ }
+
// TODO: Constant fold NOT(N0) to allow us to use AND.
// TODO: Do this in IsNOT with suitable oneuse checks?
@@ -51074,20 +51181,24 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
APInt UndefElts;
SmallVector<APInt> EltBits;
- int NumElts = VT.getVectorNumElements();
- int EltSizeInBits = VT.getScalarSizeInBits();
APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
APInt DemandedElts = APInt::getAllOnes(NumElts);
if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
EltBits)) {
DemandedBits.clearAllBits();
DemandedElts.clearAllBits();
- for (int I = 0; I != NumElts; ++I)
- if ((Invert && !EltBits[I].isAllOnes()) ||
- (!Invert && !EltBits[I].isZero())) {
+ for (int I = 0; I != NumElts; ++I) {
+ if (UndefElts[I]) {
+ // We can't assume an undef src element gives an undef dst - the
+ // other src might be zero.
+ DemandedBits.setAllBits();
+ DemandedElts.setBit(I);
+ } else if ((Invert && !EltBits[I].isAllOnes()) ||
+ (!Invert && !EltBits[I].isZero())) {
DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
DemandedElts.setBit(I);
}
+ }
}
return std::make_pair(DemandedBits, DemandedElts);
};
@@ -54714,8 +54825,9 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasFP16())
return SDValue();
+ bool IsStrict = N->isStrictFPOpcode();
EVT VT = N->getValueType(0);
- SDValue Src = N->getOperand(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
@@ -54736,8 +54848,15 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
// Destination is v8i16 with at least 8 elements.
EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
std::max(8U, NumElts));
- SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
- DAG.getTargetConstant(4, dl, MVT::i32));
+ SDValue Cvt, Chain;
+ SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
+ if (IsStrict) {
+ Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
+ {N->getOperand(0), Src, Rnd});
+ Chain = Cvt.getValue(1);
+ } else {
+ Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
+ }
// Extract down to real number of elements.
if (NumElts < 8) {
@@ -54746,7 +54865,12 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, dl));
}
- return DAG.getBitcast(VT, Cvt);
+ Cvt = DAG.getBitcast(VT, Cvt);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Cvt, Chain}, dl);
+
+ return Cvt;
}
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
@@ -54954,6 +55078,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
case ISD::STRICT_FP_EXTEND:
case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
+ case ISD::STRICT_FP_ROUND:
case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
case X86ISD::VBROADCAST_LOAD:
case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 48da7b3ac882..c105bde78ad1 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3769,12 +3769,16 @@ let Predicates = [HasAVX512] in {
(VMOVDQA64Zrm addr:$src)>;
def : Pat<(alignedloadv32i16 addr:$src),
(VMOVDQA64Zrm addr:$src)>;
+ def : Pat<(alignedloadv32f16 addr:$src),
+ (VMOVAPSZrm addr:$src)>;
def : Pat<(alignedloadv64i8 addr:$src),
(VMOVDQA64Zrm addr:$src)>;
def : Pat<(loadv16i32 addr:$src),
(VMOVDQU64Zrm addr:$src)>;
def : Pat<(loadv32i16 addr:$src),
(VMOVDQU64Zrm addr:$src)>;
+ def : Pat<(loadv32f16 addr:$src),
+ (VMOVUPSZrm addr:$src)>;
def : Pat<(loadv64i8 addr:$src),
(VMOVDQU64Zrm addr:$src)>;
@@ -3783,12 +3787,16 @@ let Predicates = [HasAVX512] in {
(VMOVDQA64Zmr addr:$dst, VR512:$src)>;
def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst),
(VMOVDQA64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst),
+ (VMOVAPSZmr addr:$dst, VR512:$src)>;
def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst),
(VMOVDQA64Zmr addr:$dst, VR512:$src)>;
def : Pat<(store (v16i32 VR512:$src), addr:$dst),
(VMOVDQU64Zmr addr:$dst, VR512:$src)>;
def : Pat<(store (v32i16 VR512:$src), addr:$dst),
(VMOVDQU64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(store (v32f16 VR512:$src), addr:$dst),
+ (VMOVUPSZmr addr:$dst, VR512:$src)>;
def : Pat<(store (v64i8 VR512:$src), addr:$dst),
(VMOVDQU64Zmr addr:$dst, VR512:$src)>;
}
@@ -3799,12 +3807,16 @@ let Predicates = [HasVLX] in {
(VMOVDQA64Z128rm addr:$src)>;
def : Pat<(alignedloadv8i16 addr:$src),
(VMOVDQA64Z128rm addr:$src)>;
+ def : Pat<(alignedloadv8f16 addr:$src),
+ (VMOVAPSZ128rm addr:$src)>;
def : Pat<(alignedloadv16i8 addr:$src),
(VMOVDQA64Z128rm addr:$src)>;
def : Pat<(loadv4i32 addr:$src),
(VMOVDQU64Z128rm addr:$src)>;
def : Pat<(loadv8i16 addr:$src),
(VMOVDQU64Z128rm addr:$src)>;
+ def : Pat<(loadv8f16 addr:$src),
+ (VMOVUPSZ128rm addr:$src)>;
def : Pat<(loadv16i8 addr:$src),
(VMOVDQU64Z128rm addr:$src)>;
@@ -3813,12 +3825,16 @@ let Predicates = [HasVLX] in {
(VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
(VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst),
+ (VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
(VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v4i32 VR128X:$src), addr:$dst),
(VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
(VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v8f16 VR128X:$src), addr:$dst),
+ (VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
(VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
@@ -3827,12 +3843,16 @@ let Predicates = [HasVLX] in {
(VMOVDQA64Z256rm addr:$src)>;
def : Pat<(alignedloadv16i16 addr:$src),
(VMOVDQA64Z256rm addr:$src)>;
+ def : Pat<(alignedloadv16f16 addr:$src),
+ (VMOVAPSZ256rm addr:$src)>;
def : Pat<(alignedloadv32i8 addr:$src),
(VMOVDQA64Z256rm addr:$src)>;
def : Pat<(loadv8i32 addr:$src),
(VMOVDQU64Z256rm addr:$src)>;
def : Pat<(loadv16i16 addr:$src),
(VMOVDQU64Z256rm addr:$src)>;
+ def : Pat<(loadv16f16 addr:$src),
+ (VMOVUPSZ256rm addr:$src)>;
def : Pat<(loadv32i8 addr:$src),
(VMOVDQU64Z256rm addr:$src)>;
@@ -3841,12 +3861,16 @@ let Predicates = [HasVLX] in {
(VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst),
(VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst),
+ (VMOVAPSZ256mr addr:$dst, VR256X:$src)>;
def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst),
(VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v8i32 VR256X:$src), addr:$dst),
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v16f16 VR256X:$src), addr:$dst),
+ (VMOVUPSZ256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
}
@@ -3855,16 +3879,12 @@ let Predicates = [HasBWI] in {
(VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>;
def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)),
(VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>;
- def : Pat<(v32f16 (alignedloadv32f16 addr:$src)),
- (VMOVAPSZrm addr:$src)>;
def : Pat<(v32f16 (vselect VK32WM:$mask,
(v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))),
(VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
def : Pat<(v32f16 (vselect VK32WM:$mask,
(v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)),
(VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
- def : Pat<(v32f16 (loadv32f16 addr:$src)),
- (VMOVUPSZrm addr:$src)>;
def : Pat<(v32f16 (vselect VK32WM:$mask,
(v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))),
(VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
@@ -3878,10 +3898,6 @@ let Predicates = [HasBWI] in {
def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)),
(VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
- def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst),
- (VMOVAPSZmr addr:$dst, VR512:$src)>;
- def : Pat<(store (v32f16 VR512:$src), addr:$dst),
- (VMOVUPSZmr addr:$dst, VR512:$src)>;
def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask),
(VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>;
}
@@ -3890,16 +3906,12 @@ let Predicates = [HasBWI, HasVLX] in {
(VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>;
def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)),
(VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>;
- def : Pat<(v16f16 (alignedloadv16f16 addr:$src)),
- (VMOVAPSZ256rm addr:$src)>;
def : Pat<(v16f16 (vselect VK16WM:$mask,
(v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
(VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(v16f16 (vselect VK16WM:$mask,
(v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)),
(VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
- def : Pat<(v16f16 (loadv16f16 addr:$src)),
- (VMOVUPSZ256rm addr:$src)>;
def : Pat<(v16f16 (vselect VK16WM:$mask,
(v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
(VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
@@ -3913,10 +3925,6 @@ let Predicates = [HasBWI, HasVLX] in {
def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)),
(VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
- def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst),
- (VMOVAPSZ256mr addr:$dst, VR256X:$src)>;
- def : Pat<(store (v16f16 VR256X:$src), addr:$dst),
- (VMOVUPSZ256mr addr:$dst, VR256X:$src)>;
def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask),
(VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>;
@@ -3924,16 +3932,12 @@ let Predicates = [HasBWI, HasVLX] in {
(VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>;
def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)),
(VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>;
- def : Pat<(v8f16 (alignedloadv8f16 addr:$src)),
- (VMOVAPSZ128rm addr:$src)>;
def : Pat<(v8f16 (vselect VK8WM:$mask,
(v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
(VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(v8f16 (vselect VK8WM:$mask,
(v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)),
(VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
- def : Pat<(v8f16 (loadv8f16 addr:$src)),
- (VMOVUPSZ128rm addr:$src)>;
def : Pat<(v8f16 (vselect VK8WM:$mask,
(v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
(VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
@@ -3947,10 +3951,6 @@ let Predicates = [HasBWI, HasVLX] in {
def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)),
(VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
- def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst),
- (VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
- def : Pat<(store (v8f16 VR128X:$src), addr:$dst),
- (VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask),
(VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>;
}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index ec32ac2acad1..74ef831e1658 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -742,8 +742,8 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
return isPICBase;
}
-bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const {
+bool X86InstrInfo::isReallyTriviallyReMaterializable(
+ const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
// This function should only be called for opcodes with the ReMaterializable
@@ -869,7 +869,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
- MI.isDereferenceableInvariantLoad(AA)) {
+ MI.isDereferenceableInvariantLoad()) {
Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0 || BaseReg == X86::RIP)
return true;
@@ -3892,6 +3892,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Register DestReg, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+ "Load size exceeds stack slot");
if (RC->getID() == X86::TILERegClassID) {
unsigned Opc = X86::TILELOADD;
// tileloadd (%sp, %idx), %tmm
@@ -3913,8 +3917,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
FrameIdx);
} else {
- const MachineFunction &MF = *MBB.getParent();
- const MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 4943d2152fd2..98da00c39bdb 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -240,8 +240,7 @@ public:
unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const override;
- bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const override;
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
Register DestReg, unsigned SubIdx,
const MachineInstr &Orig,
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 06cb280e860a..c5557bd5df4e 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -140,6 +140,7 @@ def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
let Predicates = [NoAVX512] in {
def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v8f16 immAllZerosV), (V_SET0)>;
def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
@@ -159,6 +160,7 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
let Predicates = [NoAVX512] in {
def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>;
def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
@@ -572,6 +574,23 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVUPSYmr addr:$dst, VR256:$src)>;
def : Pat<(store (v32i8 VR256:$src), addr:$dst),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+ def : Pat<(alignedloadv8f16 addr:$src),
+ (VMOVAPSrm addr:$src)>;
+ def : Pat<(loadv8f16 addr:$src),
+ (VMOVUPSrm addr:$src)>;
+ def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8f16 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedloadv16f16 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(loadv16f16 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v16f16 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
}
// Use movaps / movups for SSE integer load / store (one byte shorter).
@@ -613,6 +632,17 @@ let Predicates = [UseSSE1] in {
(MOVUPSmr addr:$dst, VR128:$src)>;
}
+let Predicates = [UseSSE2] in {
+ def : Pat<(alignedloadv8f16 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(loadv8f16 addr:$src),
+ (MOVUPSrm addr:$src)>;
+ def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8f16 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Low packed FP Instructions
//===----------------------------------------------------------------------===//
@@ -3136,6 +3166,8 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVNTDQYmr addr:$dst, VR256:$src)>;
def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
(VMOVNTDQYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
(VMOVNTDQYmr addr:$dst, VR256:$src)>;
@@ -3143,6 +3175,8 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVNTDQmr addr:$dst, VR128:$src)>;
def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
(VMOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
(VMOVNTDQmr addr:$dst, VR128:$src)>;
}
@@ -3152,6 +3186,8 @@ let Predicates = [UseSSE2] in {
(MOVNTDQmr addr:$dst, VR128:$src)>;
def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
(MOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
(MOVNTDQmr addr:$dst, VR128:$src)>;
}
@@ -3374,12 +3410,16 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVDQArm addr:$src)>;
def : Pat<(alignedloadv8i16 addr:$src),
(VMOVDQArm addr:$src)>;
+ def : Pat<(alignedloadv8f16 addr:$src),
+ (VMOVDQArm addr:$src)>;
def : Pat<(alignedloadv16i8 addr:$src),
(VMOVDQArm addr:$src)>;
def : Pat<(loadv4i32 addr:$src),
(VMOVDQUrm addr:$src)>;
def : Pat<(loadv8i16 addr:$src),
(VMOVDQUrm addr:$src)>;
+ def : Pat<(loadv8f16 addr:$src),
+ (VMOVDQUrm addr:$src)>;
def : Pat<(loadv16i8 addr:$src),
(VMOVDQUrm addr:$src)>;
@@ -3387,12 +3427,16 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVDQAmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
(VMOVDQAmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
+ (VMOVDQAmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
(VMOVDQAmr addr:$dst, VR128:$src)>;
def : Pat<(store (v4i32 VR128:$src), addr:$dst),
(VMOVDQUmr addr:$dst, VR128:$src)>;
def : Pat<(store (v8i16 VR128:$src), addr:$dst),
(VMOVDQUmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8f16 VR128:$src), addr:$dst),
+ (VMOVDQUmr addr:$dst, VR128:$src)>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
(VMOVDQUmr addr:$dst, VR128:$src)>;
}
@@ -6431,6 +6475,8 @@ let Predicates = [HasAVX2, NoVLX] in {
(VMOVNTDQAYrm addr:$src)>;
def : Pat<(v16i16 (alignednontemporalload addr:$src)),
(VMOVNTDQAYrm addr:$src)>;
+ def : Pat<(v16f16 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
def : Pat<(v32i8 (alignednontemporalload addr:$src)),
(VMOVNTDQAYrm addr:$src)>;
}
@@ -6446,6 +6492,8 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVNTDQArm addr:$src)>;
def : Pat<(v8i16 (alignednontemporalload addr:$src)),
(VMOVNTDQArm addr:$src)>;
+ def : Pat<(v8f16 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
def : Pat<(v16i8 (alignednontemporalload addr:$src)),
(VMOVNTDQArm addr:$src)>;
}
@@ -6461,6 +6509,8 @@ let Predicates = [UseSSE41] in {
(MOVNTDQArm addr:$src)>;
def : Pat<(v8i16 (alignednontemporalload addr:$src)),
(MOVNTDQArm addr:$src)>;
+ def : Pat<(v8f16 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
def : Pat<(v16i8 (alignednontemporalload addr:$src)),
(MOVNTDQArm addr:$src)>;
}
@@ -7050,6 +7100,8 @@ def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
+def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
}
@@ -7095,6 +7147,7 @@ let Predicates = [HasAVX1Only] in {
defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>;
defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
}
@@ -7150,6 +7203,8 @@ let Predicates = [HasAVX1Only] in {
defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>;
defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>;
defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
+ defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>;
+ defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>;
defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>;
}
@@ -7189,6 +7244,8 @@ let Predicates = [HasAVX1Only] in {
defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>;
+ defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
}
@@ -7503,6 +7560,10 @@ def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0))
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)),
+ (VBLENDPSYrri VR256:$src1,
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src2, sub_xmm), 0xf)>;
def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
@@ -7517,6 +7578,9 @@ def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0
def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
(VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
(VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
@@ -7759,6 +7823,8 @@ let Predicates = [HasAVX2] in {
defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>;
+ defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
}
@@ -7781,6 +7847,8 @@ let Predicates = [HasAVX2, NoVLX] in {
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>;
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>;
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>;
+ defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>;
+ defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
}
@@ -7801,6 +7869,8 @@ let Predicates = [HasAVX2, NoVLX] in {
defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>;
+ defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index f4e25e4194db..1de2a1725954 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -254,8 +254,12 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
StringRef CPU =
CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU;
- StringRef TuneCPU =
- TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU;
+ // "x86-64" is a default target setting for many front ends. In these cases,
+ // they actually request for "generic" tuning unless the "tune-cpu" was
+ // specified.
+ StringRef TuneCPU = TuneAttr.isValid() ? TuneAttr.getValueAsString()
+ : CPU == "x86-64" ? "generic"
+ : (StringRef)CPU;
StringRef FS =
FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b36f8a3d06d0..b27aac9c4e93 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1297,29 +1297,6 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
LT.first = NumOfDests * NumOfShufflesPerDest;
}
- static const CostTblEntry AVX512FP16ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw
-
- {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
- {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
- {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb
-
- {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
- {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
- {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb
-
- {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w
- };
-
- if (!ST->useSoftFloat() && ST->hasFP16())
- if (const auto *Entry =
- CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
-
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
{TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
{TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
@@ -1339,17 +1316,22 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
static const CostTblEntry AVX512BWShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
{TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
{TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
@@ -1369,6 +1351,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
{TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
@@ -1376,6 +1359,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
{TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
{TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
+ {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca
{TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
{TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
@@ -1408,11 +1392,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// FIXME: This just applies the type legalization cost rules above
// assuming these completely split.
{TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
+ {TTI::SK_PermuteSingleSrc, MVT::v32f16, 14},
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
{TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
+ {TTI::SK_PermuteTwoSrc, MVT::v32f16, 42},
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
{TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
+ {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq
{TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
{TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
{TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
@@ -1430,6 +1417,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
{TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
@@ -1437,9 +1425,11 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
{TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
{TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
+ {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
{TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
{TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
+ {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
{TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
@@ -1448,6 +1438,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
// + vpblendvb
+ {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
+ // + vpblendvb
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
// + vpblendvb
@@ -1457,6 +1449,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
// + vpblendvb
+ {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
+ // + vpblendvb
{TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
// + vpblendvb
};
@@ -1493,6 +1487,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
{TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
{TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
{TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
{TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
@@ -1501,6 +1496,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
{TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
// + vinsertf128
+ {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
+ // + vinsertf128
{TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
// + vinsertf128
@@ -1509,6 +1506,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Select, MVT::v8i32, 1}, // vblendps
{TTI::SK_Select, MVT::v8f32, 1}, // vblendps
{TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
+ {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
{TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
@@ -1517,6 +1515,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
+ {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
+ // + 2*por + vinsertf128
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
@@ -1526,6 +1526,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
// + 4*por + vinsertf128
+ {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
{TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
// + 4*por + vinsertf128
};
@@ -1540,6 +1542,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Select, MVT::v4i32, 1}, // pblendw
{TTI::SK_Select, MVT::v4f32, 1}, // blendps
{TTI::SK_Select, MVT::v8i16, 1}, // pblendw
+ {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
{TTI::SK_Select, MVT::v16i8, 1} // pblendvb
};
@@ -1549,18 +1552,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
static const CostTblEntry SSSE3ShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
{TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
{TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
{TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
{TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
+ {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
{TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
{TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
{TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
+ {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
{TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
};
@@ -1573,12 +1581,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
{TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
{TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
{TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
{TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
{TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
{TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
{TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
{TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + packus
@@ -1586,6 +1596,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Select, MVT::v2f64, 1}, // movsd
{TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
{TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
+ {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
{TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
{TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
@@ -1593,6 +1604,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
{TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
// + pshufd/unpck
+ {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
+ // + pshufd/unpck
{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + 2*packus
@@ -1600,6 +1613,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{ TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
{ TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
{ TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
+ { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
};
@@ -5219,7 +5233,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
return true;
- if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16())
+ if (ScalarTy->isHalfTy() && ST->hasBWI())
return true;
if (!ScalarTy->isIntegerTy())
@@ -5674,8 +5688,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
EltTy->isIntegerTy(32) || EltTy->isPointerTy())
return true;
- if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) ||
- (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy()))
+ if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
return HasBW;
return false;
};
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index 19ebcb3ea3e8..2fb06e29bf3b 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -27,7 +27,7 @@
#include "llvm/IR/Function.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Target/TargetOptions.h"
-#include <algorithm> // std::sort
+#include <algorithm>
using namespace llvm;