summaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/AArch64/AArch64.td1
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp8
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp2
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorDetails.td61
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp4
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp34
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp15
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h3
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp21
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td77
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp14
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp19
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td12
-rw-r--r--lib/Target/ARM/ARMCallLowering.cpp55
-rw-r--r--lib/Target/ARM/ARMInstrVFP.td18
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.cpp45
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.h2
-rw-r--r--lib/Target/BPF/BPFAsmPrinter.cpp5
-rw-r--r--lib/Target/BPF/BPFISelDAGToDAG.cpp240
-rw-r--r--lib/Target/BPF/BPFInstrInfo.td14
-rw-r--r--lib/Target/Hexagon/HexagonGenMux.cpp52
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAG.cpp15
-rw-r--r--lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp3
-rw-r--r--lib/Target/Hexagon/HexagonPatterns.td19
-rw-r--r--lib/Target/Hexagon/HexagonTargetMachine.cpp8
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp135
-rw-r--r--lib/Target/Mips/MipsInstrInfo.cpp9
-rw-r--r--lib/Target/Mips/MipsLongBranch.cpp12
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.cpp99
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.h3
-rw-r--r--lib/Target/Mips/MipsSEISelLowering.cpp176
-rw-r--r--lib/Target/Mips/MipsSubtarget.h2
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp26
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp118
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h22
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td10
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.cpp2
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td13
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td10
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.cpp14
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.h1
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.cpp2
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.h2
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.h2
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h4
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp25
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp10
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h2
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp218
-rw-r--r--lib/Target/X86/X86InstrAVX512.td2
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td12
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp134
59 files changed, 1284 insertions, 550 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index abe28460c83a..53eef79c4df3 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -362,6 +362,7 @@ def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
def : ProcessorModel<"generic", NoSchedModel, [
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index e8fcf1a0e9b7..7bf2097c17ce 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -1282,6 +1282,10 @@ unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
bool WantResult) {
assert(LHSReg && RHSReg && "Invalid register number.");
+ if (LHSReg == AArch64::SP || LHSReg == AArch64::WSP ||
+ RHSReg == AArch64::SP || RHSReg == AArch64::WSP)
+ return 0;
+
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
@@ -1362,6 +1366,8 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
uint64_t ShiftImm, bool SetFlags,
bool WantResult) {
assert(LHSReg && RHSReg && "Invalid register number.");
+ assert(LHSReg != AArch64::SP && LHSReg != AArch64::WSP &&
+ RHSReg != AArch64::SP && RHSReg != AArch64::WSP);
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
@@ -1403,6 +1409,8 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
uint64_t ShiftImm, bool SetFlags,
bool WantResult) {
assert(LHSReg && RHSReg && "Invalid register number.");
+ assert(LHSReg != AArch64::XZR && LHSReg != AArch64::WZR &&
+ RHSReg != AArch64::XZR && RHSReg != AArch64::WZR);
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 059556a560c0..083ca2156598 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9366,7 +9366,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
StoreSDNode *S = cast<StoreSDNode>(N);
- if (S->isVolatile())
+ if (S->isVolatile() || S->isIndexed())
return SDValue();
SDValue StVal = S->getValue();
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 7402bcf1346c..3d737402022d 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -160,6 +160,21 @@ def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
let NumMicroOps = 2;
}
+def FalkorWr_1VX_1VY_12cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_14cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 14;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_21cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+}
+
def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> {
let Latency = 2;
let NumMicroOps = 2;
@@ -195,10 +210,10 @@ def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
let ResourceCycles = [2, 8];
}
-def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
- let Latency = 16;
+def FalkorWr_1X_1Z_11cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+ let Latency = 11;
let NumMicroOps = 2;
- let ResourceCycles = [2, 16];
+ let ResourceCycles = [2, 11];
}
def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> {
@@ -289,9 +304,27 @@ def FalkorWr_1XYZ_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitVSD, Fa
//===----------------------------------------------------------------------===//
// Define 4 micro-op types
-def FalkorWr_2VX_2VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
- FalkorUnitVX, FalkorUnitVY]> {
- let Latency = 2;
+def FalkorWr_2VX_2VY_14cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+ FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2VX_2VY_20cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+ FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 20;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2VX_2VY_21cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+ FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 21;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2VX_2VY_24cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+ FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 24;
let NumMicroOps = 4;
}
@@ -575,7 +608,8 @@ def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i
def : InstRW<[FalkorWr_2VXVY_2cyc], (instrs FCVTLv4i16, FCVTLv2i32)>;
def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)$")>;
-def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)v2f32$")>;
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instrs FDIVv2f32)>;
+def : InstRW<[FalkorWr_1VX_1VY_12cyc],(instrs FSQRTv2f32)>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32)$")>;
@@ -592,7 +626,10 @@ def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc],
def : InstRW<[FalkorWr_3VXVY_4cyc], (instrs FCVTNv4i16, FCVTNv2i32, FCVTXNv2f32)>;
def : InstRW<[FalkorWr_3VXVY_5cyc], (instrs FCVTNv8i16, FCVTNv4i32, FCVTXNv4f32)>;
-def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32)$")>;
+def : InstRW<[FalkorWr_2VX_2VY_14cyc],(instrs FDIVv2f64)>;
+def : InstRW<[FalkorWr_2VX_2VY_20cyc],(instrs FDIVv4f32)>;
+def : InstRW<[FalkorWr_2VX_2VY_21cyc],(instrs FSQRTv2f64)>;
+def : InstRW<[FalkorWr_2VX_2VY_24cyc],(instrs FSQRTv4f32)>;
def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
@@ -1039,8 +1076,10 @@ def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
(instregex "^F(N)?MULDrr$")>;
-def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(S|D)rr$")>;
-def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(S|D)r$")>;
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instrs FDIVSrr)>;
+def : InstRW<[FalkorWr_1VX_1VY_14cyc],(instrs FDIVDrr)>;
+def : InstRW<[FalkorWr_1VX_1VY_12cyc],(instrs FSQRTSr)>;
+def : InstRW<[FalkorWr_1VX_1VY_21cyc],(instrs FSQRTDr)>;
def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32],
(instregex "^F(N)?M(ADD|SUB)Srrr$")>;
@@ -1112,7 +1151,7 @@ def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64
(instregex "^M(ADD|SUB)Xrrr$")>;
def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>;
-def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>;
+def : InstRW<[FalkorWr_1X_1Z_11cyc], (instregex "^(S|U)DIVXr$")>;
def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
(instregex "^(S|U)MULLv.*$")>;
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index d3cab1ad3397..a9a9d5ce8429 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -92,6 +92,10 @@ void AArch64Subtarget::initializeProperties() {
MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
+ CacheLineSize = 128;
+ PrefetchDistance = 820;
+ MinPrefetchStride = 2048;
+ MaxPrefetchIterationsAhead = 8;
break;
case Kryo:
MaxInterleaveFactor = 4;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index d0299149c38c..290a1ca1f24b 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -78,7 +78,7 @@ public:
return 31;
}
- unsigned getRegisterBitWidth(bool Vector) {
+ unsigned getRegisterBitWidth(bool Vector) const {
if (Vector) {
if (ST->hasNEON())
return 128;
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 8084d368c80f..6f3742ed039b 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -9,7 +9,7 @@
//
/// \file
/// This pass marks all internal functions as always_inline and creates
-/// duplicates of all other functions a marks the duplicates as always_inline.
+/// duplicates of all other functions and marks the duplicates as always_inline.
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 3c788fa1dcea..6f002860044c 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -107,7 +107,7 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
DFS(Start, Checklist);
for (auto &BB : Checklist) {
- BasicBlock::iterator StartIt = (BB == Load->getParent()) ?
+ BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
BasicBlock::iterator(Load) : BB->end();
if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr),
true, StartIt, BB, Load).isClobber())
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 251c2f9bb25a..f235313e4853 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -138,7 +138,10 @@ private:
bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
SDValue &ImmOffset, SDValue &VOffset) const;
- bool SelectFlat(SDValue Addr, SDValue &VAddr, SDValue &SLC) const;
+ bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr,
+ SDValue &Offset, SDValue &SLC) const;
+ bool SelectFlatOffset(SDValue Addr, SDValue &VAddr,
+ SDValue &Offset, SDValue &SLC) const;
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
@@ -1313,14 +1316,37 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
return true;
}
-bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
- SDValue &VAddr,
- SDValue &SLC) const {
+bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset,
+ SDValue &SLC) const {
+ int64_t OffsetVal = 0;
+
+ if (Subtarget->hasFlatInstOffsets() &&
+ CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getZExtValue();
+ if (isUInt<12>(COffsetVal)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
+ }
+ }
+
VAddr = Addr;
+ Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+
return true;
}
+bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset,
+ SDValue &SLC) const {
+ return SelectFlatOffset(Addr, VAddr, Offset, SLC);
+}
+
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
SDValue &Offset, bool &Imm) const {
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a7eac080f885..e54c887d6090 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -126,8 +126,9 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD))
.add(I.getOperand(1))
.add(I.getOperand(0))
- .addImm(0)
- .addImm(0);
+ .addImm(0) // offset
+ .addImm(0) // glc
+ .addImm(0); // slc
// Now that we selected an opcode, we need to constrain the register
@@ -392,8 +393,9 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
.add(I.getOperand(0))
.addReg(PtrReg)
- .addImm(0)
- .addImm(0);
+ .addImm(0) // offset
+ .addImm(0) // glc
+ .addImm(0); // slc
bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
I.eraseFromParent();
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b889788c3426..790a69b84397 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -34,6 +34,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
const LLT P1 = LLT::pointer(1, 64);
const LLT P2 = LLT::pointer(2, 64);
+ setAction({G_ADD, S32}, Legal);
+
// FIXME: i1 operands to intrinsics should always be legal, but other i1
// values may not be legal. We need to figure out how to distinguish
// between these two scenarios.
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index dee3d2856701..0d6689bd04c4 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -195,7 +195,7 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
return Vector ? 0 : 32;
}
@@ -489,6 +489,19 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
return false;
}
+bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane:
+ return true;
+ }
+ }
+ return false;
+}
+
unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
if (ST->hasVOP3PInsts()) {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index e0024e21e82b..a60b1bb1b59c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -76,7 +76,7 @@ public:
}
unsigned getNumberOfRegisters(bool Vector);
- unsigned getRegisterBitWidth(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
@@ -103,6 +103,7 @@ public:
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
bool isSourceOfDivergence(const Value *V) const;
+ bool isAlwaysUniform(const Value *V) const;
unsigned getFlatAddressSpace() const {
// Don't bother running InferAddressSpaces pass on graphics shaders which
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 16e3b7b4ebee..392e9d89bd9b 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -285,6 +285,9 @@ public:
bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); }
bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
+
+ bool isOffsetU12() const { return isImmTy(ImmTyOffset) && isUInt<12>(getImm()); }
+ bool isOffsetS13() const { return isImmTy(ImmTyOffset) && isInt<13>(getImm()); }
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
@@ -886,6 +889,10 @@ public:
return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
}
+ bool hasFlatOffsets() const {
+ return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets];
+ }
+
bool hasSGPR102_SGPR103() const {
return !isVI();
}
@@ -1034,6 +1041,7 @@ public:
AMDGPUOperand::Ptr defaultSMRDOffset8() const;
AMDGPUOperand::Ptr defaultSMRDOffset20() const;
AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
+ AMDGPUOperand::Ptr defaultOffsetU12() const;
OperandMatchResultTy parseOModOperand(OperandVector &Operands);
@@ -1970,6 +1978,15 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
}
}
+ if ((TSFlags & SIInstrFlags::FLAT) && !hasFlatOffsets()) {
+ // FIXME: Produces error without correct column reported.
+ auto OpNum =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset);
+ const auto &Op = Inst.getOperand(OpNum);
+ if (Op.getImm() != 0)
+ return Match_InvalidOperand;
+ }
+
return Match_Success;
}
@@ -3849,6 +3866,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
//===----------------------------------------------------------------------===//
// vop3
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 8ba9efd42c70..98eda288bcac 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -7,7 +7,8 @@
//
//===----------------------------------------------------------------------===//
-def FLATAtomic : ComplexPattern<i64, 2, "SelectFlat">;
+def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [], -10>;
+def FLATOffset : ComplexPattern<i64, 3, "SelectFlat", [], [], -10>;
//===----------------------------------------------------------------------===//
// FLAT classes
@@ -55,6 +56,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
+ let TSFlags = ps.TSFlags;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
// encoding fields
bits<8> vaddr;
@@ -63,10 +66,23 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
bits<1> slc;
bits<1> glc;
+ // Only valid on gfx9
+ bits<1> lds = 0; // XXX - What does this actually do?
+ bits<2> seg; // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+
+ // Signed offset. Highest bit ignored for flat and treated as 12-bit
+ // unsigned for flat acceses.
+ bits<13> offset;
+ bits<1> nv = 0; // XXX - What does this actually do?
+
// We don't use tfe right now, and it was removed in gfx9.
bits<1> tfe = 0;
- // 15-0 is reserved.
+ // Only valid on GFX9+
+ let Inst{12-0} = offset;
+ let Inst{13} = lds;
+ let Inst{15-14} = 0;
+
let Inst{16} = !if(ps.has_glc, glc, ps.glcValue);
let Inst{17} = slc;
let Inst{24-18} = op;
@@ -74,24 +90,30 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
let Inst{39-32} = vaddr;
let Inst{47-40} = !if(ps.has_data, vdata, ?);
// 54-48 is reserved.
- let Inst{55} = tfe;
+ let Inst{55} = nv; // nv on GFX9+, TFE before.
let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
}
-class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
+class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
+ bit HasSignedOffset = 0> : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
- (ins VReg_64:$vaddr, GLC:$glc, slc:$slc),
- " $vdst, $vaddr$glc$slc"> {
+ !if(HasSignedOffset,
+ (ins VReg_64:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc),
+ (ins VReg_64:$vaddr, offset_u12:$offset, GLC:$glc, slc:$slc)),
+ " $vdst, $vaddr$offset$glc$slc"> {
let has_data = 0;
let mayLoad = 1;
}
-class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo<
+class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
+ bit HasSignedOffset = 0> : FLAT_Pseudo<
opName,
(outs),
- (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc),
- " $vaddr, $vdata$glc$slc"> {
+ !if(HasSignedOffset,
+ (ins VReg_64:$vaddr, vdataClass:$vdata, offset_s13:$offset, GLC:$glc, slc:$slc),
+ (ins VReg_64:$vaddr, vdataClass:$vdata, offset_u12:$offset, GLC:$glc, slc:$slc)),
+ " $vaddr, $vdata$offset$glc$slc"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -103,12 +125,15 @@ multiclass FLAT_Atomic_Pseudo<
ValueType vt,
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc> {
+ RegisterClass data_rc = vdst_rc,
+ bit HasSignedOffset = 0> {
def "" : FLAT_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
- " $vaddr, $vdata$slc",
+ !if(HasSignedOffset,
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)),
+ " $vaddr, $vdata$offset$slc",
[]>,
AtomicNoRet <NAME, 0> {
let mayLoad = 1;
@@ -121,10 +146,12 @@ multiclass FLAT_Atomic_Pseudo<
def _RTN : FLAT_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
- " $vdst, $vaddr, $vdata glc$slc",
+ !if(HasSignedOffset,
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)),
+ " $vdst, $vaddr, $vdata$offset glc$slc",
[(set vt:$vdst,
- (atomic (FLATAtomic i64:$vaddr, i1:$slc), data_vt:$vdata))]>,
+ (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
AtomicNoRet <NAME, 1> {
let mayLoad = 1;
let mayStore = 1;
@@ -312,31 +339,31 @@ def flat_truncstorei16 : flat_st <truncstorei16>;
// Patterns for global loads with no offset.
class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr)),
- (inst $addr, 0, 0)
+ (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
+ (inst $vaddr, $offset, 0, $slc)
>;
class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr)),
- (inst $addr, 1, 0)
+ (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
+ (inst $vaddr, $offset, 1, $slc)
>;
class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
- (node vt:$data, i64:$addr),
- (inst $addr, $data, 0, 0)
+ (node vt:$data, (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc)),
+ (inst $vaddr, $data, $offset, 0, $slc)
>;
class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
// atomic store follows atomic binop convention so the address comes
// first.
- (node i64:$addr, vt:$data),
- (inst $addr, $data, 1, 0)
+ (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
+ (inst $vaddr, $data, $offset, 1, $slc)
>;
class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : Pat <
- (vt (node i64:$addr, data_vt:$data)),
- (inst $addr, $data, 0)
+ (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
+ (inst $vaddr, $data, $offset, $slc)
>;
let Predicates = [isCIVI] in {
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 599ee942d738..441f1ef4bd04 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -567,9 +567,17 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
}
bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
- // Flat instructions do not have offsets, and only have the register
- // address.
- return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+ if (!Subtarget->hasFlatInstOffsets()) {
+ // Flat instructions do not have offsets, and only have the register
+ // address.
+ return AM.BaseOffs == 0 && AM.Scale == 0;
+ }
+
+ // GFX9 added a 13-bit signed offset. When using regular flat instructions,
+ // the sign bit is ignored and is treated as a 12-bit unsigned offset.
+
+ // Just r + i
+ return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
}
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 58c05cf16f15..1097814e99ce 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -468,13 +468,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
- if (Idx == SubIndices.size() - 1)
- Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
-
if (Idx == 0)
Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
- Builder.addReg(SrcReg, RegState::Implicit);
+ bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
+ Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
}
}
@@ -2331,11 +2329,12 @@ static bool isSubRegOf(const SIRegisterInfo &TRI,
bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI.getOpcode();
-
if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
return true;
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
@@ -2565,6 +2564,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
+ const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
+ if (Offset->getImm() != 0) {
+ ErrInfo = "subtarget does not support offsets in flat instructions";
+ return false;
+ }
+ }
+
return true;
}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 445bf79a7814..470a47b02443 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -492,11 +492,21 @@ class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
let ParserMatchClass = MatchClass;
}
+class NamedOperandU12<string Name, AsmOperandClass MatchClass> : Operand<i16> {
+ let PrintMethod = "print"#Name;
+ let ParserMatchClass = MatchClass;
+}
+
class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> {
let PrintMethod = "print"#Name;
let ParserMatchClass = MatchClass;
}
+class NamedOperandS13<string Name, AsmOperandClass MatchClass> : Operand<i16> {
+ let PrintMethod = "print"#Name;
+ let ParserMatchClass = MatchClass;
+}
+
class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
let PrintMethod = "print"#Name;
let ParserMatchClass = MatchClass;
@@ -514,6 +524,8 @@ def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
+def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>;
+def offset_s13 : NamedOperandS13<"Offset", NamedMatchClass<"OffsetS13">>;
def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index a7ac9a1dca6e..e498f70b820d 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -35,9 +35,19 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
Type *T) {
- if (T->isArrayTy() || T->isStructTy())
+ if (T->isArrayTy())
return true;
+ if (T->isStructTy()) {
+ // For now we only allow homogeneous structs that we can manipulate with
+ // G_MERGE_VALUES and G_UNMERGE_VALUES
+ auto StructT = cast<StructType>(T);
+ for (unsigned i = 1, e = StructT->getNumElements(); i != e; ++i)
+ if (StructT->getElementType(i) != StructT->getElementType(0))
+ return false;
+ return true;
+ }
+
EVT VT = TLI.getValueType(DL, T, true);
if (!VT.isSimple() || VT.isVector() ||
!(VT.isInteger() || VT.isFloatingPoint()))
@@ -220,12 +230,16 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
return false;
SmallVector<ArgInfo, 4> SplitVTs;
+ SmallVector<unsigned, 4> Regs;
ArgInfo RetInfo(VReg, Val->getType());
setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
splitToValueTypes(RetInfo, SplitVTs, MF, [&](unsigned Reg, uint64_t Offset) {
- MIRBuilder.buildExtract(Reg, VReg, Offset);
+ Regs.push_back(Reg);
});
+ if (Regs.size() > 1)
+ MIRBuilder.buildUnmerge(Regs, VReg);
+
CCAssignFn *AssignFn =
TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
@@ -344,26 +358,6 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
return 1;
}
- /// Merge the values in \p SrcRegs into \p DstReg at offsets \p SrcOffsets.
- /// Note that the source registers are not required to have homogeneous types,
- /// so we use G_INSERT rather than G_MERGE_VALUES.
- // FIXME: Use G_MERGE_VALUES if the types are homogeneous.
- void mergeRegisters(unsigned DstReg, ArrayRef<unsigned> SrcRegs,
- ArrayRef<uint64_t> SrcOffsets) {
- LLT Ty = MRI.getType(DstReg);
-
- unsigned Dst = MRI.createGenericVirtualRegister(Ty);
- MIRBuilder.buildUndef(Dst);
-
- for (unsigned i = 0; i < SrcRegs.size(); ++i) {
- unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
- MIRBuilder.buildInsert(Tmp, Dst, SrcRegs[i], SrcOffsets[i]);
- Dst = Tmp;
- }
-
- MIRBuilder.buildCopy(DstReg, Dst);
- }
-
/// Marking a physical register as used is different between formal
/// parameters, where it's a basic block live-in, and call returns, where it's
/// an implicit-def of the call instruction.
@@ -413,22 +407,19 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
SmallVector<ArgInfo, 8> ArgInfos;
SmallVector<unsigned, 4> SplitRegs;
- SmallVector<uint64_t, 4> RegOffsets;
unsigned Idx = 0;
for (auto &Arg : F.args()) {
ArgInfo AInfo(VRegs[Idx], Arg.getType());
setArgFlags(AInfo, Idx + AttributeList::FirstArgIndex, DL, F);
SplitRegs.clear();
- RegOffsets.clear();
splitToValueTypes(AInfo, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
SplitRegs.push_back(Reg);
- RegOffsets.push_back(Offset);
});
if (!SplitRegs.empty())
- ArgHandler.mergeRegisters(VRegs[Idx], SplitRegs, RegOffsets);
+ MIRBuilder.buildMerge(VRegs[Idx], SplitRegs);
Idx++;
}
@@ -490,9 +481,13 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (!Arg.IsFixed)
return false;
+ SmallVector<unsigned, 8> Regs;
splitToValueTypes(Arg, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
- MIRBuilder.buildExtract(Reg, Arg.Reg, Offset);
+ Regs.push_back(Reg);
});
+
+ if (Regs.size() > 1)
+ MIRBuilder.buildUnmerge(Regs, Arg.Reg);
}
auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
@@ -508,11 +503,9 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
ArgInfos.clear();
- SmallVector<uint64_t, 8> RegOffsets;
SmallVector<unsigned, 8> SplitRegs;
splitToValueTypes(OrigRet, ArgInfos, MF,
[&](unsigned Reg, uint64_t Offset) {
- RegOffsets.push_back(Offset);
SplitRegs.push_back(Reg);
});
@@ -521,10 +514,10 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
return false;
- if (!RegOffsets.empty()) {
+ if (!SplitRegs.empty()) {
// We have split the value and allocated each individual piece, now build
// it up again.
- RetHandler.mergeRegisters(OrigRet.Reg, SplitRegs, RegOffsets);
+ MIRBuilder.buildMerge(OrigRet.Reg, SplitRegs);
}
}
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 817b567db767..5d887c4fcbf2 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -2010,7 +2010,8 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
[(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
(f64 DPR:$Ddin)))]>,
RegConstraint<"$Ddin = $Dd">,
- Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
+ Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -2018,7 +2019,8 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
[(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
SPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>,
+ Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines.
}
@@ -2028,7 +2030,8 @@ def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
[]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasFullFP16,UseFusedMAC]>;
+ Requires<[HasFullFP16,UseFusedMAC]>,
+ Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
(VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
@@ -2059,14 +2062,16 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
[(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
(f64 DPR:$Ddin)))]>,
RegConstraint<"$Ddin = $Dd">,
- Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+ Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
+ Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm",
[(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+ Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>,
+ Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines.
}
@@ -2076,7 +2081,8 @@ def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
[]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasFullFP16,UseFusedMAC]>;
+ Requires<[HasFullFP16,UseFusedMAC]>,
+ Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
(VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 2d490b7c303e..a706079d9866 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -12,8 +12,10 @@
//===----------------------------------------------------------------------===//
#include "ARMLegalizerInfo.h"
+#include "ARMCallLowering.h"
#include "ARMSubtarget.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/LowLevelType.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
@@ -63,6 +65,16 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
setAction({Op, s32}, Libcall);
}
+ // FIXME: Support s8 and s16 as well
+ for (unsigned Op : {G_SREM, G_UREM})
+ if (ST.hasDivideInARMMode())
+ setAction({Op, s32}, Lower);
+ else if (ST.isTargetAEABI() || ST.isTargetGNUAEABI() ||
+ ST.isTargetMuslAEABI())
+ setAction({Op, s32}, Custom);
+ else
+ setAction({Op, s32}, Libcall);
+
for (unsigned Op : {G_SEXT, G_ZEXT}) {
setAction({Op, s32}, Legal);
for (auto Ty : {s1, s8, s16})
@@ -134,5 +146,38 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
}
return true;
}
+ case G_SREM:
+ case G_UREM: {
+ unsigned OriginalResult = MI.getOperand(0).getReg();
+ auto Size = MRI.getType(OriginalResult).getSizeInBits();
+ if (Size != 32)
+ return false;
+
+ auto Libcall =
+ MI.getOpcode() == G_SREM ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
+
+ // Our divmod libcalls return a struct containing the quotient and the
+ // remainder. We need to create a virtual register for it.
+ auto &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+ Type *ArgTy = Type::getInt32Ty(Ctx);
+ StructType *RetTy = StructType::get(Ctx, {ArgTy, ArgTy}, /* Packed */ true);
+ auto RetVal = MRI.createGenericVirtualRegister(
+ getLLTForType(*RetTy, MIRBuilder.getMF().getDataLayout()));
+
+ auto Status = replaceWithLibcall(MI, MIRBuilder, Libcall, {RetVal, RetTy},
+ {{MI.getOperand(1).getReg(), ArgTy},
+ {MI.getOperand(2).getReg(), ArgTy}});
+ if (Status != LegalizerHelper::Legalized)
+ return false;
+
+ // The remainder is the second result of divmod. Split the return value into
+ // a new, unused register for the quotient and the destination of the
+ // original instruction for the remainder.
+ MIRBuilder.buildUnmerge(
+ {MRI.createGenericVirtualRegister(LLT::scalar(32)), OriginalResult},
+ RetVal);
+
+ return LegalizerHelper::Legalized;
+ }
}
}
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 7de0543dfa5e..8a1a37863877 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -78,7 +78,7 @@ public:
return 13;
}
- unsigned getRegisterBitWidth(bool Vector) {
+ unsigned getRegisterBitWidth(bool Vector) const {
if (Vector) {
if (ST->hasNEON())
return 128;
diff --git a/lib/Target/BPF/BPFAsmPrinter.cpp b/lib/Target/BPF/BPFAsmPrinter.cpp
index fcd903b7a4a8..9397c78f3dff 100644
--- a/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -35,14 +35,15 @@ using namespace llvm;
namespace {
class BPFAsmPrinter : public AsmPrinter {
public:
- explicit BPFAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+ explicit BPFAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {}
StringRef getPassName() const override { return "BPF Assembly Printer"; }
void EmitInstruction(const MachineInstr *MI) override;
};
-}
+} // namespace
void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) {
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 279cdb1a89b4..7d5fb6ca17b9 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -22,11 +22,14 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
+
using namespace llvm;
#define DEBUG_TYPE "bpf-isel"
@@ -42,6 +45,8 @@ public:
return "BPF DAG->DAG Pattern Instruction Selection";
}
+ void PreprocessISelDAG() override;
+
private:
// Include the pieces autogenerated from the target description.
#include "BPFGenDAGISel.inc"
@@ -51,15 +56,31 @@ private:
// Complex Pattern for address selection.
bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+ // Find constants from a constant structure
+ typedef std::vector<unsigned char> val_vec_type;
+ bool fillGenericConstant(const DataLayout &DL, const Constant *CV,
+ val_vec_type &Vals, uint64_t Offset);
+ bool fillConstantDataArray(const DataLayout &DL, const ConstantDataArray *CDA,
+ val_vec_type &Vals, int Offset);
+ bool fillConstantArray(const DataLayout &DL, const ConstantArray *CA,
+ val_vec_type &Vals, int Offset);
+ bool fillConstantStruct(const DataLayout &DL, const ConstantStruct *CS,
+ val_vec_type &Vals, int Offset);
+ bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset,
+ uint64_t Size, unsigned char *ByteSeq);
+
+ // Mapping from ConstantStruct global value to corresponding byte-list values
+ std::map<const void *, val_vec_type> cs_vals_;
};
-}
+} // namespace
// ComplexPattern used on BPF Load/Store instructions
bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
// if Address is FI, get the TargetFrameIndex.
SDLoc DL(Addr);
if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
- Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
return true;
}
@@ -85,13 +106,14 @@ bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
}
}
- Base = Addr;
+ Base = Addr;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
return true;
}
// ComplexPattern used on BPF FI instruction
-bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
+bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
SDLoc DL(Addr);
if (!CurDAG->isBaseWithConstantOffset(Addr))
@@ -102,8 +124,7 @@ bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset)
if (isInt<16>(CN->getSExtValue())) {
// If the first operand is a FI, get the TargetFI Node
- if (FrameIndexSDNode *FIN =
- dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
else
return false;
@@ -129,7 +150,8 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
// tablegen selection should be handled here.
switch (Opcode) {
- default: break;
+ default:
+ break;
case ISD::SDIV: {
DebugLoc Empty;
const DebugLoc &DL = Node->getDebugLoc();
@@ -181,6 +203,210 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
SelectCode(Node);
}
+void BPFDAGToDAGISel::PreprocessISelDAG() {
+ // Iterate through all nodes, only interested in loads from ConstantStruct
+ // ConstantArray should have converted by IR->DAG processing
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end();
+ I != E;) {
+ SDNode *Node = &*I++;
+ unsigned Opcode = Node->getOpcode();
+ if (Opcode != ISD::LOAD)
+ continue;
+
+ unsigned char new_val[8]; // hold up the constant values replacing loads.
+ bool to_replace = false;
+ SDLoc DL(Node);
+ const LoadSDNode *LD = cast<LoadSDNode>(Node);
+ uint64_t size = LD->getMemOperand()->getSize();
+ if (!size || size > 8 || (size & (size - 1)))
+ continue;
+
+ SDNode *LDAddrNode = LD->getOperand(1).getNode();
+ // Match LDAddr against either global_addr or (global_addr + offset)
+ unsigned opcode = LDAddrNode->getOpcode();
+ if (opcode == ISD::ADD) {
+ SDValue OP1 = LDAddrNode->getOperand(0);
+ SDValue OP2 = LDAddrNode->getOperand(1);
+
+ // We want to find the pattern global_addr + offset
+ SDNode *OP1N = OP1.getNode();
+ if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END ||
+ OP1N->getNumOperands() == 0)
+ continue;
+
+ DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+
+ const GlobalAddressSDNode *GADN =
+ dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
+ const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
+ if (GADN && CDN)
+ to_replace =
+ getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val);
+ } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
+ LDAddrNode->getNumOperands() > 0) {
+ DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+
+ SDValue OP1 = LDAddrNode->getOperand(0);
+ if (const GlobalAddressSDNode *GADN =
+ dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
+ to_replace = getConstantFieldValue(GADN, 0, size, new_val);
+ }
+
+ if (!to_replace)
+ continue;
+
+ // replacing the old with a new value
+ uint64_t val;
+ if (size == 1)
+ val = *(uint8_t *)new_val;
+ else if (size == 2)
+ val = *(uint16_t *)new_val;
+ else if (size == 4)
+ val = *(uint32_t *)new_val;
+ else {
+ val = *(uint64_t *)new_val;
+ }
+
+ DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
+ << val << '\n');
+ SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
+
+ // After replacement, the current node is dead, we need to
+ // go backward one step to make iterator still work
+ I--;
+ SDValue From[] = {SDValue(Node, 0), SDValue(Node, 1)};
+ SDValue To[] = {NVal, NVal};
+ CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2);
+ I++;
+ // It is safe to delete node now
+ CurDAG->DeleteNode(Node);
+ }
+}
+
+bool BPFDAGToDAGISel::getConstantFieldValue(const GlobalAddressSDNode *Node,
+ uint64_t Offset, uint64_t Size,
+ unsigned char *ByteSeq) {
+ const GlobalVariable *V = dyn_cast<GlobalVariable>(Node->getGlobal());
+
+ if (!V || !V->hasInitializer())
+ return false;
+
+ const Constant *Init = V->getInitializer();
+ const DataLayout &DL = CurDAG->getDataLayout();
+ val_vec_type TmpVal;
+
+ auto it = cs_vals_.find(static_cast<const void *>(Init));
+ if (it != cs_vals_.end()) {
+ TmpVal = it->second;
+ } else {
+ uint64_t total_size = 0;
+ if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(Init))
+ total_size =
+ DL.getStructLayout(cast<StructType>(CS->getType()))->getSizeInBytes();
+ else if (const ConstantArray *CA = dyn_cast<ConstantArray>(Init))
+ total_size = DL.getTypeAllocSize(CA->getType()->getElementType()) *
+ CA->getNumOperands();
+ else
+ return false;
+
+ val_vec_type Vals(total_size, 0);
+ if (fillGenericConstant(DL, Init, Vals, 0) == false)
+ return false;
+ cs_vals_[static_cast<const void *>(Init)] = Vals;
+ TmpVal = std::move(Vals);
+ }
+
+ // test whether host endianness matches target
+ uint8_t test_buf[2];
+ uint16_t test_val = 0x2345;
+ if (DL.isLittleEndian())
+ support::endian::write16le(test_buf, test_val);
+ else
+ support::endian::write16be(test_buf, test_val);
+
+ bool endian_match = *(uint16_t *)test_buf == test_val;
+ for (uint64_t i = Offset, j = 0; i < Offset + Size; i++, j++)
+ ByteSeq[j] = endian_match ? TmpVal[i] : TmpVal[Offset + Size - 1 - j];
+
+ return true;
+}
+
+bool BPFDAGToDAGISel::fillGenericConstant(const DataLayout &DL,
+ const Constant *CV,
+ val_vec_type &Vals, uint64_t Offset) {
+ uint64_t Size = DL.getTypeAllocSize(CV->getType());
+
+ if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
+ return true; // already done
+
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+ uint64_t val = CI->getZExtValue();
+ DEBUG(dbgs() << "Byte array at offset " << Offset << " with value " << val
+ << '\n');
+
+ if (Size > 8 || (Size & (Size - 1)))
+ return false;
+
+ // Store based on target endian
+ for (uint64_t i = 0; i < Size; ++i) {
+ Vals[Offset + i] = DL.isLittleEndian()
+ ? ((val >> (i * 8)) & 0xFF)
+ : ((val >> ((Size - i - 1) * 8)) & 0xFF);
+ }
+ return true;
+ }
+
+ if (const ConstantDataArray *CDA = dyn_cast<ConstantDataArray>(CV))
+ return fillConstantDataArray(DL, CDA, Vals, Offset);
+
+ if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV))
+ return fillConstantArray(DL, CA, Vals, Offset);
+
+ if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
+ return fillConstantStruct(DL, CVS, Vals, Offset);
+
+ return false;
+}
+
+bool BPFDAGToDAGISel::fillConstantDataArray(const DataLayout &DL,
+ const ConstantDataArray *CDA,
+ val_vec_type &Vals, int Offset) {
+ for (unsigned i = 0, e = CDA->getNumElements(); i != e; ++i) {
+ if (fillGenericConstant(DL, CDA->getElementAsConstant(i), Vals, Offset) ==
+ false)
+ return false;
+ Offset += DL.getTypeAllocSize(CDA->getElementAsConstant(i)->getType());
+ }
+
+ return true;
+}
+
+bool BPFDAGToDAGISel::fillConstantArray(const DataLayout &DL,
+ const ConstantArray *CA,
+ val_vec_type &Vals, int Offset) {
+ for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
+ if (fillGenericConstant(DL, CA->getOperand(i), Vals, Offset) == false)
+ return false;
+ Offset += DL.getTypeAllocSize(CA->getOperand(i)->getType());
+ }
+
+ return true;
+}
+
+bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL,
+ const ConstantStruct *CS,
+ val_vec_type &Vals, int Offset) {
+ const StructLayout *Layout = DL.getStructLayout(CS->getType());
+ for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
+ const Constant *Field = CS->getOperand(i);
+ uint64_t SizeSoFar = Layout->getElementOffset(i);
+ if (fillGenericConstant(DL, Field, Vals, Offset + SizeSoFar) == false)
+ return false;
+ }
+ return true;
+}
+
FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
return new BPFDAGToDAGISel(TM);
}
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index c6c0ff587c6b..5ad777268208 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -51,7 +51,7 @@ def u64imm : Operand<i64> {
let PrintMethod = "printImm64Operand";
}
-def i64immSExt32 : PatLeaf<(imm),
+def i64immSExt32 : PatLeaf<(i64 imm),
[{return isInt<32>(N->getSExtValue()); }]>;
// Addressing modes.
@@ -67,17 +67,17 @@ def MEMri : Operand<i64> {
}
// Conditional code predicates - used for pattern matching for jump instructions
-def BPF_CC_EQ : PatLeaf<(imm),
+def BPF_CC_EQ : PatLeaf<(i64 imm),
[{return (N->getZExtValue() == ISD::SETEQ);}]>;
-def BPF_CC_NE : PatLeaf<(imm),
+def BPF_CC_NE : PatLeaf<(i64 imm),
[{return (N->getZExtValue() == ISD::SETNE);}]>;
-def BPF_CC_GE : PatLeaf<(imm),
+def BPF_CC_GE : PatLeaf<(i64 imm),
[{return (N->getZExtValue() == ISD::SETGE);}]>;
-def BPF_CC_GT : PatLeaf<(imm),
+def BPF_CC_GT : PatLeaf<(i64 imm),
[{return (N->getZExtValue() == ISD::SETGT);}]>;
-def BPF_CC_GTU : PatLeaf<(imm),
+def BPF_CC_GTU : PatLeaf<(i64 imm),
[{return (N->getZExtValue() == ISD::SETUGT);}]>;
-def BPF_CC_GEU : PatLeaf<(imm),
+def BPF_CC_GEU : PatLeaf<(i64 imm),
[{return (N->getZExtValue() == ISD::SETUGE);}]>;
// jump instructions
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index 3c37d9ebb0eb..11ac5454f604 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -59,9 +59,7 @@ namespace {
public:
static char ID;
- HexagonGenMux() : MachineFunctionPass(ID), HII(nullptr), HRI(nullptr) {
- initializeHexagonGenMuxPass(*PassRegistry::getPassRegistry());
- }
+ HexagonGenMux() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "Hexagon generate mux instructions";
@@ -79,8 +77,8 @@ namespace {
}
private:
- const HexagonInstrInfo *HII;
- const HexagonRegisterInfo *HRI;
+ const HexagonInstrInfo *HII = nullptr;
+ const HexagonRegisterInfo *HRI = nullptr;
struct CondsetInfo {
unsigned PredR = 0;
@@ -134,7 +132,7 @@ namespace {
} // end anonymous namespace
-INITIALIZE_PASS(HexagonGenMux, "hexagon-mux",
+INITIALIZE_PASS(HexagonGenMux, "hexagon-gen-mux",
"Hexagon generate mux instructions", false, false)
void HexagonGenMux::getSubRegs(unsigned Reg, BitVector &SRs) const {
@@ -297,12 +295,15 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
bool Failure = false, CanUp = true, CanDown = true;
+ bool Used1 = false, Used2 = false;
for (unsigned X = MinX+1; X < MaxX; X++) {
const DefUseInfo &DU = DUM.lookup(X);
if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) {
Failure = true;
break;
}
+ Used1 |= DU.Uses[SR1];
+ Used2 |= DU.Uses[SR2];
if (CanDown && DU.Defs[SR1])
CanDown = false;
if (CanUp && DU.Defs[SR2])
@@ -316,6 +317,45 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
// Prefer "down", since this will move the MUX farther away from the
// predicate definition.
MachineBasicBlock::iterator At = CanDown ? Def2 : Def1;
+ if (CanDown) {
+ // If the MUX is placed "down", we need to make sure that there aren't
+ // any kills of the source registers between the two defs.
+ if (Used1 || Used2) {
+ auto ResetKill = [this] (unsigned Reg, MachineInstr &MI) -> bool {
+ if (MachineOperand *Op = MI.findRegisterUseOperand(Reg, true, HRI)) {
+ Op->setIsKill(false);
+ return true;
+ }
+ return false;
+ };
+ bool KilledSR1 = false, KilledSR2 = false;
+ for (MachineInstr &MJ : make_range(std::next(It1), It2)) {
+ if (SR1)
+ KilledSR1 |= ResetKill(SR1, MJ);
+ if (SR2)
+ KilledSR2 |= ResetKill(SR1, MJ);
+ }
+ // If any of the source registers were killed in this range, transfer
+ // the kills to the source operands: they will me "moved" to the
+ // resulting MUX and their parent instructions will be deleted.
+ if (KilledSR1) {
+ assert(Src1->isReg());
+ Src1->setIsKill(true);
+ }
+ if (KilledSR2) {
+ assert(Src2->isReg());
+ Src2->setIsKill(true);
+ }
+ }
+ } else {
+ // If the MUX is placed "up", it shouldn't kill any source registers
+ // that are still used afterwards. We can reset the kill flags directly
+ // on the operands, because the source instructions will be erased.
+ if (Used1 && Src1->isReg())
+ Src1->setIsKill(false);
+ if (Used2 && Src2->isReg())
+ Src2->setIsKill(false);
+ }
ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2));
}
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index e4434136bf86..e5f49ca77a91 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -124,6 +124,7 @@ private:
bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src);
bool isOrEquivalentToAdd(const SDNode *N) const;
bool isAlignedMemNode(const MemSDNode *N) const;
+ bool isSmallStackStore(const StoreSDNode *N) const;
bool isPositiveHalfWord(const SDNode *N) const;
// DAG preprocessing functions.
@@ -1462,6 +1463,20 @@ bool HexagonDAGToDAGISel::isAlignedMemNode(const MemSDNode *N) const {
return N->getAlignment() >= N->getMemoryVT().getStoreSize();
}
+bool HexagonDAGToDAGISel::isSmallStackStore(const StoreSDNode *N) const {
+ unsigned StackSize = MF->getFrameInfo().estimateStackSize(*MF);
+ switch (N->getMemoryVT().getStoreSize()) {
+ case 1:
+ return StackSize <= 56; // 1*2^6 - 8
+ case 2:
+ return StackSize <= 120; // 2*2^6 - 8
+ case 4:
+ return StackSize <= 248; // 4*2^6 - 8
+ default:
+ return false;
+ }
+}
+
// Return true when the given node fits in a positive half word.
bool HexagonDAGToDAGISel::isPositiveHalfWord(const SDNode *N) const {
if (const ConstantSDNode *CN = dyn_cast<const ConstantSDNode>(N)) {
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index b748b58bc0ae..f82ad6cb3da6 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1744,7 +1744,8 @@ bool PolynomialMultiplyRecognize::recognize() {
// wide as the target's pmpy instruction.
if (!promoteTypes(LoopB, ExitB))
return false;
- convertShiftsToLeft(LoopB, ExitB, IterCount);
+ if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
+ return false;
cleanupLoopBody(LoopB);
}
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index f269b74fc447..689419638f54 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -401,6 +401,11 @@ def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>;
def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>;
def Sext64: PatLeaf<(i64 Usxtw:$Rs)>;
+def: Pat<(i32 (trunc (sra (mul Sext64:$Rs, Sext64:$Rt), (i32 32)))),
+ (M2_mpy_up (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
+def: Pat<(i32 (trunc (srl (mul Sext64:$Rs, Sext64:$Rt), (i32 32)))),
+ (M2_mpy_up (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
+
def: Pat<(mul (Aext64 I32:$Rs), (Aext64 I32:$Rt)),
(M2_dpmpyuu_s0 I32:$Rs, I32:$Rt)>;
@@ -1470,16 +1475,22 @@ def i32in8ImmPred: PatLeaf<(i32 imm), [{
return v == (int64_t)(int8_t)v;
}]>;
+class SmallStackStore<PatFrag Store>
+ : PatFrag<(ops node:$Val, node:$Addr), (Store node:$Val, node:$Addr), [{
+ return isSmallStackStore(cast<StoreSDNode>(N));
+}]>;
let AddedComplexity = 40 in {
// Even though the offset is not extendable in the store-immediate, we
// can still generate the fi# in the base address. If the final offset
// is not valid for the instruction, we will replace it with a scratch
// register.
-// def: Storexm_fi_pat <truncstorei8, s32_0ImmPred, ToImmByte, S4_storeirb_io>;
-// def: Storexm_fi_pat <truncstorei16, i16in8ImmPred, ToImmHalf,
-// S4_storeirh_io>;
-// def: Storexm_fi_pat <store, i32in8ImmPred, ToImmWord, S4_storeiri_io>;
+ def: Storexm_fi_pat <SmallStackStore<truncstorei8>, s32_0ImmPred,
+ ToImmByte, S4_storeirb_io>;
+ def: Storexm_fi_pat <SmallStackStore<truncstorei16>, i16in8ImmPred,
+ ToImmHalf, S4_storeirh_io>;
+ def: Storexm_fi_pat <SmallStackStore<store>, i32in8ImmPred,
+ ToImmWord, S4_storeiri_io>;
// defm: Storexm_fi_add_pat <truncstorei8, s32_0ImmPred, u6_0ImmPred, ToImmByte,
// S4_storeirb_io>;
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index c757b6ecdd00..e507a797871f 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -111,6 +111,7 @@ namespace llvm {
extern char &HexagonExpandCondsetsID;
void initializeHexagonExpandCondsetsPass(PassRegistry&);
void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
+ void initializeHexagonGenMuxPass(PassRegistry&);
void initializeHexagonOptAddrModePass(PassRegistry&);
Pass *createHexagonLoopIdiomPass();
@@ -152,8 +153,11 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
extern "C" void LLVMInitializeHexagonTarget() {
// Register the target.
RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
- initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
- initializeHexagonOptAddrModePass(*PassRegistry::getPassRegistry());
+
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeHexagonLoopIdiomRecognizePass(PR);
+ initializeHexagonGenMuxPass(PR);
+ initializeHexagonOptAddrModePass(PR);
}
HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index f2193013b7aa..68708dc4f50f 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -364,6 +364,18 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::UDIV, MVT::i64, Expand);
setOperationAction(ISD::UREM, MVT::i64, Expand);
+ if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
+ setOperationAction(ISD::ADDC, MVT::i32, Expand);
+ setOperationAction(ISD::ADDE, MVT::i32, Expand);
+ }
+
+ setOperationAction(ISD::ADDC, MVT::i64, Expand);
+ setOperationAction(ISD::ADDE, MVT::i64, Expand);
+ setOperationAction(ISD::SUBC, MVT::i32, Expand);
+ setOperationAction(ISD::SUBE, MVT::i32, Expand);
+ setOperationAction(ISD::SUBC, MVT::i64, Expand);
+ setOperationAction(ISD::SUBE, MVT::i64, Expand);
+
// Operations not directly supported by Mips.
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
@@ -469,6 +481,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::AssertZext);
setTargetDAGCombine(ISD::SHL);
@@ -918,14 +931,130 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
}
}
+static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG,
+ const MipsSubtarget &Subtarget) {
+ // ROOTNode must have a multiplication as an operand for the match to be
+ // successful.
+ if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL &&
+ ROOTNode->getOperand(1).getOpcode() != ISD::MUL)
+ return SDValue();
+
+ // We don't handle vector types here.
+ if (ROOTNode->getValueType(0).isVector())
+ return SDValue();
+
+ // For MIPS64, madd / msub instructions are inefficent to use with 64 bit
+ // arithmetic. E.g.
+ // (add (mul a b) c) =>
+ // let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in
+ // MIPS64: (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32)
+ // or
+ // MIPS64R2: (dins (mflo res) (mfhi res) 32 32)
+ //
+ // The overhead of setting up the Hi/Lo registers and reassembling the
+ // result makes this a dubious optimzation for MIPS64. The core of the
+ // problem is that Hi/Lo contain the upper and lower 32 bits of the
+ // operand and result.
+ //
+ // It requires a chain of 4 add/mul for MIPS64R2 to get better code
+ // density than doing it naively, 5 for MIPS64. Additionally, using
+ // madd/msub on MIPS64 requires the operands actually be 32 bit sign
+ // extended operands, not true 64 bit values.
+ //
+ // FIXME: For the moment, disable this completely for MIPS64.
+ if (Subtarget.hasMips64())
+ return SDValue();
+
+ SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+ ? ROOTNode->getOperand(0)
+ : ROOTNode->getOperand(1);
+
+ SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+ ? ROOTNode->getOperand(1)
+ : ROOTNode->getOperand(0);
+
+ // Transform this to a MADD only if the user of this node is the add.
+ // If there are other users of the mul, this function returns here.
+ if (!Mult.hasOneUse())
+ return SDValue();
+
+ // maddu and madd are unusual instructions in that on MIPS64 bits 63..31
+ // must be in canonical form, i.e. sign extended. For MIPS32, the operands
+ // of the multiply must have 32 or more sign bits, otherwise we cannot
+ // perform this optimization. We have to check this here as we're performing
+ // this optimization pre-legalization.
+ SDValue MultLHS = Mult->getOperand(0);
+ SDValue MultRHS = Mult->getOperand(1);
+ unsigned LHSSB = CurDAG.ComputeNumSignBits(MultLHS);
+ unsigned RHSSB = CurDAG.ComputeNumSignBits(MultRHS);
+
+ if (LHSSB < 32 || RHSSB < 32)
+ return SDValue();
+
+ APInt HighMask =
+ APInt::getHighBitsSet(Mult->getValueType(0).getScalarSizeInBits(), 32);
+ bool IsUnsigned = CurDAG.MaskedValueIsZero(Mult->getOperand(0), HighMask) &&
+ CurDAG.MaskedValueIsZero(Mult->getOperand(1), HighMask) &&
+ CurDAG.MaskedValueIsZero(AddOperand, HighMask);
+
+ // Initialize accumulator.
+ SDLoc DL(ROOTNode);
+ SDValue TopHalf;
+ SDValue BottomHalf;
+ BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+ CurDAG.getIntPtrConstant(0, DL));
+
+ TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+ CurDAG.getIntPtrConstant(1, DL));
+ SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+ BottomHalf,
+ TopHalf);
+
+ // Create MipsMAdd(u) / MipsMSub(u) node.
+ bool IsAdd = ROOTNode->getOpcode() == ISD::ADD;
+ unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd)
+ : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub);
+ SDValue MAddOps[3] = {
+ CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)),
+ CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn};
+ EVT VTs[2] = {MVT::i32, MVT::i32};
+ SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps);
+
+ SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+ SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+ SDValue Combined =
+ CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi);
+ return Combined;
+}
+
+static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const MipsSubtarget &Subtarget) {
+ // (sub v0 (mul v1, v2)) => (msub v1, v2, v0)
+ if (DCI.isBeforeLegalizeOps()) {
+ if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+ !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+ return performMADD_MSUBCombine(N, DAG, Subtarget);
+
+ return SDValue();
+ }
+
+ return SDValue();
+}
+
static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const MipsSubtarget &Subtarget) {
- // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+ // (add v0 (mul v1, v2)) => (madd v1, v2, v0)
+ if (DCI.isBeforeLegalizeOps()) {
+ if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+ !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+ return performMADD_MSUBCombine(N, DAG, Subtarget);
- if (DCI.isBeforeLegalizeOps())
return SDValue();
+ }
+ // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
SDValue Add = N->getOperand(1);
if (Add.getOpcode() != ISD::ADD)
@@ -1053,6 +1182,8 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
return performAssertZextCombine(N, DAG, DCI, Subtarget);
case ISD::SHL:
return performSHLCombine(N, DAG, DCI, Subtarget);
+ case ISD::SUB:
+ return performSUBCombine(N, DAG, DCI, Subtarget);
}
return SDValue();
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index df62c66b75a3..4adf77f8d9a9 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -103,12 +103,9 @@ void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
for (unsigned i = 1; i < Cond.size(); ++i) {
- if (Cond[i].isReg())
- MIB.addReg(Cond[i].getReg());
- else if (Cond[i].isImm())
- MIB.addImm(Cond[i].getImm());
- else
- assert(false && "Cannot copy operand");
+ assert((Cond[i].isImm() || Cond[i].isReg()) &&
+ "Cannot copy operand for conditional branch!");
+ MIB.add(Cond[i]);
}
MIB.addMBB(TBB);
}
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index b95f1158fa56..272595af5f6f 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -274,8 +274,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
if (IsPIC) {
MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(FallThroughMBB, BalTgtMBB);
- LongBrMBB->addSuccessor(BalTgtMBB);
- BalTgtMBB->addSuccessor(TgtMBB);
+ LongBrMBB->addSuccessor(BalTgtMBB, BranchProbability::getOne());
+ BalTgtMBB->addSuccessor(&*FallThroughMBB, BranchProbability::getOne());
// We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
// instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
@@ -342,8 +342,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
.addReg(Mips::SP).addImm(8);
if (Subtarget.hasMips32r6())
- BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR))
- .addReg(Mips::ZERO).addReg(Mips::AT);
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR), Mips::ZERO)
+ .addReg(Mips::AT);
else
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
@@ -415,8 +415,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
.addReg(Mips::SP_64).addImm(0);
if (Subtarget.hasMips64r6())
- BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64))
- .addReg(Mips::ZERO_64).addReg(Mips::AT_64);
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64), Mips::ZERO_64)
+ .addReg(Mips::AT_64);
else
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 49ae6dd4cd39..4be26dd25dc0 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -245,46 +245,64 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
}
}
-void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
- SDValue CmpLHS, const SDLoc &DL,
- SDNode *Node) const {
- unsigned Opc = InFlag.getOpcode(); (void)Opc;
-
- assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
- (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
- "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
-
- unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
- if (Subtarget->isGP64bit()) {
- SLTuOp = Mips::SLTu64;
- ADDuOp = Mips::DADDu;
- }
-
- SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const {
+ SDValue InFlag = Node->getOperand(2);
+ unsigned Opc = InFlag.getOpcode();
SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
EVT VT = LHS.getValueType();
- SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
-
- if (Subtarget->isGP64bit()) {
- // On 64-bit targets, sltu produces an i64 but our backend currently says
- // that SLTu64 produces an i32. We need to fix this in the long run but for
- // now, just make the DAG type-correct by asserting the upper bits are zero.
- Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
- CurDAG->getTargetConstant(0, DL, VT),
- SDValue(Carry, 0),
- CurDAG->getTargetConstant(Mips::sub_32, DL,
- VT));
+ // In the base case, we can rely on the carry bit from the addsc
+ // instruction.
+ if (Opc == ISD::ADDC) {
+ SDValue Ops[3] = {LHS, RHS, InFlag};
+ CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops);
+ return;
}
- // Generate a second addition only if we know that RHS is not a
- // constant-zero node.
- SDNode *AddCarry = Carry;
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
- if (!C || C->getZExtValue())
- AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
+ assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!");
+
+ // The more complex case is when there is a chain of ISD::ADDE nodes like:
+ // (adde (adde (adde (addc a b) c) d) e).
+ //
+ // The addwc instruction does not write to the carry bit, instead it writes
+ // to bit 20 of the dsp control register. To match this series of nodes, each
+ // intermediate adde node must be expanded to write the carry bit before the
+ // addition.
+
+ // Start by reading the overflow field for addsc and moving the value to the
+ // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP
+ // corresponds to reading/writing the entire control register to/from a GPR.
+
+ SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32);
+
+ SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32);
+
+ SDNode *DSPCtrlField =
+ CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag);
+
+ SDNode *Carry = CurDAG->getMachineNode(
+ Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne);
- CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+ SDValue Ops[4] = {SDValue(DSPCtrlField, 0),
+ CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne,
+ SDValue(Carry, 0)};
+ SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops);
+
+ // My reading of the the MIPS DSP 3.01 specification isn't as clear as I
+ // would like about whether bit 20 always gets overwritten by addwc.
+ // Hence take an extremely conservative view and presume it's sticky. We
+ // therefore need to clear it.
+
+ SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+ SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)};
+ SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps);
+
+ SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue,
+ SDValue(DSPCtrlFinal, 0), CstOne);
+
+ SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)};
+ CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands);
}
/// Match frameindex
@@ -765,19 +783,8 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
switch(Opcode) {
default: break;
- case ISD::SUBE: {
- SDValue InFlag = Node->getOperand(2);
- unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
- selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
- return true;
- }
-
case ISD::ADDE: {
- if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
- break;
- SDValue InFlag = Node->getOperand(2);
- unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
- selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
+ selectAddE(Node, DL);
return true;
}
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index f89a350cab04..6f38289c5a45 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -41,8 +41,7 @@ private:
const SDLoc &dl, EVT Ty, bool HasLo,
bool HasHi);
- void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
- const SDLoc &DL, SDNode *Node) const;
+ void selectAddE(SDNode *Node, const SDLoc &DL) const;
bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index bf7f079e3105..2382ea271661 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -179,8 +179,6 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::i32, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
- setTargetDAGCombine(ISD::ADDE);
- setTargetDAGCombine(ISD::SUBE);
setTargetDAGCombine(ISD::MUL);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -421,163 +419,6 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
return MipsTargetLowering::LowerOperation(Op, DAG);
}
-// selectMADD -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-// (addc multLo, Lo0), (adde multHi, Hi0),
-// where,
-// multHi/Lo: product of multiplication
-// Lo0: initial value of Lo register
-// Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
- // ADDENode's second operand must be a flag output of an ADDC node in order
- // for the matching to be successful.
- SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
-
- if (ADDCNode->getOpcode() != ISD::ADDC)
- return false;
-
- SDValue MultHi = ADDENode->getOperand(0);
- SDValue MultLo = ADDCNode->getOperand(0);
- SDNode *MultNode = MultHi.getNode();
- unsigned MultOpc = MultHi.getOpcode();
-
- // MultHi and MultLo must be generated by the same node,
- if (MultLo.getNode() != MultNode)
- return false;
-
- // and it must be a multiplication.
- if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
- return false;
-
- // MultLo amd MultHi must be the first and second output of MultNode
- // respectively.
- if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
- return false;
-
- // Transform this to a MADD only if ADDENode and ADDCNode are the only users
- // of the values of MultNode, in which case MultNode will be removed in later
- // phases.
- // If there exist users other than ADDENode or ADDCNode, this function returns
- // here, which will result in MultNode being mapped to a single MULT
- // instruction node rather than a pair of MULT and MADD instructions being
- // produced.
- if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
- return false;
-
- SDLoc DL(ADDENode);
-
- // Initialize accumulator.
- SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
- ADDCNode->getOperand(1),
- ADDENode->getOperand(1));
-
- // create MipsMAdd(u) node
- MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
-
- SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
- MultNode->getOperand(0),// Factor 0
- MultNode->getOperand(1),// Factor 1
- ACCIn);
-
- // replace uses of adde and addc here
- if (!SDValue(ADDCNode, 0).use_empty()) {
- SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
- }
- if (!SDValue(ADDENode, 0).use_empty()) {
- SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
- }
-
- return true;
-}
-
-// selectMSUB -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-// (addc Lo0, multLo), (sube Hi0, multHi),
-// where,
-// multHi/Lo: product of multiplication
-// Lo0: initial value of Lo register
-// Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
- // SUBENode's second operand must be a flag output of an SUBC node in order
- // for the matching to be successful.
- SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
-
- if (SUBCNode->getOpcode() != ISD::SUBC)
- return false;
-
- SDValue MultHi = SUBENode->getOperand(1);
- SDValue MultLo = SUBCNode->getOperand(1);
- SDNode *MultNode = MultHi.getNode();
- unsigned MultOpc = MultHi.getOpcode();
-
- // MultHi and MultLo must be generated by the same node,
- if (MultLo.getNode() != MultNode)
- return false;
-
- // and it must be a multiplication.
- if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
- return false;
-
- // MultLo amd MultHi must be the first and second output of MultNode
- // respectively.
- if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
- return false;
-
- // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
- // of the values of MultNode, in which case MultNode will be removed in later
- // phases.
- // If there exist users other than SUBENode or SUBCNode, this function returns
- // here, which will result in MultNode being mapped to a single MULT
- // instruction node rather than a pair of MULT and MSUB instructions being
- // produced.
- if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
- return false;
-
- SDLoc DL(SUBENode);
-
- // Initialize accumulator.
- SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
- SUBCNode->getOperand(0),
- SUBENode->getOperand(0));
-
- // create MipsSub(u) node
- MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
-
- SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
- MultNode->getOperand(0),// Factor 0
- MultNode->getOperand(1),// Factor 1
- ACCIn);
-
- // replace uses of sube and subc here
- if (!SDValue(SUBCNode, 0).use_empty()) {
- SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
- }
- if (!SDValue(SUBENode, 0).use_empty()) {
- SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
- }
-
- return true;
-}
-
-static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const MipsSubtarget &Subtarget) {
- if (DCI.isBeforeLegalize())
- return SDValue();
-
- if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
- N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
- return SDValue(N, 0);
-
- return SDValue();
-}
-
// Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
//
// Performs the following transformations:
@@ -820,19 +661,6 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const MipsSubtarget &Subtarget) {
- if (DCI.isBeforeLegalize())
- return SDValue();
-
- if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
- selectMSUB(N, &DAG))
- return SDValue(N, 0);
-
- return SDValue();
-}
-
static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
EVT ShiftTy, SelectionDAG &DAG) {
// Clear the upper (64 - VT.sizeInBits) bits.
@@ -1110,16 +938,12 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Val;
switch (N->getOpcode()) {
- case ISD::ADDE:
- return performADDECombine(N, DAG, DCI, Subtarget);
case ISD::AND:
Val = performANDCombine(N, DAG, DCI, Subtarget);
break;
case ISD::OR:
Val = performORCombine(N, DAG, DCI, Subtarget);
break;
- case ISD::SUBE:
- return performSUBECombine(N, DAG, DCI, Subtarget);
case ISD::MUL:
return performMULCombine(N, DAG, DCI, this);
case ISD::SHL:
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 625a652a0ca0..ccd47f00c0d3 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -78,7 +78,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
// IsNan2008 - IEEE 754-2008 NaN encoding.
bool IsNaN2008bit;
- // IsFP64bit - General-purpose registers are 64 bits wide
+ // IsGP64bit - General-purpose registers are 64 bits wide
bool IsGP64bit;
// IsPTR64bit - Pointers are 64 bit wide
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 28d496ee9ca1..afd2e87078a9 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -2907,19 +2907,6 @@ SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS,
getI64Imm(58, dl), getI64Imm(63, dl)),
0);
}
- case ISD::SETNE: {
- // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1)
- // (zext (setcc %a, %b, setne)) -> (sube addc.reg, addc.reg, addc.CA)
- // {addcz.reg, addcz.CA} = (addcarry %a, -1)
- // (zext (setcc %a, 0, setne)) -> (sube addcz.reg, addcz.reg, addcz.CA)
- SDValue Xor = IsRHSZero ? LHS :
- SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
- SDValue AC =
- SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue,
- Xor, getI32Imm(~0U, dl)), 0);
- return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, AC,
- Xor, AC.getValue(1)), 0);
- }
}
}
@@ -2944,19 +2931,6 @@ SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS,
return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic,
Addic, Addic.getValue(1)), 0);
}
- case ISD::SETNE: {
- // {subfc.reg, subfc.CA} = (subcarry 0, (xor %a, %b))
- // (sext (setcc %a, %b, setne)) -> (sube subfc.reg, subfc.reg, subfc.CA)
- // {subfcz.reg, subfcz.CA} = (subcarry 0, %a)
- // (sext (setcc %a, 0, setne)) -> (sube subfcz.reg, subfcz.reg, subfcz.CA)
- SDValue Xor = IsRHSZero ? LHS :
- SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
- SDValue SC =
- SDValue(CurDAG->getMachineNode(PPC::SUBFIC8, dl, MVT::i64, MVT::Glue,
- Xor, getI32Imm(0, dl)), 0);
- return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, SC,
- SC, SC.getValue(1)), 0);
- }
}
}
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index bda4e5e81734..662550f7a396 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -136,7 +136,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
}
- // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
+ // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
@@ -175,7 +175,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
}
- // PowerPC does not support direct load / store of condition registers
+ // PowerPC does not support direct load/store of condition registers.
setOperationAction(ISD::LOAD, MVT::i1, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
@@ -204,11 +204,23 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
- // PowerPC has no SREM/UREM instructions
- setOperationAction(ISD::SREM, MVT::i32, Expand);
- setOperationAction(ISD::UREM, MVT::i32, Expand);
- setOperationAction(ISD::SREM, MVT::i64, Expand);
- setOperationAction(ISD::UREM, MVT::i64, Expand);
+ // PowerPC has no SREM/UREM instructions unless we are on P9
+ // On P9 we may use a hardware instruction to compute the remainder.
+ // The instructions are not legalized directly because in the cases where the
+ // result of both the remainder and the division is required it is more
+ // efficient to compute the remainder from the result of the division rather
+ // than use the remainder instruction.
+ if (Subtarget.isISA3_0()) {
+ setOperationAction(ISD::SREM, MVT::i32, Custom);
+ setOperationAction(ISD::UREM, MVT::i32, Custom);
+ setOperationAction(ISD::SREM, MVT::i64, Custom);
+ setOperationAction(ISD::UREM, MVT::i64, Custom);
+ } else {
+ setOperationAction(ISD::SREM, MVT::i32, Expand);
+ setOperationAction(ISD::UREM, MVT::i32, Expand);
+ setOperationAction(ISD::SREM, MVT::i64, Expand);
+ setOperationAction(ISD::UREM, MVT::i64, Expand);
+ }
// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
@@ -1116,6 +1128,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::VPERM: return "PPCISD::VPERM";
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
case PPCISD::XXINSERT: return "PPCISD::XXINSERT";
+ case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE";
case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
case PPCISD::VECSHL: return "PPCISD::VECSHL";
case PPCISD::CMPB: return "PPCISD::CMPB";
@@ -1598,22 +1611,34 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
return true;
}
-// Check that the mask is shuffling N byte elements.
-static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width) {
+/// Check that the mask is shuffling N byte elements. Within each N byte
+/// element of the mask, the indices could be either in increasing or
+/// decreasing order as long as they are consecutive.
+/// \param[in] N the shuffle vector SD Node to analyze
+/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
+/// Word/DoubleWord/QuadWord).
+/// \param[in] StepLen the delta indices number among the N byte element, if
+/// the mask is in increasing/decreasing order then it is 1/-1.
+/// \return true iff the mask is shuffling N byte elements.
+static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
+ int StepLen) {
assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
"Unexpected element width.");
+ assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
unsigned NumOfElem = 16 / Width;
unsigned MaskVal[16]; // Width is never greater than 16
for (unsigned i = 0; i < NumOfElem; ++i) {
MaskVal[0] = N->getMaskElt(i * Width);
- if (MaskVal[0] % Width) {
+ if ((StepLen == 1) && (MaskVal[0] % Width)) {
+ return false;
+ } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
return false;
}
for (unsigned int j = 1; j < Width; ++j) {
MaskVal[j] = N->getMaskElt(i * Width + j);
- if (MaskVal[j] != MaskVal[j-1] + 1) {
+ if (MaskVal[j] != MaskVal[j-1] + StepLen) {
return false;
}
}
@@ -1624,7 +1649,7 @@ static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width) {
bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
unsigned &InsertAtByte, bool &Swap, bool IsLE) {
- if (!isNByteElemShuffleMask(N, 4))
+ if (!isNByteElemShuffleMask(N, 4, 1))
return false;
// Now we look at mask elements 0,4,8,12
@@ -1701,7 +1726,7 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
bool &Swap, bool IsLE) {
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
// Ensure each byte index of the word is consecutive.
- if (!isNByteElemShuffleMask(N, 4))
+ if (!isNByteElemShuffleMask(N, 4, 1))
return false;
// Now we look at mask elements 0,4,8,12, which are the beginning of words.
@@ -1759,6 +1784,35 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
}
}
+bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
+ assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
+
+ if (!isNByteElemShuffleMask(N, Width, -1))
+ return false;
+
+ for (int i = 0; i < 16; i += Width)
+ if (N->getMaskElt(i) != i + Width - 1)
+ return false;
+
+ return true;
+}
+
+bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
+ return isXXBRShuffleMaskHelper(N, 2);
+}
+
+bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
+ return isXXBRShuffleMaskHelper(N, 4);
+}
+
+bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
+ return isXXBRShuffleMaskHelper(N, 8);
+}
+
+bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
+ return isXXBRShuffleMaskHelper(N, 16);
+}
+
/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
/// if the inputs to the instruction should be swapped and set \p DM to the
/// value for the immediate.
@@ -1772,7 +1826,7 @@ bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
// Ensure each byte index of the double word is consecutive.
- if (!isNByteElemShuffleMask(N, 8))
+ if (!isNByteElemShuffleMask(N, 8, 1))
return false;
unsigned M0 = N->getMaskElt(0) / 8;
@@ -6819,6 +6873,7 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
// Given the head of the old chain, ResChain, insert a token factor containing
// it and NewResChain, and make users of ResChain now be users of that token
// factor.
+// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
SDValue NewResChain,
SelectionDAG &DAG) const {
@@ -7846,6 +7901,26 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
}
+ if (Subtarget.hasP9Vector()) {
+ if (PPC::isXXBRHShuffleMask(SVOp)) {
+ SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+ SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
+ } else if (PPC::isXXBRWShuffleMask(SVOp)) {
+ SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+ SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
+ } else if (PPC::isXXBRDShuffleMask(SVOp)) {
+ SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
+ SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
+ } else if (PPC::isXXBRQShuffleMask(SVOp)) {
+ SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
+ SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
+ }
+ }
+
if (Subtarget.hasVSX()) {
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
@@ -8393,6 +8468,18 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue();
}
+SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
+ // Check for a DIV with the same operands as this REM.
+ for (auto UI : Op.getOperand(1)->uses()) {
+ if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) ||
+ (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))
+ if (UI->getOperand(0) == Op.getOperand(0) &&
+ UI->getOperand(1) == Op.getOperand(1))
+ return SDValue();
+ }
+ return Op;
+}
+
SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -8861,6 +8948,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_VOID:
return LowerINTRINSIC_VOID(Op, DAG);
+ case ISD::SREM:
+ case ISD::UREM:
+ return LowerREM(Op, DAG);
}
}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 7982a4a9e9fb..a5108727bb4b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -86,6 +86,10 @@ namespace llvm {
///
XXINSERT,
+ /// XXREVERSE - The PPC VSX reverse instruction
+ ///
+ XXREVERSE,
+
/// VECSHL - The PPC VSX shift left instruction
///
VECSHL,
@@ -458,6 +462,23 @@ namespace llvm {
/// for a XXSLDWI instruction.
bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
bool &Swap, bool IsLE);
+
+ /// isXXBRHShuffleMask - Return true if this is a shuffle mask suitable
+ /// for a XXBRH instruction.
+ bool isXXBRHShuffleMask(ShuffleVectorSDNode *N);
+
+ /// isXXBRWShuffleMask - Return true if this is a shuffle mask suitable
+ /// for a XXBRW instruction.
+ bool isXXBRWShuffleMask(ShuffleVectorSDNode *N);
+
+ /// isXXBRDShuffleMask - Return true if this is a shuffle mask suitable
+ /// for a XXBRD instruction.
+ bool isXXBRDShuffleMask(ShuffleVectorSDNode *N);
+
+ /// isXXBRQShuffleMask - Return true if this is a shuffle mask suitable
+ /// for a XXBRQ instruction.
+ bool isXXBRQShuffleMask(ShuffleVectorSDNode *N);
+
/// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable
/// for a XXPERMDI instruction.
bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
@@ -918,6 +939,7 @@ namespace llvm {
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 295590b2acf6..70536a6039b8 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -683,6 +683,16 @@ def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
"divde $rT, $rA, $rB", IIC_IntDivD,
[(set i64:$rT, (int_ppc_divde g8rc:$rA, g8rc:$rB))]>,
isPPC64, Requires<[HasExtDiv]>;
+
+let Predicates = [IsISA3_0] in {
+def MODSD : XForm_8<31, 777, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "modsd $rT, $rA, $rB", IIC_IntDivW,
+ [(set i64:$rT, (srem i64:$rA, i64:$rB))]>;
+def MODUD : XForm_8<31, 265, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+ "modud $rT, $rA, $rB", IIC_IntDivW,
+ [(set i64:$rT, (urem i64:$rA, i64:$rB))]>;
+}
+
let Defs = [CR0] in
def DIVDEo : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
"divde. $rT, $rA, $rB", IIC_IntDivD,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index f3c68c443b1b..236e513bec23 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1964,7 +1964,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case PPC::CFENCE8: {
auto Val = MI.getOperand(0).getReg();
- BuildMI(MBB, MI, DL, get(PPC::CMPW), PPC::CR7).addReg(Val).addReg(Val);
+ BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val);
BuildMI(MBB, MI, DL, get(PPC::CTRL_DEP))
.addImm(PPC::PRED_NE_MINUS)
.addReg(PPC::CR7)
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 8223aa655e38..47d59c25392a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -53,6 +53,10 @@ def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
]>;
+def SDT_PPCVecReverse: SDTypeProfile<1, 1, [ SDTCisVec<0>,
+ SDTCisVec<1>
+]>;
+
def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>,
SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
]>;
@@ -174,6 +178,7 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
+def PPCxxreverse : SDNode<"PPCISD::XXREVERSE", SDT_PPCVecReverse, []>;
def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
@@ -2544,6 +2549,14 @@ let Uses = [RM] in {
"mffs. $rT", IIC_IntMFFS, []>, isDOT;
}
+let Predicates = [IsISA3_0] in {
+def MODSW : XForm_8<31, 779, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "modsw $rT, $rA, $rB", IIC_IntDivW,
+ [(set i32:$rT, (srem i32:$rA, i32:$rB))]>;
+def MODUW : XForm_8<31, 267, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+ "moduw $rT, $rA, $rB", IIC_IntDivW,
+ [(set i32:$rT, (urem i32:$rA, i32:$rB))]>;
+}
let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations.
// XO-Form instructions. Arithmetic instructions that can set overflow bit
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index e214d26c063b..9cfc897cdb3f 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2340,6 +2340,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, []>;
def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>;
+ // Vector Reverse
+ def : Pat<(v8i16 (PPCxxreverse v8i16 :$A)),
+ (v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
+ def : Pat<(v4i32 (PPCxxreverse v4i32 :$A)),
+ (v4i32 (XXBRW $A))>;
+ def : Pat<(v2i64 (PPCxxreverse v2i64 :$A)),
+ (v2i64 (XXBRD $A))>;
+ def : Pat<(v1i128 (PPCxxreverse v1i128 :$A)),
+ (v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
+
// Vector Permute
def XXPERM : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc,
IIC_VecPerm, []>;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index aad913924692..637e52bbdbee 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -273,6 +273,20 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
+bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg,
+ const MachineFunction &MF) const {
+ assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+ if (TM.isELFv2ABI() && PhysReg == PPC::X2) {
+ // X2 is guaranteed to be preserved within a function if it is reserved.
+ // The reason it's reserved is that it's the TOC pointer (and the function
+ // uses the TOC). In functions where it isn't reserved (i.e. leaf functions
+ // with no TOC access), we can't claim that it is preserved.
+ return (getReservedRegs(MF).test(PPC::X2));
+ } else {
+ return false;
+ }
+}
+
unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
const PPCFrameLowering *TFI = getFrameLowering(MF);
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 4a96327fe552..0bbb71fdf9fb 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -83,6 +83,7 @@ public:
void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ bool isCallerPreservedPhysReg(unsigned PhysReg, const MachineFunction &MF) const override;
/// We require the register scavenger.
bool requiresRegisterScavenging(const MachineFunction &MF) const override {
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 5559cdc5fe46..3dbd5f5b9a92 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -230,7 +230,7 @@ unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
return ST->hasVSX() ? 64 : 32;
}
-unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
if (Vector) {
if (ST->hasQPX()) return 256;
if (ST->hasAltivec()) return 128;
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 2e0116fee04c..758c335def08 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -63,7 +63,7 @@ public:
bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
bool enableInterleavedAccessVectorization();
unsigned getNumberOfRegisters(bool Vector);
- unsigned getRegisterBitWidth(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector) const;
unsigned getCacheLineSize();
unsigned getPrefetchDistance();
unsigned getMaxInterleaveFactor(unsigned VF);
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 6a3dc6799c43..422c16b8eb62 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -302,7 +302,7 @@ unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
return 0;
}
-unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
if (!Vector)
return 64;
if (ST->hasVector())
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index ad597f5c65f0..bdba7601eb78 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -53,7 +53,7 @@ public:
/// @{
unsigned getNumberOfRegisters(bool Vector);
- unsigned getRegisterBitWidth(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector) const;
bool prefersVectorizedAddressing() { return false; }
bool supportsEfficientVectorElementLoadStore() { return true; }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index ddf964e7dbb7..5ad147e5e596 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -46,9 +46,7 @@ public:
/// .functype
virtual void emitIndirectFunctionType(StringRef name,
SmallVectorImpl<MVT> &Params,
- SmallVectorImpl<MVT> &Results) {
- llvm_unreachable("emitIndirectFunctionType not implemented");
- }
+ SmallVectorImpl<MVT> &Results) = 0;
/// .indidx
virtual void emitIndIdx(const MCExpr *Value) = 0;
/// .import_global
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 27c01cb8acf7..19e14f3261aa 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -16,11 +16,15 @@
#include "MCTargetDesc/WebAssemblyFixupKinds.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/MC/MCWasmObjectWriter.h"
+#include "llvm/MC/MCValue.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
+
using namespace llvm;
namespace {
@@ -29,8 +33,8 @@ public:
explicit WebAssemblyWasmObjectWriter(bool Is64Bit);
private:
- unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
- const MCFixup &Fixup, bool IsPCRel) const override;
+ unsigned getRelocType(const MCValue &Target,
+ const MCFixup &Fixup) const override;
};
} // end anonymous namespace
@@ -39,16 +43,13 @@ WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit)
// Test whether the given expression computes a function address.
static bool IsFunctionExpr(const MCExpr *Expr) {
- if (const MCSymbolRefExpr *SyExp =
- dyn_cast<MCSymbolRefExpr>(Expr))
+ if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr))
return cast<MCSymbolWasm>(SyExp->getSymbol()).isFunction();
- if (const MCBinaryExpr *BinOp =
- dyn_cast<MCBinaryExpr>(Expr))
+ if (auto BinOp = dyn_cast<MCBinaryExpr>(Expr))
return IsFunctionExpr(BinOp->getLHS()) != IsFunctionExpr(BinOp->getRHS());
- if (const MCUnaryExpr *UnOp =
- dyn_cast<MCUnaryExpr>(Expr))
+ if (auto UnOp = dyn_cast<MCUnaryExpr>(Expr))
return IsFunctionExpr(UnOp->getSubExpr());
return false;
@@ -59,15 +60,13 @@ static bool IsFunctionType(const MCValue &Target) {
return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX;
}
-unsigned WebAssemblyWasmObjectWriter::getRelocType(MCContext &Ctx,
- const MCValue &Target,
- const MCFixup &Fixup,
- bool IsPCRel) const {
+unsigned
+WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
+ const MCFixup &Fixup) const {
// WebAssembly functions are not allocated in the data address space. To
// resolve a pointer to a function, we must use a special relocation type.
bool IsFunction = IsFunctionExpr(Fixup.getValue());
- assert(!IsPCRel);
switch (unsigned(Fixup.getKind())) {
case WebAssembly::fixup_code_sleb128_i32:
if (IsFunction)
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 4178ec0b28f0..b999091e2d29 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -33,6 +33,8 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCSymbolELF.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
@@ -218,9 +220,13 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) {
if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
- if (GV->getValueType()->isFunctionTy())
+ if (GV->getValueType()->isFunctionTy()) {
+ MCSymbol* Sym = getSymbol(GV);
+ if (!isa<MCSymbolELF>(Sym))
+ cast<MCSymbolWasm>(Sym)->setIsFunction(true);
return MCSymbolRefExpr::create(
- getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext);
+ Sym, MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext);
+ }
return AsmPrinter::lowerConstant(CV);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 47aadf99e860..b3ce4bd27460 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -36,7 +36,7 @@ unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
return Result;
}
-unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) const {
if (Vector && getST()->hasSIMD128())
return 128;
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index f658609f8930..7b35fc916133 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -55,7 +55,7 @@ public:
/// @{
unsigned getNumberOfRegisters(bool Vector);
- unsigned getRegisterBitWidth(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector) const;
unsigned getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 831e9bdab0e1..172eba0002d4 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1,4 +1,3 @@
-
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
//
// The LLVM Compiler Infrastructure
@@ -5314,20 +5313,37 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
- unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
- unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+ // Bitcast a source array of element bits to the target size.
+ auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
+ unsigned NumSrcElts = UndefSrcElts.getBitWidth();
+ unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
+ assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
+ "Constant bit sizes don't match");
- // Extract all the undef/constant element data and pack into single bitsets.
- APInt UndefBits(SizeInBits, 0);
- APInt MaskBits(SizeInBits, 0);
-
- // Split the undef/constant single bitset data into the target elements.
- auto SplitBitData = [&]() {
// Don't split if we don't allow undef bits.
bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
- if (UndefBits.getBoolValue() && !AllowUndefs)
+ if (UndefSrcElts.getBoolValue() && !AllowUndefs)
return false;
+ // If we're already the right size, don't bother bitcasting.
+ if (NumSrcElts == NumElts) {
+ UndefElts = UndefSrcElts;
+ EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
+ return true;
+ }
+
+ // Extract all the undef/constant element data and pack into single bitsets.
+ APInt UndefBits(SizeInBits, 0);
+ APInt MaskBits(SizeInBits, 0);
+
+ for (unsigned i = 0; i != NumSrcElts; ++i) {
+ unsigned BitOffset = i * SrcEltSizeInBits;
+ if (UndefSrcElts[i])
+ UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
+ MaskBits.insertBits(SrcEltBits[i], BitOffset);
+ }
+
+ // Split the undef/constant single bitset data into the target elements.
UndefElts = APInt(NumElts, 0);
EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
@@ -5356,20 +5372,19 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
// Collect constant bits and insert into mask/undef bit masks.
auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
- unsigned BitOffset) {
+ unsigned UndefBitIndex) {
if (!Cst)
return false;
if (isa<UndefValue>(Cst)) {
- unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
- Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
+ Undefs.setBit(UndefBitIndex);
return true;
}
if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
- Mask.insertBits(CInt->getValue(), BitOffset);
+ Mask = CInt->getValue();
return true;
}
if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
- Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
+ Mask = CFP->getValueAPF().bitcastToAPInt();
return true;
}
return false;
@@ -5377,18 +5392,21 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
// Extract constant bits from build vector.
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
const SDValue &Src = Op.getOperand(i);
- unsigned BitOffset = i * SrcEltSizeInBits;
if (Src.isUndef()) {
- UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
+ UndefSrcElts.setBit(i);
continue;
}
auto *Cst = cast<ConstantSDNode>(Src);
- APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
- MaskBits.insertBits(Bits, BitOffset);
+ SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
}
- return SplitBitData();
+ return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract constant bits from constant pool vector.
@@ -5397,27 +5415,33 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
return false;
- unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
- for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
- if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
- i * CstEltSizeInBits))
+ unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumSrcElts = CstTy->getVectorNumElements();
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
+ for (unsigned i = 0; i != NumSrcElts; ++i)
+ if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
+ UndefSrcElts, i))
return false;
- return SplitBitData();
+ return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract constant bits from a broadcasted constant pool scalar.
if (Op.getOpcode() == X86ISD::VBROADCAST &&
- EltSizeInBits <= SrcEltSizeInBits) {
+ EltSizeInBits <= VT.getScalarSizeInBits()) {
if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
- APInt Bits(SizeInBits, 0);
- APInt Undefs(SizeInBits, 0);
- if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
- for (unsigned i = 0; i != NumSrcElts; ++i) {
- MaskBits |= Bits.shl(i * SrcEltSizeInBits);
- UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
- }
- return SplitBitData();
+ unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
+ if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
+ if (UndefSrcElts[0])
+ UndefSrcElts.setBits(0, NumSrcElts);
+ SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
+ return CastBitData(UndefSrcElts, SrcEltBits);
}
}
}
@@ -5426,10 +5450,15 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits;
auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
- MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
- MaskBits = MaskBits.zext(SizeInBits);
- return SplitBitData();
+ SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
+ SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
+ return CastBitData(UndefSrcElts, SrcEltBits);
}
return false;
@@ -6491,16 +6520,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
-
- if (LDBase->hasAnyUseOfValue(1)) {
- SDValue NewChain =
- DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
- SDValue(NewLd.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
- DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
- SDValue(NewLd.getNode(), 1));
- }
-
+ DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
return NewLd;
};
@@ -6565,19 +6585,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
LDBase->getAlignment(),
false/*isVolatile*/, true/*ReadMem*/,
false/*WriteMem*/);
-
- // Make sure the newly-created LOAD is in the same position as LDBase in
- // terms of dependency. We create a TokenFactor for LDBase and ResNode,
- // and update uses of LDBase's output chain to use the TokenFactor.
- if (LDBase->hasAnyUseOfValue(1)) {
- SDValue NewChain =
- DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
- SDValue(ResNode.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
- DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
- SDValue(ResNode.getNode(), 1));
- }
-
+ DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
return DAG.getBitcast(VT, ResNode);
}
}
@@ -9930,17 +9938,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
Ld->getMemOperand(), Offset, SVT.getStoreSize()));
-
- // Make sure the newly-created LOAD is in the same position as Ld in
- // terms of dependency. We create a TokenFactor for Ld and V,
- // and update uses of Ld's output chain to use the TokenFactor.
- if (Ld->hasAnyUseOfValue(1)) {
- SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- SDValue(Ld, 1), SDValue(V.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
- DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
- SDValue(V.getNode(), 1));
- }
+ DAG.makeEquivalentMemoryOrdering(Ld, V);
} else if (!BroadcastFromReg) {
// We can't broadcast from a vector register.
return SDValue();
@@ -10891,9 +10889,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
"We need to be changing the number of flipped inputs!");
int PSHUFHalfMask[] = {0, 1, 2, 3};
std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
- V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
- MVT::v8i16, V,
- getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+ V = DAG.getNode(
+ FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
+ MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
+ getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
for (int &M : Mask)
if (M >= 0 && M == FixIdx)
@@ -12007,18 +12006,22 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
- // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
+ // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
- MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0, DL));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
- OnlyUsesV1 ? V1 : V2,
- DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
+ // this will likely become vinsertf128 which can't fold a 256-bit memop.
+ if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
}
}
@@ -19117,7 +19120,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
if (Op.getOpcode() == X86ISD::FSETCCM ||
- Op.getOpcode() == X86ISD::FSETCCM_RND)
+ Op.getOpcode() == X86ISD::FSETCCM_RND)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
if (Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
@@ -27968,28 +27971,45 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
OpMask.size() % RootMask.size() == 0) ||
OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.");
- int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
- int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
- int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
- assert(((RootRatio == 1 && OpRatio == 1) ||
- (RootRatio == 1) != (OpRatio == 1)) &&
+
+ // This function can be performance-critical, so we rely on the power-of-2
+ // knowledge that we have about the mask sizes to replace div/rem ops with
+ // bit-masks and shifts.
+ assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
+ unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
+
+ unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
+ unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
+ unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
+ assert((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!");
- SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
+ assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
+ unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
+
+ SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
// Merge this shuffle operation's mask into our accumulated mask. Note that
// this shuffle's mask will be the first applied to the input, followed by the
// root mask to get us all the way to the root value arrangement. The reason
// for this order is that we are recursing up the operation chain.
- for (int i = 0; i < MaskWidth; ++i) {
- int RootIdx = i / RootRatio;
+ for (unsigned i = 0; i < MaskWidth; ++i) {
+ unsigned RootIdx = i >> RootRatioLog2;
if (RootMask[RootIdx] < 0) {
// This is a zero or undef lane, we're done.
Mask[i] = RootMask[RootIdx];
continue;
}
- int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+ unsigned RootMaskedIdx =
+ RootRatio == 1
+ ? RootMask[RootIdx]
+ : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
// Just insert the scaled root mask value if it references an input other
// than the SrcOp we're currently inserting.
@@ -27999,9 +28019,8 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
continue;
}
- RootMaskedIdx %= MaskWidth;
-
- int OpIdx = RootMaskedIdx / OpRatio;
+ RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
+ unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
if (OpMask[OpIdx] < 0) {
// The incoming lanes are zero or undef, it doesn't matter which ones we
// are using.
@@ -28010,9 +28029,12 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
}
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
- int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
- OpMaskedIdx %= MaskWidth;
+ unsigned OpMaskedIdx =
+ OpRatio == 1
+ ? OpMask[OpIdx]
+ : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
+ OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
if (OpMask[OpIdx] < (int)OpMask.size()) {
assert(0 <= InputIdx0 && "Unknown target shuffle input");
OpMaskedIdx += InputIdx0 * MaskWidth;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index d8702693884d..2620679df251 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1631,6 +1631,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2)))))],
IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ let isCommutable = IsCommutable in
def rrk : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
@@ -1764,6 +1765,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
(_.VT (bitconvert (_.LdFrag addr:$src2))),
imm:$cc))],
IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ let isCommutable = 1 in
def rrik : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
AVX512ICC:$cc),
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 5224a16613cb..c28b35b22977 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -737,19 +737,15 @@ def alignedloadv8f64 : PatFrag<(ops node:$ptr),
def alignedloadv8i64 : PatFrag<(ops node:$ptr),
(v8i64 (alignedload512 node:$ptr))>;
-// Like 'load', but uses special alignment checks suitable for use in
+// Like 'vec128load', but uses special alignment checks suitable for use in
// memory operands in most SSE instructions, which are required to
// be naturally aligned on some targets but not on others. If the subtarget
// allows unaligned accesses, match any load, though this may require
// setting a feature bit in the processor (on startup, for example).
// Opteron 10h and later implement such a feature.
-// Avoid non-temporal aligned loads on supported targets.
-def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return (Subtarget->hasSSEUnalignedMem() ||
- cast<LoadSDNode>(N)->getAlignment() >= 16) &&
- (!Subtarget->hasSSE41() ||
- !(cast<LoadSDNode>(N)->getAlignment() >= 16 &&
- cast<LoadSDNode>(N)->isNonTemporal()));
+def memop : PatFrag<(ops node:$ptr), (vec128load node:$ptr), [{
+ return Subtarget->hasSSEUnalignedMem() ||
+ cast<LoadSDNode>(N)->getAlignment() >= 16;
}]>;
// 128-bit memop pattern fragments
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index ff5d90c4e78b..f3094b781c49 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -898,10 +898,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDZrr, X86::VPABSDZrm, 0 },
{ X86::VPABSQZrr, X86::VPABSQZrm, 0 },
{ X86::VPABSWZrr, X86::VPABSWZrm, 0 },
+ { X86::VPCONFLICTDZrr, X86::VPCONFLICTDZrm, 0 },
+ { X86::VPCONFLICTQZrr, X86::VPCONFLICTQZrm, 0 },
{ X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
{ X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
{ X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
{ X86::VPERMQZri, X86::VPERMQZmi, 0 },
+ { X86::VPLZCNTDZrr, X86::VPLZCNTDZrm, 0 },
+ { X86::VPLZCNTQZrr, X86::VPLZCNTQZrm, 0 },
{ X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
{ X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
{ X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
@@ -948,10 +952,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDZ256rr, X86::VPABSDZ256rm, 0 },
{ X86::VPABSQZ256rr, X86::VPABSQZ256rm, 0 },
{ X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0 },
+ { X86::VPCONFLICTDZ256rr, X86::VPCONFLICTDZ256rm, 0 },
+ { X86::VPCONFLICTQZ256rr, X86::VPCONFLICTQZ256rm, 0 },
{ X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
{ X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
{ X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
{ X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
+ { X86::VPLZCNTDZ256rr, X86::VPLZCNTDZ256rm, 0 },
+ { X86::VPLZCNTQZ256rr, X86::VPLZCNTQZ256rm, 0 },
{ X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
@@ -995,8 +1003,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDZ128rr, X86::VPABSDZ128rm, 0 },
{ X86::VPABSQZ128rr, X86::VPABSQZ128rm, 0 },
{ X86::VPABSWZ128rr, X86::VPABSWZ128rm, 0 },
+ { X86::VPCONFLICTDZ128rr, X86::VPCONFLICTDZ128rm, 0 },
+ { X86::VPCONFLICTQZ128rr, X86::VPCONFLICTQZ128rm, 0 },
{ X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
{ X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
+ { X86::VPLZCNTDZ128rr, X86::VPLZCNTDZ128rm, 0 },
+ { X86::VPLZCNTQZ128rr, X86::VPLZCNTQZ128rm, 0 },
{ X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
@@ -2312,10 +2324,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDZrrkz, X86::VPABSDZrmkz, 0 },
{ X86::VPABSQZrrkz, X86::VPABSQZrmkz, 0 },
{ X86::VPABSWZrrkz, X86::VPABSWZrmkz, 0 },
+ { X86::VPCONFLICTDZrrkz, X86::VPCONFLICTDZrmkz, 0 },
+ { X86::VPCONFLICTQZrrkz, X86::VPCONFLICTQZrmkz, 0 },
{ X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
{ X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
{ X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
{ X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
+ { X86::VPLZCNTDZrrkz, X86::VPLZCNTDZrmkz, 0 },
+ { X86::VPLZCNTQZrrkz, X86::VPLZCNTQZrmkz, 0 },
{ X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
{ X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
@@ -2350,10 +2366,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDZ256rrkz, X86::VPABSDZ256rmkz, 0 },
{ X86::VPABSQZ256rrkz, X86::VPABSQZ256rmkz, 0 },
{ X86::VPABSWZ256rrkz, X86::VPABSWZ256rmkz, 0 },
+ { X86::VPCONFLICTDZ256rrkz, X86::VPCONFLICTDZ256rmkz, 0 },
+ { X86::VPCONFLICTQZ256rrkz, X86::VPCONFLICTQZ256rmkz, 0 },
{ X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
{ X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
{ X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
{ X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
+ { X86::VPLZCNTDZ256rrkz, X86::VPLZCNTDZ256rmkz, 0 },
+ { X86::VPLZCNTQZ256rrkz, X86::VPLZCNTQZ256rmkz, 0 },
{ X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
@@ -2385,8 +2405,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDZ128rrkz, X86::VPABSDZ128rmkz, 0 },
{ X86::VPABSQZ128rrkz, X86::VPABSQZ128rmkz, 0 },
{ X86::VPABSWZ128rrkz, X86::VPABSWZ128rmkz, 0 },
+ { X86::VPCONFLICTDZ128rrkz, X86::VPCONFLICTDZ128rmkz, 0 },
+ { X86::VPCONFLICTQZ128rrkz, X86::VPCONFLICTQZ128rmkz, 0 },
{ X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
{ X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
+ { X86::VPLZCNTDZ128rrkz, X86::VPLZCNTDZ128rmkz, 0 },
+ { X86::VPLZCNTQZ128rrkz, X86::VPLZCNTQZ128rmkz, 0 },
{ X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
@@ -2935,10 +2959,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDZrrk, X86::VPABSDZrmk, 0 },
{ X86::VPABSQZrrk, X86::VPABSQZrmk, 0 },
{ X86::VPABSWZrrk, X86::VPABSWZrmk, 0 },
+ { X86::VPCONFLICTDZrrk, X86::VPCONFLICTDZrmk, 0 },
+ { X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 },
{ X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
{ X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
{ X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
{ X86::VPERMQZrik, X86::VPERMQZmik, 0 },
+ { X86::VPLZCNTDZrrk, X86::VPLZCNTDZrmk, 0 },
+ { X86::VPLZCNTQZrrk, X86::VPLZCNTQZrmk, 0 },
{ X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
{ X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
{ X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
@@ -2973,10 +3001,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDZ256rrk, X86::VPABSDZ256rmk, 0 },
{ X86::VPABSQZ256rrk, X86::VPABSQZ256rmk, 0 },
{ X86::VPABSWZ256rrk, X86::VPABSWZ256rmk, 0 },
+ { X86::VPCONFLICTDZ256rrk, X86::VPCONFLICTDZ256rmk, 0 },
+ { X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 },
{ X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
{ X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
{ X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
{ X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
+ { X86::VPLZCNTDZ256rrk, X86::VPLZCNTDZ256rmk, 0 },
+ { X86::VPLZCNTQZ256rrk, X86::VPLZCNTQZ256rmk, 0 },
{ X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
@@ -3008,8 +3040,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDZ128rrk, X86::VPABSDZ128rmk, 0 },
{ X86::VPABSQZ128rrk, X86::VPABSQZ128rmk, 0 },
{ X86::VPABSWZ128rrk, X86::VPABSWZ128rmk, 0 },
+ { X86::VPCONFLICTDZ128rrk, X86::VPCONFLICTDZ128rmk, 0 },
+ { X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 },
{ X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
{ X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
+ { X86::VPLZCNTDZ128rrk, X86::VPLZCNTDZ128rmk, 0 },
+ { X86::VPLZCNTQZ128rrk, X86::VPLZCNTQZ128rmk, 0 },
{ X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
@@ -3034,6 +3070,64 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPSRLDZ128rik, X86::VPSRLDZ128mik, 0 },
{ X86::VPSRLQZ128rik, X86::VPSRLQZ128mik, 0 },
{ X86::VPSRLWZ128rik, X86::VPSRLWZ128mik, 0 },
+
+ // AVX-512 masked compare instructions
+ { X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0 },
+ { X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmik, 0 },
+ { X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0 },
+ { X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmik, 0 },
+ { X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0 },
+ { X86::VCMPPSZrrik, X86::VCMPPSZrmik, 0 },
+ { X86::VCMPSDZrr_Intk, X86::VCMPSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VCMPSSZrr_Intk, X86::VCMPSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VPCMPBZ128rrik, X86::VPCMPBZ128rmik, 0 },
+ { X86::VPCMPBZ256rrik, X86::VPCMPBZ256rmik, 0 },
+ { X86::VPCMPBZrrik, X86::VPCMPBZrmik, 0 },
+ { X86::VPCMPDZ128rrik, X86::VPCMPDZ128rmik, 0 },
+ { X86::VPCMPDZ256rrik, X86::VPCMPDZ256rmik, 0 },
+ { X86::VPCMPDZrrik, X86::VPCMPDZrmik, 0 },
+ { X86::VPCMPEQBZ128rrk, X86::VPCMPEQBZ128rmk, 0 },
+ { X86::VPCMPEQBZ256rrk, X86::VPCMPEQBZ256rmk, 0 },
+ { X86::VPCMPEQBZrrk, X86::VPCMPEQBZrmk, 0 },
+ { X86::VPCMPEQDZ128rrk, X86::VPCMPEQDZ128rmk, 0 },
+ { X86::VPCMPEQDZ256rrk, X86::VPCMPEQDZ256rmk, 0 },
+ { X86::VPCMPEQDZrrk, X86::VPCMPEQDZrmk, 0 },
+ { X86::VPCMPEQQZ128rrk, X86::VPCMPEQQZ128rmk, 0 },
+ { X86::VPCMPEQQZ256rrk, X86::VPCMPEQQZ256rmk, 0 },
+ { X86::VPCMPEQQZrrk, X86::VPCMPEQQZrmk, 0 },
+ { X86::VPCMPEQWZ128rrk, X86::VPCMPEQWZ128rmk, 0 },
+ { X86::VPCMPEQWZ256rrk, X86::VPCMPEQWZ256rmk, 0 },
+ { X86::VPCMPEQWZrrk, X86::VPCMPEQWZrmk, 0 },
+ { X86::VPCMPGTBZ128rrk, X86::VPCMPGTBZ128rmk, 0 },
+ { X86::VPCMPGTBZ256rrk, X86::VPCMPGTBZ256rmk, 0 },
+ { X86::VPCMPGTBZrrk, X86::VPCMPGTBZrmk, 0 },
+ { X86::VPCMPGTDZ128rrk, X86::VPCMPGTDZ128rmk, 0 },
+ { X86::VPCMPGTDZ256rrk, X86::VPCMPGTDZ256rmk, 0 },
+ { X86::VPCMPGTDZrrk, X86::VPCMPGTDZrmk, 0 },
+ { X86::VPCMPGTQZ128rrk, X86::VPCMPGTQZ128rmk, 0 },
+ { X86::VPCMPGTQZ256rrk, X86::VPCMPGTQZ256rmk, 0 },
+ { X86::VPCMPGTQZrrk, X86::VPCMPGTQZrmk, 0 },
+ { X86::VPCMPGTWZ128rrk, X86::VPCMPGTWZ128rmk, 0 },
+ { X86::VPCMPGTWZ256rrk, X86::VPCMPGTWZ256rmk, 0 },
+ { X86::VPCMPGTWZrrk, X86::VPCMPGTWZrmk, 0 },
+ { X86::VPCMPQZ128rrik, X86::VPCMPQZ128rmik, 0 },
+ { X86::VPCMPQZ256rrik, X86::VPCMPQZ256rmik, 0 },
+ { X86::VPCMPQZrrik, X86::VPCMPQZrmik, 0 },
+ { X86::VPCMPUBZ128rrik, X86::VPCMPUBZ128rmik, 0 },
+ { X86::VPCMPUBZ256rrik, X86::VPCMPUBZ256rmik, 0 },
+ { X86::VPCMPUBZrrik, X86::VPCMPUBZrmik, 0 },
+ { X86::VPCMPUDZ128rrik, X86::VPCMPUDZ128rmik, 0 },
+ { X86::VPCMPUDZ256rrik, X86::VPCMPUDZ256rmik, 0 },
+ { X86::VPCMPUDZrrik, X86::VPCMPUDZrmik, 0 },
+ { X86::VPCMPUQZ128rrik, X86::VPCMPUQZ128rmik, 0 },
+ { X86::VPCMPUQZ256rrik, X86::VPCMPUQZ256rmik, 0 },
+ { X86::VPCMPUQZrrik, X86::VPCMPUQZrmik, 0 },
+ { X86::VPCMPUWZ128rrik, X86::VPCMPUWZ128rmik, 0 },
+ { X86::VPCMPUWZ256rrik, X86::VPCMPUWZ256rmik, 0 },
+ { X86::VPCMPUWZrrik, X86::VPCMPUWZrmik, 0 },
+ { X86::VPCMPWZ128rrik, X86::VPCMPWZ128rmik, 0 },
+ { X86::VPCMPWZ256rrik, X86::VPCMPWZ256rmik, 0 },
+ { X86::VPCMPWZrrik, X86::VPCMPWZrmik, 0 },
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
@@ -5136,20 +5230,32 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return nullptr;
}
}
- case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
- case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
- case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
- case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
- case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
- case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
- case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
- case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
- case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
- case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
- case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
- case X86::VPCMPWZrri: case X86::VPCMPUWZrri: {
+ case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
+ case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
+ case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
+ case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPWZrri: case X86::VPCMPUWZrri:
+ case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik:
+ case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
+ case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
// Flip comparison mode immediate (if necessary).
- unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
switch (Imm) {
default: llvm_unreachable("Unreachable!");
case 0x01: Imm = 0x06; break; // LT -> NLE
@@ -5163,7 +5269,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
break;
}
auto &WorkingMI = cloneIfNew(MI);
- WorkingMI.getOperand(3).setImm(Imm);
+ WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}