summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/ReleaseNotes.rst38
-rw-r--r--include/llvm/IR/IntrinsicsAMDGPU.td20
-rw-r--r--include/llvm/IR/IntrinsicsX86.td9
-rw-r--r--include/llvm/MC/MCAsmMacro.h38
-rw-r--r--include/llvm/MC/MCContext.h15
-rw-r--r--include/llvm/Support/GenericDomTreeConstruction.h32
-rw-r--r--lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp3
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfUnit.cpp3
-rw-r--r--lib/CodeGen/LivePhysRegs.cpp31
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp4
-rw-r--r--lib/CodeGen/SplitKit.cpp53
-rw-r--r--lib/CodeGen/SplitKit.h6
-rw-r--r--lib/IR/AutoUpgrade.cpp7
-rw-r--r--lib/MC/MCParser/AsmParser.cpp56
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.cpp18
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td8
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp52
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp18
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h1
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td8
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp10
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp11
-rw-r--r--lib/Target/X86/X86DomainReassignment.cpp12
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp217
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h5
-rw-r--r--lib/Target/X86/X86RetpolineThunks.cpp68
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp12
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll84
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll84
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll164
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll164
-rw-r--r--test/CodeGen/ARM/pr25838.ll2
-rw-r--r--test/CodeGen/ARM/splitkit.ll245
-rw-r--r--test/CodeGen/Thumb/stm-scavenging.ll46
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-fast-isel.ll53
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-upgrade.ll14
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll15
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll96
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll371
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll40
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll49
-rw-r--r--test/CodeGen/X86/domain-reassignment.mir439
-rw-r--r--test/CodeGen/X86/inline-asm-modifier-V.ll14
-rw-r--r--test/CodeGen/X86/pr36199.ll22
-rw-r--r--test/CodeGen/X86/retpoline-external.ll48
-rw-r--r--test/CodeGen/X86/retpoline-regparm.ll42
-rw-r--r--test/CodeGen/X86/retpoline.ll14
-rw-r--r--test/DebugInfo/X86/void-typedef.ll88
-rw-r--r--test/MC/AsmParser/inline_macro_duplication.ll8
-rw-r--r--test/MC/X86/x86-64.s5
-rw-r--r--test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll108
55 files changed, 2116 insertions, 866 deletions
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index c28a3829bfee..949ec85c270b 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -71,6 +71,13 @@ Non-comprehensive list of changes in this release
Changes to the LLVM IR
----------------------
+Changes to the AArch64 Target
+-----------------------------
+
+During this release:
+
+ * Enabled the new GlobalISel instruction selection framework by default at ``-O0``.
+
Changes to the ARM Target
-------------------------
@@ -80,6 +87,28 @@ During this release the ARM target has:
isn't the default.
+Changes to the Hexagon Target
+-----------------------------
+
+* The Hexagon backend now supports V65 ISA.
+
+* The ``-mhvx`` option now takes an optional value that specified the ISA
+ version of the HVX coprocessor. The available values are v60, v62 and v65.
+ By default, the value is set to be the same as the CPU version.
+
+* The compiler option ``-mhvx-double`` is deprecated and will be removed in
+ the next release of the compiler. Programmers should use ``-mhvx-length``
+ option to specify the desired vector length: ``-mhvx-length=64b`` for
+ 64-byte vectors and ``-mhvx-length=128b`` for 128-byte vectors. While the
+ current default vector length is 64 bytes, users should always specify the
+ length explicitly, since the default value may change in the future.
+
+* The target feature ``hvx-double`` is deprecated and will be removed in the
+ next release. LLVM IR generators should use target features ``hvx-length64b``
+ and ``hvx-length128b`` to indicate the vector length. The length should
+ always be specified when HVX code generation is enabled.
+
+
Changes to the MIPS Target
--------------------------
@@ -91,6 +120,15 @@ Changes to the PowerPC Target
During this release ...
+Changes to the SystemZ Target
+-----------------------------
+
+During this release the SystemZ target has:
+
+* Added support for 128-bit atomic operations.
+
+* Added support for the "o" constraint for inline asm statements.
+
Changes to the X86 Target
-------------------------
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 22a3a0fe618f..3397fa41db1b 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -238,6 +238,26 @@ def int_amdgcn_cvt_pkrtz : Intrinsic<
[IntrNoMem, IntrSpeculatable]
>;
+def int_amdgcn_cvt_pknorm_i16 : Intrinsic<
+ [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pknorm_u16 : Intrinsic<
+ [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pk_i16 : Intrinsic<
+ [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pk_u16 : Intrinsic<
+ [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
def int_amdgcn_class : Intrinsic<
[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index bd6177c5b3d9..7c000e2b1dc7 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -3738,6 +3738,15 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_kxnor_w : // TODO: remove this intrinsic
Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
[IntrNoMem]>;
+ def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
+ Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
+ [IntrNoMem]>;
+ def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
+ Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+ [IntrNoMem]>;
def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
[IntrNoMem]>;
diff --git a/include/llvm/MC/MCAsmMacro.h b/include/llvm/MC/MCAsmMacro.h
new file mode 100644
index 000000000000..34d14abc9645
--- /dev/null
+++ b/include/llvm/MC/MCAsmMacro.h
@@ -0,0 +1,38 @@
+//===- MCAsmMacro.h - Assembly Macros ---------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCASMMACRO_H
+#define LLVM_MC_MCASMMACRO_H
+
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+
+namespace llvm {
+
+struct MCAsmMacroParameter {
+ StringRef Name;
+ std::vector<AsmToken> Value;
+ bool Required = false;
+ bool Vararg = false;
+
+ MCAsmMacroParameter() = default;
+};
+
+typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
+struct MCAsmMacro {
+ StringRef Name;
+ StringRef Body;
+ MCAsmMacroParameters Parameters;
+
+public:
+ MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
+ : Name(N), Body(B), Parameters(std::move(P)) {}
+};
+}; // namespace llvm
+
+#endif
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 432fc0ede072..358f67c4db6d 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -18,6 +18,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCAsmMacro.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/SectionKind.h"
@@ -268,6 +269,9 @@ namespace llvm {
unsigned UniqueID,
const MCSymbolELF *Associated);
+ /// \brief Map of currently defined macros.
+ StringMap<MCAsmMacro> MacroMap;
+
public:
explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
const MCObjectFileInfo *MOFI,
@@ -618,6 +622,17 @@ namespace llvm {
// FIXME: We should really do something about that.
LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L,
const Twine &Msg);
+
+ const MCAsmMacro *lookupMacro(StringRef Name) {
+ StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
+ return (I == MacroMap.end()) ? nullptr : &I->getValue();
+ }
+
+ void defineMacro(StringRef Name, MCAsmMacro Macro) {
+ MacroMap.insert(std::make_pair(Name, std::move(Macro)));
+ }
+
+ void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
};
} // end namespace llvm
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index 25175fe66aa8..9438c9e08850 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -698,24 +698,20 @@ struct SemiNCAInfo {
return;
// Recalculate the set of roots.
- DT.Roots = FindRoots(DT, BUI);
- for (const NodePtr R : DT.Roots) {
- const TreeNodePtr TN = DT.getNode(R);
- // A CFG node was selected as a tree root, but the corresponding tree node
- // is not connected to the virtual root. This is because the incremental
- // algorithm does not really know or use the set of roots and can make a
- // different (implicit) decision about which nodes within an infinite loop
- // becomes a root.
- if (TN && !DT.isVirtualRoot(TN->getIDom())) {
- DEBUG(dbgs() << "Root " << BlockNamePrinter(R)
- << " is not virtual root's child\n"
- << "The entire tree needs to be rebuilt\n");
- // It should be possible to rotate the subtree instead of recalculating
- // the whole tree, but this situation happens extremely rarely in
- // practice.
- CalculateFromScratch(DT, BUI);
- return;
- }
+ auto Roots = FindRoots(DT, BUI);
+ if (DT.Roots.size() != Roots.size() ||
+ !std::is_permutation(DT.Roots.begin(), DT.Roots.end(), Roots.begin())) {
+ // The roots chosen in the CFG have changed. This is because the
+ // incremental algorithm does not really know or use the set of roots and
+ // can make a different (implicit) decision about which node within an
+ // infinite loop becomes a root.
+
+ DEBUG(dbgs() << "Roots are different in updated trees\n"
+ << "The entire tree needs to be rebuilt\n");
+ // It may be possible to update the tree without recalculating it, but
+ // we do not know yet how to do it, and it happens rarely in practise.
+ CalculateFromScratch(DT, BUI);
+ return;
}
}
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index d94b0e5c2118..2e5c22447936 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -163,7 +163,8 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
DIType *BaseType = DDTy->getBaseType().resolve();
- assert(BaseType && "Unexpected invalid base type");
+ if (!BaseType)
+ return 0;
// If this is a derived type, go ahead and get the base type, unless it's a
// reference then it's just the size of the field. Pointer types have no need
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 911e46235781..4ea59f504bd4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1391,7 +1391,8 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
if (!Name.empty())
addString(MemberDie, dwarf::DW_AT_name, Name);
- addType(MemberDie, resolve(DT->getBaseType()));
+ if (DIType *Resolved = resolve(DT->getBaseType()))
+ addType(MemberDie, Resolved);
addSourceLine(MemberDie, DT);
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index f4b43a9b8ead..277212cf7dac 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -205,14 +205,18 @@ void LivePhysRegs::addPristines(const MachineFunction &MF) {
}
void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
- if (!MBB.succ_empty()) {
- // To get the live-outs we simply merge the live-ins of all successors.
- for (const MachineBasicBlock *Succ : MBB.successors())
- addBlockLiveIns(*Succ);
- } else if (MBB.isReturnBlock()) {
- // For the return block: Add all callee saved registers that are saved and
- // restored (somewhere); This does not include callee saved registers that
- // are unused and hence not saved and restored; they are called pristine.
+ // To get the live-outs we simply merge the live-ins of all successors.
+ for (const MachineBasicBlock *Succ : MBB.successors())
+ addBlockLiveIns(*Succ);
+ if (MBB.isReturnBlock()) {
+ // Return blocks are a special case because we currently don't mark up
+ // return instructions completely: specifically, there is no explicit
+ // use for callee-saved registers. So we add all callee saved registers
+ // that are saved and restored (somewhere). This does not include
+ // callee saved registers that are unused and hence not saved and
+ // restored; they are called pristine.
+ // FIXME: PEI should add explicit markings to return instructions
+ // instead of implicitly handling them here.
const MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
if (MFI.isCalleeSavedInfoValid()) {
@@ -225,15 +229,8 @@ void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
const MachineFunction &MF = *MBB.getParent();
- if (!MBB.succ_empty()) {
- addPristines(MF);
- addLiveOutsNoPristines(MBB);
- } else if (MBB.isReturnBlock()) {
- // For the return block: Add all callee saved registers.
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (MFI.isCalleeSavedInfoValid())
- addCalleeSavedRegs(*this, MF);
- }
+ addPristines(MF);
+ addLiveOutsNoPristines(MBB);
}
void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2c6b724c02df..03cb2e310c7e 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16409,7 +16409,9 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getOperand(0).getOperand(1) == N2 &&
N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
- VT.getVectorNumElements()) {
+ VT.getVectorNumElements() &&
+ N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
+ VT.getSizeInBits()) {
return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
}
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index c99c3b09d88a..1628ee28b8a3 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -491,9 +491,8 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
return VNI;
}
-void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
- assert(ParentVNI && "Mapping NULL value");
- ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI->id)];
+void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
+ ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI.id)];
VNInfo *VNI = VFP.getPointer();
// ParentVNI was either unmapped or already complex mapped. Either way, just
@@ -777,7 +776,7 @@ SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
// the source live range. The spiller also won't try to hoist this copy.
if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) &&
MI->readsVirtualRegister(Edit->getReg())) {
- forceRecompute(0, ParentVNI);
+ forceRecompute(0, *ParentVNI);
defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
return Idx;
}
@@ -835,7 +834,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
// The complement interval will be extended as needed by LRCalc.extend().
if (ParentVNI)
- forceRecompute(0, ParentVNI);
+ forceRecompute(0, *ParentVNI);
DEBUG(dbgs() << " overlapIntv [" << Start << ';' << End << "):");
RegAssign.insert(Start, End, OpenIdx);
DEBUG(dump());
@@ -878,7 +877,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
unsigned RegIdx = AssignI.value();
if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg())) {
DEBUG(dbgs() << " cannot find simple kill of RegIdx " << RegIdx << '\n');
- forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def));
+ forceRecompute(RegIdx, *Edit->getParent().getVNInfoAt(Def));
} else {
SlotIndex Kill = LIS.getInstructionIndex(*MBBI).getRegSlot();
DEBUG(dbgs() << " move kill to " << Kill << '\t' << *MBBI);
@@ -982,7 +981,7 @@ void SplitEditor::computeRedundantBackCopies(
}
}
if (!DominatedVNIs.empty()) {
- forceRecompute(0, ParentVNI);
+ forceRecompute(0, *ParentVNI);
for (auto VNI : DominatedVNIs) {
BackCopies.push_back(VNI);
}
@@ -1102,7 +1101,7 @@ void SplitEditor::hoistCopies() {
NotToHoistSet.count(ParentVNI->id))
continue;
BackCopies.push_back(VNI);
- forceRecompute(0, ParentVNI);
+ forceRecompute(0, *ParentVNI);
}
// If it is not beneficial to hoist all the BackCopies, simply remove
@@ -1428,6 +1427,41 @@ void SplitEditor::deleteRematVictims() {
Edit->eliminateDeadDefs(Dead, None, &AA);
}
+void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
+ // Fast-path for common case.
+ if (!ParentVNI.isPHIDef()) {
+ for (unsigned I = 0, E = Edit->size(); I != E; ++I)
+ forceRecompute(I, ParentVNI);
+ return;
+ }
+
+ // Trace value through phis.
+ SmallPtrSet<const VNInfo *, 8> Visited; ///< whether VNI was/is in worklist.
+ SmallVector<const VNInfo *, 4> WorkList;
+ Visited.insert(&ParentVNI);
+ WorkList.push_back(&ParentVNI);
+
+ const LiveInterval &ParentLI = Edit->getParent();
+ const SlotIndexes &Indexes = *LIS.getSlotIndexes();
+ do {
+ const VNInfo &VNI = *WorkList.back();
+ WorkList.pop_back();
+ for (unsigned I = 0, E = Edit->size(); I != E; ++I)
+ forceRecompute(I, VNI);
+ if (!VNI.isPHIDef())
+ continue;
+
+ MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(VNI.def);
+ for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+ SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
+ VNInfo *PredVNI = ParentLI.getVNInfoBefore(PredEnd);
+ assert(PredVNI && "Value available in PhiVNI predecessor");
+ if (Visited.insert(PredVNI).second)
+ WorkList.push_back(PredVNI);
+ }
+ } while(!WorkList.empty());
+}
+
void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
++NumFinished;
@@ -1444,8 +1478,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
// Force rematted values to be recomputed everywhere.
// The new live ranges may be truncated.
if (Edit->didRematerialize(ParentVNI))
- for (unsigned i = 0, e = Edit->size(); i != e; ++i)
- forceRecompute(i, ParentVNI);
+ forceRecomputeVNI(*ParentVNI);
}
// Hoist back-copies to the complement interval when in spill mode.
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index c0608893d4e5..2dafaf587801 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -357,7 +357,11 @@ private:
/// recomputed by LiveRangeCalc::extend regardless of the number of defs.
/// This is used for values whose live range doesn't match RegAssign exactly.
/// They could have rematerialized, or back-copies may have been moved.
- void forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI);
+ void forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI);
+
+ /// Calls forceRecompute() on any affected regidx and on ParentVNI
+ /// predecessors in case of a phi definition.
+ void forceRecomputeVNI(const VNInfo &ParentVNI);
/// defFromParent - Define Reg from ParentVNI at UseIdx using either
/// rematerialization or a COPY from parent. Return the new value.
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index c258d1a4e3ad..c56a022c6705 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -75,7 +75,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name=="ssse3.pabs.d.128" || // Added in 6.0
Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
Name.startswith("avx512.mask.shuf.f") || // Added in 6.0
- Name.startswith("avx512.kunpck") || //added in 6.0
Name.startswith("avx2.pabs.") || // Added in 6.0
Name.startswith("avx512.mask.pabs.") || // Added in 6.0
Name.startswith("avx512.broadcastm") || // Added in 6.0
@@ -1063,12 +1062,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
CI->getArgOperand(1));
- } else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
- uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2;
- uint64_t And = (1ULL << Shift) - 1;
- Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And);
- Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift);
- Rep = Builder.CreateOr(LowBits, HighBits);
} else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
Type *I32Ty = Type::getInt32Ty(C);
Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 2259136c6ec4..ce3b70bed740 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -83,27 +83,6 @@ namespace {
typedef std::vector<AsmToken> MCAsmMacroArgument;
typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
-struct MCAsmMacroParameter {
- StringRef Name;
- MCAsmMacroArgument Value;
- bool Required = false;
- bool Vararg = false;
-
- MCAsmMacroParameter() = default;
-};
-
-typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
-
-struct MCAsmMacro {
- StringRef Name;
- StringRef Body;
- MCAsmMacroParameters Parameters;
-
-public:
- MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
- : Name(N), Body(B), Parameters(std::move(P)) {}
-};
-
/// \brief Helper class for storing information about an active macro
/// instantiation.
struct MacroInstantiation {
@@ -164,9 +143,6 @@ private:
/// addDirectiveHandler.
StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
- /// \brief Map of currently defined macros.
- StringMap<MCAsmMacro> MacroMap;
-
/// \brief Stack of active macro instantiations.
std::vector<MacroInstantiation*> ActiveMacros;
@@ -308,17 +284,6 @@ private:
/// \brief Control a flag in the parser that enables or disables macros.
void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
- /// \brief Lookup a previously defined macro.
- /// \param Name Macro name.
- /// \returns Pointer to macro. NULL if no such macro was defined.
- const MCAsmMacro* lookupMacro(StringRef Name);
-
- /// \brief Define a new macro with the given name and information.
- void defineMacro(StringRef Name, MCAsmMacro Macro);
-
- /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
- void undefineMacro(StringRef Name);
-
/// \brief Are we inside a macro instantiation?
bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
@@ -1841,7 +1806,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
// If macros are enabled, check to see if this is a macro instantiation.
if (areMacrosEnabled())
- if (const MCAsmMacro *M = lookupMacro(IDVal)) {
+ if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
return handleMacroEntry(M, IDLoc);
}
@@ -2720,17 +2685,6 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
return TokError("too many positional arguments");
}
-const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
- StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
- return (I == MacroMap.end()) ? nullptr : &I->getValue();
-}
-
-void AsmParser::defineMacro(StringRef Name, MCAsmMacro Macro) {
- MacroMap.insert(std::make_pair(Name, std::move(Macro)));
-}
-
-void AsmParser::undefineMacro(StringRef Name) { MacroMap.erase(Name); }
-
bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
// Arbitrarily limit macro nesting depth (default matches 'as'). We can
// eliminate this, although we should protect against infinite loops.
@@ -4249,7 +4203,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
eatToEndOfStatement();
}
- if (lookupMacro(Name)) {
+ if (getContext().lookupMacro(Name)) {
return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
}
@@ -4257,7 +4211,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
const char *BodyEnd = EndToken.getLoc().getPointer();
StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
- defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
+ getContext().defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
return false;
}
@@ -4416,10 +4370,10 @@ bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
"unexpected token in '.purgem' directive"))
return true;
- if (!lookupMacro(Name))
+ if (!getContext().lookupMacro(Name))
return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
- undefineMacro(Name);
+ getContext().undefineMacro(Name);
return false;
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 49929441ef21..21192a2c1cc8 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3957,6 +3957,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_F32_UBYTE2)
NODE_NAME_CASE(CVT_F32_UBYTE3)
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_I16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_U16_F32)
+ NODE_NAME_CASE(CVT_PK_I16_I32)
+ NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 5c31bddd9b1a..039ee174e5b7 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -417,6 +417,10 @@ enum NodeType : unsigned {
// Convert two float 32 numbers into a single register holding two packed f16
// with round to zero.
CVT_PKRTZ_F16_F32,
+ CVT_PKNORM_I16_F32,
+ CVT_PKNORM_U16_F32,
+ CVT_PK_I16_I32,
+ CVT_PK_U16_U32,
// Same as the standard node, except the high bits of the resulting integer
// are known 0.
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 8156599528c2..61892efe39e0 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -108,3 +108,21 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
return MCOp;
}
+
+// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
+bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
+ const Value *Ptr = MMO->getValue();
+ // UndefValue means this is a load of a kernel input. These are uniform.
+ // Sometimes LDS instructions have constant pointers.
+ // If Ptr is null, then that means this mem operand contains a
+ // PseudoSourceValue like GOT.
+ if (!Ptr || isa<UndefValue>(Ptr) ||
+ isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+ return true;
+
+ if (const Argument *Arg = dyn_cast<Argument>(Ptr))
+ return AMDGPU::isArgPassedInSGPR(Arg);
+
+ const Instruction *I = dyn_cast<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.uniform");
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index a9fcd4834638..74e14ef8fbd8 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -50,6 +50,8 @@ public:
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
int pseudoToMCOpcode(int Opcode) const;
+
+ static bool isUniformMMO(const MachineMemOperand *MMO);
};
} // End llvm namespace
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c024010f3e96..65c483d85c5a 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2,
[SDTCisFP<1>, SDTCisSameAs<1, 2>]
>;
+def AMDGPUIntPackOp : SDTypeProfile<1, 2,
+ [SDTCisInt<1>, SDTCisSameAs<1, 2>]
+>;
+
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>;
@@ -142,6 +146,10 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1ed02fae085a..e3df6d9bee88 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -120,7 +120,7 @@ static bool isInstrUniform(const MachineInstr &MI) {
return false;
const MachineMemOperand *MMO = *MI.memoperands_begin();
- return AMDGPU::isUniformMMO(MMO);
+ return AMDGPUInstrInfo::isUniformMMO(MMO);
}
const RegisterBankInfo::InstructionMapping &
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 415d8a512aa8..6d89aa6968e9 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -205,6 +205,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -1085,7 +1086,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
- return AMDGPU::isUniformMMO(MemNode->getMemOperand());
+ return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
}
TargetLoweringBase::LegalizeTypeAction
@@ -3517,7 +3518,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
+ switch (IID) {
+ case Intrinsic::amdgcn_cvt_pkrtz: {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
SDLoc SL(N);
@@ -3526,6 +3528,29 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
return;
}
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1 = N->getOperand(2);
+ SDLoc SL(N);
+ unsigned Opcode;
+
+ if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+ return;
+ }
+ }
break;
}
case ISD::SELECT: {
@@ -4424,10 +4449,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_ubfe:
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
- case Intrinsic::amdgcn_cvt_pkrtz: {
- // FIXME: Stop adding cast if v2f16 legal.
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ // FIXME: Stop adding cast if v2f16/v2i16 are legal.
EVT VT = Op.getValueType();
- SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+ unsigned Opcode;
+
+ if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
+ Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 125a3b22d0cf..bf9d5bc6ebdc 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -871,24 +871,6 @@ bool isArgPassedInSGPR(const Argument *A) {
}
}
-// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
-bool isUniformMMO(const MachineMemOperand *MMO) {
- const Value *Ptr = MMO->getValue();
- // UndefValue means this is a load of a kernel input. These are uniform.
- // Sometimes LDS instructions have constant pointers.
- // If Ptr is null, then that means this mem operand contains a
- // PseudoSourceValue like GOT.
- if (!Ptr || isa<UndefValue>(Ptr) ||
- isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
- return true;
-
- if (const Argument *Arg = dyn_cast<Argument>(Ptr))
- return isArgPassedInSGPR(Arg);
-
- const Instruction *I = dyn_cast<Instruction>(Ptr);
- return I && I->getMetadata("amdgpu.uniform");
-}
-
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
if (isGCN3Encoding(ST))
return ByteOffset;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a215b445378e..9515001b63d2 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -363,7 +363,6 @@ LLVM_READNONE
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
bool isArgPassedInSGPR(const Argument *Arg);
-bool isUniformMMO(const MachineMemOperand *MMO);
/// \returns The encoding that will be used for \p ByteOffset in the SMRD
/// offset field.
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index ef90b68db1a8..56b934f92f61 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -407,11 +407,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
} // End SubtargetPredicate = isGCN
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a7059c6914df..4ddc1f0ba429 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -396,10 +396,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// rip-relative addressing is actually relative to the *next* instruction.
// Since an immediate can follow the mod/rm byte for an instruction, this
- // means that we need to bias the immediate field of the instruction with
- // the size of the immediate field. If we have this case, add it into the
+ // means that we need to bias the displacement field of the instruction with
+ // the size of the immediate field. If we have this case, add it into the
// expression to emit.
- int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+ // Note: rip-relative addressing using immediate displacement values should
+ // not be adjusted, assuming it was the user's intent.
+ int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
+ ? X86II::getSizeOfImm(TSFlags)
+ : 0;
EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
CurByte, OS, Fixups, -ImmSize);
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 71526dd77f11..2a501efbc1bf 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -370,6 +370,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
char Mode, raw_ostream &O) {
unsigned Reg = MO.getReg();
+ bool EmitPercent = true;
+
switch (Mode) {
default: return true; // Unknown mode.
case 'b': // Print QImode register
@@ -384,6 +386,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
case 'k': // Print SImode register
Reg = getX86SubSuperRegister(Reg, 32);
break;
+ case 'V':
+ EmitPercent = false;
+ LLVM_FALLTHROUGH;
case 'q':
// Print 64-bit register names if 64-bit integer registers are available.
// Otherwise, print 32-bit register names.
@@ -391,7 +396,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
break;
}
- O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+ if (EmitPercent)
+ O << '%';
+
+ O << X86ATTInstPrinter::getRegisterName(Reg);
return false;
}
@@ -464,6 +472,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'w': // Print HImode register
case 'k': // Print SImode register
case 'q': // Print DImode register
+ case 'V': // Print native register without '%'
if (MO.isReg())
return printAsmMRegister(*this, MO, ExtraCode[0], O);
printOperand(*this, MI, OpNo, O);
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index ba7280c29cc9..bc0f55f581ff 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -663,8 +663,10 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::XOR32rr, X86::KXORDrr);
createReplacer(X86::XOR64rr, X86::KXORQrr);
- createReplacer(X86::TEST32rr, X86::KTESTDrr);
- createReplacer(X86::TEST64rr, X86::KTESTQrr);
+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+ // to prove only Z flag is used.
+ //createReplacer(X86::TEST32rr, X86::KTESTDrr);
+ //createReplacer(X86::TEST64rr, X86::KTESTQrr);
}
if (STI->hasDQI()) {
@@ -684,8 +686,10 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
createReplacer(X86::SHL8ri, X86::KSHIFTLBri);
- createReplacer(X86::TEST8rr, X86::KTESTBrr);
- createReplacer(X86::TEST16rr, X86::KTESTWrr);
+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+ // to prove only Z flag is used.
+ //createReplacer(X86::TEST8rr, X86::KTESTBrr);
+ //createReplacer(X86::TEST16rr, X86::KTESTWrr);
createReplacer(X86::XOR8rr, X86::KXORBrr);
}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 38885c42b529..9237833a2cd0 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17017,24 +17017,6 @@ static bool hasNonFlagsUse(SDValue Op) {
return false;
}
-// Emit KTEST instruction for bit vectors on AVX-512
-static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (Op.getOpcode() == ISD::BITCAST) {
- auto hasKTEST = [&](MVT VT) {
- unsigned SizeInBits = VT.getSizeInBits();
- return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
- (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
- };
- SDValue Op0 = Op.getOperand(0);
- MVT Op0VT = Op0.getValueType().getSimpleVT();
- if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
- hasKTEST(Op0VT))
- return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
- }
- return SDValue();
-}
-
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
@@ -17079,9 +17061,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
// we prove that the arithmetic won't overflow, we can't use OF or CF.
if (Op.getResNo() != 0 || NeedOF || NeedCF) {
- // Emit KTEST for bit vectors
- if (auto Node = EmitKTEST(Op, DAG, Subtarget))
- return Node;
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
@@ -17310,10 +17289,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
}
if (Opcode == 0) {
- // Emit KTEST for bit vectors
- if (auto Node = EmitKTEST(Op, DAG, Subtarget))
- return Node;
-
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
@@ -18093,6 +18068,34 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
return Result;
}
+// Try to select this as a KTEST+SETCC if possible.
+static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Only support equality comparisons.
+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
+ return SDValue();
+
+ // Must be a bitcast from vXi1.
+ if (Op0.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ Op0 = Op0.getOperand(0);
+ MVT VT = Op0.getSimpleValueType();
+ if (!(Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) &&
+ !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
+ return SDValue();
+
+ X86::CondCode X86CC;
+ if (isNullConstant(Op1)) {
+ X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+ } else
+ return SDValue();
+
+ SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
+ return getSETCC(X86CC, KTEST, dl, DAG);
+}
+
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
@@ -18115,6 +18118,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return NewSetCC;
}
+ // Try to lower using KTEST.
+ if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
+ return NewSetCC;
+
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
// these.
if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
@@ -20525,6 +20532,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Mask = DAG.getBitcast(MaskVT, Mask);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
}
+ case KUNPCK: {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+ SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+ SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+ // Arguments should be swapped.
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+ MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+ Src2, Src1);
+ return DAG.getBitcast(VT, Res);
+ }
case MASK_BINOP: {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
@@ -27094,28 +27113,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
unsigned Reg) {
+ if (Subtarget.useRetpolineExternalThunk()) {
+ // When using an external thunk for retpolines, we pick names that match the
+ // names GCC happens to use as well. This helps simplify the implementation
+ // of the thunks for kernels where they have no easy ability to create
+ // aliases and are doing non-trivial configuration of the thunk's body. For
+ // example, the Linux kernel will do boot-time hot patching of the thunk
+ // bodies and cannot easily export aliases of these to loaded modules.
+ //
+ // Note that at any point in the future, we may need to change the semantics
+ // of how we implement retpolines and at that time will likely change the
+ // name of the called thunk. Essentially, there is no hard guarantee that
+ // LLVM will generate calls to specific thunks, we merely make a best-effort
+ // attempt to help out kernels and other systems where duplicating the
+ // thunks is costly.
+ switch (Reg) {
+ case X86::EAX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_eax";
+ case X86::ECX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_ecx";
+ case X86::EDX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_edx";
+ case X86::EDI:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_edi";
+ case X86::R11:
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__x86_indirect_thunk_r11";
+ }
+ llvm_unreachable("unexpected reg for retpoline");
+ }
+
+ // When targeting an internal COMDAT thunk use an LLVM-specific name.
switch (Reg) {
- case 0:
- assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_push"
- : "__llvm_retpoline_push";
case X86::EAX:
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_eax"
- : "__llvm_retpoline_eax";
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_eax";
case X86::ECX:
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_ecx"
- : "__llvm_retpoline_ecx";
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_ecx";
case X86::EDX:
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_edx"
- : "__llvm_retpoline_edx";
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edx";
+ case X86::EDI:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edi";
case X86::R11:
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_r11"
- : "__llvm_retpoline_r11";
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__llvm_retpoline_r11";
}
llvm_unreachable("unexpected reg for retpoline");
}
@@ -27134,15 +27182,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
// just use R11, but we scan for uses anyway to ensure we don't generate
// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
// already a register use operand to the call to hold the callee. If none
- // are available, push the callee instead. This is less efficient, but is
- // necessary for functions using 3 regparms. Such function calls are
- // (currently) not eligible for tail call optimization, because there is no
- // scratch register available to hold the address of the callee.
+ // are available, use EDI instead. EDI is chosen because EBX is the PIC base
+ // register and ESI is the base pointer to realigned stack frames with VLAs.
SmallVector<unsigned, 3> AvailableRegs;
if (Subtarget.is64Bit())
AvailableRegs.push_back(X86::R11);
else
- AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
+ AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
// Zero out any registers that are already used.
for (const auto &MO : MI.operands()) {
@@ -27160,30 +27206,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
break;
}
}
+ if (!AvailableReg)
+ report_fatal_error("calling convention incompatible with retpoline, no "
+ "available registers");
const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
- if (AvailableReg == 0) {
- // No register available. Use PUSH. This must not be a tailcall, and this
- // must not be x64.
- if (Subtarget.is64Bit())
- report_fatal_error(
- "Cannot make an indirect call on x86-64 using both retpoline and a "
- "calling convention that preservers r11");
- if (Opc != X86::CALLpcrel32)
- report_fatal_error("Cannot make an indirect tail call on x86 using "
- "retpoline without a preserved register");
- BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
- MI.getOperand(0).ChangeToES(Symbol);
- MI.setDesc(TII->get(Opc));
- } else {
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
- .addReg(CalleeVReg);
- MI.getOperand(0).ChangeToES(Symbol);
- MI.setDesc(TII->get(Opc));
- MachineInstrBuilder(*BB->getParent(), &MI)
- .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
- }
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+ .addReg(CalleeVReg);
+ MI.getOperand(0).ChangeToES(Symbol);
+ MI.setDesc(TII->get(Opc));
+ MachineInstrBuilder(*BB->getParent(), &MI)
+ .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
return BB;
}
@@ -30432,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SDValue N0 = BitCast.getOperand(0);
EVT VecVT = N0->getValueType(0);
- if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
- N0->getOpcode() == ISD::OR) {
- SDValue Op0 = N0->getOperand(0);
- SDValue Op1 = N0->getOperand(1);
- MVT TrunckVT;
- MVT BitcastVT;
- switch (VT.getSimpleVT().SimpleTy) {
- default:
- return SDValue();
- case MVT::v16i1:
- TrunckVT = MVT::i8;
- BitcastVT = MVT::v8i1;
- break;
- case MVT::v32i1:
- TrunckVT = MVT::i16;
- BitcastVT = MVT::v16i1;
- break;
- case MVT::v64i1:
- TrunckVT = MVT::i32;
- BitcastVT = MVT::v32i1;
- break;
- }
- bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
- bool isArg0UndefLeft =
- Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
- bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
- bool isArg1UndefLeft =
- Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
- SDValue OpLeft;
- SDValue OpRight;
- if (isArg0UndefRight && isArg1UndefLeft) {
- OpLeft = Op0;
- OpRight = Op1;
- } else if (isArg1UndefRight && isArg0UndefLeft) {
- OpLeft = Op1;
- OpRight = Op0;
- } else
- return SDValue();
- SDLoc DL(BitCast);
- SDValue Shr = OpLeft->getOperand(0);
- SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
- SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
- SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
- SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
- }
-
if (!VT.isScalarInteger() || !VecVT.isSimple())
return SDValue();
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 0782d5598746..fae0889950b2 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM,
- TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+ TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
ROUNDP, ROUNDS
};
@@ -479,6 +479,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index 223fa5771498..d03826bbe992 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -43,7 +43,7 @@ static const char R11ThunkName[] = "__llvm_retpoline_r11";
static const char EAXThunkName[] = "__llvm_retpoline_eax";
static const char ECXThunkName[] = "__llvm_retpoline_ecx";
static const char EDXThunkName[] = "__llvm_retpoline_edx";
-static const char PushThunkName[] = "__llvm_retpoline_push";
+static const char EDIThunkName[] = "__llvm_retpoline_edi";
namespace {
class X86RetpolineThunks : public MachineFunctionPass {
@@ -74,7 +74,6 @@ private:
void createThunkFunction(Module &M, StringRef Name);
void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
- void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
};
@@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
createThunkFunction(M, R11ThunkName);
else
for (StringRef Name :
- {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName})
+ {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
createThunkFunction(M, Name);
InsertedThunks = true;
return true;
@@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
populateThunk(MF, X86::R11);
} else {
// For 32-bit targets we need to emit a collection of thunks for various
- // possible scratch registers as well as a fallback that is used when
- // there are no scratch registers and assumes the retpoline target has
- // been pushed.
+ // possible scratch registers as well as a fallback that uses EDI, which is
+ // normally callee saved.
// __llvm_retpoline_eax:
// calll .Leax_call_target
// .Leax_capture_spec:
@@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
// movl %edx, (%esp)
// retl
//
- // This last one is a bit more special and so needs a little extra
- // handling.
- // __llvm_retpoline_push:
- // calll .Lpush_call_target
- // .Lpush_capture_spec:
- // pause
- // lfence
- // jmp .Lpush_capture_spec
- // .align 16
- // .Lpush_call_target:
- // # Clear pause_loop return address.
- // addl $4, %esp
- // # Top of stack words are: Callee, RA. Exchange Callee and RA.
- // pushl 4(%esp) # Push callee
- // pushl 4(%esp) # Push RA
- // popl 8(%esp) # Pop RA to final RA
- // popl (%esp) # Pop callee to next top of stack
- // retl # Ret to callee
+ // __llvm_retpoline_edi:
+ // ... # Same setup
+ // movl %edi, (%esp)
+ // retl
if (MF.getName() == EAXThunkName)
populateThunk(MF, X86::EAX);
else if (MF.getName() == ECXThunkName)
populateThunk(MF, X86::ECX);
else if (MF.getName() == EDXThunkName)
populateThunk(MF, X86::EDX);
- else if (MF.getName() == PushThunkName)
- populateThunk(MF);
+ else if (MF.getName() == EDIThunkName)
+ populateThunk(MF, X86::EDI);
else
llvm_unreachable("Invalid thunk name on x86-32!");
}
@@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
.addReg(Reg);
}
-void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
- MachineBasicBlock &MBB) {
- // The instruction sequence we use to replace the return address without
- // a scratch register is somewhat complicated:
- // # Clear capture_spec from return address.
- // addl $4, %esp
- // # Top of stack words are: Callee, RA. Exchange Callee and RA.
- // pushl 4(%esp) # Push callee
- // pushl 4(%esp) # Push RA
- // popl 8(%esp) # Pop RA to final RA
- // popl (%esp) # Pop callee to next top of stack
- // retl # Ret to callee
- BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
- .addReg(X86::ESP)
- .addImm(4);
- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
- false, 4);
- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
- false, 4);
- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
- false, 8);
- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
- false, 0);
-}
-
void X86RetpolineThunks::populateThunk(MachineFunction &MF,
Optional<unsigned> Reg) {
// Set MF properties. We never use vregs...
@@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
CaptureSpec->addSuccessor(CaptureSpec);
CallTarget->setAlignment(4);
- if (Reg) {
- insertRegReturnAddrClobber(*CallTarget, *Reg);
- } else {
- assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
- insert32BitPushReturnAddrClobber(*CallTarget);
- }
+ insertRegReturnAddrClobber(*CallTarget, *Reg);
BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
}
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 40e52ee755e5..2f2f0696366c 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3264,6 +3264,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ Value *Src0 = II->getArgOperand(0);
+ Value *Src1 = II->getArgOperand(1);
+
+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+ return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+ break;
+ }
case Intrinsic::amdgcn_ubfe:
case Intrinsic::amdgcn_sbfe: {
// Decompose simple cases into standard shifts.
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
new file mode 100644
index 000000000000..241b708e7baf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pk_i16_samereg_i32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %x)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %b = load volatile i32, i32 addrspace(1)* %b.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
+define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 1)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, 1, [[A]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, 1, [[A]]
+define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 1, i32 %a)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
new file mode 100644
index 000000000000..8d5c9aa95219
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pk_u16_samereg_i32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %x)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %b = load volatile i32, i32 addrspace(1)* %b.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
+define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 1)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, 1, [[A]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, 1, [[A]]
+define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 1, i32 %a)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
new file mode 100644
index 000000000000..822e8c2886ba
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
@@ -0,0 +1,164 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_i16_samereg_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %x)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float 1.0)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float 1.0, float %a)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fabs.a = call float @llvm.fabs.f32(float %a)
+ %neg.fabs.a = fsub float -0.0, %fabs.a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.fabs.a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
new file mode 100644
index 000000000000..c2b8f3cb28ca
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
@@ -0,0 +1,164 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_u16_samereg_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %x)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float 1.0)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float 1.0, float %a)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fabs.a = call float @llvm.fabs.f32(float %a)
+ %neg.fabs.a = fsub float -0.0, %fabs.a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.fabs.a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/ARM/pr25838.ll b/test/CodeGen/ARM/pr25838.ll
index 0aa95fd2d720..f3bb98f4260c 100644
--- a/test/CodeGen/ARM/pr25838.ll
+++ b/test/CodeGen/ARM/pr25838.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -verify-machineinstrs < %s
; PR25838
target triple = "armv7--linux-android"
diff --git a/test/CodeGen/ARM/splitkit.ll b/test/CodeGen/ARM/splitkit.ll
new file mode 100644
index 000000000000..d51d35174450
--- /dev/null
+++ b/test/CodeGen/ARM/splitkit.ll
@@ -0,0 +1,245 @@
+; RUN: llc -o - %s | FileCheck %s
+; Make sure RegAllocGreedy/SplitKit do not produce invalid liveness information
+; and crash when splitting a liverange twice and rematerializing each time.
+; (Sorry for the testcase; this was ran through bugpoint and then manually
+; reduced for several hours but is still big...)
+target triple = "thumbv7-apple-ios"
+
+%struct.ham = type { %struct.wombat.0 }
+%struct.wombat.0 = type { %struct.barney }
+%struct.barney = type { %struct.snork.1 }
+%struct.snork.1 = type { %struct.wobble.2 }
+%struct.wobble.2 = type { %struct.blam }
+%struct.blam = type { i32, i32, i8* }
+%struct.ham.3 = type { %struct.pluto }
+%struct.pluto = type { %struct.zot*, %struct.snork.5, %struct.wibble }
+%struct.zot = type { %struct.blam.4* }
+%struct.blam.4 = type <{ %struct.zot, %struct.blam.4*, %struct.zot*, i8, [3 x i8] }>
+%struct.snork.5 = type { %struct.quux }
+%struct.quux = type { %struct.zot }
+%struct.wibble = type { %struct.widget }
+%struct.widget = type { i32 }
+%struct.bar = type { %struct.spam }
+%struct.spam = type { %struct.zot*, %struct.wobble, %struct.zot.7 }
+%struct.wobble = type { %struct.wibble.6 }
+%struct.wibble.6 = type { %struct.zot }
+%struct.zot.7 = type { %struct.ham.8 }
+%struct.ham.8 = type { i32 }
+%struct.hoge = type { %struct.ham, %struct.foo }
+%struct.foo = type { float, float }
+%struct.wombat = type { %struct.ham, float }
+%struct.snork = type { %struct.ham.9, [11 x i8] }
+%struct.ham.9 = type { i8 }
+
+@global = external global i8
+@global.1 = private constant [20 x i8] c"aaaaaaaaaaaaaaaaaa0\00"
+@global.2 = external constant [27 x i8]
+@global.3 = external global %struct.ham
+@global.4 = external constant [47 x i8]
+@global.5 = external constant [61 x i8]
+@global.6 = external constant [40 x i8]
+@global.7 = external constant [24 x i8]
+@global.8 = external constant [20 x i8]
+@global.9 = external global %struct.ham
+@global.10 = external global %struct.ham
+@global.11 = external global %struct.ham
+@global.12 = external global %struct.ham
+@global.13 = external global %struct.ham
+@global.14 = external global %struct.ham
+@global.15 = external global %struct.ham
+@global.16 = external global %struct.ham
+@global.17 = external global %struct.ham
+@global.18 = external constant [35 x i8]
+@global.19 = external global %struct.ham
+@global.20 = external constant [53 x i8]
+@global.21 = external global %struct.ham
+@global.22 = external global %struct.ham
+@global.23 = external global %struct.ham
+@global.24 = external constant [32 x i8]
+@global.25 = external global %struct.ham
+@global.26 = external constant [47 x i8]
+@global.27 = external global %struct.ham
+@global.28 = external constant [45 x i8]
+@global.29 = external global %struct.ham
+@global.30 = external global %struct.ham
+@global.31 = external constant [24 x i8]
+@global.32 = external global %struct.ham
+@global.33 = external global %struct.ham
+@global.34 = external global %struct.ham
+@global.35 = external global %struct.ham
+@global.36 = external constant [27 x i8]
+@global.37 = external global %struct.ham
+@global.38 = external constant [10 x i8]
+@global.39 = external global %struct.ham
+@global.40 = external global %struct.ham
+@global.41 = external global %struct.ham
+@global.42 = external global %struct.ham
+@global.43 = external global %struct.ham
+@global.44 = external constant [41 x i8]
+@global.45 = external global %struct.ham
+@global.46 = external global %struct.ham
+@global.47 = external global %struct.ham
+@global.48 = external global %struct.ham
+@global.49 = external constant [52 x i8]
+@global.50 = external constant [47 x i8]
+@global.51 = external global %struct.ham
+@global.52 = external global %struct.ham
+@global.53 = external global %struct.ham
+@global.54 = external global %struct.ham
+@global.55 = external global %struct.ham.3
+@global.56 = external global %struct.bar
+@global.57 = external global i8
+
+declare %struct.ham* @bar(%struct.ham* returned)
+
+declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)
+
+declare %struct.ham* @wobble(%struct.ham* returned, %struct.ham* )
+
+declare i32 @quux(...)
+
+declare i8* @_Znwm(i32)
+
+declare i32 @wobble.58(%struct.pluto*, [1 x i32], %struct.ham* , %struct.hoge* )
+
+declare i32 @widget(%struct.spam*, [1 x i32], %struct.ham* , %struct.wombat* )
+
+; Just check we didn't crash and did output something...
+; CHECK-LABEL: func:
+; CHECK: trap
+define internal void @func() section "__TEXT,__StaticInit,regular,pure_instructions" personality i32 (...)* @quux {
+ %tmp = tail call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.3 to i8*), i8* @global) #0
+ %tmp2 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham* @global.9)
+ to label %bb14 unwind label %bbunwind
+
+bb14:
+ %tmp15 = getelementptr i8, i8* undef, i32 12
+ store i8 0, i8* %tmp15
+ %tmp16 = icmp eq i8 undef, 0
+ br i1 %tmp16, label %bb28, label %bb18
+
+bb18:
+ br i1 undef, label %bb21, label %bb29
+
+bb21:
+ %tmp22 = call i8* @_Znwm(i32 16)
+ store i32 17, i32* getelementptr (%struct.ham, %struct.ham* @global.10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp23 = call i8* @_Znwm(i32 32)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([27 x i8], [27 x i8]* @global.2, i32 0, i32 0), i32 26, i32 1, i1 false)
+ store i32 33, i32* getelementptr (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ store i32 23, i32* getelementptr (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([24 x i8], [24 x i8]* @global.7, i32 0, i32 0), i32 23, i32 1, i1 false)
+ %tmp24 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.11 to i8*), i8* @global) #0
+ store i32 49, i32* getelementptr (%struct.ham, %struct.ham* @global.12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ store i32 37, i32* getelementptr (%struct.ham, %struct.ham* @global.13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.14 to i8*), i8 0, i32 12, i32 1, i1 false)
+ %tmp25 = call i8* @_Znwm(i32 48)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp25, i8* align 1 getelementptr ([40 x i8], [40 x i8]* @global.6, i32 0, i32 0), i32 39, i32 1, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([47 x i8], [47 x i8]* @global.4, i32 0, i32 0), i32 46, i32 1, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([61 x i8], [61 x i8]* @global.5, i32 0, i32 0), i32 60, i32 1, i1 false)
+ %tmp26 = call i8* @_Znwm(i32 48)
+ store i32 65, i32* getelementptr (%struct.ham, %struct.ham* @global.15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp27 = icmp eq i8 undef, 0
+ br i1 %tmp27, label %bb30, label %bb33
+
+bb28:
+ call void @llvm.trap()
+ unreachable
+
+bb29:
+ call void @llvm.trap()
+ unreachable
+
+bb30:
+ %tmp31 = icmp eq i32 undef, 37
+ br i1 %tmp31, label %bb32, label %bb30
+
+bb32:
+ store i8 1, i8* @global.57
+ br label %bb33
+
+bb33:
+ %tmp34 = call i8* @_Znwm(i32 32)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([20 x i8], [20 x i8]* @global.1, i32 0, i32 0), i32 19, i32 1, i1 false)
+ store i32 17, i32* getelementptr (%struct.ham, %struct.ham* @global.16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ store i32 65, i32* getelementptr (%struct.ham, %struct.ham* @global.17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([35 x i8], [35 x i8]* @global.18, i32 0, i32 0), i32 34, i32 1, i1 false)
+ store i32 65, i32* getelementptr (%struct.ham, %struct.ham* @global.19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([53 x i8], [53 x i8]* @global.20, i32 0, i32 0), i32 52, i32 1, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([20 x i8], [20 x i8]* @global.8, i32 0, i32 0), i32 19, i32 1, i1 false)
+ store i32 37, i32* getelementptr (%struct.ham, %struct.ham* @global.21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ %tmp35 = call i8* @_Znwm(i32 32)
+ store i8 16, i8* bitcast (%struct.ham* @global.22 to i8*)
+ %tmp36 = call i8* @_Znwm(i32 32)
+ store i32 31, i32* getelementptr (%struct.ham, %struct.ham* @global.23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp36, i8* align 1 getelementptr ([32 x i8], [32 x i8]* @global.24, i32 0, i32 0), i32 31, i32 1, i1 false)
+ %tmp37 = getelementptr i8, i8* %tmp36, i32 31
+ store i8 0, i8* %tmp37
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([47 x i8], [47 x i8]* @global.26, i32 0, i32 0), i32 46, i32 1, i1 false)
+ %tmp38 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.25 to i8*), i8* @global) #0
+ %tmp39 = call i8* @_Znwm(i32 48)
+ store i32 44, i32* getelementptr (%struct.ham, %struct.ham* @global.27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp39, i8* align 1 getelementptr ([45 x i8], [45 x i8]* @global.28, i32 0, i32 0), i32 44, i32 1, i1 false)
+ %tmp40 = getelementptr i8, i8* %tmp39, i32 44
+ store i8 0, i8* %tmp40
+ call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.29 to i8*), i8 0, i32 12, i32 1, i1 false)
+ %tmp41 = call i8* @_Znwm(i32 32)
+ store i32 23, i32* getelementptr (%struct.ham, %struct.ham* @global.30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp41, i8* align 1 getelementptr ([24 x i8], [24 x i8]* @global.31, i32 0, i32 0), i32 23, i32 1, i1 false)
+ %tmp42 = getelementptr i8, i8* %tmp41, i32 23
+ store i8 0, i8* %tmp42
+ call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.32 to i8*), i8 0, i32 12, i32 1, i1 false)
+ store i8 16, i8* bitcast (%struct.ham* @global.32 to i8*)
+ %tmp43 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.33 to i8*), i8* @global) #0
+ %tmp44 = call i8* @_Znwm(i32 16)
+ call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.34 to i8*), i8 0, i32 12, i32 1, i1 false)
+ call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.9 to i8*), i8 0, i32 12, i32 1, i1 false)
+ %tmp45 = call i8* @_Znwm(i32 32)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp45, i8* align 1 getelementptr ([27 x i8], [27 x i8]* @global.36, i32 0, i32 0), i32 26, i32 1, i1 false)
+ call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.37 to i8*), i8 0, i32 12, i32 1, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 getelementptr (%struct.snork, %struct.snork* bitcast (%struct.ham* @global.37 to %struct.snork*), i32 0, i32 1, i32 0), i8* align 1 getelementptr ([10 x i8], [10 x i8]* @global.38, i32 0, i32 0), i32 9, i32 1, i1 false)
+ store i32 17, i32* getelementptr (%struct.ham, %struct.ham* @global.39, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp46 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.40 to i8*), i8* @global) #0
+ %tmp47 = call i8* @_Znwm(i32 32)
+ %tmp48 = getelementptr i8, i8* %tmp47, i32 21
+ store i8 0, i8* %tmp48
+ store i32 33, i32* getelementptr (%struct.ham, %struct.ham* @global.41, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ store i32 15, i32* getelementptr (%struct.ham, %struct.ham* @global.42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ %tmp49 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.43 to i8*), i8* @global) #0
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([41 x i8], [41 x i8]* @global.44, i32 0, i32 0), i32 40, i32 1, i1 false)
+ %tmp50 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.45 to i8*), i8* @global) #0
+ %tmp51 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.46 to i8*), i8* @global) #0
+ %tmp52 = call i8* @_Znwm(i32 32)
+ store i8* %tmp52, i8** getelementptr (%struct.ham, %struct.ham* @global.47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([52 x i8], [52 x i8]* @global.49, i32 0, i32 0), i32 51, i32 1, i1 false)
+ %tmp53 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.48 to i8*), i8* @global) #0
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([47 x i8], [47 x i8]* @global.50, i32 0, i32 0), i32 46, i32 1, i1 false)
+ store i32 33, i32* getelementptr (%struct.ham, %struct.ham* @global.51, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ store i32 37, i32* getelementptr (%struct.ham, %struct.ham* @global.52, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ %tmp54 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham* @global.54)
+ to label %bb58 unwind label %bbunwind
+
+bb58:
+ %tmp59 = invoke i32 @wobble.58(%struct.pluto* getelementptr (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham* undef, %struct.hoge* undef)
+ to label %bb71 unwind label %bbunwind
+
+bb71:
+ %tmp72 = invoke i32 @widget(%struct.spam* getelementptr (%struct.bar, %struct.bar* @global.56, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr (%struct.bar, %struct.bar* @global.56, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham* undef, %struct.wombat* undef)
+ to label %bb73 unwind label %bbunwind
+
+bb73:
+ ret void
+
+bbunwind:
+ %tmp75 = landingpad { i8*, i32 }
+ cleanup
+ resume { i8*, i32 } undef
+}
+
+declare void @llvm.trap()
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* , i8* , i32, i32, i1)
+
+declare void @llvm.memset.p0i8.i32(i8* , i8, i32, i32, i1)
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/Thumb/stm-scavenging.ll b/test/CodeGen/Thumb/stm-scavenging.ll
new file mode 100644
index 000000000000..3ed5763f2955
--- /dev/null
+++ b/test/CodeGen/Thumb/stm-scavenging.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "thumbv6---gnueabi"
+
+; Use STM to save the three registers
+; CHECK-LABEL: use_stm:
+; CHECK: .save {r7, lr}
+; CHECK: .setfp r7, sp
+; CHECK: stm r3!, {r0, r1, r2}
+; CHECK: bl throws_1
+define void @use_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" {
+entry:
+ %arrayidx = getelementptr inbounds i32, i32* %d, i32 2
+ store i32 %a, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3
+ store i32 %b, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4
+ store i32 %c, i32* %arrayidx2, align 4
+ tail call void @throws_1(i32 %a, i32 %b, i32 %c) noreturn
+ unreachable
+}
+
+; Don't use STM: there is no available register to store
+; the address. We could transform this with some extra math, but
+; that currently isn't implemented.
+; CHECK-LABEL: no_stm:
+; CHECK: .save {r7, lr}
+; CHECK: .setfp r7, sp
+; CHECK: str r0,
+; CHECK: str r1,
+; CHECK: str r2,
+; CHECK: bl throws_2
+define void @no_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" {
+entry:
+ %arrayidx = getelementptr inbounds i32, i32* %d, i32 2
+ store i32 %a, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3
+ store i32 %b, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4
+ store i32 %c, i32* %arrayidx2, align 4
+ tail call void @throws_2(i32 %a, i32 %b, i32 %c, i32* %d) noreturn
+ unreachable
+}
+
+
+declare void @throws_1(i32, i32, i32) noreturn
+declare void @throws_2(i32, i32, i32, i32*) noreturn
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index 50de773af001..80127f66bdfe 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -5,59 +5,6 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
-define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
-; X32-LABEL: test_mm512_kunpackb:
-; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
-; X32-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
-; X32-NEXT: kunpckbw %k0, %k1, %k1
-; X32-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT: kmovw %k0, %eax
-; X32-NEXT: movzwl %ax, %eax
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm512_kunpackb:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
-; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
-; X64-NEXT: kunpckbw %k0, %k1, %k1
-; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT: kmovw %k0, %eax
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__A to <16 x i32>
- %1 = bitcast <8 x i64> %__B to <16 x i32>
- %2 = icmp ne <16 x i32> %0, %1
- %3 = bitcast <16 x i1> %2 to i16
- %4 = bitcast <8 x i64> %__C to <16 x i32>
- %5 = bitcast <8 x i64> %__D to <16 x i32>
- %6 = icmp ne <16 x i32> %4, %5
- %7 = bitcast <16 x i1> %6 to i16
- %8 = and i16 %7, 255
- %shl.i = shl i16 %3, 8
- %or.i = or i16 %8, %shl.i
- %9 = bitcast <8 x i64> %__E to <16 x i32>
- %10 = bitcast <8 x i64> %__F to <16 x i32>
- %11 = icmp ne <16 x i32> %9, %10
- %12 = bitcast i16 %or.i to <16 x i1>
- %13 = and <16 x i1> %11, %12
- %14 = bitcast <16 x i1> %13 to i16
- ret i16 %14
-}
-
define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
; X32-LABEL: test_mm512_shuffle_f32x4:
; X32: # %bb.0: # %entry
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index f3ca0644e463..378dbda2dc0a 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1,20 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
-
-define i16 @unpckbw_test(i16 %a0, i16 %a1) {
-; CHECK-LABEL: unpckbw_test:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: shll $8, %esi
-; CHECK-NEXT: orl %esi, %eax
-; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
- ret i16 %res
-}
-
define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
; CHECK: ## %bb.0:
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 5faa202c30f3..628199d4ac9e 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -96,6 +96,21 @@ define i16 @test_kor(i16 %a0, i16 %a1) {
ret i16 %t2
}
+declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
+
+define i16 @unpckbw_test(i16 %a0, i16 %a1) {
+; CHECK-LABEL: unpckbw_test:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: kunpckbw %k1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
+ ret i16 %res
+}
+
declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
; TODO: the two kxnor instructions here a no op and should be elimintaed,
; probably by FoldConstantArithmetic in SelectionDAG.
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 4877157d911d..d112577a6104 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -2775,3 +2775,99 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) {
%ret = bitcast <8 x i1> %m2 to i8
ret i8 %ret
}
+
+; Make sure we don't emit a ktest for signed comparisons.
+define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
+; KNL-LABEL: ktest_signed:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rax
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testw %ax, %ax
+; KNL-NEXT: jle LBB63_1
+; KNL-NEXT: ## %bb.2: ## %bb.2
+; KNL-NEXT: popq %rax
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+; KNL-NEXT: LBB63_1: ## %bb.1
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: callq _foo
+; KNL-NEXT: popq %rax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: ktest_signed:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rax
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: testw %ax, %ax
+; SKX-NEXT: jle LBB63_1
+; SKX-NEXT: ## %bb.2: ## %bb.2
+; SKX-NEXT: popq %rax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+; SKX-NEXT: LBB63_1: ## %bb.1
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: callq _foo
+; SKX-NEXT: popq %rax
+; SKX-NEXT: retq
+;
+; AVX512BW-LABEL: ktest_signed:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: pushq %rax
+; AVX512BW-NEXT: .cfi_def_cfa_offset 16
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: testw %ax, %ax
+; AVX512BW-NEXT: jle LBB63_1
+; AVX512BW-NEXT: ## %bb.2: ## %bb.2
+; AVX512BW-NEXT: popq %rax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+; AVX512BW-NEXT: LBB63_1: ## %bb.1
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: callq _foo
+; AVX512BW-NEXT: popq %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: ktest_signed:
+; AVX512DQ: ## %bb.0:
+; AVX512DQ-NEXT: pushq %rax
+; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT: kmovw %k0, %eax
+; AVX512DQ-NEXT: testw %ax, %ax
+; AVX512DQ-NEXT: jle LBB63_1
+; AVX512DQ-NEXT: ## %bb.2: ## %bb.2
+; AVX512DQ-NEXT: popq %rax
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+; AVX512DQ-NEXT: LBB63_1: ## %bb.1
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: callq _foo
+; AVX512DQ-NEXT: popq %rax
+; AVX512DQ-NEXT: retq
+ %a = icmp eq <16 x i32> %x, zeroinitializer
+ %b = icmp eq <16 x i32> %y, zeroinitializer
+ %c = and <16 x i1> %a, %b
+ %d = bitcast <16 x i1> %c to i16
+ %e = icmp sgt i16 %d, 0
+ br i1 %e, label %bb.2, label %bb.1
+bb.1:
+ call void @foo()
+ br label %bb.2
+bb.2:
+ ret void
+}
+declare void @foo()
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
index 1e754be6fe49..a56111f3453e 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -4,117 +4,6 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
-define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackd:
-; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT: vmovdqa64 72(%ebp), %zmm4
-; X32-NEXT: vmovdqa64 8(%ebp), %zmm5
-; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
-; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT: vpcmpneqb %zmm5, %zmm2, %k0
-; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT: kunpckdq %k0, %k1, %k1
-; X32-NEXT: vpcmpneqb %zmm3, %zmm4, %k0 {%k1}
-; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm512_kunpackd:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
-; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1
-; X64-NEXT: kunpckdq %k0, %k1, %k1
-; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT: kmovq %k0, %rax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__B to <64 x i8>
- %1 = bitcast <8 x i64> %__A to <64 x i8>
- %2 = icmp ne <64 x i8> %0, %1
- %3 = bitcast <64 x i1> %2 to i64
- %4 = bitcast <8 x i64> %__C to <64 x i8>
- %5 = bitcast <8 x i64> %__D to <64 x i8>
- %6 = icmp ne <64 x i8> %4, %5
- %7 = bitcast <64 x i1> %6 to i64
- %and.i = and i64 %7, 4294967295
- %shl.i = shl i64 %3, 32
- %or.i = or i64 %and.i, %shl.i
- %8 = bitcast <8 x i64> %__E to <64 x i8>
- %9 = bitcast <8 x i64> %__F to <64 x i8>
- %10 = icmp ne <64 x i8> %8, %9
- %11 = bitcast i64 %or.i to <64 x i1>
- %12 = and <64 x i1> %10, %11
- %13 = bitcast <64 x i1> %12 to i64
- ret i64 %13
-}
-
-define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackw:
-; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
-; X32-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1
-; X32-NEXT: kunpckwd %k0, %k1, %k1
-; X32-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT: kmovd %k0, %eax
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm512_kunpackw:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
-; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1
-; X64-NEXT: kunpckwd %k0, %k1, %k1
-; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT: kmovd %k0, %eax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__B to <32 x i16>
- %1 = bitcast <8 x i64> %__A to <32 x i16>
- %2 = icmp ne <32 x i16> %0, %1
- %3 = bitcast <32 x i1> %2 to i32
- %4 = bitcast <8 x i64> %__C to <32 x i16>
- %5 = bitcast <8 x i64> %__D to <32 x i16>
- %6 = icmp ne <32 x i16> %4, %5
- %7 = bitcast <32 x i1> %6 to i32
- %and.i = and i32 %7, 65535
- %shl.i = shl i32 %3, 16
- %or.i = or i32 %and.i, %shl.i
- %8 = bitcast <8 x i64> %__E to <32 x i16>
- %9 = bitcast <8 x i64> %__F to <32 x i16>
- %10 = icmp ne <32 x i16> %8, %9
- %11 = bitcast i32 %or.i to <32 x i1>
- %12 = and <32 x i1> %10, %11
- %13 = bitcast <32 x i1> %12 to i32
- ret i32 %13
-}
-
-
define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) {
; X32-LABEL: test_mm512_mask_set1_epi8:
; X32: # %bb.0: # %entry
@@ -189,46 +78,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: movb %ch, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $55, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: andb $2, %al
; X32-NEXT: shrb %al
; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $54, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: movb %ch, %al
; X32-NEXT: andb $15, %al
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $53, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: shrb $3, %al
-; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $12, %eax
-; X32-NEXT: andl $15, %eax
-; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $13, %eax
; X32-NEXT: andb $1, %al
-; X32-NEXT: kmovd %eax, %k3
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $14, %eax
-; X32-NEXT: andl $3, %eax
-; X32-NEXT: kmovd %eax, %k4
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $15, %eax
-; X32-NEXT: andl $1, %eax
; X32-NEXT: kmovd %eax, %k5
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl $16, %edx
@@ -243,25 +105,52 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kmovd %eax, %k7
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $52, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $51, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $13, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $50, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $14, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $49, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $15, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $48, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -494,22 +383,14 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $43, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $12, %esi
-; X32-NEXT: andl $15, %esi
-; X32-NEXT: kmovd %esi, %k2
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $14, %esi
-; X32-NEXT: andl $3, %esi
-; X32-NEXT: kmovd %esi, %k3
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $15, %esi
-; X32-NEXT: andl $1, %esi
-; X32-NEXT: kmovd %esi, %k4
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $20, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $19, %k1, %k1
@@ -520,12 +401,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kshiftrq $18, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $46, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $17, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $47, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $16, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -551,8 +440,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $12, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k4
-; X32-NEXT: kshiftrq $52, %k4, %k0
+; X32-NEXT: kxorq %k0, %k1, %k3
+; X32-NEXT: kshiftrq $52, %k3, %k0
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $4, %dl
; X32-NEXT: kmovd %edx, %k1
@@ -576,19 +465,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: andb $15, %cl
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: kmovd %edx, %k4
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $11, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k4
-; X32-NEXT: kshiftrq $53, %k4, %k5
+; X32-NEXT: kxorq %k3, %k5, %k3
+; X32-NEXT: kshiftrq $53, %k3, %k5
; X32-NEXT: kxorq %k6, %k5, %k5
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $10, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k5
-; X32-NEXT: kshiftrq $54, %k5, %k4
-; X32-NEXT: kxorq %k7, %k4, %k6
+; X32-NEXT: kxorq %k3, %k5, %k5
+; X32-NEXT: kshiftrq $54, %k5, %k3
+; X32-NEXT: kxorq %k7, %k3, %k6
; X32-NEXT: shrb $3, %cl
-; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: kmovd %ecx, %k3
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $29, %ecx
; X32-NEXT: andb $1, %cl
@@ -603,12 +492,6 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kxorq %k5, %k0, %k0
; X32-NEXT: kshiftrq $56, %k0, %k5
; X32-NEXT: kxorq %k1, %k5, %k1
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $28, %ecx
-; X32-NEXT: kmovd %ecx, %k5
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $30, %ecx
-; X32-NEXT: kmovd %ecx, %k6
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $7, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -618,17 +501,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kshiftrq $6, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $58, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $5, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $59, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $4, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $60, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $3, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -638,7 +524,10 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kshiftrq $2, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $62, %k0, %k1
-; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: shrl $31, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
@@ -743,46 +632,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: movb %ch, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $55, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: andb $2, %al
; X32-NEXT: shrb %al
; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $54, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: movb %ch, %al
; X32-NEXT: andb $15, %al
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $53, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: shrb $3, %al
-; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $12, %eax
-; X32-NEXT: andl $15, %eax
-; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $13, %eax
; X32-NEXT: andb $1, %al
-; X32-NEXT: kmovd %eax, %k3
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $14, %eax
-; X32-NEXT: andl $3, %eax
-; X32-NEXT: kmovd %eax, %k4
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $15, %eax
-; X32-NEXT: andl $1, %eax
; X32-NEXT: kmovd %eax, %k5
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl $16, %edx
@@ -797,25 +659,52 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kmovd %eax, %k7
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $52, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $51, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $13, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $50, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $14, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $49, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $15, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $48, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -1048,22 +937,14 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $43, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $12, %esi
-; X32-NEXT: andl $15, %esi
-; X32-NEXT: kmovd %esi, %k2
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $14, %esi
-; X32-NEXT: andl $3, %esi
-; X32-NEXT: kmovd %esi, %k3
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $15, %esi
-; X32-NEXT: andl $1, %esi
-; X32-NEXT: kmovd %esi, %k4
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $20, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $19, %k1, %k1
@@ -1074,12 +955,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kshiftrq $18, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $46, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $17, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $47, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $16, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -1105,8 +994,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $12, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k4
-; X32-NEXT: kshiftrq $52, %k4, %k0
+; X32-NEXT: kxorq %k0, %k1, %k3
+; X32-NEXT: kshiftrq $52, %k3, %k0
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $4, %dl
; X32-NEXT: kmovd %edx, %k1
@@ -1130,19 +1019,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: andb $15, %cl
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: kmovd %edx, %k4
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $11, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k4
-; X32-NEXT: kshiftrq $53, %k4, %k5
+; X32-NEXT: kxorq %k3, %k5, %k3
+; X32-NEXT: kshiftrq $53, %k3, %k5
; X32-NEXT: kxorq %k6, %k5, %k5
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $10, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k5
-; X32-NEXT: kshiftrq $54, %k5, %k4
-; X32-NEXT: kxorq %k7, %k4, %k6
+; X32-NEXT: kxorq %k3, %k5, %k5
+; X32-NEXT: kshiftrq $54, %k5, %k3
+; X32-NEXT: kxorq %k7, %k3, %k6
; X32-NEXT: shrb $3, %cl
-; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: kmovd %ecx, %k3
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $29, %ecx
; X32-NEXT: andb $1, %cl
@@ -1157,12 +1046,6 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kxorq %k5, %k0, %k0
; X32-NEXT: kshiftrq $56, %k0, %k5
; X32-NEXT: kxorq %k1, %k5, %k1
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $28, %ecx
-; X32-NEXT: kmovd %ecx, %k5
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $30, %ecx
-; X32-NEXT: kmovd %ecx, %k6
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $7, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -1172,17 +1055,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kshiftrq $6, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $58, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $5, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $59, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $4, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $60, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $3, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -1192,7 +1078,10 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kshiftrq $2, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $62, %k0, %k1
-; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: shrl $31, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index f19e09758f12..13aca464b9e2 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -2,46 +2,6 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
-declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
-
-define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512BW: ## %bb.0:
-; AVX512BW-NEXT: movzwl %di, %eax
-; AVX512BW-NEXT: shll $16, %esi
-; AVX512BW-NEXT: orl %esi, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: shll $16, %eax
-; AVX512F-32-NEXT: orl %ecx, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
- ret i32 %res
-}
-
-declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
-
-define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512BW: ## %bb.0:
-; AVX512BW-NEXT: movl %edi, %eax
-; AVX512BW-NEXT: shlq $32, %rsi
-; AVX512BW-NEXT: orq %rsi, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
- ret i64 %res
-}
-
declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 2fa7c2c5b8a8..7b5cc5feff0c 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1455,6 +1455,55 @@ define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
ret <8 x i64> %res2
}
+declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
+
+define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: kmovd %edi, %k0
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: kunpckwd %k1, %k0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
+ ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
+
+define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: kmovq %rdi, %k0
+; AVX512BW-NEXT: kmovq %rsi, %k1
+; AVX512BW-NEXT: kunpckdq %k1, %k0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
+ ret i64 %res
+}
+
declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
diff --git a/test/CodeGen/X86/domain-reassignment.mir b/test/CodeGen/X86/domain-reassignment.mir
index 3cb4b5dd1396..7da9b083c22e 100644
--- a/test/CodeGen/X86/domain-reassignment.mir
+++ b/test/CodeGen/X86/domain-reassignment.mir
@@ -1,22 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -run-pass x86-domain-reassignment -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq -o - %s | FileCheck %s
--- |
; ModuleID = '../test/CodeGen/X86/gpr-to-mask.ll'
source_filename = "../test/CodeGen/X86/gpr-to-mask.ll"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
-
+
define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) #0 {
entry:
br i1 %cond, label %if, label %else
-
+
if: ; preds = %entry
%cmp1 = fcmp oeq float %f3, %f4
br label %exit
-
+
else: ; preds = %entry
%cmp2 = fcmp oeq float %f5, %f6
br label %exit
-
+
exit: ; preds = %else, %if
%val = phi i1 [ %cmp1, %if ], [ %cmp2, %else ]
%selected = select i1 %val, float %f1, float %f2
@@ -48,14 +49,13 @@
...
---
name: test_fcmp_storefloat
-# CHECK-LABEL: name: test_fcmp_storefloat
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr8, preferred-register: '' }
- { id: 1, class: gr8, preferred-register: '' }
- { id: 2, class: gr8, preferred-register: '' }
@@ -79,7 +79,7 @@ registers:
- { id: 20, class: fr128, preferred-register: '' }
- { id: 21, class: fr128, preferred-register: '' }
- { id: 22, class: fr32x, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%edi', virtual-reg: '%3' }
- { reg: '%rsi', virtual-reg: '%4' }
- { reg: '%xmm0', virtual-reg: '%5' }
@@ -88,7 +88,7 @@ liveins:
- { reg: '%xmm3', virtual-reg: '%8' }
- { reg: '%xmm4', virtual-reg: '%9' }
- { reg: '%xmm5', virtual-reg: '%10' }
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -105,14 +105,51 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
-fixedStack:
-stack:
-constants:
+fixedStack:
+stack:
+constants:
body: |
+ ; CHECK-LABEL: name: test_fcmp_storefloat
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+ ; CHECK: [[COPY:%[0-9]+]]:fr32x = COPY %xmm5
+ ; CHECK: [[COPY1:%[0-9]+]]:fr32x = COPY %xmm4
+ ; CHECK: [[COPY2:%[0-9]+]]:fr32x = COPY %xmm3
+ ; CHECK: [[COPY3:%[0-9]+]]:fr32x = COPY %xmm2
+ ; CHECK: [[COPY4:%[0-9]+]]:fr32x = COPY %xmm1
+ ; CHECK: [[COPY5:%[0-9]+]]:vr128x = COPY %xmm0
+ ; CHECK: [[COPY6:%[0-9]+]]:gr64 = COPY %rsi
+ ; CHECK: [[COPY7:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY8:%[0-9]+]]:gr8 = COPY [[COPY7]].sub_8bit
+ ; CHECK: TEST8ri killed [[COPY8]], 1, implicit-def %eflags
+ ; CHECK: JE_1 %bb.2, implicit %eflags
+ ; CHECK: JMP_1 %bb.1
+ ; CHECK: bb.1.if:
+ ; CHECK: successors: %bb.3(0x80000000)
+ ; CHECK: [[VCMPSSZrr:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY3]], [[COPY2]], 0
+ ; CHECK: [[COPY9:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr]]
+ ; CHECK: [[COPY10:%[0-9]+]]:vk8 = COPY [[COPY9]]
+ ; CHECK: JMP_1 %bb.3
+ ; CHECK: bb.2.else:
+ ; CHECK: successors: %bb.3(0x80000000)
+ ; CHECK: [[VCMPSSZrr1:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY1]], [[COPY]], 0
+ ; CHECK: [[COPY11:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr1]]
+ ; CHECK: [[COPY12:%[0-9]+]]:vk8 = COPY [[COPY11]]
+ ; CHECK: bb.3.exit:
+ ; CHECK: [[PHI:%[0-9]+]]:vk8 = PHI [[COPY12]], %bb.2, [[COPY10]], %bb.1
+ ; CHECK: [[COPY13:%[0-9]+]]:vk32 = COPY [[PHI]]
+ ; CHECK: [[COPY14:%[0-9]+]]:vk1wm = COPY [[COPY13]]
+ ; CHECK: [[COPY15:%[0-9]+]]:vr128x = COPY [[COPY4]]
+ ; CHECK: [[DEF:%[0-9]+]]:fr128 = IMPLICIT_DEF
+ ; CHECK: [[VMOVSSZrrk:%[0-9]+]]:fr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]]
+ ; CHECK: [[COPY16:%[0-9]+]]:fr32x = COPY [[VMOVSSZrrk]]
+ ; CHECK: VMOVSSZmr [[COPY6]], 1, %noreg, 0, %noreg, killed [[COPY16]] :: (store 4 into %ir.fptr)
+ ; CHECK: RET 0
bb.0.entry:
successors: %bb.1(0x40000000), %bb.2(0x40000000)
liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-
+
%10 = COPY %xmm5
%9 = COPY %xmm4
%8 = COPY %xmm3
@@ -125,38 +162,31 @@ body: |
TEST8ri killed %11, 1, implicit-def %eflags
JE_1 %bb.2, implicit %eflags
JMP_1 %bb.1
-
+
bb.1.if:
successors: %bb.3(0x80000000)
-
+
%14 = VCMPSSZrr %7, %8, 0
; check that cross domain copies are replaced with same domain copies.
- ; CHECK: %15:vk32 = COPY %14
- ; CHECK: %0:vk8 = COPY %15
-
+
%15 = COPY %14
%0 = COPY %15.sub_8bit
JMP_1 %bb.3
-
+
bb.2.else:
successors: %bb.3(0x80000000)
%12 = VCMPSSZrr %9, %10, 0
; check that cross domain copies are replaced with same domain copies.
- ; CHECK: %13:vk32 = COPY %12
- ; CHECK: %1:vk8 = COPY %13
%13 = COPY %12
%1 = COPY %13.sub_8bit
-
+
bb.3.exit:
; check PHI, IMPLICIT_DEF, and INSERT_SUBREG replacers.
- ; CHECK: %2:vk8 = PHI %1, %bb.2, %0, %bb.1
- ; CHECK: %16:vk32 = COPY %2
- ; CHECK: %18:vk1wm = COPY %16
-
+
%2 = PHI %1, %bb.2, %0, %bb.1
%17 = IMPLICIT_DEF
%16 = INSERT_SUBREG %17, %2, 1
@@ -171,14 +201,13 @@ body: |
...
---
name: test_8bitops
-# CHECK-LABEL: name: test_8bitops
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@@ -198,13 +227,13 @@ registers:
- { id: 16, class: gr8, preferred-register: '' }
- { id: 17, class: gr8, preferred-register: '' }
- { id: 18, class: gr8, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
- { reg: '%zmm2', virtual-reg: '%3' }
- { reg: '%zmm3', virtual-reg: '%4' }
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -221,32 +250,50 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
-fixedStack:
-stack:
-constants:
+fixedStack:
+stack:
+constants:
body: |
+ ; CHECK-LABEL: name: test_8bitops
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
+ ; CHECK: [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
+ ; CHECK: [[VCMPPDZrri:%[0-9]+]]:vk8 = VCMPPDZrri [[COPY3]], [[COPY4]], 0
+ ; CHECK: [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPDZrri]]
+ ; CHECK: [[COPY6:%[0-9]+]]:vk8 = COPY [[COPY5]]
+ ; CHECK: [[KSHIFTRBri:%[0-9]+]]:vk8 = KSHIFTRBri [[COPY6]], 2
+ ; CHECK: [[KSHIFTLBri:%[0-9]+]]:vk8 = KSHIFTLBri [[KSHIFTRBri]], 1
+ ; CHECK: [[KNOTBrr:%[0-9]+]]:vk8 = KNOTBrr [[KSHIFTLBri]]
+ ; CHECK: [[KORBrr:%[0-9]+]]:vk8 = KORBrr [[KNOTBrr]], [[KSHIFTRBri]]
+ ; CHECK: [[KANDBrr:%[0-9]+]]:vk8 = KANDBrr [[KORBrr]], [[KSHIFTLBri]]
+ ; CHECK: [[KXORBrr:%[0-9]+]]:vk8 = KXORBrr [[KANDBrr]], [[KSHIFTRBri]]
+ ; CHECK: [[KADDBrr:%[0-9]+]]:vk8 = KADDBrr [[KXORBrr]], [[KNOTBrr]]
+ ; CHECK: [[COPY7:%[0-9]+]]:vk32 = COPY [[KADDBrr]]
+ ; CHECK: [[COPY8:%[0-9]+]]:vk8wm = COPY [[COPY7]]
+ ; CHECK: [[VMOVAPDZrrk:%[0-9]+]]:vr512 = VMOVAPDZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+ ; CHECK: VMOVAPDZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPDZrrk]]
+ ; CHECK: bb.1:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: bb.2:
+ ; CHECK: RET 0
bb.0:
liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
-
+
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
%3 = COPY %zmm2
%4 = COPY %zmm3
-
+
%5 = VCMPPDZrri %3, %4, 0
- ; CHECK: %6:vk32 = COPY %5
- ; CHECK: %7:vk8 = COPY %6
%6 = COPY %5
%7 = COPY %6.sub_8bit
- ; CHECK: %12:vk8 = KSHIFTRBri %7, 2
- ; CHECK: %13:vk8 = KSHIFTLBri %12, 1
- ; CHECK: %14:vk8 = KNOTBrr %13
- ; CHECK: %15:vk8 = KORBrr %14, %12
- ; CHECK: %16:vk8 = KANDBrr %15, %13
- ; CHECK: %17:vk8 = KXORBrr %16, %12
- ; CHECK: %18:vk8 = KADDBrr %17, %14
%12 = SHR8ri %7, 2, implicit-def dead %eflags
%13 = SHL8ri %12, 1, implicit-def dead %eflags
%14 = NOT8r %13
@@ -254,19 +301,17 @@ body: |
%16 = AND8rr %15, %13, implicit-def dead %eflags
%17 = XOR8rr %16, %12, implicit-def dead %eflags
%18 = ADD8rr %17, %14, implicit-def dead %eflags
-
- ; CHECK: %9:vk32 = COPY %18
- ; CHECK: %10:vk8wm = COPY %9
+
%8 = IMPLICIT_DEF
%9 = INSERT_SUBREG %8, %18, 1
%10 = COPY %9
%11 = VMOVAPDZrrk %2, killed %10, %1
- VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11
+ VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11
- ; CHECK: KTESTBrr %18, %18, implicit-def %eflags
- TEST8rr %18, %18, implicit-def %eflags
- JE_1 %bb.1, implicit %eflags
- JMP_1 %bb.2
+ ; FIXME We can't replace TEST with KTEST due to flag differences
+ ; TEST8rr %18, %18, implicit-def %eflags
+ ; JE_1 %bb.1, implicit %eflags
+ ; JMP_1 %bb.2
bb.1:
@@ -276,14 +321,13 @@ body: |
...
---
name: test_16bitops
-# CHECK-LABEL: name: test_16bitops
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@@ -302,13 +346,13 @@ registers:
- { id: 15, class: gr16, preferred-register: '' }
- { id: 16, class: gr16, preferred-register: '' }
- { id: 17, class: gr16, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
- { reg: '%zmm2', virtual-reg: '%3' }
- { reg: '%zmm3', virtual-reg: '%4' }
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -325,50 +369,66 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
-fixedStack:
-stack:
-constants:
+fixedStack:
+stack:
+constants:
body: |
+ ; CHECK-LABEL: name: test_16bitops
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
+ ; CHECK: [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
+ ; CHECK: [[VCMPPSZrri:%[0-9]+]]:vk16 = VCMPPSZrri [[COPY3]], [[COPY4]], 0
+ ; CHECK: [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPSZrri]]
+ ; CHECK: [[COPY6:%[0-9]+]]:vk16 = COPY [[COPY5]]
+ ; CHECK: [[KSHIFTRWri:%[0-9]+]]:vk16 = KSHIFTRWri [[COPY6]], 2
+ ; CHECK: [[KSHIFTLWri:%[0-9]+]]:vk16 = KSHIFTLWri [[KSHIFTRWri]], 1
+ ; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[KSHIFTLWri]]
+ ; CHECK: [[KORWrr:%[0-9]+]]:vk16 = KORWrr [[KNOTWrr]], [[KSHIFTRWri]]
+ ; CHECK: [[KANDWrr:%[0-9]+]]:vk16 = KANDWrr [[KORWrr]], [[KSHIFTLWri]]
+ ; CHECK: [[KXORWrr:%[0-9]+]]:vk16 = KXORWrr [[KANDWrr]], [[KSHIFTRWri]]
+ ; CHECK: [[COPY7:%[0-9]+]]:vk32 = COPY [[KXORWrr]]
+ ; CHECK: [[COPY8:%[0-9]+]]:vk16wm = COPY [[COPY7]]
+ ; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+ ; CHECK: VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
+ ; CHECK: bb.1:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: bb.2:
+ ; CHECK: RET 0
bb.0:
liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
-
+
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
%3 = COPY %zmm2
%4 = COPY %zmm3
-
+
%5 = VCMPPSZrri %3, %4, 0
- ; CHECK: %6:vk32 = COPY %5
- ; CHECK: %7:vk16 = COPY %6
%6 = COPY %5
%7 = COPY %6.sub_16bit
- ; CHECK: %12:vk16 = KSHIFTRWri %7, 2
- ; CHECK: %13:vk16 = KSHIFTLWri %12, 1
- ; CHECK: %14:vk16 = KNOTWrr %13
- ; CHECK: %15:vk16 = KORWrr %14, %12
- ; CHECK: %16:vk16 = KANDWrr %15, %13
- ; CHECK: %17:vk16 = KXORWrr %16, %12
%12 = SHR16ri %7, 2, implicit-def dead %eflags
%13 = SHL16ri %12, 1, implicit-def dead %eflags
%14 = NOT16r %13
%15 = OR16rr %14, %12, implicit-def dead %eflags
%16 = AND16rr %15, %13, implicit-def dead %eflags
%17 = XOR16rr %16, %12, implicit-def dead %eflags
-
- ; CHECK: %9:vk32 = COPY %17
- ; CHECK: %10:vk16wm = COPY %9
+
%8 = IMPLICIT_DEF
%9 = INSERT_SUBREG %8, %17, 3
%10 = COPY %9
%11 = VMOVAPSZrrk %2, killed %10, %1
- VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11
+ VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11
- ; CHECK: KTESTWrr %17, %17, implicit-def %eflags
- TEST16rr %17, %17, implicit-def %eflags
- JE_1 %bb.1, implicit %eflags
- JMP_1 %bb.2
+ ; FIXME We can't replace TEST with KTEST due to flag differences
+ ; TEST16rr %17, %17, implicit-def %eflags
+ ; JE_1 %bb.1, implicit %eflags
+ ; JMP_1 %bb.2
bb.1:
@@ -378,14 +438,13 @@ body: |
...
---
name: test_32bitops
-# CHECK-LABEL: name: test_32bitops
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@@ -400,11 +459,11 @@ registers:
- { id: 11, class: gr32, preferred-register: '' }
- { id: 12, class: gr32, preferred-register: '' }
- { id: 13, class: gr32, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -421,26 +480,40 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
-fixedStack:
-stack:
-constants:
+fixedStack:
+stack:
+constants:
body: |
+ ; CHECK-LABEL: name: test_32bitops
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: %rdi, %zmm0, %zmm1
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[KMOVDkm:%[0-9]+]]:vk32 = KMOVDkm [[COPY]], 1, %noreg, 0, %noreg
+ ; CHECK: [[KSHIFTRDri:%[0-9]+]]:vk32 = KSHIFTRDri [[KMOVDkm]], 2
+ ; CHECK: [[KSHIFTLDri:%[0-9]+]]:vk32 = KSHIFTLDri [[KSHIFTRDri]], 1
+ ; CHECK: [[KNOTDrr:%[0-9]+]]:vk32 = KNOTDrr [[KSHIFTLDri]]
+ ; CHECK: [[KORDrr:%[0-9]+]]:vk32 = KORDrr [[KNOTDrr]], [[KSHIFTRDri]]
+ ; CHECK: [[KANDDrr:%[0-9]+]]:vk32 = KANDDrr [[KORDrr]], [[KSHIFTLDri]]
+ ; CHECK: [[KXORDrr:%[0-9]+]]:vk32 = KXORDrr [[KANDDrr]], [[KSHIFTRDri]]
+ ; CHECK: [[KANDNDrr:%[0-9]+]]:vk32 = KANDNDrr [[KXORDrr]], [[KORDrr]]
+ ; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[KANDNDrr]], [[KXORDrr]]
+ ; CHECK: [[COPY3:%[0-9]+]]:vk32wm = COPY [[KADDDrr]]
+ ; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+ ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
+ ; CHECK: bb.1:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: bb.2:
+ ; CHECK: RET 0
bb.0:
liveins: %rdi, %zmm0, %zmm1
-
+
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
-
- ; CHECK: %5:vk32 = KMOVDkm %0, 1, %noreg, 0, %noreg
- ; CHECK: %6:vk32 = KSHIFTRDri %5, 2
- ; CHECK: %7:vk32 = KSHIFTLDri %6, 1
- ; CHECK: %8:vk32 = KNOTDrr %7
- ; CHECK: %9:vk32 = KORDrr %8, %6
- ; CHECK: %10:vk32 = KANDDrr %9, %7
- ; CHECK: %11:vk32 = KXORDrr %10, %6
- ; CHECK: %12:vk32 = KANDNDrr %11, %9
- ; CHECK: %13:vk32 = KADDDrr %12, %11
+
%5 = MOV32rm %0, 1, %noreg, 0, %noreg
%6 = SHR32ri %5, 2, implicit-def dead %eflags
%7 = SHL32ri %6, 1, implicit-def dead %eflags
@@ -450,16 +523,15 @@ body: |
%11 = XOR32rr %10, %6, implicit-def dead %eflags
%12 = ANDN32rr %11, %9, implicit-def dead %eflags
%13 = ADD32rr %12, %11, implicit-def dead %eflags
-
- ; CHECK: %3:vk32wm = COPY %13
+
%3 = COPY %13
%4 = VMOVDQU16Zrrk %2, killed %3, %1
VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
- ; CHECK: KTESTDrr %13, %13, implicit-def %eflags
- TEST32rr %13, %13, implicit-def %eflags
- JE_1 %bb.1, implicit %eflags
- JMP_1 %bb.2
+ ; FIXME We can't replace TEST with KTEST due to flag differences
+ ; TEST32rr %13, %13, implicit-def %eflags
+ ; JE_1 %bb.1, implicit %eflags
+ ; JMP_1 %bb.2
bb.1:
@@ -469,14 +541,13 @@ body: |
...
---
name: test_64bitops
-# CHECK-LABEL: name: test_64bitops
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@@ -491,11 +562,11 @@ registers:
- { id: 11, class: gr64, preferred-register: '' }
- { id: 12, class: gr64, preferred-register: '' }
- { id: 13, class: gr64, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -512,26 +583,40 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
-fixedStack:
-stack:
-constants:
+fixedStack:
+stack:
+constants:
body: |
+ ; CHECK-LABEL: name: test_64bitops
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: %rdi, %zmm0, %zmm1
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[KMOVQkm:%[0-9]+]]:vk64 = KMOVQkm [[COPY]], 1, %noreg, 0, %noreg
+ ; CHECK: [[KSHIFTRQri:%[0-9]+]]:vk64 = KSHIFTRQri [[KMOVQkm]], 2
+ ; CHECK: [[KSHIFTLQri:%[0-9]+]]:vk64 = KSHIFTLQri [[KSHIFTRQri]], 1
+ ; CHECK: [[KNOTQrr:%[0-9]+]]:vk64 = KNOTQrr [[KSHIFTLQri]]
+ ; CHECK: [[KORQrr:%[0-9]+]]:vk64 = KORQrr [[KNOTQrr]], [[KSHIFTRQri]]
+ ; CHECK: [[KANDQrr:%[0-9]+]]:vk64 = KANDQrr [[KORQrr]], [[KSHIFTLQri]]
+ ; CHECK: [[KXORQrr:%[0-9]+]]:vk64 = KXORQrr [[KANDQrr]], [[KSHIFTRQri]]
+ ; CHECK: [[KANDNQrr:%[0-9]+]]:vk64 = KANDNQrr [[KXORQrr]], [[KORQrr]]
+ ; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[KANDNQrr]], [[KXORQrr]]
+ ; CHECK: [[COPY3:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+ ; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+ ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
+ ; CHECK: bb.1:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: bb.2:
+ ; CHECK: RET 0
bb.0:
liveins: %rdi, %zmm0, %zmm1
-
+
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
-
- ; CHECK: %5:vk64 = KMOVQkm %0, 1, %noreg, 0, %noreg
- ; CHECK: %6:vk64 = KSHIFTRQri %5, 2
- ; CHECK: %7:vk64 = KSHIFTLQri %6, 1
- ; CHECK: %8:vk64 = KNOTQrr %7
- ; CHECK: %9:vk64 = KORQrr %8, %6
- ; CHECK: %10:vk64 = KANDQrr %9, %7
- ; CHECK: %11:vk64 = KXORQrr %10, %6
- ; CHECK: %12:vk64 = KANDNQrr %11, %9
- ; CHECK: %13:vk64 = KADDQrr %12, %11
+
%5 = MOV64rm %0, 1, %noreg, 0, %noreg
%6 = SHR64ri %5, 2, implicit-def dead %eflags
%7 = SHL64ri %6, 1, implicit-def dead %eflags
@@ -541,16 +626,15 @@ body: |
%11 = XOR64rr %10, %6, implicit-def dead %eflags
%12 = ANDN64rr %11, %9, implicit-def dead %eflags
%13 = ADD64rr %12, %11, implicit-def dead %eflags
-
- ; CHECK: %3:vk64wm = COPY %13
+
%3 = COPY %13
%4 = VMOVDQU8Zrrk %2, killed %3, %1
VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
- ; CHECK: KTESTQrr %13, %13, implicit-def %eflags
- TEST64rr %13, %13, implicit-def %eflags
- JE_1 %bb.1, implicit %eflags
- JMP_1 %bb.2
+ ; FIXME We can't replace TEST with KTEST due to flag differences
+ ; TEST64rr %13, %13, implicit-def %eflags
+ ; JE_1 %bb.1, implicit %eflags
+ ; JMP_1 %bb.2
bb.1:
@@ -560,14 +644,13 @@ body: |
...
---
name: test_16bitext
-# CHECK-LABEL: name: test_16bitext
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@@ -575,11 +658,11 @@ registers:
- { id: 4, class: vr512, preferred-register: '' }
- { id: 5, class: gr16, preferred-register: '' }
- { id: 6, class: gr16, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -596,24 +679,32 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
-fixedStack:
-stack:
-constants:
+fixedStack:
+stack:
+constants:
body: |
bb.0:
liveins: %rdi, %zmm0, %zmm1
-
+
+ ; CHECK-LABEL: name: test_16bitext
+ ; CHECK: liveins: %rdi, %zmm0, %zmm1
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+ ; CHECK: [[COPY3:%[0-9]+]]:vk16 = COPY [[KMOVBkm]]
+ ; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[COPY3]]
+ ; CHECK: [[COPY4:%[0-9]+]]:vk16wm = COPY [[KNOTWrr]]
+ ; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY4]], [[COPY1]]
+ ; CHECK: VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
+ ; CHECK: RET 0
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
-
- ; CHECK: %7:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
- ; CHECK: %5:vk16 = COPY %7
- ; CHECK: %6:vk16 = KNOTWrr %5
+
%5 = MOVZX16rm8 %0, 1, %noreg, 0, %noreg
%6 = NOT16r %5
- ; CHECK: %3:vk16wm = COPY %6
%3 = COPY %6
%4 = VMOVAPSZrrk %2, killed %3, %1
VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %4
@@ -622,14 +713,13 @@ body: |
...
---
name: test_32bitext
-# CHECK-LABEL: name: test_32bitext
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@@ -638,11 +728,11 @@ registers:
- { id: 5, class: gr32, preferred-register: '' }
- { id: 6, class: gr32, preferred-register: '' }
- { id: 7, class: gr32, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -659,27 +749,35 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
-fixedStack:
-stack:
-constants:
+fixedStack:
+stack:
+constants:
body: |
bb.0:
liveins: %rdi, %zmm0, %zmm1
-
+
+ ; CHECK-LABEL: name: test_32bitext
+ ; CHECK: liveins: %rdi, %zmm0, %zmm1
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+ ; CHECK: [[COPY3:%[0-9]+]]:vk32 = COPY [[KMOVBkm]]
+ ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
+ ; CHECK: [[COPY4:%[0-9]+]]:vk32 = COPY [[KMOVWkm]]
+ ; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[COPY3]], [[COPY4]]
+ ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDDrr]]
+ ; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+ ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
+ ; CHECK: RET 0
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
-
- ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
- ; CHECK: %5:vk32 = COPY %8
- ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
- ; CHECK: %6:vk32 = COPY %9
- ; CHECK: %7:vk32 = KADDDrr %5, %6
+
%5 = MOVZX32rm8 %0, 1, %noreg, 0, %noreg
%6 = MOVZX32rm16 %0, 1, %noreg, 0, %noreg
%7 = ADD32rr %5, %6, implicit-def dead %eflags
- ; CHECK: %3:vk64wm = COPY %7
%3 = COPY %7
%4 = VMOVDQU16Zrrk %2, killed %3, %1
VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
@@ -688,14 +786,13 @@ body: |
...
---
name: test_64bitext
-# CHECK-LABEL: name: test_64bitext
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@@ -704,11 +801,11 @@ registers:
- { id: 5, class: gr64, preferred-register: '' }
- { id: 6, class: gr64, preferred-register: '' }
- { id: 7, class: gr64, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -725,27 +822,35 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
-fixedStack:
-stack:
-constants:
+fixedStack:
+stack:
+constants:
body: |
bb.0:
liveins: %rdi, %zmm0, %zmm1
-
+
+ ; CHECK-LABEL: name: test_64bitext
+ ; CHECK: liveins: %rdi, %zmm0, %zmm1
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+ ; CHECK: [[COPY3:%[0-9]+]]:vk64 = COPY [[KMOVBkm]]
+ ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
+ ; CHECK: [[COPY4:%[0-9]+]]:vk64 = COPY [[KMOVWkm]]
+ ; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[COPY3]], [[COPY4]]
+ ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+ ; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+ ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
+ ; CHECK: RET 0
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
-
- ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
- ; CHECK: %5:vk64 = COPY %8
- ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
- ; CHECK: %6:vk64 = COPY %9
- ; CHECK: %7:vk64 = KADDQrr %5, %6
+
%5 = MOVZX64rm8 %0, 1, %noreg, 0, %noreg
%6 = MOVZX64rm16 %0, 1, %noreg, 0, %noreg
%7 = ADD64rr %5, %6, implicit-def dead %eflags
- ; CHECK: %3:vk64wm = COPY %7
%3 = COPY %7
%4 = VMOVDQU8Zrrk %2, killed %3, %1
VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
diff --git a/test/CodeGen/X86/inline-asm-modifier-V.ll b/test/CodeGen/X86/inline-asm-modifier-V.ll
new file mode 100644
index 000000000000..5a7f3fdd25fd
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-modifier-V.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-- -no-integrated-as | FileCheck -check-prefix=X64 %s
+
+; If the target does not have 64-bit integer registers, emit 32-bit register
+; names.
+
+; X86: call __x86_indirect_thunk_e{{[abcd]}}x
+; X64: call __x86_indirect_thunk_r
+
+define void @q_modifier(i32* %p) {
+entry:
+ tail call void asm sideeffect "call __x86_indirect_thunk_${0:V}", "r,~{dirflag},~{fpsr},~{flags}"(i32* %p)
+ ret void
+}
diff --git a/test/CodeGen/X86/pr36199.ll b/test/CodeGen/X86/pr36199.ll
new file mode 100644
index 000000000000..84e17dba92e0
--- /dev/null
+++ b/test/CodeGen/X86/pr36199.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s
+
+define void @foo() unnamed_addr #0 {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vaddps %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; CHECK-NEXT: vmovups %zmm0, (%rax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %1 = fadd <16 x float> undef, undef
+ %bc256 = bitcast <16 x float> %1 to <4 x i128>
+ %2 = extractelement <4 x i128> %bc256, i32 0
+ %3 = bitcast i128 %2 to <4 x float>
+ %4 = shufflevector <4 x float> %3, <4 x float> undef, <16 x i32> <i32 0, i32
+1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0,
+i32 1, i32 2, i32 3>
+ store <16 x float> %4, <16 x float>* undef, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/retpoline-external.ll b/test/CodeGen/X86/retpoline-external.ll
index 66d32ba5d73d..2f21bb2566de 100644
--- a/test/CodeGen/X86/retpoline-external.ll
+++ b/test/CodeGen/X86/retpoline-external.ll
@@ -23,18 +23,18 @@ entry:
; X64: callq bar
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq %[[fp]], %r11
-; X64: callq __llvm_external_retpoline_r11
+; X64: callq __x86_indirect_thunk_r11
; X64: movl %[[x]], %edi
; X64: callq bar
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq %[[fp]], %r11
-; X64: jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64: jmp __x86_indirect_thunk_r11 # TAILCALL
; X64FAST-LABEL: icall_reg:
; X64FAST: callq bar
-; X64FAST: callq __llvm_external_retpoline_r11
+; X64FAST: callq __x86_indirect_thunk_r11
; X64FAST: callq bar
-; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL
; X86-LABEL: icall_reg:
; X86-DAG: movl 12(%esp), %[[fp:[^ ]*]]
@@ -43,19 +43,19 @@ entry:
; X86: calll bar
; X86: movl %[[fp]], %eax
; X86: pushl %[[x]]
-; X86: calll __llvm_external_retpoline_eax
+; X86: calll __x86_indirect_thunk_eax
; X86: pushl %[[x]]
; X86: calll bar
; X86: movl %[[fp]], %eax
; X86: pushl %[[x]]
-; X86: calll __llvm_external_retpoline_eax
+; X86: calll __x86_indirect_thunk_eax
; X86-NOT: # TAILCALL
; X86FAST-LABEL: icall_reg:
; X86FAST: calll bar
-; X86FAST: calll __llvm_external_retpoline_eax
+; X86FAST: calll __x86_indirect_thunk_eax
; X86FAST: calll bar
-; X86FAST: calll __llvm_external_retpoline_eax
+; X86FAST: calll __x86_indirect_thunk_eax
@global_fp = external global void (i32)*
@@ -72,28 +72,28 @@ define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
; X64-LABEL: icall_global_fp:
; X64-DAG: movl %edi, %[[x:[^ ]*]]
; X64-DAG: movq global_fp(%rip), %r11
-; X64: callq __llvm_external_retpoline_r11
+; X64: callq __x86_indirect_thunk_r11
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq global_fp(%rip), %r11
-; X64: jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64: jmp __x86_indirect_thunk_r11 # TAILCALL
; X64FAST-LABEL: icall_global_fp:
; X64FAST: movq global_fp(%rip), %r11
-; X64FAST: callq __llvm_external_retpoline_r11
+; X64FAST: callq __x86_indirect_thunk_r11
; X64FAST: movq global_fp(%rip), %r11
-; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL
; X86-LABEL: icall_global_fp:
; X86: movl global_fp, %eax
; X86: pushl 4(%esp)
-; X86: calll __llvm_external_retpoline_eax
+; X86: calll __x86_indirect_thunk_eax
; X86: addl $4, %esp
; X86: movl global_fp, %eax
-; X86: jmp __llvm_external_retpoline_eax # TAILCALL
+; X86: jmp __x86_indirect_thunk_eax # TAILCALL
; X86FAST-LABEL: icall_global_fp:
-; X86FAST: calll __llvm_external_retpoline_eax
-; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL
+; X86FAST: calll __x86_indirect_thunk_eax
+; X86FAST: jmp __x86_indirect_thunk_eax # TAILCALL
%struct.Foo = type { void (%struct.Foo*)** }
@@ -114,14 +114,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
; X64: movq (%[[obj]]), %[[vptr:[^ ]*]]
; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]]
; X64: movq %[[fp]], %r11
-; X64: callq __llvm_external_retpoline_r11
+; X64: callq __x86_indirect_thunk_r11
; X64-DAG: movq %[[obj]], %rdi
; X64-DAG: movq %[[fp]], %r11
-; X64: jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64: jmp __x86_indirect_thunk_r11 # TAILCALL
; X64FAST-LABEL: vcall:
-; X64FAST: callq __llvm_external_retpoline_r11
-; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST: callq __x86_indirect_thunk_r11
+; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL
; X86-LABEL: vcall:
; X86: movl 8(%esp), %[[obj:[^ ]*]]
@@ -129,14 +129,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
; X86: movl 4(%[[vptr]]), %[[fp:[^ ]*]]
; X86: movl %[[fp]], %eax
; X86: pushl %[[obj]]
-; X86: calll __llvm_external_retpoline_eax
+; X86: calll __x86_indirect_thunk_eax
; X86: addl $4, %esp
; X86: movl %[[fp]], %eax
-; X86: jmp __llvm_external_retpoline_eax # TAILCALL
+; X86: jmp __x86_indirect_thunk_eax # TAILCALL
; X86FAST-LABEL: vcall:
-; X86FAST: calll __llvm_external_retpoline_eax
-; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL
+; X86FAST: calll __x86_indirect_thunk_eax
+; X86FAST: jmp __x86_indirect_thunk_eax # TAILCALL
declare void @direct_callee()
diff --git a/test/CodeGen/X86/retpoline-regparm.ll b/test/CodeGen/X86/retpoline-regparm.ll
new file mode 100644
index 000000000000..13b32740b287
--- /dev/null
+++ b/test/CodeGen/X86/retpoline-regparm.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s
+
+; Test 32-bit retpoline when -mregparm=3 is used. This case is interesting
+; because there are no available scratch registers. The Linux kernel builds
+; with -mregparm=3, so we need to support it. TCO should fail because we need
+; to restore EDI.
+
+define void @call_edi(void (i32, i32, i32)* %fp) #0 {
+entry:
+ tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
+ ret void
+}
+
+; CHECK-LABEL: call_edi:
+; EDI is used, so it must be saved.
+; CHECK: pushl %edi
+; CHECK-DAG: xorl %eax, %eax
+; CHECK-DAG: xorl %edx, %edx
+; CHECK-DAG: xorl %ecx, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: calll __llvm_retpoline_edi
+; CHECK: popl %edi
+; CHECK: retl
+
+define void @edi_external(void (i32, i32, i32)* %fp) #1 {
+entry:
+ tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
+ ret void
+}
+
+; CHECK-LABEL: edi_external:
+; CHECK: pushl %edi
+; CHECK-DAG: xorl %eax, %eax
+; CHECK-DAG: xorl %edx, %edx
+; CHECK-DAG: xorl %ecx, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: calll __x86_indirect_thunk_edi
+; CHECK: popl %edi
+; CHECK: retl
+
+attributes #0 = { "target-features"="+retpoline" }
+attributes #1 = { "target-features"="+retpoline-external-thunk" }
diff --git a/test/CodeGen/X86/retpoline.ll b/test/CodeGen/X86/retpoline.ll
index 57d3388b812a..477609e2d10b 100644
--- a/test/CodeGen/X86/retpoline.ll
+++ b/test/CodeGen/X86/retpoline.ll
@@ -340,10 +340,10 @@ latch:
; X86-NEXT: movl %edx, (%esp)
; X86-NEXT: retl
;
-; X86-LABEL: .section .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat
-; X86-NEXT: .hidden __llvm_retpoline_push
-; X86-NEXT: .weak __llvm_retpoline_push
-; X86: __llvm_retpoline_push:
+; X86-LABEL: .section .text.__llvm_retpoline_edi,{{.*}},__llvm_retpoline_edi,comdat
+; X86-NEXT: .hidden __llvm_retpoline_edi
+; X86-NEXT: .weak __llvm_retpoline_edi
+; X86: __llvm_retpoline_edi:
; X86-NEXT: # {{.*}} # %entry
; X86-NEXT: calll [[CALL_TARGET:.*]]
; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken
@@ -355,11 +355,7 @@ latch:
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: [[CALL_TARGET]]: # Block address taken
; X86-NEXT: # %entry
-; X86-NEXT: addl $4, %esp
-; X86-NEXT: pushl 4(%esp)
-; X86-NEXT: pushl 4(%esp)
-; X86-NEXT: popl 8(%esp)
-; X86-NEXT: popl (%esp)
+; X86-NEXT: movl %edi, (%esp)
; X86-NEXT: retl
diff --git a/test/DebugInfo/X86/void-typedef.ll b/test/DebugInfo/X86/void-typedef.ll
new file mode 100644
index 000000000000..2e6bf49bae78
--- /dev/null
+++ b/test/DebugInfo/X86/void-typedef.ll
@@ -0,0 +1,88 @@
+; Choosing CodeView generates debug metadata for class-scope typedefs that
+; Dwarf would normally omit. Choosing both CodeView and Dwarf triggered
+; assertion failures and crashes because the Dwarf handler wasn't prepared for
+; those records (in particular, ones with the void type represented by a
+; null pointer).
+;
+; This test was generated with:
+; clang++ -cc1 -emit-llvm -debug-info-kind=limited -dwarf-version=4 -gcodeview -x c++
+; on the following source code:
+;
+; class A {
+; typedef void _Nodeptr;
+; };
+; class B {
+; A FailedTestsCache;
+; bool m_fn1();
+; };
+; bool B::m_fn1() {}
+;
+; CodeView generates a DIDerivedType for the _Nodeptr typedef.
+;
+; RUN: llc %s -o - 2>&1 | FileCheck %s
+; CHECK-NOT: Assertion failed
+
+; ModuleID = 'bug.cpp'
+source_filename = "bug.cpp"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+%class.B = type { %class.A }
+%class.A = type { i8 }
+
+; Function Attrs: noinline nounwind optnone
+define x86_thiscallcc zeroext i1 @"\01?m_fn1@B@@AAE_NXZ"(%class.B* %this) #0 align 2 !dbg !9 {
+entry:
+ %retval = alloca i1, align 1
+ %this.addr = alloca %class.B*, align 4
+ store %class.B* %this, %class.B** %this.addr, align 4
+ call void @llvm.dbg.declare(metadata %class.B** %this.addr, metadata !22, metadata !DIExpression()), !dbg !24
+ %this1 = load %class.B*, %class.B** %this.addr, align 4
+ call void @llvm.trap(), !dbg !25
+ unreachable, !dbg !25
+
+return: ; No predecessors!
+ %0 = load i1, i1* %retval, align 1, !dbg !25
+ ret i1 %0, !dbg !25
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap() #2
+
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { noreturn nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"CodeView", i32 1}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 2}
+!8 = !{!"clang version 6.0.0 "}
+!9 = distinct !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 8, type: !18, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !17, variables: !2)
+!10 = !DIFile(filename: "bug.cpp", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6")
+!11 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B", file: !10, line: 4, size: 8, elements: !12, identifier: ".?AVB@@")
+!12 = !{!13, !17}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "FailedTestsCache", scope: !11, file: !10, line: 5, baseType: !14, size: 8)
+!14 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !10, line: 1, size: 8, elements: !15, identifier: ".?AVA@@")
+!15 = !{!16}
+!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "_Nodeptr", scope: !14, file: !10, line: 2, baseType: null)
+!17 = !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 6, type: !18, isLocal: false, isDefinition: false, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false)
+!18 = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: !19)
+!19 = !{!20, !21}
+!20 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DILocalVariable(name: "this", arg: 1, scope: !9, type: !23, flags: DIFlagArtificial | DIFlagObjectPointer)
+!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32)
+!24 = !DILocation(line: 0, scope: !9)
+!25 = !DILocation(line: 8, scope: !9)
diff --git a/test/MC/AsmParser/inline_macro_duplication.ll b/test/MC/AsmParser/inline_macro_duplication.ll
new file mode 100644
index 000000000000..9d7e22fde7b6
--- /dev/null
+++ b/test/MC/AsmParser/inline_macro_duplication.ll
@@ -0,0 +1,8 @@
+; RUN: not llc < %s 2>&1 | FileCheck %s
+
+define void @test() {
+ call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
+ call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
+; CHECK: error: macro 'FOO' is already defined
+ ret void
+}
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 378af768fa99..01cd6b6fa006 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -622,6 +622,11 @@ movl $12, foo(%rip)
// CHECK: encoding: [0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
// CHECK: fixup A - offset: 2, value: foo-8, kind: reloc_riprel_4byte
+// rdar://37247000
+movl $12, 1024(%rip)
+// CHECK: movl $12, 1024(%rip)
+// CHECK: encoding: [0xc7,0x05,0x00,0x04,0x00,0x00,0x0c,0x00,0x00,0x00]
+
movq $12, foo(%rip)
// CHECK: movq $12, foo(%rip)
// CHECK: encoding: [0x48,0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
diff --git a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index f82bf81fbbf8..c8a05204bf5e 100644
--- a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -723,6 +723,114 @@ define <2 x half> @constant_rtz_pkrtz() {
}
; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_i16() {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef)
+ ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_u16() {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef)
+ ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_i16() {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef)
+ ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_u16() {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef)
+ ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
; llvm.amdgcn.ubfe
; --------------------------------------------------------------------