summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-01-09 21:23:09 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-01-09 21:23:09 +0000
commit909545a822eef491158f831688066f0ec2866938 (patch)
tree5b0bf0e81294007a9b462b21031b3df272c655c3 /lib
parent7e7b6700743285c0af506ac6299ddf82ebd434b9 (diff)
Notes
Diffstat (limited to 'lib')
-rw-r--r--lib/Analysis/InstructionSimplify.cpp20
-rw-r--r--lib/Analysis/LoopInfo.cpp6
-rw-r--r--lib/Analysis/MemoryDependenceAnalysis.cpp42
-rw-r--r--lib/Analysis/ScalarEvolution.cpp12
-rw-r--r--lib/Analysis/ValueTracking.cpp1
-rw-r--r--lib/Bitcode/Reader/MetadataLoader.cpp13
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp4
-rw-r--r--lib/CodeGen/StackSlotColoring.cpp11
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp44
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h3
-rw-r--r--lib/LTO/ThinLTOCodeGenerator.cpp9
-rw-r--r--lib/Object/MachOObjectFile.cpp8
-rw-r--r--lib/Object/ModuleSummaryIndexObjectFile.cpp8
-rw-r--r--lib/Support/CommandLine.cpp2
-rw-r--r--lib/Support/Path.cpp10
-rw-r--r--lib/Support/TarWriter.cpp42
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp10
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp52
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp281
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td11
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp39
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h3
-rw-r--r--lib/Target/AVR/AVRISelDAGToDAG.cpp4
-rw-r--r--lib/Target/AVR/AVRISelLowering.cpp41
-rw-r--r--lib/Target/AVR/AVRISelLowering.h3
-rw-r--r--lib/Target/BPF/BPFInstrInfo.cpp16
-rw-r--r--lib/Target/BPF/Disassembler/BPFDisassembler.cpp12
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp20
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp11
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp19
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp9
-rw-r--r--lib/Target/TargetMachineC.cpp4
-rw-r--r--lib/Target/WebAssembly/CMakeLists.txt1
-rw-r--r--lib/Target/WebAssembly/WebAssembly.h1
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp159
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInteger.td4
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp4
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp264
-rw-r--r--lib/Target/X86/X86InstrAVX512.td247
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp19
-rw-r--r--lib/Target/X86/X86InstrSSE.td2
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp291
-rw-r--r--lib/Transforms/IPO/LowerTypeTests.cpp109
-rw-r--r--lib/Transforms/IPO/PassManagerBuilder.cpp3
-rw-r--r--lib/Transforms/InstCombine/InstCombineCompares.cpp10
-rw-r--r--lib/Transforms/Instrumentation/AddressSanitizer.cpp1
-rw-r--r--lib/Transforms/Scalar/IndVarSimplify.cpp2
-rw-r--r--lib/Transforms/Scalar/LoopLoadElimination.cpp4
-rw-r--r--lib/Transforms/Scalar/LoopUnswitch.cpp2
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp192
-rw-r--r--lib/Transforms/Scalar/SCCP.cpp18
-rw-r--r--lib/Transforms/Utils/FunctionImportUtils.cpp15
-rw-r--r--lib/Transforms/Utils/SimplifyLibCalls.cpp12
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp34
55 files changed, 1411 insertions, 757 deletions
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index b4686a1ff1758..8da2f0981d0ca 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1106,6 +1106,16 @@ static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse))
return V;
+ // udiv %V, C -> 0 if %V < C
+ if (MaxRecurse) {
+ if (Constant *C = dyn_cast_or_null<Constant>(SimplifyICmpInst(
+ ICmpInst::ICMP_ULT, Op0, Op1, Q, MaxRecurse - 1))) {
+ if (C->isAllOnesValue()) {
+ return Constant::getNullValue(Op0->getType());
+ }
+ }
+ }
+
return nullptr;
}
@@ -1247,6 +1257,16 @@ static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse))
return V;
+ // urem %V, C -> %V if %V < C
+ if (MaxRecurse) {
+ if (Constant *C = dyn_cast_or_null<Constant>(SimplifyICmpInst(
+ ICmpInst::ICMP_ULT, Op0, Op1, Q, MaxRecurse - 1))) {
+ if (C->isAllOnesValue()) {
+ return Op0;
+ }
+ }
+ }
+
return nullptr;
}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 19c0171740c92..3d85ef6988a9a 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -179,9 +179,9 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const {
}
bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT, const LoopInfo &LI) const {
- // For each block we check that it doesn't have any uses outside of it's
- // innermost loop. This process will transitivelly guarntee that current loop
- // and all of the nested loops are in the LCSSA form.
+ // For each block we check that it doesn't have any uses outside of its
+ // innermost loop. This process will transitively guarantee that the current
+ // loop and all of the nested loops are in LCSSA form.
return all_of(this->blocks(), [&](const BasicBlock *BB) {
return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT);
});
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 2746361ab4b58..e7415e6231963 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -344,38 +344,24 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
if (!InvariantGroupMD)
return MemDepResult::getUnknown();
- Value *LoadOperand = LI->getPointerOperand();
+ // Take the ptr operand after all casts and geps 0. This way we can search
+ // cast graph down only.
+ Value *LoadOperand = LI->getPointerOperand()->stripPointerCasts();
+
// It's is not safe to walk the use list of global value, because function
// passes aren't allowed to look outside their functions.
+ // FIXME: this could be fixed by filtering instructions from outside
+ // of current function.
if (isa<GlobalValue>(LoadOperand))
return MemDepResult::getUnknown();
// Queue to process all pointers that are equivalent to load operand.
SmallVector<const Value *, 8> LoadOperandsQueue;
- SmallSet<const Value *, 14> SeenValues;
- auto TryInsertToQueue = [&](Value *V) {
- if (SeenValues.insert(V).second)
- LoadOperandsQueue.push_back(V);
- };
-
- TryInsertToQueue(LoadOperand);
+ LoadOperandsQueue.push_back(LoadOperand);
while (!LoadOperandsQueue.empty()) {
const Value *Ptr = LoadOperandsQueue.pop_back_val();
- assert(Ptr);
- if (isa<GlobalValue>(Ptr))
- continue;
-
- // Value comes from bitcast: Ptr = bitcast x. Insert x.
- if (auto *BCI = dyn_cast<BitCastInst>(Ptr))
- TryInsertToQueue(BCI->getOperand(0));
- // Gep with zeros is equivalent to bitcast.
- // FIXME: we are not sure if some bitcast should be canonicalized to gep 0
- // or gep 0 to bitcast because of SROA, so there are 2 forms. When typeless
- // pointers will be upstream then both cases will be gone (and this BFS
- // also won't be needed).
- if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr))
- if (GEP->hasAllZeroIndices())
- TryInsertToQueue(GEP->getOperand(0));
+ assert(Ptr && !isa<GlobalValue>(Ptr) &&
+ "Null or GlobalValue should not be inserted");
for (const Use &Us : Ptr->uses()) {
auto *U = dyn_cast<Instruction>(Us.getUser());
@@ -385,13 +371,17 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
// Bitcast or gep with zeros are using Ptr. Add to queue to check it's
// users. U = bitcast Ptr
if (isa<BitCastInst>(U)) {
- TryInsertToQueue(U);
+ LoadOperandsQueue.push_back(U);
continue;
}
- // U = getelementptr Ptr, 0, 0...
+ // Gep with zeros is equivalent to bitcast.
+ // FIXME: we are not sure if some bitcast should be canonicalized to gep 0
+ // or gep 0 to bitcast because of SROA, so there are 2 forms. When
+ // typeless pointers will be ready then both cases will be gone
+ // (and this BFS also won't be needed).
if (auto *GEP = dyn_cast<GetElementPtrInst>(U))
if (GEP->hasAllZeroIndices()) {
- TryInsertToQueue(U);
+ LoadOperandsQueue.push_back(U);
continue;
}
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 5e566bcdaff4d..44f1a6dde0d21 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -10012,6 +10012,18 @@ void ScalarEvolution::verify() const {
// TODO: Verify more things.
}
+bool ScalarEvolution::invalidate(
+ Function &F, const PreservedAnalyses &PA,
+ FunctionAnalysisManager::Invalidator &Inv) {
+ // Invalidate the ScalarEvolution object whenever it isn't preserved or one
+ // of its dependencies is invalidated.
+ auto PAC = PA.getChecker<ScalarEvolutionAnalysis>();
+ return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) ||
+ Inv.invalidate<AssumptionAnalysis>(F, PA) ||
+ Inv.invalidate<DominatorTreeAnalysis>(F, PA) ||
+ Inv.invalidate<LoopAnalysis>(F, PA);
+}
+
AnalysisKey ScalarEvolutionAnalysis::Key;
ScalarEvolution ScalarEvolutionAnalysis::run(Function &F,
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 073b4e6ab26ae..d31472c0d33c1 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -3257,6 +3257,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
case Intrinsic::dbg_value:
return true;
+ case Intrinsic::bitreverse:
case Intrinsic::bswap:
case Intrinsic::ctlz:
case Intrinsic::ctpop:
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 460d39cc28d87..4a5d18e2db750 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -429,7 +429,7 @@ class MetadataLoader::MetadataLoaderImpl {
/// Populate the index above to enable lazily loading of metadata, and load
/// the named metadata as well as the transitively referenced global
/// Metadata.
- Expected<bool> lazyLoadModuleMetadataBlock(PlaceholderQueue &Placeholders);
+ Expected<bool> lazyLoadModuleMetadataBlock();
/// On-demand loading of a single metadata. Requires the index above to be
/// populated.
@@ -516,8 +516,8 @@ Error error(const Twine &Message) {
Message, make_error_code(BitcodeError::CorruptedBitcode));
}
-Expected<bool> MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock(
- PlaceholderQueue &Placeholders) {
+Expected<bool>
+MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
IndexCursor = Stream;
SmallVector<uint64_t, 64> Record;
// Get the abbrevs, and preload record positions to make them lazy-loadable.
@@ -701,7 +701,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
// then load individual record as needed, starting with the named metadata.
if (ModuleLevel && IsImporting && MetadataList.empty() &&
!DisableLazyLoading) {
- auto SuccessOrErr = lazyLoadModuleMetadataBlock(Placeholders);
+ auto SuccessOrErr = lazyLoadModuleMetadataBlock();
if (!SuccessOrErr)
return SuccessOrErr.takeError();
if (SuccessOrErr.get()) {
@@ -1561,7 +1561,6 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
return error("Invalid record");
SmallVector<uint64_t, 64> Record;
-
PlaceholderQueue Placeholders;
while (true) {
@@ -1608,10 +1607,12 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
auto Idx = Record[i + 1];
if (Idx < (MDStringRef.size() + GlobalMetadataBitPosIndex.size()) &&
- !MetadataList.lookup(Idx))
+ !MetadataList.lookup(Idx)) {
// Load the attachment if it is in the lazy-loadable range and hasn't
// been loaded yet.
lazyLoadOneMetadata(Idx, Placeholders);
+ resolveForwardRefsAndPlaceholders(Placeholders);
+ }
Metadata *Node = MetadataList.getMetadataFwdRef(Idx);
if (isa<LocalAsMetadata>(Node))
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a37f4e1116b43..6b62f11f12405 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1714,7 +1714,7 @@ void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
EVT CCT = getSetCCResultType(NVT);
// Hi part is always the same op
- Hi = DAG.getNode(N->getOpcode(), DL, {NVT, NVT}, {LHSH, RHSH});
+ Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH});
// We need to know whether to select Lo part that corresponds to 'winning'
// Hi part or if Hi parts are equal.
@@ -1725,7 +1725,7 @@ void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL);
// Recursed Lo part if Hi parts are equal, this uses unsigned version
- SDValue LoMinMax = DAG.getNode(LoOpc, DL, {NVT, NVT}, {LHSL, RHSL});
+ SDValue LoMinMax = DAG.getNode(LoOpc, DL, NVT, {LHSL, RHSL});
Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp);
}
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index bae828a2263c1..234b2043a6a14 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -381,7 +381,6 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
I != E; ++I) {
if (DCELimit != -1 && (int)NumDead >= DCELimit)
break;
-
int FirstSS, SecondSS;
if (TII->isStackSlotCopy(*I, FirstSS, SecondSS) && FirstSS == SecondSS &&
FirstSS != -1) {
@@ -392,12 +391,18 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
}
MachineBasicBlock::iterator NextMI = std::next(I);
- if (NextMI == MBB->end()) continue;
+ MachineBasicBlock::iterator ProbableLoadMI = I;
unsigned LoadReg = 0;
unsigned StoreReg = 0;
if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS)))
continue;
+ // Skip the ...pseudo debugging... instructions between a load and store.
+ while ((NextMI != E) && NextMI->isDebugValue()) {
+ ++NextMI;
+ ++I;
+ }
+ if (NextMI == E) continue;
if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS)))
continue;
if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;
@@ -407,7 +412,7 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
if (NextMI->findRegisterUseOperandIdx(LoadReg, true, nullptr) != -1) {
++NumDead;
- toErase.push_back(&*I);
+ toErase.push_back(&*ProbableLoadMI);
}
toErase.push_back(&*NextMI);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index a5a30fab5b698..8f6b1849169a9 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -896,6 +896,48 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType,
return ELF::R_MIPS_NONE;
}
+// Sometimes we don't need to create thunk for a branch.
+// This typically happens when branch target is located
+// in the same object file. In such case target is either
+// a weak symbol or symbol in a different executable section.
+// This function checks if branch target is located in the
+// same object file and if distance between source and target
+// fits R_AARCH64_CALL26 relocation. If both conditions are
+// met, it emits direct jump to the target and returns true.
+// Otherwise false is returned and thunk is created.
+bool RuntimeDyldELF::resolveAArch64ShortBranch(
+ unsigned SectionID, relocation_iterator RelI,
+ const RelocationValueRef &Value) {
+ uint64_t Address;
+ if (Value.SymbolName) {
+ auto Loc = GlobalSymbolTable.find(Value.SymbolName);
+
+ // Don't create direct branch for external symbols.
+ if (Loc == GlobalSymbolTable.end())
+ return false;
+
+ const auto &SymInfo = Loc->second;
+ Address =
+ uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset(
+ SymInfo.getOffset()));
+ } else {
+ Address = uint64_t(Sections[Value.SectionID].getLoadAddress());
+ }
+ uint64_t Offset = RelI->getOffset();
+ uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset);
+
+ // R_AARCH64_CALL26 requires immediate to be in range -2^27 <= imm < 2^27
+ // If distance between source and target is out of range then we should
+ // create thunk.
+ if (!isInt<28>(Address + Value.Addend - SourceAddress))
+ return false;
+
+ resolveRelocation(Sections[SectionID], Offset, Address, RelI->getType(),
+ Value.Addend);
+
+ return true;
+}
+
Expected<relocation_iterator>
RuntimeDyldELF::processRelocationRef(
unsigned SectionID, relocation_iterator RelI, const ObjectFile &O,
@@ -1003,7 +1045,7 @@ RuntimeDyldELF::processRelocationRef(
(uint64_t)Section.getAddressWithOffset(i->second),
RelType, 0);
DEBUG(dbgs() << " Stub function found\n");
- } else {
+ } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
// Create a new stub function.
DEBUG(dbgs() << " Create a new stub function\n");
Stubs[Value] = Section.getStubOffset();
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 796127ab92bd7..d1867d091fe29 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -40,6 +40,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
void resolveAArch64Relocation(const SectionEntry &Section, uint64_t Offset,
uint64_t Value, uint32_t Type, int64_t Addend);
+ bool resolveAArch64ShortBranch(unsigned SectionID, relocation_iterator RelI,
+ const RelocationValueRef &Value);
+
void resolveARMRelocation(const SectionEntry &Section, uint64_t Offset,
uint32_t Value, uint32_t Type, int32_t Addend);
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 66ffe6db29d61..928f69a17de90 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -196,8 +196,15 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
};
FunctionImporter Importer(Index, Loader);
- if (!Importer.importFunctions(TheModule, ImportList))
+ Expected<bool> Result = Importer.importFunctions(TheModule, ImportList);
+ if (!Result) {
+ handleAllErrors(Result.takeError(), [&](ErrorInfoBase &EIB) {
+ SMDiagnostic Err = SMDiagnostic(TheModule.getModuleIdentifier(),
+ SourceMgr::DK_Error, EIB.message());
+ Err.print("ThinLTO", errs());
+ });
report_fatal_error("importFunctions failed");
+ }
}
static void optimizeModule(Module &TheModule, TargetMachine &TM,
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 40105000c56ca..5b018676eba3d 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -2823,7 +2823,11 @@ StringRef MachORebaseEntry::typeName() const {
}
bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
+#ifdef EXPENSIVE_CHECKS
assert(Opcodes == Other.Opcodes && "compare iterators of different files");
+#else
+ assert(Opcodes.data() == Other.Opcodes.data() && "compare iterators of different files");
+#endif
return (Ptr == Other.Ptr) &&
(RemainingLoopCount == Other.RemainingLoopCount) &&
(Done == Other.Done);
@@ -3073,7 +3077,11 @@ uint32_t MachOBindEntry::flags() const { return Flags; }
int MachOBindEntry::ordinal() const { return Ordinal; }
bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
+#ifdef EXPENSIVE_CHECKS
assert(Opcodes == Other.Opcodes && "compare iterators of different files");
+#else
+ assert(Opcodes.data() == Other.Opcodes.data() && "compare iterators of different files");
+#endif
return (Ptr == Other.Ptr) &&
(RemainingLoopCount == Other.RemainingLoopCount) &&
(Done == Other.Done);
diff --git a/lib/Object/ModuleSummaryIndexObjectFile.cpp b/lib/Object/ModuleSummaryIndexObjectFile.cpp
index 202783e7d993e..11ace84b9cebb 100644
--- a/lib/Object/ModuleSummaryIndexObjectFile.cpp
+++ b/lib/Object/ModuleSummaryIndexObjectFile.cpp
@@ -22,6 +22,12 @@
using namespace llvm;
using namespace object;
+static llvm::cl::opt<bool> IgnoreEmptyThinLTOIndexFile(
+ "ignore-empty-index-file", llvm::cl::ZeroOrMore,
+ llvm::cl::desc(
+ "Ignore an empty index file and perform non-ThinLTO compilation"),
+ llvm::cl::init(false));
+
ModuleSummaryIndexObjectFile::ModuleSummaryIndexObjectFile(
MemoryBufferRef Object, std::unique_ptr<ModuleSummaryIndex> I)
: SymbolicFile(Binary::ID_ModuleSummaryIndex, Object), Index(std::move(I)) {
@@ -97,6 +103,8 @@ llvm::getModuleSummaryIndexForFile(StringRef Path) {
if (EC)
return errorCodeToError(EC);
MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
+ if (IgnoreEmptyThinLTOIndexFile && !BufferRef.getBufferSize())
+ return nullptr;
Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
object::ModuleSummaryIndexObjectFile::create(BufferRef);
if (!ObjOrErr)
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 0a989706b4361..3889902eea54a 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -373,7 +373,7 @@ void Option::removeArgument() { GlobalParser->removeOption(this); }
void Option::setArgStr(StringRef S) {
if (FullyInitialized)
GlobalParser->updateArgStr(this, S);
- assert(S[0] != '-' && "Option can't start with '-");
+ assert((S.empty() || S[0] != '-') && "Option can't start with '-");
ArgStr = S;
}
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 0616d05aff573..4bb035eeccca8 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -571,6 +571,16 @@ void native(SmallVectorImpl<char> &Path) {
#endif
}
+std::string convert_to_slash(StringRef path) {
+#ifdef LLVM_ON_WIN32
+ std::string s = path.str();
+ std::replace(s.begin(), s.end(), '\\', '/');
+ return s;
+#else
+ return path;
+#endif
+}
+
StringRef filename(StringRef path) {
return *rbegin(path);
}
diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp
index 5fc17d2763776..f79b364dc1f7b 100644
--- a/lib/Support/TarWriter.cpp
+++ b/lib/Support/TarWriter.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Path.h"
using namespace llvm;
@@ -109,27 +110,44 @@ static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) {
pad(OS);
}
+// In the Ustar header, a path can be split at any '/' to store
+// a path into UstarHeader::Name and UstarHeader::Prefix. This
+// function splits a given path for that purpose.
+static std::pair<StringRef, StringRef> splitPath(StringRef Path) {
+ if (Path.size() <= sizeof(UstarHeader::Name))
+ return {"", Path};
+ size_t Sep = Path.rfind('/', sizeof(UstarHeader::Name) + 1);
+ if (Sep == StringRef::npos)
+ return {"", Path};
+ return {Path.substr(0, Sep), Path.substr(Sep + 1)};
+}
+
+// Returns true if a given path can be stored to a Ustar header
+// without the PAX extension.
+static bool fitsInUstar(StringRef Path) {
+ StringRef Prefix;
+ StringRef Name;
+ std::tie(Prefix, Name) = splitPath(Path);
+ return Name.size() <= sizeof(UstarHeader::Name);
+}
+
// The PAX header is an extended format, so a PAX header needs
// to be followed by a "real" header.
static void writeUstarHeader(raw_fd_ostream &OS, StringRef Path, size_t Size) {
+ StringRef Prefix;
+ StringRef Name;
+ std::tie(Prefix, Name) = splitPath(Path);
+
UstarHeader Hdr = {};
- memcpy(Hdr.Name, Path.data(), Path.size());
+ memcpy(Hdr.Name, Name.data(), Name.size());
memcpy(Hdr.Mode, "0000664", 8);
snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", Size);
memcpy(Hdr.Magic, "ustar", 6);
+ memcpy(Hdr.Prefix, Prefix.data(), Prefix.size());
computeChecksum(Hdr);
OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr));
}
-// We want to use '/' as a path separator even on Windows.
-// This function canonicalizes a given path.
-static std::string canonicalize(std::string S) {
-#ifdef LLVM_ON_WIN32
- std::replace(S.begin(), S.end(), '\\', '/');
-#endif
- return S;
-}
-
// Creates a TarWriter instance and returns it.
Expected<std::unique_ptr<TarWriter>> TarWriter::create(StringRef OutputPath,
StringRef BaseDir) {
@@ -145,8 +163,8 @@ TarWriter::TarWriter(int FD, StringRef BaseDir)
// Append a given file to an archive.
void TarWriter::append(StringRef Path, StringRef Data) {
// Write Path and Data.
- std::string S = BaseDir + "/" + canonicalize(Path) + "\0";
- if (S.size() <= sizeof(UstarHeader::Name)) {
+ std::string S = BaseDir + "/" + sys::path::convert_to_slash(Path) + "\0";
+ if (fitsInUstar(S)) {
writeUstarHeader(OS, S, Data.size());
} else {
writePaxHeader(OS, S);
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ef3b44f7c2116..2b4fc5397b18d 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -608,6 +608,10 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
if ((C = dyn_cast<ConstantSDNode>(Addr))) {
Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
+ Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
Base = Addr.getOperand(0);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0b0a0e7d083ed..730bcdcf7afa5 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -172,16 +172,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
- setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
- setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
-
- setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
- setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
-
- setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
setTruncStoreAction(MVT::i64, MVT::i8, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a6c31629e7c40..da9d009c542b6 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -822,6 +822,7 @@ public:
bool isForcedVOP3() const { return ForcedEncodingSize == 64; }
bool isForcedDPP() const { return ForcedDPP; }
bool isForcedSDWA() const { return ForcedSDWA; }
+ ArrayRef<unsigned> getMatchedVariants() const;
std::unique_ptr<AMDGPUOperand> parseRegister();
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
@@ -1630,31 +1631,44 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
return Match_Success;
}
+// What asm variants we should check
+ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
+ if (getForcedEncodingSize() == 32) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::DEFAULT};
+ return makeArrayRef(Variants);
+ }
+
+ if (isForcedVOP3()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::VOP3};
+ return makeArrayRef(Variants);
+ }
+
+ if (isForcedSDWA()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA};
+ return makeArrayRef(Variants);
+ }
+
+ if (isForcedDPP()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::DPP};
+ return makeArrayRef(Variants);
+ }
+
+ static const unsigned Variants[] = {
+ AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
+ AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP
+ };
+
+ return makeArrayRef(Variants);
+}
+
bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
- // What asm variants we should check
- std::vector<unsigned> MatchedVariants;
- if (getForcedEncodingSize() == 32) {
- MatchedVariants = {AMDGPUAsmVariants::DEFAULT};
- } else if (isForcedVOP3()) {
- MatchedVariants = {AMDGPUAsmVariants::VOP3};
- } else if (isForcedSDWA()) {
- MatchedVariants = {AMDGPUAsmVariants::SDWA};
- } else if (isForcedDPP()) {
- MatchedVariants = {AMDGPUAsmVariants::DPP};
- } else {
- MatchedVariants = {AMDGPUAsmVariants::DEFAULT,
- AMDGPUAsmVariants::VOP3,
- AMDGPUAsmVariants::SDWA,
- AMDGPUAsmVariants::DPP};
- }
-
MCInst Inst;
unsigned Result = Match_Success;
- for (auto Variant : MatchedVariants) {
+ for (auto Variant : getMatchedVariants()) {
uint64_t EI;
auto R = MatchInstructionImpl(Operands, Inst, EI, MatchingInlineAsm,
Variant);
@@ -3486,7 +3500,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
- if ((BasicInstType == SIInstrFlags::VOPC ||
+ if ((BasicInstType == SIInstrFlags::VOPC ||
BasicInstType == SIInstrFlags::VOP2)&&
Op.isReg() &&
Op.Reg.RegNo == AMDGPU::VCC) {
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 89c9266746ac4..de7ce5cb9e478 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -99,6 +99,18 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::i32, MVT::i8, Custom);
setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+ // We need to include these since trunc STORES to PRIVATE need
+ // special handling to accommodate RMW
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
// Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
@@ -1087,79 +1099,114 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,
SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
SelectionDAG &DAG) const {
SDLoc DL(Store);
+ //TODO: Who creates the i8 stores?
+ assert(Store->isTruncatingStore()
+ || Store->getValue().getValueType() == MVT::i8);
+ assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
- unsigned Mask = 0;
+ SDValue Mask;
if (Store->getMemoryVT() == MVT::i8) {
- Mask = 0xff;
+ assert(Store->getAlignment() >= 1);
+ Mask = DAG.getConstant(0xff, DL, MVT::i32);
} else if (Store->getMemoryVT() == MVT::i16) {
- Mask = 0xffff;
+ assert(Store->getAlignment() >= 2);
+ Mask = DAG.getConstant(0xffff, DL, MVT::i32);;
+ } else {
+ llvm_unreachable("Unsupported private trunc store");
}
SDValue Chain = Store->getChain();
SDValue BasePtr = Store->getBasePtr();
+ SDValue Offset = Store->getOffset();
EVT MemVT = Store->getMemoryVT();
- SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
- DAG.getConstant(2, DL, MVT::i32));
- SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
- Chain, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32));
+ SDValue LoadPtr = BasePtr;
+ if (!Offset.isUndef()) {
+ LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
+ }
+
+ // Get dword location
+ // TODO: this should be eliminated by the future SHR ptr, 2
+ SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
+ DAG.getConstant(0xfffffffc, DL, MVT::i32));
+
+ // Load dword
+ // TODO: can we be smarter about machine pointer info?
+ SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
- SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+ Chain = Dst.getValue(1);
+
+ // Get offset in dword
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
DAG.getConstant(0x3, DL, MVT::i32));
+ // Convert byte offset to bit shift
SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
DAG.getConstant(3, DL, MVT::i32));
+ // TODO: Contrary to the name of the functiom,
+ // it also handles sub i32 non-truncating stores (like i1)
SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
Store->getValue());
+ // Mask the value to the right type
SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+ // Shift the value in place
SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
MaskedValue, ShiftAmt);
- SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
- DAG.getConstant(Mask, DL, MVT::i32),
- ShiftAmt);
- DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
- DAG.getConstant(0xffffffff, DL, MVT::i32));
+ // Shift the mask in place
+ SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
+
+ // Invert the mask. NOTE: if we had native ROL instructions we could
+ // use inverted mask
+ DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
+
+ // Cleanup the target bits
Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+ // Add the new bits
SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
- return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
- Chain, Value, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32));
+
+ // Store dword
+ // TODO: Can we be smarter about MachinePointerInfo?
+ return DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
}
SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
unsigned AS = StoreNode->getAddressSpace();
+
+ SDValue Chain = StoreNode->getChain();
+ SDValue Ptr = StoreNode->getBasePtr();
SDValue Value = StoreNode->getValue();
- EVT ValueVT = Value.getValueType();
+
+ EVT VT = Value.getValueType();
EVT MemVT = StoreNode->getMemoryVT();
- unsigned Align = StoreNode->getAlignment();
+ EVT PtrVT = Ptr.getValueType();
+ SDLoc DL(Op);
+
+ // Neither LOCAL nor PRIVATE can do vectors at the moment
if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
- ValueVT.isVector()) {
- return SplitVectorStore(Op, DAG);
+ VT.isVector()) {
+ return scalarizeVectorStore(StoreNode, DAG);
}
- // Private AS needs special fixes
- if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+ unsigned Align = StoreNode->getAlignment();
+ if (Align < MemVT.getStoreSize() &&
!allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
return expandUnalignedStore(StoreNode, DAG);
}
- SDLoc DL(Op);
- SDValue Chain = StoreNode->getChain();
- SDValue Ptr = StoreNode->getBasePtr();
+ SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
+ DAG.getConstant(2, DL, PtrVT));
if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
// It is beneficial to create MSKOR here instead of combiner to avoid
// artificial dependencies introduced by RMW
if (StoreNode->isTruncatingStore()) {
- EVT VT = Value.getValueType();
assert(VT.bitsLE(MVT::i32));
SDValue MaskConstant;
if (MemVT == MVT::i8) {
@@ -1169,15 +1216,19 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
assert(StoreNode->getAlignment() >= 2);
MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
}
- SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
- DAG.getConstant(2, DL, MVT::i32));
- SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
- DAG.getConstant(0x00000003, DL, VT));
+
+ SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr,
+ DAG.getConstant(0x00000003, DL, PtrVT));
+ SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+ DAG.getConstant(3, DL, VT));
+
+ // Put the mask in correct place
+ SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
+
+ // Put the mask in correct place
SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
- SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
- DAG.getConstant(3, DL, VT));
- SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
- SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+ SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
+
// XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
// vector instead.
SDValue Src[4] = {
@@ -1191,12 +1242,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
Op->getVTList(), Args, MemVT,
StoreNode->getMemOperand());
- } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
- ValueVT.bitsGE(MVT::i32)) {
+ } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
// Convert pointer from byte address to dword address.
- Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
- DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
- Ptr, DAG.getConstant(2, DL, MVT::i32)));
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
llvm_unreachable("Truncated and indexed stores not supported yet");
@@ -1207,49 +1255,22 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
}
}
+ // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
if (AS != AMDGPUAS::PRIVATE_ADDRESS)
return SDValue();
if (MemVT.bitsLT(MVT::i32))
return lowerPrivateTruncStore(StoreNode, DAG);
- // Lowering for indirect addressing
- const MachineFunction &MF = DAG.getMachineFunction();
- const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
- unsigned StackWidth = TFL->getStackWidth(MF);
-
- Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
- if (ValueVT.isVector()) {
- unsigned NumElemVT = ValueVT.getVectorNumElements();
- EVT ElemVT = ValueVT.getVectorElementType();
- SmallVector<SDValue, 4> Stores(NumElemVT);
-
- assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
- "vector width in load");
-
- for (unsigned i = 0; i < NumElemVT; ++i) {
- unsigned Channel, PtrIncr;
- getStackAddress(StackWidth, i, Channel, PtrIncr);
- Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
- DAG.getConstant(PtrIncr, DL, MVT::i32));
- SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
- Value, DAG.getConstant(i, DL, MVT::i32));
-
- Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
- Chain, Elem, Ptr,
- DAG.getTargetConstant(Channel, DL, MVT::i32));
- }
- Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
- } else {
- if (ValueVT == MVT::i8) {
- Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
- }
- Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
+ // Standard i32+ store, tag it with DWORDADDR to note that the address
+ // has been shifted
+ if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
+ return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
}
- return Chain;
+ // Tagged i32+ stores will be matched by patterns
+ return SDValue();
}
// return (512 + (kc_bank << 12)
@@ -1299,51 +1320,50 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
LoadSDNode *Load = cast<LoadSDNode>(Op);
ISD::LoadExtType ExtType = Load->getExtensionType();
EVT MemVT = Load->getMemoryVT();
+ assert(Load->getAlignment() >= MemVT.getStoreSize());
- // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
- // register (2-)byte extract.
+ SDValue BasePtr = Load->getBasePtr();
+ SDValue Chain = Load->getChain();
+ SDValue Offset = Load->getOffset();
- // Get Register holding the target.
- SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
- DAG.getConstant(2, DL, MVT::i32));
- // Load the Register.
- SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
- Load->getChain(),
- Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32),
- Op.getOperand(2));
+ SDValue LoadPtr = BasePtr;
+ if (!Offset.isUndef()) {
+ LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
+ }
+
+ // Get dword location
+ // NOTE: this should be eliminated by the future SHR ptr, 2
+ SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
+ DAG.getConstant(0xfffffffc, DL, MVT::i32));
+
+ // Load dword
+ // TODO: can we be smarter about machine pointer info?
+ SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
// Get offset within the register.
SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
- Load->getBasePtr(),
- DAG.getConstant(0x3, DL, MVT::i32));
+ LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));
// Bit offset of target byte (byteIdx * 8).
SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
DAG.getConstant(3, DL, MVT::i32));
// Shift to the right.
- Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+ SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
// Eliminate the upper bits by setting them to ...
EVT MemEltVT = MemVT.getScalarType();
- // ... ones.
- if (ExtType == ISD::SEXTLOAD) {
+ if (ExtType == ISD::SEXTLOAD) { // ... ones.
SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-
- SDValue Ops[] = {
- DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
- Load->getChain()
- };
-
- return DAG.getMergeValues(Ops, DL);
+ Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
+ } else { // ... or zeros.
+ Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
}
- // ... or zeros.
SDValue Ops[] = {
- DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
- Load->getChain()
+ Ret,
+ Read.getValue(1) // This should be our output chain
};
return DAG.getMergeValues(Ops, DL);
@@ -1365,12 +1385,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = LoadNode->getChain();
SDValue Ptr = LoadNode->getBasePtr();
- if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
- SDValue MergedValues[2] = {
- scalarizeVectorLoad(LoadNode, DAG),
- Chain
- };
- return DAG.getMergeValues(MergedValues, DL);
+ if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+ VT.isVector()) {
+ return scalarizeVectorLoad(LoadNode, DAG);
}
int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
@@ -1421,8 +1439,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues(MergedValues, DL);
}
- SDValue LoweredLoad;
-
// For most operations returning SDValue() will result in the node being
// expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
// need to manually expand loads that may be legal in some address spaces and
@@ -1447,47 +1463,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
- // Lowering for indirect addressing
- const MachineFunction &MF = DAG.getMachineFunction();
- const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
- unsigned StackWidth = TFL->getStackWidth(MF);
-
- Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
- if (VT.isVector()) {
- unsigned NumElemVT = VT.getVectorNumElements();
- EVT ElemVT = VT.getVectorElementType();
- SDValue Loads[4];
-
- assert(NumElemVT <= 4);
- assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
- "vector width in load");
-
- for (unsigned i = 0; i < NumElemVT; ++i) {
- unsigned Channel, PtrIncr;
- getStackAddress(StackWidth, i, Channel, PtrIncr);
- Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
- DAG.getConstant(PtrIncr, DL, MVT::i32));
- Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
- Chain, Ptr,
- DAG.getTargetConstant(Channel, DL, MVT::i32),
- Op.getOperand(2));
- }
- EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
- LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
- } else {
- LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
- Chain, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32), // Channel
- Op.getOperand(2));
+ // DWORDADDR ISD marks already shifted address
+ if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
+ assert(VT == MVT::i32);
+ Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32));
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
+ return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
}
-
- SDValue Ops[2] = {
- LoweredLoad,
- Chain
- };
-
- return DAG.getMergeValues(Ops, DL);
+ return SDValue();
}
SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 3a72e0791fd68..19795bdde6473 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -1268,6 +1268,17 @@ let Predicates = [isR600] in {
defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+// Hardcode channel to 0
+// NOTE: LSHR is not available here. LSHR is per family instruction
+def : Pat <
+ (i32 (load_private ADDRIndirect:$addr) ),
+ (R600_RegisterLoad FRAMEri:$addr, (i32 0))
+>;
+def : Pat <
+ (store_private i32:$val, ADDRIndirect:$addr),
+ (R600_RegisterStore i32:$val, FRAMEri:$addr, (i32 0))
+>;
+
//===----------------------------------------------------------------------===//
// Pseudo instructions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index c78e97dfd46f8..9140fe6cd1484 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -99,6 +99,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
+
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
@@ -699,7 +711,8 @@ SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
- unsigned Offset, bool Signed) const {
+ unsigned Offset, bool Signed,
+ const ISD::InputArg *Arg) const {
const DataLayout &DL = DAG.getDataLayout();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
@@ -713,20 +726,21 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
- SDValue Val;
+ SDValue Val = Load;
+ if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
+ VT.bitsLT(MemVT)) {
+ unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
+ Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
+ }
+
if (MemVT.isFloatingPoint())
- Val = getFPExtOrFPTrunc(DAG, Load, SL, VT);
+ Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
else if (Signed)
- Val = DAG.getSExtOrTrunc(Load, SL, VT);
+ Val = DAG.getSExtOrTrunc(Val, SL, VT);
else
- Val = DAG.getZExtOrTrunc(Load, SL, VT);
-
- SDValue Ops[] = {
- Val,
- Load.getValue(1)
- };
+ Val = DAG.getZExtOrTrunc(Val, SL, VT);
- return DAG.getMergeValues(Ops, SL);
+ return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
}
SDValue SITargetLowering::LowerFormalArguments(
@@ -899,7 +913,8 @@ SDValue SITargetLowering::LowerFormalArguments(
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain,
- Offset, Ins[i].Flags.isSExt());
+ Offset, Ins[i].Flags.isSExt(),
+ &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 9583f6db6faaf..6c04e4f309773 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -24,7 +24,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
unsigned Offset) const;
SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
- SDValue Chain, unsigned Offset, bool Signed) const;
+ SDValue Chain, unsigned Offset, bool Signed,
+ const ISD::InputArg *Arg = nullptr) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 156a21dfecfea..462a7d57d2de5 100644
--- a/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -203,8 +203,8 @@ unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD,
bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintCode,
std::vector<SDValue> &OutOps) {
- assert(ConstraintCode == InlineAsm::Constraint_m ||
- ConstraintCode == InlineAsm::Constraint_Q &&
+ assert((ConstraintCode == InlineAsm::Constraint_m ||
+ ConstraintCode == InlineAsm::Constraint_Q) &&
"Unexpected asm memory constraint");
MachineRegisterInfo &RI = MF->getRegInfo();
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index 53668f05b59ba..07fc3f6890b8d 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -14,6 +14,7 @@
#include "AVRISelLowering.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -1933,5 +1934,45 @@ void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
+unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
+ EVT VT,
+ SelectionDAG &DAG) const {
+ unsigned Reg;
+
+ if (VT == MVT::i8) {
+ Reg = StringSwitch<unsigned>(RegName)
+ .Case("r0", AVR::R0).Case("r1", AVR::R1).Case("r2", AVR::R2)
+ .Case("r3", AVR::R3).Case("r4", AVR::R4).Case("r5", AVR::R5)
+ .Case("r6", AVR::R6).Case("r7", AVR::R7).Case("r8", AVR::R8)
+ .Case("r9", AVR::R9).Case("r10", AVR::R10).Case("r11", AVR::R11)
+ .Case("r12", AVR::R12).Case("r13", AVR::R13).Case("r14", AVR::R14)
+ .Case("r15", AVR::R15).Case("r16", AVR::R16).Case("r17", AVR::R17)
+ .Case("r18", AVR::R18).Case("r19", AVR::R19).Case("r20", AVR::R20)
+ .Case("r21", AVR::R21).Case("r22", AVR::R22).Case("r23", AVR::R23)
+ .Case("r24", AVR::R24).Case("r25", AVR::R25).Case("r26", AVR::R26)
+ .Case("r27", AVR::R27).Case("r28", AVR::R28).Case("r29", AVR::R29)
+ .Case("r30", AVR::R30).Case("r31", AVR::R31)
+ .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
+ .Default(0);
+ } else {
+ Reg = StringSwitch<unsigned>(RegName)
+ .Case("r0", AVR::R1R0).Case("r2", AVR::R3R2)
+ .Case("r4", AVR::R5R4).Case("r6", AVR::R7R6)
+ .Case("r8", AVR::R9R8).Case("r10", AVR::R11R10)
+ .Case("r12", AVR::R13R12).Case("r14", AVR::R15R14)
+ .Case("r16", AVR::R17R16).Case("r18", AVR::R19R18)
+ .Case("r20", AVR::R21R20).Case("r22", AVR::R23R22)
+ .Case("r24", AVR::R25R24).Case("r26", AVR::R27R26)
+ .Case("r28", AVR::R29R28).Case("r30", AVR::R31R30)
+ .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
+ .Default(0);
+ }
+
+ if (Reg)
+ return Reg;
+
+ report_fatal_error("Invalid register name global variable");
+}
+
} // end of namespace llvm
diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h
index 17074e1b1eeea..a8cdc4e7ae234 100644
--- a/lib/Target/AVR/AVRISelLowering.h
+++ b/lib/Target/AVR/AVRISelLowering.h
@@ -116,6 +116,9 @@ public:
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const override;
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
private:
SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
SelectionDAG &DAG, SDLoc dl) const;
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
index cbe4466164f91..e38facead9228 100644
--- a/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -13,15 +13,13 @@
#include "BPF.h"
#include "BPFInstrInfo.h"
-#include "BPFSubtarget.h"
-#include "BPFTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
+#include <cassert>
+#include <iterator>
#define GET_INSTRINFO_CTOR_DTOR
#include "BPFGenInstrInfo.inc"
@@ -109,11 +107,11 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
while (std::next(I) != MBB.end())
std::next(I)->eraseFromParent();
Cond.clear();
- FBB = 0;
+ FBB = nullptr;
// Delete the J if it's equivalent to a fall-through.
if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
- TBB = 0;
+ TBB = nullptr;
I->eraseFromParent();
I = MBB.end();
continue;
diff --git a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index b0037fbc16ac5..9beefcdcc1d5d 100644
--- a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -12,16 +12,15 @@
//===----------------------------------------------------------------------===//
#include "BPF.h"
-#include "BPFRegisterInfo.h"
#include "BPFSubtarget.h"
#include "MCTargetDesc/BPFMCTargetDesc.h"
-
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
+#include <cstdint>
using namespace llvm;
@@ -36,14 +35,15 @@ class BPFDisassembler : public MCDisassembler {
public:
BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
: MCDisassembler(STI, Ctx) {}
- virtual ~BPFDisassembler() {}
+ ~BPFDisassembler() override = default;
DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &VStream,
raw_ostream &CStream) const override;
};
-}
+
+} // end anonymous namespace
static MCDisassembler *createBPFDisassembler(const Target &T,
const MCSubtargetInfo &STI,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index a6cd2002c12c6..afc321ea2c34e 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -8,28 +8,24 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
using namespace llvm;
namespace {
+
class BPFAsmBackend : public MCAsmBackend {
public:
bool IsLittleEndian;
BPFAsmBackend(bool IsLittleEndian)
: MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
- ~BPFAsmBackend() override {}
+ ~BPFAsmBackend() override = default;
void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
uint64_t Value, bool IsPCRel) const override;
@@ -53,6 +49,8 @@ public:
bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
};
+} // end anonymous namespace
+
bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
if ((Count % 8) != 0)
return false;
@@ -66,7 +64,6 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
unsigned DataSize, uint64_t Value,
bool IsPCRel) const {
-
if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
assert(Value == 0);
} else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
@@ -92,7 +89,6 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
MCObjectWriter *BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
return createBPFELFObjectWriter(OS, 0, IsLittleEndian);
}
-}
MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
const MCRegisterInfo &MRI,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 3d1c0eb55afa8..ebe9abd8ffac4 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -10,29 +10,30 @@
#include "MCTargetDesc/BPFMCTargetDesc.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ELF.h"
#include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
using namespace llvm;
namespace {
+
class BPFELFObjectWriter : public MCELFObjectTargetWriter {
public:
BPFELFObjectWriter(uint8_t OSABI);
-
- ~BPFELFObjectWriter() override;
+ ~BPFELFObjectWriter() override = default;
protected:
unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup, bool IsPCRel) const override;
};
-}
+
+} // end anonymous namespace
BPFELFObjectWriter::BPFELFObjectWriter(uint8_t OSABI)
: MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_BPF,
/*HasRelocationAddend*/ false) {}
-BPFELFObjectWriter::~BPFELFObjectWriter() {}
-
unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index 47f16512a3972..e8c9744798287 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -12,24 +12,25 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Endian.h"
#include "llvm/Support/EndianStream.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
using namespace llvm;
#define DEBUG_TYPE "mccodeemitter"
namespace {
+
class BPFMCCodeEmitter : public MCCodeEmitter {
- BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
- void operator=(const BPFMCCodeEmitter &) = delete;
const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
bool IsLittleEndian;
@@ -38,8 +39,9 @@ public:
BPFMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
bool IsLittleEndian)
: MCII(mcii), MRI(mri), IsLittleEndian(IsLittleEndian) {}
-
- ~BPFMCCodeEmitter() {}
+ BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
+ void operator=(const BPFMCCodeEmitter &) = delete;
+ ~BPFMCCodeEmitter() override = default;
// getBinaryCodeForInstr - TableGen'erated function for getting the
// binary encoding for an instruction.
@@ -66,7 +68,8 @@ private:
void verifyInstructionPredicates(const MCInst &MI,
uint64_t AvailableFeatures) const;
};
-}
+
+} // end anonymous namespace
MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 55415f97396b4..b58409730de04 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -12,14 +12,13 @@
//===----------------------------------------------------------------------===//
#include "BPF.h"
-#include "BPFMCTargetDesc.h"
-#include "BPFMCAsmInfo.h"
#include "InstPrinter/BPFInstPrinter.h"
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "MCTargetDesc/BPFMCAsmInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
#include "llvm/Support/TargetRegistry.h"
#define GET_INSTRINFO_MC_DESC
@@ -64,7 +63,7 @@ static MCInstPrinter *createBPFMCInstPrinter(const Triple &T,
const MCRegisterInfo &MRI) {
if (SyntaxVariant == 0)
return new BPFInstPrinter(MAI, MII, MRI);
- return 0;
+ return nullptr;
}
extern "C" void LLVMInitializeBPFTargetMC() {
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 5fb5b02278002..df12e0e88e3bb 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -101,7 +101,7 @@ LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
}
LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
- const char* Triple, const char* CPU, const char* Features,
+ const char *Triple, const char *CPU, const char *Features,
LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
LLVMCodeModel CodeModel) {
Optional<Reloc::Model> RM;
@@ -139,7 +139,7 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
TargetOptions opt;
return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM,
- CM, OL));
+ CM, OL));
}
void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); }
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index f4d46383e5bb8..d9c53ecc8d084 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_target(WebAssemblyCodeGen
WebAssemblyExplicitLocals.cpp
WebAssemblyFastISel.cpp
WebAssemblyFixIrreducibleControlFlow.cpp
+ WebAssemblyFixFunctionBitcasts.cpp
WebAssemblyFrameLowering.cpp
WebAssemblyISelDAGToDAG.cpp
WebAssemblyISelLowering.cpp
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 09c35b4825fc2..8738263ad8473 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -28,6 +28,7 @@ class FunctionPass;
// LLVM IR passes.
ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool DoEH, bool DoSjLj);
void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
+ModulePass *createWebAssemblyFixFunctionBitcasts();
FunctionPass *createWebAssemblyOptimizeReturned();
// ISel and immediate followup passes.
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
new file mode 100644
index 0000000000000..d5474a02ce01a
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -0,0 +1,159 @@
+//===-- WebAssemblyFixFunctionBitcasts.cpp - Fix function bitcasts --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Fix bitcasted functions.
+///
+/// WebAssembly requires caller and callee signatures to match, however in LLVM,
+/// some amount of slop is vaguely permitted. Detect mismatch by looking for
+/// bitcasts of functions and rewrite them to use wrapper functions instead.
+///
+/// This doesn't catch all cases, such as when a function's address is taken in
+/// one place and casted in another, but it works for many common cases.
+///
+/// Note that LLVM already optimizes away function bitcasts in common cases by
+/// dropping arguments as needed, so this pass only ends up getting used in less
+/// common cases.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fix-function-bitcasts"
+
+namespace {
+class FixFunctionBitcasts final : public ModulePass {
+ StringRef getPassName() const override {
+ return "WebAssembly Fix Function Bitcasts";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ ModulePass::getAnalysisUsage(AU);
+ }
+
+ bool runOnModule(Module &M) override;
+
+public:
+ static char ID;
+ FixFunctionBitcasts() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char FixFunctionBitcasts::ID = 0;
+ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
+ return new FixFunctionBitcasts();
+}
+
+// Recursively descend the def-use lists from V to find non-bitcast users of
+// bitcasts of V.
+static void FindUses(Value *V, Function &F,
+ SmallVectorImpl<std::pair<Use *, Function *>> &Uses) {
+ for (Use &U : V->uses()) {
+ if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser()))
+ FindUses(BC, F, Uses);
+ else if (U.get()->getType() != F.getType())
+ Uses.push_back(std::make_pair(&U, &F));
+ }
+}
+
+// Create a wrapper function with type Ty that calls F (which may have a
+// different type). Attempt to support common bitcasted function idioms:
+// - Call with more arguments than needed: arguments are dropped
+// - Call with fewer arguments than needed: arguments are filled in with undef
+// - Return value is not needed: drop it
+// - Return value needed but not present: supply an undef
+//
+// For now, return nullptr without creating a wrapper if the wrapper cannot
+// be generated due to incompatible types.
+static Function *CreateWrapper(Function *F, FunctionType *Ty) {
+ Module *M = F->getParent();
+
+ Function *Wrapper =
+ Function::Create(Ty, Function::PrivateLinkage, "bitcast", M);
+ BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
+
+ // Determine what arguments to pass.
+ SmallVector<Value *, 4> Args;
+ Function::arg_iterator AI = Wrapper->arg_begin();
+ FunctionType::param_iterator PI = F->getFunctionType()->param_begin();
+ FunctionType::param_iterator PE = F->getFunctionType()->param_end();
+ for (; AI != Wrapper->arg_end() && PI != PE; ++AI, ++PI) {
+ if (AI->getType() != *PI) {
+ Wrapper->eraseFromParent();
+ return nullptr;
+ }
+ Args.push_back(&*AI);
+ }
+ for (; PI != PE; ++PI)
+ Args.push_back(UndefValue::get(*PI));
+
+ CallInst *Call = CallInst::Create(F, Args, "", BB);
+
+ // Determine what value to return.
+ if (Ty->getReturnType()->isVoidTy())
+ ReturnInst::Create(M->getContext(), BB);
+ else if (F->getFunctionType()->getReturnType()->isVoidTy())
+ ReturnInst::Create(M->getContext(), UndefValue::get(Ty->getReturnType()),
+ BB);
+ else if (F->getFunctionType()->getReturnType() == Ty->getReturnType())
+ ReturnInst::Create(M->getContext(), Call, BB);
+ else {
+ Wrapper->eraseFromParent();
+ return nullptr;
+ }
+
+ return Wrapper;
+}
+
+bool FixFunctionBitcasts::runOnModule(Module &M) {
+ SmallVector<std::pair<Use *, Function *>, 0> Uses;
+
+ // Collect all the places that need wrappers.
+ for (Function &F : M)
+ FindUses(&F, F, Uses);
+
+ DenseMap<std::pair<Function *, FunctionType *>, Function *> Wrappers;
+
+ for (auto &UseFunc : Uses) {
+ Use *U = UseFunc.first;
+ Function *F = UseFunc.second;
+ PointerType *PTy = cast<PointerType>(U->get()->getType());
+ FunctionType *Ty = dyn_cast<FunctionType>(PTy->getElementType());
+
+ // If the function is casted to something like i8* as a "generic pointer"
+ // to be later casted to something else, we can't generate a wrapper for it.
+ // Just ignore such casts for now.
+ if (!Ty)
+ continue;
+
+ auto Pair = Wrappers.insert(std::make_pair(std::make_pair(F, Ty), nullptr));
+ if (Pair.second)
+ Pair.first->second = CreateWrapper(F, Ty);
+
+ Function *Wrapper = Pair.first->second;
+ if (!Wrapper)
+ continue;
+
+ if (isa<Constant>(U->get()))
+ U->get()->replaceAllUsesWith(Wrapper);
+ else
+ U->set(Wrapper);
+ }
+
+ return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 8a3248ee669ea..e872dc2198460 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -40,8 +40,8 @@ defm ROTL : BinaryInt<rotl, "rotl", 0x77, 0x89>;
defm ROTR : BinaryInt<rotr, "rotr", 0x78, 0x8a>;
let isCommutable = 1 in {
-defm EQ : ComparisonInt<SETEQ, "eq ", 0x46, 0x68>;
-defm NE : ComparisonInt<SETNE, "ne ", 0x47, 0x69>;
+defm EQ : ComparisonInt<SETEQ, "eq ", 0x46, 0x51>;
+defm NE : ComparisonInt<SETNE, "ne ", 0x47, 0x52>;
} // isCommutable = 1
defm LT_S : ComparisonInt<SETLT, "lt_s", 0x48, 0x53>;
defm LT_U : ComparisonInt<SETULT, "lt_u", 0x49, 0x54>;
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b61bc0a081435..f5ef35a2ad40d 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -163,6 +163,10 @@ void WebAssemblyPassConfig::addIRPasses() {
// control specifically what gets lowered.
addPass(createAtomicExpandPass(TM));
+ // Fix function bitcasts, as WebAssembly requires caller and callee signatures
+ // to match.
+ addPass(createWebAssemblyFixFunctionBitcasts());
+
// Optimize "returned" function attributes.
if (getOptLevel() != CodeGenOpt::None)
addPass(createWebAssemblyOptimizeReturned());
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7f72ab17f6194..db76ddf04c069 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6962,23 +6962,24 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
}
-/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
-/// node.
-static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+/// Returns true iff \p BV builds a vector with the result equivalent to
+/// the result of ADDSUB operation.
+/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
+/// are written to the parameters \p Opnd0 and \p Opnd1.
+static bool isAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ SDValue &Opnd0, SDValue &Opnd1) {
+
MVT VT = BV->getSimpleValueType(0);
if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
- return SDValue();
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
+ (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+ return false;
- SDLoc DL(BV);
unsigned NumElts = VT.getVectorNumElements();
SDValue InVec0 = DAG.getUNDEF(VT);
SDValue InVec1 = DAG.getUNDEF(VT);
- assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
- VT == MVT::v2f64) && "build_vector with an invalid type found!");
-
// Odd-numbered elements in the input build vector are obtained from
// adding two integer/float elements.
// Even-numbered elements in the input build vector are obtained from
@@ -7000,7 +7001,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
// Early exit if we found an unexpected opcode.
if (Opcode != ExpectedOpcode)
- return SDValue();
+ return false;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
@@ -7013,11 +7014,11 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
!isa<ConstantSDNode>(Op1.getOperand(1)) ||
Op0.getOperand(1) != Op1.getOperand(1))
- return SDValue();
+ return false;
unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
if (I0 != i)
- return SDValue();
+ return false;
// We found a valid add/sub node. Update the information accordingly.
if (i & 1)
@@ -7029,39 +7030,118 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
if (InVec0.isUndef()) {
InVec0 = Op0.getOperand(0);
if (InVec0.getSimpleValueType() != VT)
- return SDValue();
+ return false;
}
if (InVec1.isUndef()) {
InVec1 = Op1.getOperand(0);
if (InVec1.getSimpleValueType() != VT)
- return SDValue();
+ return false;
}
// Make sure that operands in input to each add/sub node always
// come from a same pair of vectors.
if (InVec0 != Op0.getOperand(0)) {
if (ExpectedOpcode == ISD::FSUB)
- return SDValue();
+ return false;
// FADD is commutable. Try to commute the operands
// and then test again.
std::swap(Op0, Op1);
if (InVec0 != Op0.getOperand(0))
- return SDValue();
+ return false;
}
if (InVec1 != Op1.getOperand(0))
- return SDValue();
+ return false;
// Update the pair of expected opcodes.
std::swap(ExpectedOpcode, NextExpectedOpcode);
}
// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
- if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
- return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
+ if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
+ return false;
- return SDValue();
+ Opnd0 = InVec0;
+ Opnd1 = InVec1;
+ return true;
+}
+
+/// Returns true if is possible to fold MUL and an idiom that has already been
+/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
+/// If (and only if) true is returned, the operands of FMADDSUB are written to
+/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
+///
+/// Prior to calling this function it should be known that there is some
+/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
+/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
+/// before replacement of such SDNode with ADDSUB operation. Thus the number
+/// of \p Opnd0 uses is expected to be equal to 2.
+/// For example, this function may be called for the following IR:
+/// %AB = fmul fast <2 x double> %A, %B
+/// %Sub = fsub fast <2 x double> %AB, %C
+/// %Add = fadd fast <2 x double> %AB, %C
+/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
+/// <2 x i32> <i32 0, i32 3>
+/// There is a def for %Addsub here, which potentially can be replaced by
+/// X86ISD::ADDSUB operation:
+/// %Addsub = X86ISD::ADDSUB %AB, %C
+/// and such ADDSUB can further be replaced with FMADDSUB:
+/// %Addsub = FMADDSUB %A, %B, %C.
+///
+/// The main reason why this method is called before the replacement of the
+/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
+/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
+/// FMADDSUB is.
+static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
+ if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
+ !Subtarget.hasAnyFMA())
+ return false;
+
+ // FIXME: These checks must match the similar ones in
+ // DAGCombiner::visitFADDForFMACombine. It would be good to have one
+ // function that would answer if it is Ok to fuse MUL + ADD to FMADD
+ // or MUL + ADDSUB to FMADDSUB.
+ const TargetOptions &Options = DAG.getTarget().Options;
+ bool AllowFusion =
+ (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+ if (!AllowFusion)
+ return false;
+
+ Opnd2 = Opnd1;
+ Opnd1 = Opnd0.getOperand(1);
+ Opnd0 = Opnd0.getOperand(0);
+
+ return true;
+}
+
+/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
+/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
+static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Opnd0, Opnd1;
+ if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
+ return SDValue();
+
+ MVT VT = BV->getSimpleValueType(0);
+ SDLoc DL(BV);
+
+ // Try to generate X86ISD::FMADDSUB node here.
+ SDValue Opnd2;
+ if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+ return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+
+ // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+ // the ADDSUB idiom has been successfully recognized. There are no known
+ // X86 targets with 512-bit ADDSUB instructions!
+ // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
+ // recognition.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
@@ -7290,7 +7370,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return VectorConstant;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
- if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
+ if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
return AddSub;
if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
return HorizontalOp;
@@ -12965,6 +13045,12 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Subtarget.hasVBMI())
return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
@@ -16985,9 +17071,16 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
}
- if (Cond.getOpcode() == ISD::SETCC)
- if (SDValue NewCond = LowerSETCC(Cond, DAG))
+ if (Cond.getOpcode() == ISD::SETCC) {
+ if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
Cond = NewCond;
+ // If the condition was updated, it's possible that the operands of the
+ // select were also updated (for example, EmitTest has a RAUW). Refresh
+ // the local references to the select operands in case they got stale.
+ Op1 = Op.getOperand(1);
+ Op2 = Op.getOperand(2);
+ }
+ }
// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
@@ -17193,22 +17286,26 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
return SDValue();
- if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+ if (VT.is512BitVector() && InVTElt != MVT::i1) {
if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
}
- assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+ assert (InVTElt == MVT::i1 && "Unexpected vector type");
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
- SDValue NegOne = DAG.getConstant(
- APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
- SDValue Zero = DAG.getConstant(
- APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+ SDValue V;
+ if (Subtarget.hasDQI()) {
+ V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
+ assert(!VT.is512BitVector() && "Unexpected vector type");
+ } else {
+ SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
+ SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
+ V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+ if (VT.is512BitVector())
+ return V;
+ }
- SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
- if (VT.is512BitVector())
- return V;
return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
}
@@ -21528,6 +21625,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
}
+ // It's worth extending once and using the vXi16/vXi32 shifts for smaller
+ // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
+ // make the existing SSE solution better.
+ if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
+ (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
+ (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
+ (Subtarget.hasBWI() && VT == MVT::v32i8)) {
+ MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
+ MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
+ unsigned ExtOpc =
+ Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+ Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
+ }
+
if (VT == MVT::v16i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
@@ -21636,19 +21750,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
}
}
- // It's worth extending once and using the v8i32 shifts for 16-bit types, but
- // the extra overheads to get from v16i8 to v8i32 make the existing SSE
- // solution better.
- if (Subtarget.hasInt256() && VT == MVT::v8i16) {
- MVT ExtVT = MVT::v8i32;
- unsigned ExtOpc =
- Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- R = DAG.getNode(ExtOpc, dl, ExtVT, R);
- Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
- return DAG.getNode(ISD::TRUNCATE, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
- }
-
if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
MVT ExtVT = MVT::v8i32;
SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
@@ -27763,29 +27864,32 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
}
-/// \brief Try to combine a shuffle into a target-specific add-sub node.
+/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
+/// operation. If true is returned then the operands of ADDSUB operation
+/// are written to the parameters \p Opnd0 and \p Opnd1.
///
-/// We combine this directly on the abstract vector shuffle nodes so it is
-/// easier to generically match. We also insert dummy vector shuffle nodes for
-/// the operands which explicitly discard the lanes which are unused by this
-/// operation to try to flow through the rest of the combiner the fact that
-/// they're unused.
-static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- SDLoc DL(N);
+/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
+/// so it is easier to generically match. We also insert dummy vector shuffle
+/// nodes for the operands which explicitly discard the lanes which are unused
+/// by this operation to try to flow through the rest of the combiner
+/// the fact that they're unused.
+static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
+ SDValue &Opnd0, SDValue &Opnd1) {
+
EVT VT = N->getValueType(0);
if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
- return SDValue();
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
+ (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+ return false;
// We only handle target-independent shuffles.
// FIXME: It would be easy and harmless to use the target shuffle mask
// extraction tool to support more.
if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
- return SDValue();
+ return false;
ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
- SmallVector<int, 8> Mask(OrigMask.begin(), OrigMask.end());
+ SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
@@ -27796,27 +27900,57 @@ static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
ShuffleVectorSDNode::commuteMask(Mask);
std::swap(V1, V2);
} else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
- return SDValue();
+ return false;
// If there are other uses of these operations we can't fold them.
if (!V1->hasOneUse() || !V2->hasOneUse())
- return SDValue();
+ return false;
// Ensure that both operations have the same operands. Note that we can
// commute the FADD operands.
SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
- return SDValue();
+ return false;
// We're looking for blends between FADD and FSUB nodes. We insist on these
// nodes being lined up in a specific expected pattern.
if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
- isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
+ isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
+ 8, 25, 10, 27, 12, 29, 14, 31})))
+ return false;
+
+ Opnd0 = LHS;
+ Opnd1 = RHS;
+ return true;
+}
+
+/// \brief Try to combine a shuffle into a target-specific add-sub or
+/// mul-add-sub node.
+static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Opnd0, Opnd1;
+ if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Try to generate X86ISD::FMADDSUB node here.
+ SDValue Opnd2;
+ if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+ return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+
+ // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+ // the ADDSUB idiom has been successfully recognized. There are no known
+ // X86 targets with 512-bit ADDSUB instructions!
+ if (VT.is512BitVector())
return SDValue();
- return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
// We are looking for a shuffle where both sources are concatenated with undef
@@ -27878,7 +28012,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB node.
if (TLI.isTypeLegal(VT))
- if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
+ if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
// During Type Legalization, when promoting illegal vector types,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 908053e1342d0..d44d1395f2437 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -443,6 +443,22 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllOnesV))]>;
}
+// Alias instructions that allow VPTERNLOG to be used with a mask to create
+// a mix of all ones and all zeros elements. This is done this way to force
+// the same register to be used as input for all three sources.
+let isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst),
+ (ins VK16WM:$mask), "",
+ [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask),
+ (v16i32 immAllOnesV),
+ (v16i32 immAllZerosV)))]>;
+def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
+ (ins VK8WM:$mask), "",
+ [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask),
+ (bc_v8i64 (v16i32 immAllOnesV)),
+ (bc_v8i64 (v16i32 immAllZerosV))))]>;
+}
+
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
@@ -1064,10 +1080,10 @@ def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
(v8f32 VR256X:$src), 1)>;
def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
(VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v4f64 VR256X:$src), 1)>;
+ (v4f64 VR256X:$src), 1)>;
def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
(VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v4i64 VR256X:$src), 1)>;
+ (v4i64 VR256X:$src), 1)>;
def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
(VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(v8i32 VR256X:$src), 1)>;
@@ -1485,8 +1501,7 @@ defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
// AVX-512 - BLEND using mask
//
multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in {
- let hasSideEffects = 0 in
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
@@ -1496,16 +1511,13 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- [(set _.RC:$dst, (vselect _.KRCWM:$mask,
- (_.VT _.RC:$src2),
- (_.VT _.RC:$src1)))]>, EVEX_4V, EVEX_K;
- let hasSideEffects = 0 in
+ []>, EVEX_4V, EVEX_K;
def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
[]>, EVEX_4V, EVEX_KZ;
- let mayLoad = 1, hasSideEffects = 0 in
+ let mayLoad = 1 in {
def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
@@ -1515,38 +1527,32 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- [(set _.RC:$dst, (vselect _.KRCWM:$mask,
- (_.VT (bitconvert (_.LdFrag addr:$src2))),
- (_.VT _.RC:$src1)))]>,
- EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
- let mayLoad = 1, hasSideEffects = 0 in
+ []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
[]>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
}
+ }
}
multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+ let mayLoad = 1, hasSideEffects = 0 in {
def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
- [(set _.RC:$dst,(vselect _.KRCWM:$mask,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- (_.VT _.RC:$src1)))]>,
- EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ []>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
- let mayLoad = 1, hasSideEffects = 0 in
def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
"$dst, $src1, ${src2}", _.BroadcastStr, "}"),
[]>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
-
+ }
}
multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
@@ -1582,21 +1588,6 @@ defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
-let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
- (v8f32 VR256X:$src2))),
- (EXTRACT_SUBREG
- (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
- (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
- (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-
-def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
- (v8i32 VR256X:$src2))),
- (EXTRACT_SUBREG
- (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-}
//===----------------------------------------------------------------------===//
// Compare Instructions
//===----------------------------------------------------------------------===//
@@ -2735,7 +2726,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
(ins _.KRCWM:$mask, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
"${dst} {${mask}} {z}, $src}"),
- [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+ [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
(_.VT _.RC:$src),
_.ImmAllZerosV)))], _.ExeDomain>,
EVEX, EVEX_KZ;
@@ -2972,6 +2963,30 @@ def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
(v16i32 VR512:$src))),
(VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
+// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
+// available. Use a 512-bit operation and extract.
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+ (v8f32 VR256X:$src0))),
+ (EXTRACT_SUBREG
+ (v16f32
+ (VMOVAPSZrrk
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
+ (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
+ sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+ (v8i32 VR256X:$src0))),
+ (EXTRACT_SUBREG
+ (v16i32
+ (VMOVDQA32Zrrk
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
+ (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
+ sub_ymm)>;
+}
+
let Predicates = [HasVLX, NoBWI] in {
// 128-bit load/store without BWI.
def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
@@ -3116,13 +3131,13 @@ let Predicates = [HasVLX] in {
(VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
}
-
-// Move Int Doubleword to Packed Double Int
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set VR128X:$dst,
+
+// Move Int Doubleword to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
EVEX;
def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
@@ -3152,47 +3167,47 @@ def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src
def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
- EVEX_CD8<64, CD8VT1>;
-}
-} // ExeDomain = SSEPackedInt
-
-// Move Int Doubleword to Single Scalar
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
-def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+ EVEX_CD8<64, CD8VT1>;
+}
+} // ExeDomain = SSEPackedInt
+
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert GR32:$src))],
IIC_SSE_MOVDQ>, EVEX;
def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
-
-// Move doubleword from xmm register to r/m32
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move doubleword from xmm register to r/m32
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
(iPTR 0)))], IIC_SSE_MOVD_ToGP>,
EVEX;
def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
- [(store (i32 (extractelt (v4i32 VR128X:$src),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
- EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt
-
-// Move quadword from xmm1 register to r/m64
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+ [(store (i32 (extractelt (v4i32 VR128X:$src),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+ EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt
+
+// Move quadword from xmm1 register to r/m64
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
(iPTR 0)))],
IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
Requires<[HasAVX512, In64BitMode]>;
@@ -3213,39 +3228,39 @@ def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
let hasSideEffects = 0 in
def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
- (ins VR128X:$src),
- "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
- EVEX, VEX_W;
-} // ExeDomain = SSEPackedInt
-
-// Move Scalar Single to Double Int
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
-def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
- (ins FR32X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
+ (ins VR128X:$src),
+ "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
+ EVEX, VEX_W;
+} // ExeDomain = SSEPackedInt
+
+// Move Scalar Single to Double Int
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
+ (ins FR32X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32X:$src))],
IIC_SSE_MOVD_ToGP>, EVEX;
def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, FR32X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
-
-// Move Quadword Int to Packed Quadword Int
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
- (ins i64mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128X:$dst,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
- EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
-} // ExeDomain = SSEPackedInt
-
-//===----------------------------------------------------------------------===//
-// AVX-512 MOVSS, MOVSD
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move Quadword Int to Packed Quadword Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+} // ExeDomain = SSEPackedInt
+
+//===----------------------------------------------------------------------===//
+// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
multiclass avx512_move_scalar<string asm, SDNode OpNode,
@@ -8646,6 +8661,28 @@ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
+ (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 579359794fbdd..e3484d062bc80 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -543,7 +543,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MOV8rr, X86::MOV8rm, 0 },
{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
- { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
@@ -661,7 +661,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
{ X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
{ X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
- { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 },
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
{ X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
{ X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
@@ -6864,6 +6864,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addReg(Reg, RegState::Undef).addImm(0xff);
return true;
}
+ case X86::AVX512_512_SEXT_MASK_32:
+ case X86::AVX512_512_SEXT_MASK_64: {
+ unsigned Reg = MIB->getOperand(0).getReg();
+ unsigned MaskReg = MIB->getOperand(1).getReg();
+ unsigned MaskState = getRegState(MIB->getOperand(1));
+ unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
+ X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
+ MI.RemoveOperand(1);
+ MIB->setDesc(get(Opc));
+ // VPTERNLOG needs 3 register inputs and an immediate.
+ // 0xff will return 1s for any input.
+ MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
+ .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
+ return true;
+ }
case X86::VMOVAPSZ128rm_NOVLX:
return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 4cd6ae563f03d..09971d586a413 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -6397,7 +6397,7 @@ let Predicates = [HasAVX] in {
defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
int_x86_sse41_round_ss,
int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
- defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
+ defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
}
let Predicates = [UseAVX] in {
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index de4839432b9ad..107ed9359376b 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -144,6 +144,10 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry AVX512BWUniformConstCostTable[] = {
+ { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
+
{ ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
{ ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
};
@@ -168,6 +172,10 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry AVX2UniformConstCostTable[] = {
+ { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
+
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
@@ -184,6 +192,14 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry SSE2UniformConstCostTable[] = {
+ { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand).
+ { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand).
+ { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb).
+
{ ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
{ ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
{ ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
@@ -207,6 +223,43 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
}
+ static const CostTblEntry AVX2UniformCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v16i16, 1 }, // psllw.
+ { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
+ { ISD::SRA, MVT::v16i16, 1 }, // psraw.
+ };
+
+ if (ST->hasAVX2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2UniformCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ { ISD::SHL, MVT::v4i32, 1 }, // pslld
+ { ISD::SHL, MVT::v2i64, 1 }, // psllq.
+
+ { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
+ { ISD::SRL, MVT::v4i32, 1 }, // psrld.
+ { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
+
+ { ISD::SRA, MVT::v8i16, 1 }, // psraw.
+ { ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ };
+
+ if (ST->hasSSE2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
static const CostTblEntry AVX512DQCostTable[] = {
{ ISD::MUL, MVT::v2i64, 1 },
{ ISD::MUL, MVT::v4i64, 1 },
@@ -219,6 +272,10 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWCostTable[] = {
+ { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
+
{ ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
@@ -259,7 +316,7 @@ int X86TTIImpl::getArithmeticInstrCost(
if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- static const CostTblEntry AVX2CostTable[] = {
+ static const CostTblEntry AVX2ShiftCostTable[] = {
// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
{ ISD::SHL, MVT::v4i32, 1 },
@@ -283,11 +340,11 @@ int X86TTIImpl::getArithmeticInstrCost(
// is lowered into a vector multiply (vpmullw).
return LT.first;
- if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
- static const CostTblEntry XOPCostTable[] = {
+ static const CostTblEntry XOPShiftCostTable[] = {
// 128bit shifts take 1cy, but right shifts require negation beforehand.
{ ISD::SHL, MVT::v16i8, 1 },
{ ISD::SRL, MVT::v16i8, 2 },
@@ -318,93 +375,20 @@ int X86TTIImpl::getArithmeticInstrCost(
// Look for XOP lowering tricks.
if (ST->hasXOP())
- if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
+ if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- static const CostTblEntry AVX2CustomCostTable[] = {
- { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
-
- { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
-
- { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
- { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
-
- { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v8i32, 1 }, // pmulld
- { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
-
- { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
- };
-
- // Look for AVX2 lowering tricks for custom cases.
- if (ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
- LT.second))
- return LT.first * Entry->Cost;
-
- static const CostTblEntry AVXCustomCostTable[] = {
- { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
-
- { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
-
- // Vectorizing division is a bad idea. See the SSE2 table for more comments.
- { ISD::SDIV, MVT::v32i8, 32*20 },
- { ISD::SDIV, MVT::v16i16, 16*20 },
- { ISD::SDIV, MVT::v8i32, 8*20 },
- { ISD::SDIV, MVT::v4i64, 4*20 },
- { ISD::UDIV, MVT::v32i8, 32*20 },
- { ISD::UDIV, MVT::v16i16, 16*20 },
- { ISD::UDIV, MVT::v8i32, 8*20 },
- { ISD::UDIV, MVT::v4i64, 4*20 },
- };
-
- // Look for AVX2 lowering tricks for custom cases.
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD,
- LT.second))
- return LT.first * Entry->Cost;
-
- static const CostTblEntry
- SSE2UniformCostTable[] = {
+ static const CostTblEntry SSE2UniformShiftCostTable[] = {
// Uniform splats are cheaper for the following instructions.
- { ISD::SHL, MVT::v16i8, 1 }, // psllw.
- { ISD::SHL, MVT::v32i8, 2 }, // psllw.
- { ISD::SHL, MVT::v8i16, 1 }, // psllw.
{ ISD::SHL, MVT::v16i16, 2 }, // psllw.
- { ISD::SHL, MVT::v4i32, 1 }, // pslld
{ ISD::SHL, MVT::v8i32, 2 }, // pslld
- { ISD::SHL, MVT::v2i64, 1 }, // psllq.
{ ISD::SHL, MVT::v4i64, 2 }, // psllq.
- { ISD::SRL, MVT::v16i8, 1 }, // psrlw.
- { ISD::SRL, MVT::v32i8, 2 }, // psrlw.
- { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
{ ISD::SRL, MVT::v16i16, 2 }, // psrlw.
- { ISD::SRL, MVT::v4i32, 1 }, // psrld.
{ ISD::SRL, MVT::v8i32, 2 }, // psrld.
- { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
{ ISD::SRL, MVT::v4i64, 2 }, // psrlq.
- { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
- { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb.
- { ISD::SRA, MVT::v8i16, 1 }, // psraw.
{ ISD::SRA, MVT::v16i16, 2 }, // psraw.
- { ISD::SRA, MVT::v4i32, 1 }, // psrad.
{ ISD::SRA, MVT::v8i32, 2 }, // psrad.
{ ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
@@ -414,7 +398,7 @@ int X86TTIImpl::getArithmeticInstrCost(
((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
(Op2Info == TargetTransformInfo::OK_UniformValue))) {
if (const auto *Entry =
- CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+ CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
@@ -422,24 +406,98 @@ int X86TTIImpl::getArithmeticInstrCost(
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
MVT VT = LT.second;
// Vector shift left by non uniform constant can be lowered
- // into vector multiply (pmullw/pmulld).
- if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
- (VT == MVT::v4i32 && ST->hasSSE41()))
- return LT.first;
-
- // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
- // sequence of extract + two vector multiply + insert.
- if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
- (ST->hasAVX() && !ST->hasAVX2()))
- ISD = ISD::MUL;
-
- // A vector shift left by non uniform constant is converted
- // into a vector multiply; the new multiply is eventually
- // lowered into a sequence of shuffles and 2 x pmuludq.
- if (VT == MVT::v4i32 && ST->hasSSE2())
+ // into vector multiply.
+ if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
+ ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
ISD = ISD::MUL;
}
+ static const CostTblEntry AVX2CostTable[] = {
+ { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
+
+ { ISD::SUB, MVT::v32i8, 1 }, // psubb
+ { ISD::ADD, MVT::v32i8, 1 }, // paddb
+ { ISD::SUB, MVT::v16i16, 1 }, // psubw
+ { ISD::ADD, MVT::v16i16, 1 }, // paddw
+ { ISD::SUB, MVT::v8i32, 1 }, // psubd
+ { ISD::ADD, MVT::v8i32, 1 }, // paddd
+ { ISD::SUB, MVT::v4i64, 1 }, // psubq
+ { ISD::ADD, MVT::v4i64, 1 }, // paddq
+
+ { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i16, 1 }, // pmullw
+ { ISD::MUL, MVT::v8i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
+ };
+
+ // Look for AVX2 lowering tricks for custom cases.
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX1CostTable[] = {
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v16i16, 4 },
+ { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v32i8, 4 },
+ { ISD::ADD, MVT::v32i8, 4 },
+ { ISD::SUB, MVT::v16i16, 4 },
+ { ISD::ADD, MVT::v16i16, 4 },
+ { ISD::SUB, MVT::v8i32, 4 },
+ { ISD::ADD, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v4i64, 4 },
+ { ISD::ADD, MVT::v4i64, 4 },
+
+ // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+ // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
+ // Because we believe v4i64 to be a legal type, we must also include the
+ // extract+insert in the cost table. Therefore, the cost here is 18
+ // instead of 8.
+ { ISD::MUL, MVT::v4i64, 18 },
+
+ { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
+
+ { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v32i8, 32*20 },
+ { ISD::SDIV, MVT::v16i16, 16*20 },
+ { ISD::SDIV, MVT::v8i32, 8*20 },
+ { ISD::SDIV, MVT::v4i64, 4*20 },
+ { ISD::UDIV, MVT::v32i8, 32*20 },
+ { ISD::UDIV, MVT::v16i16, 16*20 },
+ { ISD::UDIV, MVT::v8i32, 8*20 },
+ { ISD::UDIV, MVT::v4i64, 4*20 },
+ };
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry SSE42CostTable[] = {
{ ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
@@ -456,6 +514,8 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence.
{ ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
{ ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
+ { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld
{ ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
{ ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence.
@@ -501,6 +561,7 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
{ ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v8i16, 1 }, // pmullw
{ ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
{ ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
@@ -516,46 +577,19 @@ int X86TTIImpl::getArithmeticInstrCost(
// generally a bad idea. Assume somewhat arbitrarily that we have to be able
// to hide "20 cycles" for each lane.
{ ISD::SDIV, MVT::v16i8, 16*20 },
- { ISD::SDIV, MVT::v8i16, 8*20 },
- { ISD::SDIV, MVT::v4i32, 4*20 },
- { ISD::SDIV, MVT::v2i64, 2*20 },
+ { ISD::SDIV, MVT::v8i16, 8*20 },
+ { ISD::SDIV, MVT::v4i32, 4*20 },
+ { ISD::SDIV, MVT::v2i64, 2*20 },
{ ISD::UDIV, MVT::v16i8, 16*20 },
- { ISD::UDIV, MVT::v8i16, 8*20 },
- { ISD::UDIV, MVT::v4i32, 4*20 },
- { ISD::UDIV, MVT::v2i64, 2*20 },
+ { ISD::UDIV, MVT::v8i16, 8*20 },
+ { ISD::UDIV, MVT::v4i32, 4*20 },
+ { ISD::UDIV, MVT::v2i64, 2*20 },
};
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- static const CostTblEntry AVX1CostTable[] = {
- // We don't have to scalarize unsupported ops. We can issue two half-sized
- // operations and we only need to extract the upper YMM half.
- // Two ops + 1 extract + 1 insert = 4.
- { ISD::MUL, MVT::v16i16, 4 },
- { ISD::MUL, MVT::v8i32, 4 },
- { ISD::SUB, MVT::v32i8, 4 },
- { ISD::ADD, MVT::v32i8, 4 },
- { ISD::SUB, MVT::v16i16, 4 },
- { ISD::ADD, MVT::v16i16, 4 },
- { ISD::SUB, MVT::v8i32, 4 },
- { ISD::ADD, MVT::v8i32, 4 },
- { ISD::SUB, MVT::v4i64, 4 },
- { ISD::ADD, MVT::v4i64, 4 },
- // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
- // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
- // Because we believe v4i64 to be a legal type, we must also include the
- // extract+insert in the cost table. Therefore, the cost here is 18
- // instead of 8.
- { ISD::MUL, MVT::v4i64, 18 },
- };
-
- // Look for AVX1 lowering tricks.
- if (ST->hasAVX() && !ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
- return LT.first * Entry->Cost;
-
static const CostTblEntry SSE1CostTable[] = {
{ ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
@@ -639,8 +673,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
{ TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
- { TTI::SK_Reverse, MVT::v64i8, 6 }, // vextracti64x4 + 2*vperm2i128
- // + 2*pshufb + vinserti64x4
+ { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2
{ TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index f4742aaf748f1..82daf754be0dd 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -42,6 +42,8 @@
using namespace llvm;
using namespace lowertypetests;
+using SummaryAction = LowerTypeTestsSummaryAction;
+
#define DEBUG_TYPE "lowertypetests"
STATISTIC(ByteArraySizeBits, "Byte array size in bits");
@@ -55,9 +57,15 @@ static cl::opt<bool> AvoidReuse(
cl::desc("Try to avoid reuse of byte array addresses using aliases"),
cl::Hidden, cl::init(true));
-static cl::opt<std::string> ClSummaryAction(
+static cl::opt<SummaryAction> ClSummaryAction(
"lowertypetests-summary-action",
- cl::desc("What to do with the summary when running this pass"), cl::Hidden);
+ cl::desc("What to do with the summary when running this pass"),
+ cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"),
+ clEnumValN(SummaryAction::Import, "import",
+ "Import typeid resolutions from summary and globals"),
+ clEnumValN(SummaryAction::Export, "export",
+ "Export typeid resolutions to summary and globals")),
+ cl::Hidden);
static cl::opt<std::string> ClReadSummary(
"lowertypetests-read-summary",
@@ -226,8 +234,8 @@ public:
class LowerTypeTestsModule {
Module &M;
- // This is for testing purposes only.
- std::unique_ptr<ModuleSummaryIndex> OwnedSummary;
+ SummaryAction Action;
+ ModuleSummaryIndex *Summary;
bool LinkerSubsectionsViaSymbols;
Triple::ArchType Arch;
@@ -319,21 +327,38 @@ class LowerTypeTestsModule {
void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
public:
- LowerTypeTestsModule(Module &M);
- ~LowerTypeTestsModule();
+ LowerTypeTestsModule(Module &M, SummaryAction Action,
+ ModuleSummaryIndex *Summary);
bool lower();
+
+ // Lower the module using the action and summary passed as command line
+ // arguments. For testing purposes only.
+ static bool runForTesting(Module &M);
};
struct LowerTypeTests : public ModulePass {
static char ID;
- LowerTypeTests() : ModulePass(ID) {
+
+ bool UseCommandLine = false;
+
+ SummaryAction Action;
+ ModuleSummaryIndex *Summary;
+
+ LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
+ initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+ }
+
+ LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary)
+ : ModulePass(ID), Action(Action), Summary(Summary) {
initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
}
bool runOnModule(Module &M) override {
if (skipModule(M))
return false;
- return LowerTypeTestsModule(M).lower();
+ if (UseCommandLine)
+ return LowerTypeTestsModule::runForTesting(M);
+ return LowerTypeTestsModule(M, Action, Summary).lower();
}
};
@@ -343,7 +368,10 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
false)
char LowerTypeTests::ID = 0;
-ModulePass *llvm::createLowerTypeTestsPass() { return new LowerTypeTests; }
+ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action,
+ ModuleSummaryIndex *Summary) {
+ return new LowerTypeTests(Action, Summary);
+}
/// Build a bit set for TypeId using the object layouts in
/// GlobalLayout.
@@ -1145,22 +1173,12 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
}
/// Lower all type tests in this module.
-LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
- // Handle the command-line summary arguments. This code is for testing
- // purposes only, so we handle errors directly.
- if (!ClSummaryAction.empty()) {
- OwnedSummary = make_unique<ModuleSummaryIndex>();
- if (!ClReadSummary.empty()) {
- ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
- ": ");
- auto ReadSummaryFile =
- ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
-
- yaml::Input In(ReadSummaryFile->getBuffer());
- In >> *OwnedSummary;
- ExitOnErr(errorCodeToError(In.error()));
- }
- }
+LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action,
+ ModuleSummaryIndex *Summary)
+ : M(M), Action(Action), Summary(Summary) {
+ // FIXME: Use these fields.
+ (void)this->Action;
+ (void)this->Summary;
Triple TargetTriple(M.getTargetTriple());
LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
@@ -1169,18 +1187,36 @@ LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
ObjectFormat = TargetTriple.getObjectFormat();
}
-LowerTypeTestsModule::~LowerTypeTestsModule() {
- if (ClSummaryAction.empty() || ClWriteSummary.empty())
- return;
+bool LowerTypeTestsModule::runForTesting(Module &M) {
+ ModuleSummaryIndex Summary;
- ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
- ": ");
- std::error_code EC;
- raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
- ExitOnErr(errorCodeToError(EC));
+ // Handle the command-line summary arguments. This code is for testing
+ // purposes only, so we handle errors directly.
+ if (!ClReadSummary.empty()) {
+ ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
+ ": ");
+ auto ReadSummaryFile =
+ ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+ yaml::Input In(ReadSummaryFile->getBuffer());
+ In >> Summary;
+ ExitOnErr(errorCodeToError(In.error()));
+ }
+
+ bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower();
+
+ if (!ClWriteSummary.empty()) {
+ ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
+ ": ");
+ std::error_code EC;
+ raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+ ExitOnErr(errorCodeToError(EC));
+
+ yaml::Output Out(OS);
+ Out << Summary;
+ }
- yaml::Output Out(OS);
- Out << *OwnedSummary;
+ return Changed;
}
bool LowerTypeTestsModule::lower() {
@@ -1313,7 +1349,8 @@ bool LowerTypeTestsModule::lower() {
PreservedAnalyses LowerTypeTestsPass::run(Module &M,
ModuleAnalysisManager &AM) {
- bool Changed = LowerTypeTestsModule(M).lower();
+ bool Changed =
+ LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower();
if (!Changed)
return PreservedAnalyses::all();
return PreservedAnalyses::none();
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 293ddf21a68f7..d086ee05a64fe 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -857,7 +857,8 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
// Lower type metadata and the type.test intrinsic. This pass supports Clang's
// control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
// link time if CFI is enabled. The pass does nothing if CFI is disabled.
- PM.add(createLowerTypeTestsPass());
+ PM.add(createLowerTypeTestsPass(LowerTypeTestsSummaryAction::None,
+ /*Summary=*/nullptr));
if (OptLevel != 0)
addLateLTOOptimizationPasses(PM);
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 012bfc7b4944c..013159cde7740 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1903,7 +1903,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
return foldICmpShlOne(Cmp, Shl, C);
// Check that the shift amount is in range. If not, don't perform undefined
- // shifts. When the shift is visited it will be simplified.
+ // shifts. When the shift is visited, it will be simplified.
unsigned TypeBits = C->getBitWidth();
if (ShiftAmt->uge(TypeBits))
return nullptr;
@@ -1923,7 +1923,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
return new ICmpInst(Pred, X, LShrC);
if (Shl->hasOneUse()) {
- // Otherwise strength reduce the shift into an and.
+ // Otherwise, strength reduce the shift into an and.
Constant *Mask = ConstantInt::get(Shl->getType(),
APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
@@ -1951,7 +1951,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
}
// When the shift is nuw and pred is >u or <=u, comparison only really happens
- // in the pre-shifted bits. Since InstSimplify canoncalizes <=u into <u, the
+ // in the pre-shifted bits. Since InstSimplify canonicalizes <=u into <u, the
// <=u case can be further converted to match <u (see below).
if (Shl->hasNoUnsignedWrap() &&
(Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT)) {
@@ -1970,9 +1970,9 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
// Transform (icmp pred iM (shl iM %v, N), C)
// -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
// Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
- // This enables us to get rid of the shift in favor of a trunc which can be
+ // This enables us to get rid of the shift in favor of a trunc that may be
// free on the target. It has the additional benefit of comparing to a
- // smaller constant, which will be target friendly.
+ // smaller constant that may be more target-friendly.
unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
if (Shl->hasOneUse() && Amt != 0 && C->countTrailingZeros() >= Amt &&
DL.isLegalInteger(TypeBits - Amt)) {
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 1d55283987766..54bdc9e0772b5 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1818,6 +1818,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
RegisteredFlag = new GlobalVariable(
M, IntptrTy, false, GlobalVariable::CommonLinkage,
ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+ RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
// Update llvm.compiler.used, adding the new liveness globals. This is
// needed so that during LTO these variables stay alive. The alternative
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 6aeb5237ffe35..68faa886060a1 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1423,7 +1423,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
if (widenLoopCompare(DU))
return nullptr;
- // This user does not evaluate to a recurence after widening, so don't
+ // This user does not evaluate to a recurrence after widening, so don't
// follow it. Instead insert a Trunc to kill off the original use,
// eventually isolating the original narrow IV so it can be removed.
truncateIVUse(DU, DT, LI);
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 08e7acdaaf72c..8fb580183e307 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -415,7 +415,9 @@ public:
Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
PH->getTerminator());
Value *Initial =
- new LoadInst(InitialPtr, "load_initial", PH->getTerminator());
+ new LoadInst(InitialPtr, "load_initial", /* isVolatile */ false,
+ Cand.Load->getAlignment(), PH->getTerminator());
+
PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
&L->getHeader()->front());
PHI->addIncoming(Initial, PH);
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 6f7682c96cefb..76fe91884c7b8 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1382,8 +1382,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
Succ->begin(), Succ->end());
LPM->deleteSimpleAnalysisValue(BI, L);
- BI->eraseFromParent();
RemoveFromWorklist(BI, Worklist);
+ BI->eraseFromParent();
// Remove Succ from the loop tree.
LI->removeBlock(Succ);
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 8b8236390bf47..eef7db08cd460 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -79,7 +79,8 @@ STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
-STATISTIC(NumGVNMaxIterations, "Maximum Number of iterations it took to converge GVN");
+STATISTIC(NumGVNMaxIterations,
+ "Maximum Number of iterations it took to converge GVN");
//===----------------------------------------------------------------------===//
// GVN Pass
@@ -327,7 +328,7 @@ private:
// Elimination.
struct ValueDFS;
void convertDenseToDFSOrdered(CongruenceClass::MemberSet &,
- std::vector<ValueDFS> &);
+ SmallVectorImpl<ValueDFS> &);
bool eliminateInstructions(Function &);
void replaceInstruction(Instruction *, Value *);
@@ -336,8 +337,11 @@ private:
// New instruction creation.
void handleNewInstruction(Instruction *){};
+
+ // Various instruction touch utilities
void markUsersTouched(Value *);
void markMemoryUsersTouched(MemoryAccess *);
+ void markLeaderChangeTouched(CongruenceClass *CC);
// Utilities.
void cleanupTables();
@@ -390,10 +394,10 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false)
PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
- BasicBlock *PhiBlock = I->getParent();
+ BasicBlock *PHIBlock = I->getParent();
auto *PN = cast<PHINode>(I);
- auto *E = new (ExpressionAllocator)
- PHIExpression(PN->getNumOperands(), I->getParent());
+ auto *E =
+ new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
E->allocateOperands(ArgRecycler, ExpressionAllocator);
E->setType(I->getType());
@@ -408,10 +412,10 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
[&](const Use &U) -> Value * {
- // Don't try to transform self-defined phis
+ // Don't try to transform self-defined phis.
if (U == PN)
return PN;
- const BasicBlockEdge BBE(PN->getIncomingBlock(U), PhiBlock);
+ const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock);
return lookupOperandLeader(U, I, BBE);
});
return E;
@@ -710,6 +714,15 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
return E;
}
+// Utility function to check whether the congruence class has a member other
+// than the given instruction.
+bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
+ // Either it has more than one member, in which case it must contain something
+ // other than us (because it's indexed by value), or if it only has one member
+ // right now, that member should not be us.
+ return CC->Members.size() > 1 || CC->Members.count(I) == 0;
+}
+
const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
const BasicBlock *B) {
// Unlike loads, we never try to eliminate stores, so we do not check if they
@@ -725,8 +738,12 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
cast<MemoryDef>(StoreAccess)->getDefiningAccess());
const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
+ // Basically, check if the congruence class the store is in is defined by a
+ // store that isn't us, and has the same value. MemorySSA takes care of
+ // ensuring the store has the same memory state as us already.
if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
- CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B))
+ CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) &&
+ hasMemberOtherThanUs(CC, I))
return createStoreExpression(SI, StoreRHS, B);
}
@@ -810,36 +827,50 @@ bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) {
const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
const BasicBlock *B) {
auto *E = cast<PHIExpression>(createPHIExpression(I));
- if (E->op_empty()) {
+ // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
+
+ // See if all arguaments are the same.
+ // We track if any were undef because they need special handling.
+ bool HasUndef = false;
+ auto Filtered = make_filter_range(E->operands(), [&](const Value *Arg) {
+ if (Arg == I)
+ return false;
+ if (isa<UndefValue>(Arg)) {
+ HasUndef = true;
+ return false;
+ }
+ return true;
+ });
+ // If we are left with no operands, it's undef
+ if (Filtered.begin() == Filtered.end()) {
DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
<< "\n");
E->deallocateOperands(ArgRecycler);
ExpressionAllocator.Deallocate(E);
return createConstantExpression(UndefValue::get(I->getType()));
}
-
- Value *AllSameValue = E->getOperand(0);
-
- // See if all arguments are the same, ignoring undef arguments, because we can
- // choose a value that is the same for them.
- for (const Value *Arg : E->operands())
- if (Arg != AllSameValue && !isa<UndefValue>(Arg)) {
- AllSameValue = nullptr;
- break;
+ Value *AllSameValue = *(Filtered.begin());
+ ++Filtered.begin();
+ // Can't use std::equal here, sadly, because filter.begin moves.
+ if (llvm::all_of(Filtered, [AllSameValue](const Value *V) {
+ return V == AllSameValue;
+ })) {
+ // In LLVM's non-standard representation of phi nodes, it's possible to have
+ // phi nodes with cycles (IE dependent on other phis that are .... dependent
+ // on the original phi node), especially in weird CFG's where some arguments
+ // are unreachable, or uninitialized along certain paths. This can cause
+ // infinite loops during evaluation. We work around this by not trying to
+ // really evaluate them independently, but instead using a variable
+ // expression to say if one is equivalent to the other.
+ // We also special case undef, so that if we have an undef, we can't use the
+ // common value unless it dominates the phi block.
+ if (HasUndef) {
+ // Only have to check for instructions
+ if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
+ if (!DT->dominates(AllSameInst, I))
+ return E;
}
- if (AllSameValue) {
- // It's possible to have phi nodes with cycles (IE dependent on
- // other phis that are .... dependent on the original phi node),
- // especially in weird CFG's where some arguments are unreachable, or
- // uninitialized along certain paths.
- // This can cause infinite loops during evaluation (even if you disable
- // the recursion below, you will simply ping-pong between congruence
- // classes). If a phi node symbolically evaluates to another phi node,
- // just leave it alone. If they are really the same, we will still
- // eliminate them in favor of each other.
- if (isa<PHINode>(AllSameValue))
- return E;
NumGVNPhisAllSame++;
DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
<< "\n");
@@ -1007,12 +1038,22 @@ void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) {
}
}
+// Touch the instructions that need to be updated after a congruence class has a
+// leader change, and mark changed values.
+void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
+ for (auto M : CC->Members) {
+ if (auto *I = dyn_cast<Instruction>(M))
+ TouchedInstructions.set(InstrDFS[I]);
+ ChangedValues.insert(M);
+ }
+}
+
// Perform congruence finding on a given value numbering expression.
void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
-
ValueToExpression[V] = E;
// This is guaranteed to return something, since it will at least find
// INITIAL.
+
CongruenceClass *VClass = ValueToClass[V];
assert(VClass && "Should have found a vclass");
// Dead classes should have been eliminated from the mapping.
@@ -1031,14 +1072,17 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
place->second = NewClass;
// Constants and variables should always be made the leader.
- if (const auto *CE = dyn_cast<ConstantExpression>(E))
+ if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
NewClass->RepLeader = CE->getConstantValue();
- else if (const auto *VE = dyn_cast<VariableExpression>(E))
- NewClass->RepLeader = VE->getVariableValue();
- else if (const auto *SE = dyn_cast<StoreExpression>(E))
- NewClass->RepLeader = SE->getStoreInst()->getValueOperand();
- else
+ } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
+ StoreInst *SI = SE->getStoreInst();
+ NewClass->RepLeader =
+ lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+ } else {
NewClass->RepLeader = V;
+ }
+ assert(!isa<VariableExpression>(E) &&
+ "VariableExpression should have been handled already");
EClass = NewClass;
DEBUG(dbgs() << "Created new congruence class for " << *V
@@ -1077,14 +1121,11 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
ExpressionToClass.erase(VClass->DefiningExpr);
}
} else if (VClass->RepLeader == V) {
- // FIXME: When the leader changes, the value numbering of
- // everything may change, so we need to reprocess.
+ // When the leader changes, the value numbering of
+ // everything may change due to symbolization changes, so we need to
+ // reprocess.
VClass->RepLeader = *(VClass->Members.begin());
- for (auto M : VClass->Members) {
- if (auto *I = dyn_cast<Instruction>(M))
- TouchedInstructions.set(InstrDFS[I]);
- ChangedValues.insert(M);
- }
+ markLeaderChangeTouched(VClass);
}
}
@@ -1106,6 +1147,27 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
markMemoryUsersTouched(MA);
}
}
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(V)) {
+ // There is, sadly, one complicating thing for stores. Stores do not
+ // produce values, only consume them. However, in order to make loads and
+ // stores value number the same, we ignore the value operand of the store.
+ // But the value operand will still be the leader of our class, and thus, it
+ // may change. Because the store is a use, the store will get reprocessed,
+ // but nothing will change about it, and so nothing above will catch it
+ // (since the class will not change). In order to make sure everything ends
+ // up okay, we need to recheck the leader of the class. Since stores of
+ // different values value number differently due to different memorydefs, we
+ // are guaranteed the leader is always the same between stores in the same
+ // class.
+ DEBUG(dbgs() << "Checking store leader\n");
+ auto ProperLeader =
+ lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+ if (EClass->RepLeader != ProperLeader) {
+ DEBUG(dbgs() << "Store leader changed, fixing\n");
+ EClass->RepLeader = ProperLeader;
+ markLeaderChangeTouched(EClass);
+ markMemoryUsersTouched(MSSA->getMemoryAccess(SI));
+ }
}
}
@@ -1708,8 +1770,9 @@ struct NewGVN::ValueDFS {
}
};
-void NewGVN::convertDenseToDFSOrdered(CongruenceClass::MemberSet &Dense,
- std::vector<ValueDFS> &DFSOrderedSet) {
+void NewGVN::convertDenseToDFSOrdered(
+ CongruenceClass::MemberSet &Dense,
+ SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
for (auto D : Dense) {
// First add the value.
BasicBlock *BB = getBlockForValue(D);
@@ -1972,21 +2035,25 @@ bool NewGVN::eliminateInstructions(Function &F) {
ValueDFSStack EliminationStack;
// Convert the members to DFS ordered sets and then merge them.
- std::vector<ValueDFS> DFSOrderedSet;
+ SmallVector<ValueDFS, 8> DFSOrderedSet;
convertDenseToDFSOrdered(CC->Members, DFSOrderedSet);
// Sort the whole thing.
- sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
-
- for (auto &C : DFSOrderedSet) {
- int MemberDFSIn = C.DFSIn;
- int MemberDFSOut = C.DFSOut;
- Value *Member = C.Val;
- Use *MemberUse = C.U;
-
- // We ignore void things because we can't get a value from them.
- if (Member && Member->getType()->isVoidTy())
- continue;
+ std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
+
+ for (auto &VD : DFSOrderedSet) {
+ int MemberDFSIn = VD.DFSIn;
+ int MemberDFSOut = VD.DFSOut;
+ Value *Member = VD.Val;
+ Use *MemberUse = VD.U;
+
+ if (Member) {
+ // We ignore void things because we can't get a value from them.
+ // FIXME: We could actually use this to kill dead stores that are
+ // dominated by equivalent earlier stores.
+ if (Member->getType()->isVoidTy())
+ continue;
+ }
if (EliminationStack.empty()) {
DEBUG(dbgs() << "Elimination Stack is empty\n");
@@ -1995,8 +2062,6 @@ bool NewGVN::eliminateInstructions(Function &F) {
<< EliminationStack.dfs_back().first << ","
<< EliminationStack.dfs_back().second << ")\n");
}
- if (Member && isa<Constant>(Member))
- assert(isa<Constant>(CC->RepLeader));
DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
<< MemberDFSOut << ")\n");
@@ -2037,11 +2102,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
continue;
Value *Result = EliminationStack.back();
- // Don't replace our existing users with ourselves, and don't replace
- // phi node arguments with the result of the same phi node.
- // IE tmp = phi(tmp11, undef); tmp11 = foo -> tmp = phi(tmp, undef)
- if (MemberUse->get() == Result ||
- (isa<PHINode>(Result) && MemberUse->getUser() == Result))
+ // Don't replace our existing users with ourselves.
+ if (MemberUse->get() == Result)
continue;
DEBUG(dbgs() << "Found replacement " << *Result << " for "
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 8a6be97d08c7c..34be90692481b 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -511,9 +511,6 @@ private:
void visitSelectInst(SelectInst &I);
void visitBinaryOperator(Instruction &I);
void visitCmpInst(CmpInst &I);
- void visitExtractElementInst(ExtractElementInst &I);
- void visitInsertElementInst(InsertElementInst &I);
- void visitShuffleVectorInst(ShuffleVectorInst &I);
void visitExtractValueInst(ExtractValueInst &EVI);
void visitInsertValueInst(InsertValueInst &IVI);
void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
@@ -970,21 +967,6 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
markOverdefined(&I);
}
-void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
- // TODO : SCCP does not handle vectors properly.
- return markOverdefined(&I);
-}
-
-void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
- // TODO : SCCP does not handle vectors properly.
- return markOverdefined(&I);
-}
-
-void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
- // TODO : SCCP does not handle vectors properly.
- return markOverdefined(&I);
-}
-
// Handle getelementptr instructions. If all operands are constants then we
// can turn this into a getelementptr ConstantExpr.
//
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 678d02e05d423..9844190ef84a2 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -67,12 +67,15 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
return true;
}
- // When exporting, consult the index.
- auto Summaries = ImportIndex.findGlobalValueSummaryList(SGV->getGUID());
- assert(Summaries != ImportIndex.end() &&
- "Missing summary for global value when exporting");
- assert(Summaries->second.size() == 1 && "Local has more than one summary");
- auto Linkage = Summaries->second.front()->linkage();
+ // When exporting, consult the index. We can have more than one local
+ // with the same GUID, in the case of same-named locals in different but
+ // same-named source files that were compiled in their respective directories
+ // (so the source file name and resulting GUID is the same). Find the one
+ // in this module.
+ auto Summary = ImportIndex.findSummaryInModule(
+ SGV->getGUID(), SGV->getParent()->getModuleIdentifier());
+ assert(Summary && "Missing summary for global value when exporting");
+ auto Linkage = Summary->linkage();
if (!GlobalValue::isLocalLinkage(Linkage)) {
assert(!isNonRenamableLocal(*SGV) &&
"Attempting to promote non-renamable local");
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index c8f030f7eb835..11d54bcf4f89d 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1189,19 +1189,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
Function *Callee = CI->getCalledFunction();
- Value *Ret = nullptr;
StringRef Name = Callee->getName();
if (Name == "fabs" && hasFloatVersion(Name))
- Ret = optimizeUnaryDoubleFP(CI, B, false);
+ return optimizeUnaryDoubleFP(CI, B, false);
- Value *Op = CI->getArgOperand(0);
- if (Instruction *I = dyn_cast<Instruction>(Op)) {
- // Fold fabs(x * x) -> x * x; any squared FP value must already be positive.
- if (I->getOpcode() == Instruction::FMul)
- if (I->getOperand(0) == I->getOperand(1))
- return Op;
- }
- return Ret;
+ return nullptr;
}
Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 31daba2248aae..578c65daf7c0f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -783,6 +783,10 @@ protected:
// Similarly, we create a new latch condition when setting up the structure
// of the new loop, so the old one can become dead.
SmallPtrSet<Instruction *, 4> DeadInstructions;
+
+ // Holds the end values for each induction variable. We save the end values
+ // so we can later fix-up the external users of the induction variables.
+ DenseMap<PHINode *, Value *> IVEndValues;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -1879,13 +1883,6 @@ public:
unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
unsigned LoopCost);
- /// \return The most profitable unroll factor.
- /// This method finds the best unroll-factor based on register pressure and
- /// other parameters. VF and LoopCost are the selected vectorization factor
- /// and the cost of the selected VF.
- unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
- unsigned LoopCost);
-
/// \brief A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
@@ -3424,7 +3421,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
// Create phi nodes to merge from the backedge-taken check block.
PHINode *BCResumeVal = PHINode::Create(
OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
- Value *EndValue;
+ Value *&EndValue = IVEndValues[OrigPhi];
if (OrigPhi == OldInduction) {
// We know what the end value is.
EndValue = CountRoundDown;
@@ -3443,9 +3440,6 @@ void InnerLoopVectorizer::createEmptyLoop() {
// or the value at the end of the vectorized loop.
BCResumeVal->addIncoming(EndValue, MiddleBlock);
- // Fix up external users of the induction variable.
- fixupIVUsers(OrigPhi, II, CountRoundDown, EndValue, MiddleBlock);
-
// Fix the scalar body counter (PHI node).
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
@@ -4116,11 +4110,23 @@ void InnerLoopVectorizer::vectorizeLoop() {
Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
} // end of for each Phi in PHIsToFix.
- fixLCSSAPHIs();
-
- // Make sure DomTree is updated.
+ // Update the dominator tree.
+ //
+ // FIXME: After creating the structure of the new loop, the dominator tree is
+ // no longer up-to-date, and it remains that way until we update it
+ // here. An out-of-date dominator tree is problematic for SCEV,
+ // because SCEVExpander uses it to guide code generation. The
+ // vectorizer use SCEVExpanders in several places. Instead, we should
+ // keep the dominator tree up-to-date as we go.
updateAnalysis();
+ // Fix-up external users of the induction variables.
+ for (auto &Entry : *Legal->getInductionVars())
+ fixupIVUsers(Entry.first, Entry.second,
+ getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
+ IVEndValues[Entry.first], LoopMiddleBlock);
+
+ fixLCSSAPHIs();
predicateInstructions();
// Remove redundant induction instructions.