diff options
Diffstat (limited to 'lib')
55 files changed, 1411 insertions, 757 deletions
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index b4686a1ff1758..8da2f0981d0ca 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -1106,6 +1106,16 @@ static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q, if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse)) return V; + // udiv %V, C -> 0 if %V < C + if (MaxRecurse) { + if (Constant *C = dyn_cast_or_null<Constant>(SimplifyICmpInst( + ICmpInst::ICMP_ULT, Op0, Op1, Q, MaxRecurse - 1))) { + if (C->isAllOnesValue()) { + return Constant::getNullValue(Op0->getType()); + } + } + } + return nullptr; } @@ -1247,6 +1257,16 @@ static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q, if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse)) return V; + // urem %V, C -> %V if %V < C + if (MaxRecurse) { + if (Constant *C = dyn_cast_or_null<Constant>(SimplifyICmpInst( + ICmpInst::ICMP_ULT, Op0, Op1, Q, MaxRecurse - 1))) { + if (C->isAllOnesValue()) { + return Op0; + } + } + } + return nullptr; } diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp index 19c0171740c92..3d85ef6988a9a 100644 --- a/lib/Analysis/LoopInfo.cpp +++ b/lib/Analysis/LoopInfo.cpp @@ -179,9 +179,9 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const { } bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT, const LoopInfo &LI) const { - // For each block we check that it doesn't have any uses outside of it's - // innermost loop. This process will transitivelly guarntee that current loop - // and all of the nested loops are in the LCSSA form. + // For each block we check that it doesn't have any uses outside of its + // innermost loop. This process will transitively guarantee that the current + // loop and all of the nested loops are in LCSSA form. return all_of(this->blocks(), [&](const BasicBlock *BB) { return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT); }); diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index 2746361ab4b58..e7415e6231963 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -344,38 +344,24 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI, if (!InvariantGroupMD) return MemDepResult::getUnknown(); - Value *LoadOperand = LI->getPointerOperand(); + // Take the ptr operand after all casts and geps 0. This way we can search + // cast graph down only. + Value *LoadOperand = LI->getPointerOperand()->stripPointerCasts(); + // It's is not safe to walk the use list of global value, because function // passes aren't allowed to look outside their functions. + // FIXME: this could be fixed by filtering instructions from outside + // of current function. if (isa<GlobalValue>(LoadOperand)) return MemDepResult::getUnknown(); // Queue to process all pointers that are equivalent to load operand. SmallVector<const Value *, 8> LoadOperandsQueue; - SmallSet<const Value *, 14> SeenValues; - auto TryInsertToQueue = [&](Value *V) { - if (SeenValues.insert(V).second) - LoadOperandsQueue.push_back(V); - }; - - TryInsertToQueue(LoadOperand); + LoadOperandsQueue.push_back(LoadOperand); while (!LoadOperandsQueue.empty()) { const Value *Ptr = LoadOperandsQueue.pop_back_val(); - assert(Ptr); - if (isa<GlobalValue>(Ptr)) - continue; - - // Value comes from bitcast: Ptr = bitcast x. Insert x. - if (auto *BCI = dyn_cast<BitCastInst>(Ptr)) - TryInsertToQueue(BCI->getOperand(0)); - // Gep with zeros is equivalent to bitcast. - // FIXME: we are not sure if some bitcast should be canonicalized to gep 0 - // or gep 0 to bitcast because of SROA, so there are 2 forms. When typeless - // pointers will be upstream then both cases will be gone (and this BFS - // also won't be needed). - if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) - if (GEP->hasAllZeroIndices()) - TryInsertToQueue(GEP->getOperand(0)); + assert(Ptr && !isa<GlobalValue>(Ptr) && + "Null or GlobalValue should not be inserted"); for (const Use &Us : Ptr->uses()) { auto *U = dyn_cast<Instruction>(Us.getUser()); @@ -385,13 +371,17 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI, // Bitcast or gep with zeros are using Ptr. Add to queue to check it's // users. U = bitcast Ptr if (isa<BitCastInst>(U)) { - TryInsertToQueue(U); + LoadOperandsQueue.push_back(U); continue; } - // U = getelementptr Ptr, 0, 0... + // Gep with zeros is equivalent to bitcast. + // FIXME: we are not sure if some bitcast should be canonicalized to gep 0 + // or gep 0 to bitcast because of SROA, so there are 2 forms. When + // typeless pointers will be ready then both cases will be gone + // (and this BFS also won't be needed). if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) if (GEP->hasAllZeroIndices()) { - TryInsertToQueue(U); + LoadOperandsQueue.push_back(U); continue; } diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 5e566bcdaff4d..44f1a6dde0d21 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -10012,6 +10012,18 @@ void ScalarEvolution::verify() const { // TODO: Verify more things. } +bool ScalarEvolution::invalidate( + Function &F, const PreservedAnalyses &PA, + FunctionAnalysisManager::Invalidator &Inv) { + // Invalidate the ScalarEvolution object whenever it isn't preserved or one + // of its dependencies is invalidated. + auto PAC = PA.getChecker<ScalarEvolutionAnalysis>(); + return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) || + Inv.invalidate<AssumptionAnalysis>(F, PA) || + Inv.invalidate<DominatorTreeAnalysis>(F, PA) || + Inv.invalidate<LoopAnalysis>(F, PA); +} + AnalysisKey ScalarEvolutionAnalysis::Key; ScalarEvolution ScalarEvolutionAnalysis::run(Function &F, diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index 073b4e6ab26ae..d31472c0d33c1 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -3257,6 +3257,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V, case Intrinsic::dbg_value: return true; + case Intrinsic::bitreverse: case Intrinsic::bswap: case Intrinsic::ctlz: case Intrinsic::ctpop: diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp index 460d39cc28d87..4a5d18e2db750 100644 --- a/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/lib/Bitcode/Reader/MetadataLoader.cpp @@ -429,7 +429,7 @@ class MetadataLoader::MetadataLoaderImpl { /// Populate the index above to enable lazily loading of metadata, and load /// the named metadata as well as the transitively referenced global /// Metadata. - Expected<bool> lazyLoadModuleMetadataBlock(PlaceholderQueue &Placeholders); + Expected<bool> lazyLoadModuleMetadataBlock(); /// On-demand loading of a single metadata. Requires the index above to be /// populated. @@ -516,8 +516,8 @@ Error error(const Twine &Message) { Message, make_error_code(BitcodeError::CorruptedBitcode)); } -Expected<bool> MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock( - PlaceholderQueue &Placeholders) { +Expected<bool> +MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() { IndexCursor = Stream; SmallVector<uint64_t, 64> Record; // Get the abbrevs, and preload record positions to make them lazy-loadable. @@ -701,7 +701,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) { // then load individual record as needed, starting with the named metadata. if (ModuleLevel && IsImporting && MetadataList.empty() && !DisableLazyLoading) { - auto SuccessOrErr = lazyLoadModuleMetadataBlock(Placeholders); + auto SuccessOrErr = lazyLoadModuleMetadataBlock(); if (!SuccessOrErr) return SuccessOrErr.takeError(); if (SuccessOrErr.get()) { @@ -1561,7 +1561,6 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment( return error("Invalid record"); SmallVector<uint64_t, 64> Record; - PlaceholderQueue Placeholders; while (true) { @@ -1608,10 +1607,12 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment( auto Idx = Record[i + 1]; if (Idx < (MDStringRef.size() + GlobalMetadataBitPosIndex.size()) && - !MetadataList.lookup(Idx)) + !MetadataList.lookup(Idx)) { // Load the attachment if it is in the lazy-loadable range and hasn't // been loaded yet. lazyLoadOneMetadata(Idx, Placeholders); + resolveForwardRefsAndPlaceholders(Placeholders); + } Metadata *Node = MetadataList.getMetadataFwdRef(Idx); if (isa<LocalAsMetadata>(Node)) diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a37f4e1116b43..6b62f11f12405 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1714,7 +1714,7 @@ void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N, EVT CCT = getSetCCResultType(NVT); // Hi part is always the same op - Hi = DAG.getNode(N->getOpcode(), DL, {NVT, NVT}, {LHSH, RHSH}); + Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH}); // We need to know whether to select Lo part that corresponds to 'winning' // Hi part or if Hi parts are equal. @@ -1725,7 +1725,7 @@ void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N, SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL); // Recursed Lo part if Hi parts are equal, this uses unsigned version - SDValue LoMinMax = DAG.getNode(LoOpc, DL, {NVT, NVT}, {LHSL, RHSL}); + SDValue LoMinMax = DAG.getNode(LoOpc, DL, NVT, {LHSL, RHSL}); Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp); } diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp index bae828a2263c1..234b2043a6a14 100644 --- a/lib/CodeGen/StackSlotColoring.cpp +++ b/lib/CodeGen/StackSlotColoring.cpp @@ -381,7 +381,6 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) { I != E; ++I) { if (DCELimit != -1 && (int)NumDead >= DCELimit) break; - int FirstSS, SecondSS; if (TII->isStackSlotCopy(*I, FirstSS, SecondSS) && FirstSS == SecondSS && FirstSS != -1) { @@ -392,12 +391,18 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) { } MachineBasicBlock::iterator NextMI = std::next(I); - if (NextMI == MBB->end()) continue; + MachineBasicBlock::iterator ProbableLoadMI = I; unsigned LoadReg = 0; unsigned StoreReg = 0; if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS))) continue; + // Skip the ...pseudo debugging... instructions between a load and store. + while ((NextMI != E) && NextMI->isDebugValue()) { + ++NextMI; + ++I; + } + if (NextMI == E) continue; if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS))) continue; if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue; @@ -407,7 +412,7 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) { if (NextMI->findRegisterUseOperandIdx(LoadReg, true, nullptr) != -1) { ++NumDead; - toErase.push_back(&*I); + toErase.push_back(&*ProbableLoadMI); } toErase.push_back(&*NextMI); diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index a5a30fab5b698..8f6b1849169a9 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -896,6 +896,48 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType, return ELF::R_MIPS_NONE; } +// Sometimes we don't need to create thunk for a branch. +// This typically happens when branch target is located +// in the same object file. In such case target is either +// a weak symbol or symbol in a different executable section. +// This function checks if branch target is located in the +// same object file and if distance between source and target +// fits R_AARCH64_CALL26 relocation. If both conditions are +// met, it emits direct jump to the target and returns true. +// Otherwise false is returned and thunk is created. +bool RuntimeDyldELF::resolveAArch64ShortBranch( + unsigned SectionID, relocation_iterator RelI, + const RelocationValueRef &Value) { + uint64_t Address; + if (Value.SymbolName) { + auto Loc = GlobalSymbolTable.find(Value.SymbolName); + + // Don't create direct branch for external symbols. + if (Loc == GlobalSymbolTable.end()) + return false; + + const auto &SymInfo = Loc->second; + Address = + uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset( + SymInfo.getOffset())); + } else { + Address = uint64_t(Sections[Value.SectionID].getLoadAddress()); + } + uint64_t Offset = RelI->getOffset(); + uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset); + + // R_AARCH64_CALL26 requires immediate to be in range -2^27 <= imm < 2^27 + // If distance between source and target is out of range then we should + // create thunk. + if (!isInt<28>(Address + Value.Addend - SourceAddress)) + return false; + + resolveRelocation(Sections[SectionID], Offset, Address, RelI->getType(), + Value.Addend); + + return true; +} + Expected<relocation_iterator> RuntimeDyldELF::processRelocationRef( unsigned SectionID, relocation_iterator RelI, const ObjectFile &O, @@ -1003,7 +1045,7 @@ RuntimeDyldELF::processRelocationRef( (uint64_t)Section.getAddressWithOffset(i->second), RelType, 0); DEBUG(dbgs() << " Stub function found\n"); - } else { + } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) { // Create a new stub function. DEBUG(dbgs() << " Create a new stub function\n"); Stubs[Value] = Section.getStubOffset(); diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h index 796127ab92bd7..d1867d091fe29 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h @@ -40,6 +40,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl { void resolveAArch64Relocation(const SectionEntry &Section, uint64_t Offset, uint64_t Value, uint32_t Type, int64_t Addend); + bool resolveAArch64ShortBranch(unsigned SectionID, relocation_iterator RelI, + const RelocationValueRef &Value); + void resolveARMRelocation(const SectionEntry &Section, uint64_t Offset, uint32_t Value, uint32_t Type, int32_t Addend); diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp index 66ffe6db29d61..928f69a17de90 100644 --- a/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/lib/LTO/ThinLTOCodeGenerator.cpp @@ -196,8 +196,15 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index, }; FunctionImporter Importer(Index, Loader); - if (!Importer.importFunctions(TheModule, ImportList)) + Expected<bool> Result = Importer.importFunctions(TheModule, ImportList); + if (!Result) { + handleAllErrors(Result.takeError(), [&](ErrorInfoBase &EIB) { + SMDiagnostic Err = SMDiagnostic(TheModule.getModuleIdentifier(), + SourceMgr::DK_Error, EIB.message()); + Err.print("ThinLTO", errs()); + }); report_fatal_error("importFunctions failed"); + } } static void optimizeModule(Module &TheModule, TargetMachine &TM, diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp index 40105000c56ca..5b018676eba3d 100644 --- a/lib/Object/MachOObjectFile.cpp +++ b/lib/Object/MachOObjectFile.cpp @@ -2823,7 +2823,11 @@ StringRef MachORebaseEntry::typeName() const { } bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const { +#ifdef EXPENSIVE_CHECKS assert(Opcodes == Other.Opcodes && "compare iterators of different files"); +#else + assert(Opcodes.data() == Other.Opcodes.data() && "compare iterators of different files"); +#endif return (Ptr == Other.Ptr) && (RemainingLoopCount == Other.RemainingLoopCount) && (Done == Other.Done); @@ -3073,7 +3077,11 @@ uint32_t MachOBindEntry::flags() const { return Flags; } int MachOBindEntry::ordinal() const { return Ordinal; } bool MachOBindEntry::operator==(const MachOBindEntry &Other) const { +#ifdef EXPENSIVE_CHECKS assert(Opcodes == Other.Opcodes && "compare iterators of different files"); +#else + assert(Opcodes.data() == Other.Opcodes.data() && "compare iterators of different files"); +#endif return (Ptr == Other.Ptr) && (RemainingLoopCount == Other.RemainingLoopCount) && (Done == Other.Done); diff --git a/lib/Object/ModuleSummaryIndexObjectFile.cpp b/lib/Object/ModuleSummaryIndexObjectFile.cpp index 202783e7d993e..11ace84b9cebb 100644 --- a/lib/Object/ModuleSummaryIndexObjectFile.cpp +++ b/lib/Object/ModuleSummaryIndexObjectFile.cpp @@ -22,6 +22,12 @@ using namespace llvm; using namespace object; +static llvm::cl::opt<bool> IgnoreEmptyThinLTOIndexFile( + "ignore-empty-index-file", llvm::cl::ZeroOrMore, + llvm::cl::desc( + "Ignore an empty index file and perform non-ThinLTO compilation"), + llvm::cl::init(false)); + ModuleSummaryIndexObjectFile::ModuleSummaryIndexObjectFile( MemoryBufferRef Object, std::unique_ptr<ModuleSummaryIndex> I) : SymbolicFile(Binary::ID_ModuleSummaryIndex, Object), Index(std::move(I)) { @@ -97,6 +103,8 @@ llvm::getModuleSummaryIndexForFile(StringRef Path) { if (EC) return errorCodeToError(EC); MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef(); + if (IgnoreEmptyThinLTOIndexFile && !BufferRef.getBufferSize()) + return nullptr; Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr = object::ModuleSummaryIndexObjectFile::create(BufferRef); if (!ObjOrErr) diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp index 0a989706b4361..3889902eea54a 100644 --- a/lib/Support/CommandLine.cpp +++ b/lib/Support/CommandLine.cpp @@ -373,7 +373,7 @@ void Option::removeArgument() { GlobalParser->removeOption(this); } void Option::setArgStr(StringRef S) { if (FullyInitialized) GlobalParser->updateArgStr(this, S); - assert(S[0] != '-' && "Option can't start with '-"); + assert((S.empty() || S[0] != '-') && "Option can't start with '-"); ArgStr = S; } diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp index 0616d05aff573..4bb035eeccca8 100644 --- a/lib/Support/Path.cpp +++ b/lib/Support/Path.cpp @@ -571,6 +571,16 @@ void native(SmallVectorImpl<char> &Path) { #endif } +std::string convert_to_slash(StringRef path) { +#ifdef LLVM_ON_WIN32 + std::string s = path.str(); + std::replace(s.begin(), s.end(), '\\', '/'); + return s; +#else + return path; +#endif +} + StringRef filename(StringRef path) { return *rbegin(path); } diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp index 5fc17d2763776..f79b364dc1f7b 100644 --- a/lib/Support/TarWriter.cpp +++ b/lib/Support/TarWriter.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/Path.h" using namespace llvm; @@ -109,27 +110,44 @@ static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) { pad(OS); } +// In the Ustar header, a path can be split at any '/' to store +// a path into UstarHeader::Name and UstarHeader::Prefix. This +// function splits a given path for that purpose. +static std::pair<StringRef, StringRef> splitPath(StringRef Path) { + if (Path.size() <= sizeof(UstarHeader::Name)) + return {"", Path}; + size_t Sep = Path.rfind('/', sizeof(UstarHeader::Name) + 1); + if (Sep == StringRef::npos) + return {"", Path}; + return {Path.substr(0, Sep), Path.substr(Sep + 1)}; +} + +// Returns true if a given path can be stored to a Ustar header +// without the PAX extension. +static bool fitsInUstar(StringRef Path) { + StringRef Prefix; + StringRef Name; + std::tie(Prefix, Name) = splitPath(Path); + return Name.size() <= sizeof(UstarHeader::Name); +} + // The PAX header is an extended format, so a PAX header needs // to be followed by a "real" header. static void writeUstarHeader(raw_fd_ostream &OS, StringRef Path, size_t Size) { + StringRef Prefix; + StringRef Name; + std::tie(Prefix, Name) = splitPath(Path); + UstarHeader Hdr = {}; - memcpy(Hdr.Name, Path.data(), Path.size()); + memcpy(Hdr.Name, Name.data(), Name.size()); memcpy(Hdr.Mode, "0000664", 8); snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", Size); memcpy(Hdr.Magic, "ustar", 6); + memcpy(Hdr.Prefix, Prefix.data(), Prefix.size()); computeChecksum(Hdr); OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr)); } -// We want to use '/' as a path separator even on Windows. -// This function canonicalizes a given path. -static std::string canonicalize(std::string S) { -#ifdef LLVM_ON_WIN32 - std::replace(S.begin(), S.end(), '\\', '/'); -#endif - return S; -} - // Creates a TarWriter instance and returns it. Expected<std::unique_ptr<TarWriter>> TarWriter::create(StringRef OutputPath, StringRef BaseDir) { @@ -145,8 +163,8 @@ TarWriter::TarWriter(int FD, StringRef BaseDir) // Append a given file to an archive. void TarWriter::append(StringRef Path, StringRef Data) { // Write Path and Data. - std::string S = BaseDir + "/" + canonicalize(Path) + "\0"; - if (S.size() <= sizeof(UstarHeader::Name)) { + std::string S = BaseDir + "/" + sys::path::convert_to_slash(Path) + "\0"; + if (fitsInUstar(S)) { writeUstarHeader(OS, S, Data.size()); } else { writePaxHeader(OS, S); diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ef3b44f7c2116..2b4fc5397b18d 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -608,6 +608,10 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, if ((C = dyn_cast<ConstantSDNode>(Addr))) { Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && + (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { Base = Addr.getOperand(0); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0b0a0e7d083ed..730bcdcf7afa5 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -172,16 +172,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2f64, Promote); AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); - setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); - setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); - - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); - setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); - - setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i1, Expand); setTruncStoreAction(MVT::i64, MVT::i8, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a6c31629e7c40..da9d009c542b6 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -822,6 +822,7 @@ public: bool isForcedVOP3() const { return ForcedEncodingSize == 64; } bool isForcedDPP() const { return ForcedDPP; } bool isForcedSDWA() const { return ForcedSDWA; } + ArrayRef<unsigned> getMatchedVariants() const; std::unique_ptr<AMDGPUOperand> parseRegister(); bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; @@ -1630,31 +1631,44 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { return Match_Success; } +// What asm variants we should check +ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { + if (getForcedEncodingSize() == 32) { + static const unsigned Variants[] = {AMDGPUAsmVariants::DEFAULT}; + return makeArrayRef(Variants); + } + + if (isForcedVOP3()) { + static const unsigned Variants[] = {AMDGPUAsmVariants::VOP3}; + return makeArrayRef(Variants); + } + + if (isForcedSDWA()) { + static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA}; + return makeArrayRef(Variants); + } + + if (isForcedDPP()) { + static const unsigned Variants[] = {AMDGPUAsmVariants::DPP}; + return makeArrayRef(Variants); + } + + static const unsigned Variants[] = { + AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3, + AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP + }; + + return makeArrayRef(Variants); +} + bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { - // What asm variants we should check - std::vector<unsigned> MatchedVariants; - if (getForcedEncodingSize() == 32) { - MatchedVariants = {AMDGPUAsmVariants::DEFAULT}; - } else if (isForcedVOP3()) { - MatchedVariants = {AMDGPUAsmVariants::VOP3}; - } else if (isForcedSDWA()) { - MatchedVariants = {AMDGPUAsmVariants::SDWA}; - } else if (isForcedDPP()) { - MatchedVariants = {AMDGPUAsmVariants::DPP}; - } else { - MatchedVariants = {AMDGPUAsmVariants::DEFAULT, - AMDGPUAsmVariants::VOP3, - AMDGPUAsmVariants::SDWA, - AMDGPUAsmVariants::DPP}; - } - MCInst Inst; unsigned Result = Match_Success; - for (auto Variant : MatchedVariants) { + for (auto Variant : getMatchedVariants()) { uint64_t EI; auto R = MatchInstructionImpl(Operands, Inst, EI, MatchingInlineAsm, Variant); @@ -3486,7 +3500,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments - if ((BasicInstType == SIInstrFlags::VOPC || + if ((BasicInstType == SIInstrFlags::VOPC || BasicInstType == SIInstrFlags::VOP2)&& Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 89c9266746ac4..de7ce5cb9e478 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -99,6 +99,18 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::i32, MVT::i8, Custom); setTruncStoreAction(MVT::i32, MVT::i16, Custom); + // We need to include these since trunc STORES to PRIVATE need + // special handling to accommodate RMW + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom); + setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom); + setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom); // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); @@ -1087,79 +1099,114 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth, SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const { SDLoc DL(Store); + //TODO: Who creates the i8 stores? + assert(Store->isTruncatingStore() + || Store->getValue().getValueType() == MVT::i8); + assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS); - unsigned Mask = 0; + SDValue Mask; if (Store->getMemoryVT() == MVT::i8) { - Mask = 0xff; + assert(Store->getAlignment() >= 1); + Mask = DAG.getConstant(0xff, DL, MVT::i32); } else if (Store->getMemoryVT() == MVT::i16) { - Mask = 0xffff; + assert(Store->getAlignment() >= 2); + Mask = DAG.getConstant(0xffff, DL, MVT::i32);; + } else { + llvm_unreachable("Unsupported private trunc store"); } SDValue Chain = Store->getChain(); SDValue BasePtr = Store->getBasePtr(); + SDValue Offset = Store->getOffset(); EVT MemVT = Store->getMemoryVT(); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); + SDValue LoadPtr = BasePtr; + if (!Offset.isUndef()) { + LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); + } + + // Get dword location + // TODO: this should be eliminated by the future SHR ptr, 2 + SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, + DAG.getConstant(0xfffffffc, DL, MVT::i32)); + + // Load dword + // TODO: can we be smarter about machine pointer info? + SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo()); - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + Chain = Dst.getValue(1); + + // Get offset in dword + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, DAG.getConstant(0x3, DL, MVT::i32)); + // Convert byte offset to bit shift SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, DAG.getConstant(3, DL, MVT::i32)); + // TODO: Contrary to the name of the functiom, + // it also handles sub i32 non-truncating stores (like i1) SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Store->getValue()); + // Mask the value to the right type SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + // Shift the value in place SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, MaskedValue, ShiftAmt); - SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, - DAG.getConstant(Mask, DL, MVT::i32), - ShiftAmt); - DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, - DAG.getConstant(0xffffffff, DL, MVT::i32)); + // Shift the mask in place + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt); + + // Invert the mask. NOTE: if we had native ROL instructions we could + // use inverted mask + DstMask = DAG.getNOT(DL, DstMask, MVT::i32); + + // Cleanup the target bits Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + // Add the new bits SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); + + // Store dword + // TODO: Can we be smarter about MachinePointerInfo? + return DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo()); } SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *StoreNode = cast<StoreSDNode>(Op); unsigned AS = StoreNode->getAddressSpace(); + + SDValue Chain = StoreNode->getChain(); + SDValue Ptr = StoreNode->getBasePtr(); SDValue Value = StoreNode->getValue(); - EVT ValueVT = Value.getValueType(); + + EVT VT = Value.getValueType(); EVT MemVT = StoreNode->getMemoryVT(); - unsigned Align = StoreNode->getAlignment(); + EVT PtrVT = Ptr.getValueType(); + SDLoc DL(Op); + + // Neither LOCAL nor PRIVATE can do vectors at the moment if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && - ValueVT.isVector()) { - return SplitVectorStore(Op, DAG); + VT.isVector()) { + return scalarizeVectorStore(StoreNode, DAG); } - // Private AS needs special fixes - if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) && + unsigned Align = StoreNode->getAlignment(); + if (Align < MemVT.getStoreSize() && !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { return expandUnalignedStore(StoreNode, DAG); } - SDLoc DL(Op); - SDValue Chain = StoreNode->getChain(); - SDValue Ptr = StoreNode->getBasePtr(); + SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr, + DAG.getConstant(2, DL, PtrVT)); if (AS == AMDGPUAS::GLOBAL_ADDRESS) { // It is beneficial to create MSKOR here instead of combiner to avoid // artificial dependencies introduced by RMW if (StoreNode->isTruncatingStore()) { - EVT VT = Value.getValueType(); assert(VT.bitsLE(MVT::i32)); SDValue MaskConstant; if (MemVT == MVT::i8) { @@ -1169,15 +1216,19 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { assert(StoreNode->getAlignment() >= 2); MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); } - SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(0x00000003, DL, VT)); + + SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr, + DAG.getConstant(0x00000003, DL, PtrVT)); + SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, + DAG.getConstant(3, DL, VT)); + + // Put the mask in correct place + SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift); + + // Put the mask in correct place SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); - SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, - DAG.getConstant(3, DL, VT)); - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); - SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift); + // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 // vector instead. SDValue Src[4] = { @@ -1191,12 +1242,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, Op->getVTList(), Args, MemVT, StoreNode->getMemOperand()); - } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && - ValueVT.bitsGE(MVT::i32)) { + } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) { // Convert pointer from byte address to dword address. - Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), - DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), - Ptr, DAG.getConstant(2, DL, MVT::i32))); + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { llvm_unreachable("Truncated and indexed stores not supported yet"); @@ -1207,49 +1255,22 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } } + // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes if (AS != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); if (MemVT.bitsLT(MVT::i32)) return lowerPrivateTruncStore(StoreNode, DAG); - // Lowering for indirect addressing - const MachineFunction &MF = DAG.getMachineFunction(); - const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (ValueVT.isVector()) { - unsigned NumElemVT = ValueVT.getVectorNumElements(); - EVT ElemVT = ValueVT.getVectorElementType(); - SmallVector<SDValue, 4> Stores(NumElemVT); - - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, - Value, DAG.getConstant(i, DL, MVT::i32)); - - Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Elem, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32)); - } - Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); - } else { - if (ValueVT == MVT::i8) { - Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); - } - Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); // Channel + // Standard i32+ store, tag it with DWORDADDR to note that the address + // has been shifted + if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); + return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); } - return Chain; + // Tagged i32+ stores will be matched by patterns + return SDValue(); } // return (512 + (kc_bank << 12) @@ -1299,51 +1320,50 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, LoadSDNode *Load = cast<LoadSDNode>(Op); ISD::LoadExtType ExtType = Load->getExtensionType(); EVT MemVT = Load->getMemoryVT(); + assert(Load->getAlignment() >= MemVT.getStoreSize()); - // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, - // register (2-)byte extract. + SDValue BasePtr = Load->getBasePtr(); + SDValue Chain = Load->getChain(); + SDValue Offset = Load->getOffset(); - // Get Register holding the target. - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), - DAG.getConstant(2, DL, MVT::i32)); - // Load the Register. - SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), - Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); + SDValue LoadPtr = BasePtr; + if (!Offset.isUndef()) { + LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); + } + + // Get dword location + // NOTE: this should be eliminated by the future SHR ptr, 2 + SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, + DAG.getConstant(0xfffffffc, DL, MVT::i32)); + + // Load dword + // TODO: can we be smarter about machine pointer info? + SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo()); // Get offset within the register. SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, - Load->getBasePtr(), - DAG.getConstant(0x3, DL, MVT::i32)); + LoadPtr, DAG.getConstant(0x3, DL, MVT::i32)); // Bit offset of target byte (byteIdx * 8). SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, DAG.getConstant(3, DL, MVT::i32)); // Shift to the right. - Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt); // Eliminate the upper bits by setting them to ... EVT MemEltVT = MemVT.getScalarType(); - // ... ones. - if (ExtType == ISD::SEXTLOAD) { + if (ExtType == ISD::SEXTLOAD) { // ... ones. SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - - SDValue Ops[] = { - DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); + Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); + } else { // ... or zeros. + Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT); } - // ... or zeros. SDValue Ops[] = { - DAG.getZeroExtendInReg(Ret, DL, MemEltVT), - Load->getChain() + Ret, + Read.getValue(1) // This should be our output chain }; return DAG.getMergeValues(Ops, DL); @@ -1365,12 +1385,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = LoadNode->getChain(); SDValue Ptr = LoadNode->getBasePtr(); - if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { - SDValue MergedValues[2] = { - scalarizeVectorLoad(LoadNode, DAG), - Chain - }; - return DAG.getMergeValues(MergedValues, DL); + if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + VT.isVector()) { + return scalarizeVectorLoad(LoadNode, DAG); } int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); @@ -1421,8 +1439,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(MergedValues, DL); } - SDValue LoweredLoad; - // For most operations returning SDValue() will result in the node being // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we // need to manually expand loads that may be legal in some address spaces and @@ -1447,47 +1463,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } - // Lowering for indirect addressing - const MachineFunction &MF = DAG.getMachineFunction(); - const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (VT.isVector()) { - unsigned NumElemVT = VT.getVectorNumElements(); - EVT ElemVT = VT.getVectorElementType(); - SDValue Loads[4]; - - assert(NumElemVT <= 4); - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, - Chain, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32), - Op.getOperand(2)); - } - EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); - LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); - } else { - LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), // Channel - Op.getOperand(2)); + // DWORDADDR ISD marks already shifted address + if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { + assert(VT == MVT::i32); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr); + return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand()); } - - SDValue Ops[2] = { - LoweredLoad, - Chain - }; - - return DAG.getMergeValues(Ops, DL); + return SDValue(); } SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 3a72e0791fd68..19795bdde6473 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -1268,6 +1268,17 @@ let Predicates = [isR600] in { defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>; +// Hardcode channel to 0 +// NOTE: LSHR is not available here. LSHR is per family instruction +def : Pat < + (i32 (load_private ADDRIndirect:$addr) ), + (R600_RegisterLoad FRAMEri:$addr, (i32 0)) +>; +def : Pat < + (store_private i32:$val, ADDRIndirect:$addr), + (R600_RegisterStore i32:$val, FRAMEri:$addr, (i32 0)) +>; + //===----------------------------------------------------------------------===// // Pseudo instructions diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index c78e97dfd46f8..9140fe6cd1484 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -99,6 +99,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); @@ -699,7 +711,8 @@ SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - unsigned Offset, bool Signed) const { + unsigned Offset, bool Signed, + const ISD::InputArg *Arg) const { const DataLayout &DL = DAG.getDataLayout(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); @@ -713,20 +726,21 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); - SDValue Val; + SDValue Val = Load; + if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && + VT.bitsLT(MemVT)) { + unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; + Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); + } + if (MemVT.isFloatingPoint()) - Val = getFPExtOrFPTrunc(DAG, Load, SL, VT); + Val = getFPExtOrFPTrunc(DAG, Val, SL, VT); else if (Signed) - Val = DAG.getSExtOrTrunc(Load, SL, VT); + Val = DAG.getSExtOrTrunc(Val, SL, VT); else - Val = DAG.getZExtOrTrunc(Load, SL, VT); - - SDValue Ops[] = { - Val, - Load.getValue(1) - }; + Val = DAG.getZExtOrTrunc(Val, SL, VT); - return DAG.getMergeValues(Ops, SL); + return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); } SDValue SITargetLowering::LowerFormalArguments( @@ -899,7 +913,8 @@ SDValue SITargetLowering::LowerFormalArguments( // The first 36 bytes of the input buffer contains information about // thread group and global sizes. SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, - Offset, Ins[i].Flags.isSExt()); + Offset, Ins[i].Flags.isSExt(), + &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 9583f6db6faaf..6c04e4f309773 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -24,7 +24,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, unsigned Offset) const; SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, - SDValue Chain, unsigned Offset, bool Signed) const; + SDValue Chain, unsigned Offset, bool Signed, + const ISD::InputArg *Arg = nullptr) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp index 156a21dfecfea..462a7d57d2de5 100644 --- a/lib/Target/AVR/AVRISelDAGToDAG.cpp +++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp @@ -203,8 +203,8 @@ unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD, bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode, std::vector<SDValue> &OutOps) { - assert(ConstraintCode == InlineAsm::Constraint_m || - ConstraintCode == InlineAsm::Constraint_Q && + assert((ConstraintCode == InlineAsm::Constraint_m || + ConstraintCode == InlineAsm::Constraint_Q) && "Unexpected asm memory constraint"); MachineRegisterInfo &RI = MF->getRegInfo(); diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp index 53668f05b59ba..07fc3f6890b8d 100644 --- a/lib/Target/AVR/AVRISelLowering.cpp +++ b/lib/Target/AVR/AVRISelLowering.cpp @@ -14,6 +14,7 @@ #include "AVRISelLowering.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -1933,5 +1934,45 @@ void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } +unsigned AVRTargetLowering::getRegisterByName(const char *RegName, + EVT VT, + SelectionDAG &DAG) const { + unsigned Reg; + + if (VT == MVT::i8) { + Reg = StringSwitch<unsigned>(RegName) + .Case("r0", AVR::R0).Case("r1", AVR::R1).Case("r2", AVR::R2) + .Case("r3", AVR::R3).Case("r4", AVR::R4).Case("r5", AVR::R5) + .Case("r6", AVR::R6).Case("r7", AVR::R7).Case("r8", AVR::R8) + .Case("r9", AVR::R9).Case("r10", AVR::R10).Case("r11", AVR::R11) + .Case("r12", AVR::R12).Case("r13", AVR::R13).Case("r14", AVR::R14) + .Case("r15", AVR::R15).Case("r16", AVR::R16).Case("r17", AVR::R17) + .Case("r18", AVR::R18).Case("r19", AVR::R19).Case("r20", AVR::R20) + .Case("r21", AVR::R21).Case("r22", AVR::R22).Case("r23", AVR::R23) + .Case("r24", AVR::R24).Case("r25", AVR::R25).Case("r26", AVR::R26) + .Case("r27", AVR::R27).Case("r28", AVR::R28).Case("r29", AVR::R29) + .Case("r30", AVR::R30).Case("r31", AVR::R31) + .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30) + .Default(0); + } else { + Reg = StringSwitch<unsigned>(RegName) + .Case("r0", AVR::R1R0).Case("r2", AVR::R3R2) + .Case("r4", AVR::R5R4).Case("r6", AVR::R7R6) + .Case("r8", AVR::R9R8).Case("r10", AVR::R11R10) + .Case("r12", AVR::R13R12).Case("r14", AVR::R15R14) + .Case("r16", AVR::R17R16).Case("r18", AVR::R19R18) + .Case("r20", AVR::R21R20).Case("r22", AVR::R23R22) + .Case("r24", AVR::R25R24).Case("r26", AVR::R27R26) + .Case("r28", AVR::R29R28).Case("r30", AVR::R31R30) + .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30) + .Default(0); + } + + if (Reg) + return Reg; + + report_fatal_error("Invalid register name global variable"); +} + } // end of namespace llvm diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h index 17074e1b1eeea..a8cdc4e7ae234 100644 --- a/lib/Target/AVR/AVRISelLowering.h +++ b/lib/Target/AVR/AVRISelLowering.h @@ -116,6 +116,9 @@ public: std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; + private: SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc, SelectionDAG &DAG, SDLoc dl) const; diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp index cbe4466164f91..e38facead9228 100644 --- a/lib/Target/BPF/BPFInstrInfo.cpp +++ b/lib/Target/BPF/BPFInstrInfo.cpp @@ -13,15 +13,13 @@ #include "BPF.h" #include "BPFInstrInfo.h" -#include "BPFSubtarget.h" -#include "BPFTargetMachine.h" -#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallVector.h" +#include <cassert> +#include <iterator> #define GET_INSTRINFO_CTOR_DTOR #include "BPFGenInstrInfo.inc" @@ -109,11 +107,11 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB, while (std::next(I) != MBB.end()) std::next(I)->eraseFromParent(); Cond.clear(); - FBB = 0; + FBB = nullptr; // Delete the J if it's equivalent to a fall-through. if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { - TBB = 0; + TBB = nullptr; I->eraseFromParent(); I = MBB.end(); continue; diff --git a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp index b0037fbc16ac5..9beefcdcc1d5d 100644 --- a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp +++ b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp @@ -12,16 +12,15 @@ //===----------------------------------------------------------------------===// #include "BPF.h" -#include "BPFRegisterInfo.h" #include "BPFSubtarget.h" #include "MCTargetDesc/BPFMCTargetDesc.h" - +#include "llvm/ADT/ArrayRef.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" +#include <cstdint> using namespace llvm; @@ -36,14 +35,15 @@ class BPFDisassembler : public MCDisassembler { public: BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : MCDisassembler(STI, Ctx) {} - virtual ~BPFDisassembler() {} + ~BPFDisassembler() override = default; DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &VStream, raw_ostream &CStream) const override; }; -} + +} // end anonymous namespace static MCDisassembler *createBPFDisassembler(const Target &T, const MCSubtargetInfo &STI, diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index a6cd2002c12c6..afc321ea2c34e 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -8,28 +8,24 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/BPFMCTargetDesc.h" +#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCDirectives.h" -#include "llvm/MC/MCELFObjectWriter.h" -#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCFixup.h" #include "llvm/MC/MCObjectWriter.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> using namespace llvm; namespace { + class BPFAsmBackend : public MCAsmBackend { public: bool IsLittleEndian; BPFAsmBackend(bool IsLittleEndian) : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {} - ~BPFAsmBackend() override {} + ~BPFAsmBackend() override = default; void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const override; @@ -53,6 +49,8 @@ public: bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; }; +} // end anonymous namespace + bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { if ((Count % 8) != 0) return false; @@ -66,7 +64,6 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const { - if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) { assert(Value == 0); } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) { @@ -92,7 +89,6 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, MCObjectWriter *BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const { return createBPFELFObjectWriter(OS, 0, IsLittleEndian); } -} MCAsmBackend *llvm::createBPFAsmBackend(const Target &T, const MCRegisterInfo &MRI, diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp index 3d1c0eb55afa8..ebe9abd8ffac4 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp @@ -10,29 +10,30 @@ #include "MCTargetDesc/BPFMCTargetDesc.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixup.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include <cstdint> using namespace llvm; namespace { + class BPFELFObjectWriter : public MCELFObjectTargetWriter { public: BPFELFObjectWriter(uint8_t OSABI); - - ~BPFELFObjectWriter() override; + ~BPFELFObjectWriter() override = default; protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; }; -} + +} // end anonymous namespace BPFELFObjectWriter::BPFELFObjectWriter(uint8_t OSABI) : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_BPF, /*HasRelocationAddend*/ false) {} -BPFELFObjectWriter::~BPFELFObjectWriter() {} - unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp index 47f16512a3972..e8c9744798287 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp @@ -12,24 +12,25 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/BPFMCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCFixup.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> + using namespace llvm; #define DEBUG_TYPE "mccodeemitter" namespace { + class BPFMCCodeEmitter : public MCCodeEmitter { - BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete; - void operator=(const BPFMCCodeEmitter &) = delete; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; bool IsLittleEndian; @@ -38,8 +39,9 @@ public: BPFMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, bool IsLittleEndian) : MCII(mcii), MRI(mri), IsLittleEndian(IsLittleEndian) {} - - ~BPFMCCodeEmitter() {} + BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete; + void operator=(const BPFMCCodeEmitter &) = delete; + ~BPFMCCodeEmitter() override = default; // getBinaryCodeForInstr - TableGen'erated function for getting the // binary encoding for an instruction. @@ -66,7 +68,8 @@ private: void verifyInstructionPredicates(const MCInst &MI, uint64_t AvailableFeatures) const; }; -} + +} // end anonymous namespace MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp index 55415f97396b4..b58409730de04 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp @@ -12,14 +12,13 @@ //===----------------------------------------------------------------------===// #include "BPF.h" -#include "BPFMCTargetDesc.h" -#include "BPFMCAsmInfo.h" #include "InstPrinter/BPFInstPrinter.h" +#include "MCTargetDesc/BPFMCTargetDesc.h" +#include "MCTargetDesc/BPFMCAsmInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Host.h" #include "llvm/Support/TargetRegistry.h" #define GET_INSTRINFO_MC_DESC @@ -64,7 +63,7 @@ static MCInstPrinter *createBPFMCInstPrinter(const Triple &T, const MCRegisterInfo &MRI) { if (SyntaxVariant == 0) return new BPFInstPrinter(MAI, MII, MRI); - return 0; + return nullptr; } extern "C" void LLVMInitializeBPFTargetMC() { diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp index 5fb5b02278002..df12e0e88e3bb 100644 --- a/lib/Target/TargetMachineC.cpp +++ b/lib/Target/TargetMachineC.cpp @@ -101,7 +101,7 @@ LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) { } LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, - const char* Triple, const char* CPU, const char* Features, + const char *Triple, const char *CPU, const char *Features, LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc, LLVMCodeModel CodeModel) { Optional<Reloc::Model> RM; @@ -139,7 +139,7 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, TargetOptions opt; return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM, - CM, OL)); + CM, OL)); } void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); } diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt index f4d46383e5bb8..d9c53ecc8d084 100644 --- a/lib/Target/WebAssembly/CMakeLists.txt +++ b/lib/Target/WebAssembly/CMakeLists.txt @@ -17,6 +17,7 @@ add_llvm_target(WebAssemblyCodeGen WebAssemblyExplicitLocals.cpp WebAssemblyFastISel.cpp WebAssemblyFixIrreducibleControlFlow.cpp + WebAssemblyFixFunctionBitcasts.cpp WebAssemblyFrameLowering.cpp WebAssemblyISelDAGToDAG.cpp WebAssemblyISelLowering.cpp diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h index 09c35b4825fc2..8738263ad8473 100644 --- a/lib/Target/WebAssembly/WebAssembly.h +++ b/lib/Target/WebAssembly/WebAssembly.h @@ -28,6 +28,7 @@ class FunctionPass; // LLVM IR passes. ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool DoEH, bool DoSjLj); void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &); +ModulePass *createWebAssemblyFixFunctionBitcasts(); FunctionPass *createWebAssemblyOptimizeReturned(); // ISel and immediate followup passes. diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp new file mode 100644 index 0000000000000..d5474a02ce01a --- /dev/null +++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp @@ -0,0 +1,159 @@ +//===-- WebAssemblyFixFunctionBitcasts.cpp - Fix function bitcasts --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief Fix bitcasted functions. +/// +/// WebAssembly requires caller and callee signatures to match, however in LLVM, +/// some amount of slop is vaguely permitted. Detect mismatch by looking for +/// bitcasts of functions and rewrite them to use wrapper functions instead. +/// +/// This doesn't catch all cases, such as when a function's address is taken in +/// one place and casted in another, but it works for many common cases. +/// +/// Note that LLVM already optimizes away function bitcasts in common cases by +/// dropping arguments as needed, so this pass only ends up getting used in less +/// common cases. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-fix-function-bitcasts" + +namespace { +class FixFunctionBitcasts final : public ModulePass { + StringRef getPassName() const override { + return "WebAssembly Fix Function Bitcasts"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + ModulePass::getAnalysisUsage(AU); + } + + bool runOnModule(Module &M) override; + +public: + static char ID; + FixFunctionBitcasts() : ModulePass(ID) {} +}; +} // End anonymous namespace + +char FixFunctionBitcasts::ID = 0; +ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() { + return new FixFunctionBitcasts(); +} + +// Recursively descend the def-use lists from V to find non-bitcast users of +// bitcasts of V. +static void FindUses(Value *V, Function &F, + SmallVectorImpl<std::pair<Use *, Function *>> &Uses) { + for (Use &U : V->uses()) { + if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser())) + FindUses(BC, F, Uses); + else if (U.get()->getType() != F.getType()) + Uses.push_back(std::make_pair(&U, &F)); + } +} + +// Create a wrapper function with type Ty that calls F (which may have a +// different type). Attempt to support common bitcasted function idioms: +// - Call with more arguments than needed: arguments are dropped +// - Call with fewer arguments than needed: arguments are filled in with undef +// - Return value is not needed: drop it +// - Return value needed but not present: supply an undef +// +// For now, return nullptr without creating a wrapper if the wrapper cannot +// be generated due to incompatible types. +static Function *CreateWrapper(Function *F, FunctionType *Ty) { + Module *M = F->getParent(); + + Function *Wrapper = + Function::Create(Ty, Function::PrivateLinkage, "bitcast", M); + BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper); + + // Determine what arguments to pass. + SmallVector<Value *, 4> Args; + Function::arg_iterator AI = Wrapper->arg_begin(); + FunctionType::param_iterator PI = F->getFunctionType()->param_begin(); + FunctionType::param_iterator PE = F->getFunctionType()->param_end(); + for (; AI != Wrapper->arg_end() && PI != PE; ++AI, ++PI) { + if (AI->getType() != *PI) { + Wrapper->eraseFromParent(); + return nullptr; + } + Args.push_back(&*AI); + } + for (; PI != PE; ++PI) + Args.push_back(UndefValue::get(*PI)); + + CallInst *Call = CallInst::Create(F, Args, "", BB); + + // Determine what value to return. + if (Ty->getReturnType()->isVoidTy()) + ReturnInst::Create(M->getContext(), BB); + else if (F->getFunctionType()->getReturnType()->isVoidTy()) + ReturnInst::Create(M->getContext(), UndefValue::get(Ty->getReturnType()), + BB); + else if (F->getFunctionType()->getReturnType() == Ty->getReturnType()) + ReturnInst::Create(M->getContext(), Call, BB); + else { + Wrapper->eraseFromParent(); + return nullptr; + } + + return Wrapper; +} + +bool FixFunctionBitcasts::runOnModule(Module &M) { + SmallVector<std::pair<Use *, Function *>, 0> Uses; + + // Collect all the places that need wrappers. + for (Function &F : M) + FindUses(&F, F, Uses); + + DenseMap<std::pair<Function *, FunctionType *>, Function *> Wrappers; + + for (auto &UseFunc : Uses) { + Use *U = UseFunc.first; + Function *F = UseFunc.second; + PointerType *PTy = cast<PointerType>(U->get()->getType()); + FunctionType *Ty = dyn_cast<FunctionType>(PTy->getElementType()); + + // If the function is casted to something like i8* as a "generic pointer" + // to be later casted to something else, we can't generate a wrapper for it. + // Just ignore such casts for now. + if (!Ty) + continue; + + auto Pair = Wrappers.insert(std::make_pair(std::make_pair(F, Ty), nullptr)); + if (Pair.second) + Pair.first->second = CreateWrapper(F, Ty); + + Function *Wrapper = Pair.first->second; + if (!Wrapper) + continue; + + if (isa<Constant>(U->get())) + U->get()->replaceAllUsesWith(Wrapper); + else + U->set(Wrapper); + } + + return true; +} diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td index 8a3248ee669ea..e872dc2198460 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -40,8 +40,8 @@ defm ROTL : BinaryInt<rotl, "rotl", 0x77, 0x89>; defm ROTR : BinaryInt<rotr, "rotr", 0x78, 0x8a>; let isCommutable = 1 in { -defm EQ : ComparisonInt<SETEQ, "eq ", 0x46, 0x68>; -defm NE : ComparisonInt<SETNE, "ne ", 0x47, 0x69>; +defm EQ : ComparisonInt<SETEQ, "eq ", 0x46, 0x51>; +defm NE : ComparisonInt<SETNE, "ne ", 0x47, 0x52>; } // isCommutable = 1 defm LT_S : ComparisonInt<SETLT, "lt_s", 0x48, 0x53>; defm LT_U : ComparisonInt<SETULT, "lt_u", 0x49, 0x54>; diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index b61bc0a081435..f5ef35a2ad40d 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -163,6 +163,10 @@ void WebAssemblyPassConfig::addIRPasses() { // control specifically what gets lowered. addPass(createAtomicExpandPass(TM)); + // Fix function bitcasts, as WebAssembly requires caller and callee signatures + // to match. + addPass(createWebAssemblyFixFunctionBitcasts()); + // Optimize "returned" function attributes. if (getOptLevel() != CodeGenOpt::None) addPass(createWebAssemblyOptimizeReturned()); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7f72ab17f6194..db76ddf04c069 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6962,23 +6962,24 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } -/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB -/// node. -static SDValue LowerToAddSub(const BuildVectorSDNode *BV, - const X86Subtarget &Subtarget, SelectionDAG &DAG) { +/// Returns true iff \p BV builds a vector with the result equivalent to +/// the result of ADDSUB operation. +/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation +/// are written to the parameters \p Opnd0 and \p Opnd1. +static bool isAddSub(const BuildVectorSDNode *BV, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + SDValue &Opnd0, SDValue &Opnd1) { + MVT VT = BV->getSimpleValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && - (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) - return SDValue(); + (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && + (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) + return false; - SDLoc DL(BV); unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); - assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || - VT == MVT::v2f64) && "build_vector with an invalid type found!"); - // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from @@ -7000,7 +7001,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) - return SDValue(); + return false; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -7013,11 +7014,11 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, !isa<ConstantSDNode>(Op0.getOperand(1)) || !isa<ConstantSDNode>(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) - return SDValue(); + return false; unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); if (I0 != i) - return SDValue(); + return false; // We found a valid add/sub node. Update the information accordingly. if (i & 1) @@ -7029,39 +7030,118 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, if (InVec0.isUndef()) { InVec0 = Op0.getOperand(0); if (InVec0.getSimpleValueType() != VT) - return SDValue(); + return false; } if (InVec1.isUndef()) { InVec1 = Op1.getOperand(0); if (InVec1.getSimpleValueType() != VT) - return SDValue(); + return false; } // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (ExpectedOpcode == ISD::FSUB) - return SDValue(); + return false; // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) - return SDValue(); + return false; } if (InVec1 != Op1.getOperand(0)) - return SDValue(); + return false; // Update the pair of expected opcodes. std::swap(ExpectedOpcode, NextExpectedOpcode); } // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. - if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef()) - return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); + if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef()) + return false; - return SDValue(); + Opnd0 = InVec0; + Opnd1 = InVec1; + return true; +} + +/// Returns true if is possible to fold MUL and an idiom that has already been +/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1). +/// If (and only if) true is returned, the operands of FMADDSUB are written to +/// parameters \p Opnd0, \p Opnd1, \p Opnd2. +/// +/// Prior to calling this function it should be known that there is some +/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation +/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called +/// before replacement of such SDNode with ADDSUB operation. Thus the number +/// of \p Opnd0 uses is expected to be equal to 2. +/// For example, this function may be called for the following IR: +/// %AB = fmul fast <2 x double> %A, %B +/// %Sub = fsub fast <2 x double> %AB, %C +/// %Add = fadd fast <2 x double> %AB, %C +/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, +/// <2 x i32> <i32 0, i32 3> +/// There is a def for %Addsub here, which potentially can be replaced by +/// X86ISD::ADDSUB operation: +/// %Addsub = X86ISD::ADDSUB %AB, %C +/// and such ADDSUB can further be replaced with FMADDSUB: +/// %Addsub = FMADDSUB %A, %B, %C. +/// +/// The main reason why this method is called before the replacement of the +/// recognized ADDSUB idiom with ADDSUB operation is that such replacement +/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit +/// FMADDSUB is. +static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG, + SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) { + if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 || + !Subtarget.hasAnyFMA()) + return false; + + // FIXME: These checks must match the similar ones in + // DAGCombiner::visitFADDForFMACombine. It would be good to have one + // function that would answer if it is Ok to fuse MUL + ADD to FMADD + // or MUL + ADDSUB to FMADDSUB. + const TargetOptions &Options = DAG.getTarget().Options; + bool AllowFusion = + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); + if (!AllowFusion) + return false; + + Opnd2 = Opnd1; + Opnd1 = Opnd0.getOperand(1); + Opnd0 = Opnd0.getOperand(0); + + return true; +} + +/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation +/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node. +static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue Opnd0, Opnd1; + if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1)) + return SDValue(); + + MVT VT = BV->getSimpleValueType(0); + SDLoc DL(BV); + + // Try to generate X86ISD::FMADDSUB node here. + SDValue Opnd2; + if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2)) + return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); + + // Do not generate X86ISD::ADDSUB node for 512-bit types even though + // the ADDSUB idiom has been successfully recognized. There are no known + // X86 targets with 512-bit ADDSUB instructions! + // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom + // recognition. + if (VT.is512BitVector()) + return SDValue(); + + return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. @@ -7290,7 +7370,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return VectorConstant; BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); - if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG)) + if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) return HorizontalOp; @@ -12965,6 +13045,12 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (Subtarget.hasVBMI()) return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + return V; + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } @@ -16985,9 +17071,16 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); } - if (Cond.getOpcode() == ISD::SETCC) - if (SDValue NewCond = LowerSETCC(Cond, DAG)) + if (Cond.getOpcode() == ISD::SETCC) { + if (SDValue NewCond = LowerSETCC(Cond, DAG)) { Cond = NewCond; + // If the condition was updated, it's possible that the operands of the + // select were also updated (for example, EmitTest has a RAUW). Refresh + // the local references to the select operands in case they got stale. + Op1 = Op.getOperand(1); + Op2 = Op.getOperand(2); + } + } // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y @@ -17193,22 +17286,26 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) return SDValue(); - if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { + if (VT.is512BitVector() && InVTElt != MVT::i1) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); return DAG.getNode(X86ISD::VSEXT, dl, VT, In); } - assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); + assert (InVTElt == MVT::i1 && "Unexpected vector type"); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); - SDValue NegOne = DAG.getConstant( - APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); - SDValue Zero = DAG.getConstant( - APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); + SDValue V; + if (Subtarget.hasDQI()) { + V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In); + assert(!VT.is512BitVector() && "Unexpected vector type"); + } else { + SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl); + SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); + V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); + if (VT.is512BitVector()) + return V; + } - SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); - if (VT.is512BitVector()) - return V; return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); } @@ -21528,6 +21625,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); } + // It's worth extending once and using the vXi16/vXi32 shifts for smaller + // types, but without AVX512 the extra overheads to get from vXi8 to vXi32 + // make the existing SSE solution better. + if ((Subtarget.hasInt256() && VT == MVT::v8i16) || + (Subtarget.hasAVX512() && VT == MVT::v16i16) || + (Subtarget.hasAVX512() && VT == MVT::v16i8) || + (Subtarget.hasBWI() && VT == MVT::v32i8)) { + MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32); + MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements()); + unsigned ExtOpc = + Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + R = DAG.getNode(ExtOpc, dl, ExtVT, R); + Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); @@ -21636,19 +21750,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } } - // It's worth extending once and using the v8i32 shifts for 16-bit types, but - // the extra overheads to get from v16i8 to v8i32 make the existing SSE - // solution better. - if (Subtarget.hasInt256() && VT == MVT::v8i16) { - MVT ExtVT = MVT::v8i32; - unsigned ExtOpc = - Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - R = DAG.getNode(ExtOpc, dl, ExtVT, R); - Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); - return DAG.getNode(ISD::TRUNCATE, dl, VT, - DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); - } - if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); @@ -27763,29 +27864,32 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } -/// \brief Try to combine a shuffle into a target-specific add-sub node. +/// Returns true iff the shuffle node \p N can be replaced with ADDSUB +/// operation. If true is returned then the operands of ADDSUB operation +/// are written to the parameters \p Opnd0 and \p Opnd1. /// -/// We combine this directly on the abstract vector shuffle nodes so it is -/// easier to generically match. We also insert dummy vector shuffle nodes for -/// the operands which explicitly discard the lanes which are unused by this -/// operation to try to flow through the rest of the combiner the fact that -/// they're unused. -static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - SDLoc DL(N); +/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes +/// so it is easier to generically match. We also insert dummy vector shuffle +/// nodes for the operands which explicitly discard the lanes which are unused +/// by this operation to try to flow through the rest of the combiner +/// the fact that they're unused. +static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget, + SDValue &Opnd0, SDValue &Opnd1) { + EVT VT = N->getValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && - (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) - return SDValue(); + (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && + (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) + return false; // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask // extraction tool to support more. if (N->getOpcode() != ISD::VECTOR_SHUFFLE) - return SDValue(); + return false; ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask(); - SmallVector<int, 8> Mask(OrigMask.begin(), OrigMask.end()); + SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end()); SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); @@ -27796,27 +27900,57 @@ static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget, ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) - return SDValue(); + return false; // If there are other uses of these operations we can't fold them. if (!V1->hasOneUse() || !V2->hasOneUse()) - return SDValue(); + return false; // Ensure that both operations have the same operands. Note that we can // commute the FADD operands. SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) - return SDValue(); + return false; // We're looking for blends between FADD and FSUB nodes. We insist on these // nodes being lined up in a specific expected pattern. if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || - isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) + isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) || + isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23, + 8, 25, 10, 27, 12, 29, 14, 31}))) + return false; + + Opnd0 = LHS; + Opnd1 = RHS; + return true; +} + +/// \brief Try to combine a shuffle into a target-specific add-sub or +/// mul-add-sub node. +static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue Opnd0, Opnd1; + if (!isAddSub(N, Subtarget, Opnd0, Opnd1)) + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // Try to generate X86ISD::FMADDSUB node here. + SDValue Opnd2; + if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2)) + return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); + + // Do not generate X86ISD::ADDSUB node for 512-bit types even though + // the ADDSUB idiom has been successfully recognized. There are no known + // X86 targets with 512-bit ADDSUB instructions! + if (VT.is512BitVector()) return SDValue(); - return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); + return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } // We are looking for a shuffle where both sources are concatenated with undef @@ -27878,7 +28012,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. if (TLI.isTypeLegal(VT)) - if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) + if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; // During Type Legalization, when promoting illegal vector types, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 908053e1342d0..d44d1395f2437 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -443,6 +443,22 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllOnesV))]>; } +// Alias instructions that allow VPTERNLOG to be used with a mask to create +// a mix of all ones and all zeros elements. This is done this way to force +// the same register to be used as input for all three sources. +let isPseudo = 1, Predicates = [HasAVX512] in { +def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst), + (ins VK16WM:$mask), "", + [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask), + (v16i32 immAllOnesV), + (v16i32 immAllZerosV)))]>; +def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst), + (ins VK8WM:$mask), "", + [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask), + (bc_v8i64 (v16i32 immAllOnesV)), + (bc_v8i64 (v16i32 immAllZerosV))))]>; +} + let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in { def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "", @@ -1064,10 +1080,10 @@ def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), (v8f32 VR256X:$src), 1)>; def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))), (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v4f64 VR256X:$src), 1)>; + (v4f64 VR256X:$src), 1)>; def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v4i64 VR256X:$src), 1)>; + (v4i64 VR256X:$src), 1)>; def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v8i32 VR256X:$src), 1)>; @@ -1485,8 +1501,7 @@ defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", // AVX-512 - BLEND using mask // multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in { - let hasSideEffects = 0 in + let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, @@ -1496,16 +1511,13 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - [(set _.RC:$dst, (vselect _.KRCWM:$mask, - (_.VT _.RC:$src2), - (_.VT _.RC:$src1)))]>, EVEX_4V, EVEX_K; - let hasSideEffects = 0 in + []>, EVEX_4V, EVEX_K; def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), []>, EVEX_4V, EVEX_KZ; - let mayLoad = 1, hasSideEffects = 0 in + let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, @@ -1515,38 +1527,32 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - [(set _.RC:$dst, (vselect _.KRCWM:$mask, - (_.VT (bitconvert (_.LdFrag addr:$src2))), - (_.VT _.RC:$src1)))]>, - EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; - let mayLoad = 1, hasSideEffects = 0 in + []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; } + } } multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + let mayLoad = 1, hasSideEffects = 0 in { def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), - [(set _.RC:$dst,(vselect _.KRCWM:$mask, - (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - (_.VT _.RC:$src1)))]>, - EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + []>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; - let mayLoad = 1, hasSideEffects = 0 in def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, "}"), []>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; - + } } multiclass blendmask_dq <bits<8> opc, string OpcodeStr, @@ -1582,21 +1588,6 @@ defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; -let Predicates = [HasAVX512, NoVLX] in { -def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), - (v8f32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; - -def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), - (v8i32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; -} //===----------------------------------------------------------------------===// // Compare Instructions //===----------------------------------------------------------------------===// @@ -2735,7 +2726,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, (ins _.KRCWM:$mask, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", "${dst} {${mask}} {z}, $src}"), - [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask, (_.VT _.RC:$src), _.ImmAllZerosV)))], _.ExeDomain>, EVEX, EVEX_KZ; @@ -2972,6 +2963,30 @@ def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)), (v16i32 VR512:$src))), (VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>; +// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't +// available. Use a 512-bit operation and extract. +let Predicates = [HasAVX512, NoVLX] in { +def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), + (v8f32 VR256X:$src0))), + (EXTRACT_SUBREG + (v16f32 + (VMOVAPSZrrk + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)), + (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), + sub_ymm)>; + +def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), + (v8i32 VR256X:$src0))), + (EXTRACT_SUBREG + (v16i32 + (VMOVDQA32Zrrk + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)), + (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), + sub_ymm)>; +} + let Predicates = [HasVLX, NoBWI] in { // 128-bit load/store without BWI. def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst), @@ -3116,13 +3131,13 @@ let Predicates = [HasVLX] in { (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; } -
-// Move Int Doubleword to Packed Double Int
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set VR128X:$dst,
+ +// Move Int Doubleword to Packed Double Int +// +let ExeDomain = SSEPackedInt in { +def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, EVEX; def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), @@ -3152,47 +3167,47 @@ def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(store (i64 (bitconvert FR64X:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
- EVEX_CD8<64, CD8VT1>;
-}
-} // ExeDomain = SSEPackedInt
-
-// Move Int Doubleword to Single Scalar
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
-def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>, + EVEX_CD8<64, CD8VT1>; +} +} // ExeDomain = SSEPackedInt + +// Move Int Doubleword to Single Scalar +// +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { +def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert GR32:$src))], IIC_SSE_MOVDQ>, EVEX; def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), - "vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
-
-// Move doubleword from xmm register to r/m32
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 + +// Move doubleword from xmm register to r/m32 +// +let ExeDomain = SSEPackedInt in { +def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (extractelt (v4i32 VR128X:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, EVEX; def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", - [(store (i32 (extractelt (v4i32 VR128X:$src),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
- EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt
-
-// Move quadword from xmm1 register to r/m64
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+ [(store (i32 (extractelt (v4i32 VR128X:$src), + (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, EVEX_CD8<32, CD8VT1>; +} // ExeDomain = SSEPackedInt + +// Move quadword from xmm1 register to r/m64 +// +let ExeDomain = SSEPackedInt in { +def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Requires<[HasAVX512, In64BitMode]>; @@ -3213,39 +3228,39 @@ def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs), let hasSideEffects = 0 in def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), - (ins VR128X:$src),
- "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
- EVEX, VEX_W;
-} // ExeDomain = SSEPackedInt
-
-// Move Scalar Single to Double Int
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
-def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
- (ins FR32X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
+ (ins VR128X:$src), + "vmovq.s\t{$src, $dst|$dst, $src}",[]>, + EVEX, VEX_W; +} // ExeDomain = SSEPackedInt + +// Move Scalar Single to Double Int +// +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { +def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), + (ins FR32X:$src), + "vmovd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32X:$src))], IIC_SSE_MOVD_ToGP>, EVEX; def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32X:$src), - "vmovd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
-
-// Move Quadword Int to Packed Quadword Int
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
- (ins i64mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128X:$dst,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
- EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
-} // ExeDomain = SSEPackedInt
-
-//===----------------------------------------------------------------------===//
-// AVX-512 MOVSS, MOVSD
+ "vmovd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32X:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 + +// Move Quadword Int to Packed Quadword Int +// +let ExeDomain = SSEPackedInt in { +def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>; +} // ExeDomain = SSEPackedInt + +//===----------------------------------------------------------------------===// +// AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// multiclass avx512_move_scalar<string asm, SDNode OpNode, @@ -8646,6 +8661,28 @@ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask, + (COPY_TO_REGCLASS FR64X:$src, VR128X))>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 579359794fbdd..e3484d062bc80 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -543,7 +543,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOV8rr, X86::MOV8rm, 0 }, { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, - { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 }, + { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE }, { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, @@ -661,7 +661,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 }, - { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 }, + { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE }, { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, @@ -6864,6 +6864,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(Reg, RegState::Undef).addImm(0xff); return true; } + case X86::AVX512_512_SEXT_MASK_32: + case X86::AVX512_512_SEXT_MASK_64: { + unsigned Reg = MIB->getOperand(0).getReg(); + unsigned MaskReg = MIB->getOperand(1).getReg(); + unsigned MaskState = getRegState(MIB->getOperand(1)); + unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? + X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; + MI.RemoveOperand(1); + MIB->setDesc(get(Opc)); + // VPTERNLOG needs 3 register inputs and an immediate. + // 0xff will return 1s for any input. + MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState) + .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff); + return true; + } case X86::VMOVAPSZ128rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm), get(X86::VBROADCASTF32X4rm), X86::sub_xmm); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 4cd6ae563f03d..09971d586a413 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6397,7 +6397,7 @@ let Predicates = [HasAVX] in { defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", int_x86_sse41_round_ss, int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; - defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG; + defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG; } let Predicates = [UseAVX] in { diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index de4839432b9ad..107ed9359376b 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -144,6 +144,10 @@ int X86TTIImpl::getArithmeticInstrCost( } static const CostTblEntry AVX512BWUniformConstCostTable[] = { + { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence }; @@ -168,6 +172,10 @@ int X86TTIImpl::getArithmeticInstrCost( } static const CostTblEntry AVX2UniformConstCostTable[] = { + { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence @@ -184,6 +192,14 @@ int X86TTIImpl::getArithmeticInstrCost( } static const CostTblEntry SSE2UniformConstCostTable[] = { + { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + + { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand). + { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand). + { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb). + { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence @@ -207,6 +223,43 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; } + static const CostTblEntry AVX2UniformCostTable[] = { + // Uniform splats are cheaper for the following instructions. + { ISD::SHL, MVT::v16i16, 1 }, // psllw. + { ISD::SRL, MVT::v16i16, 1 }, // psrlw. + { ISD::SRA, MVT::v16i16, 1 }, // psraw. + }; + + if (ST->hasAVX2() && + ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || + (Op2Info == TargetTransformInfo::OK_UniformValue))) { + if (const auto *Entry = + CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry SSE2UniformCostTable[] = { + // Uniform splats are cheaper for the following instructions. + { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v2i64, 1 }, // psllq. + + { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + + { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v4i32, 1 }, // psrad. + }; + + if (ST->hasSSE2() && + ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || + (Op2Info == TargetTransformInfo::OK_UniformValue))) { + if (const auto *Entry = + CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + static const CostTblEntry AVX512DQCostTable[] = { { ISD::MUL, MVT::v2i64, 1 }, { ISD::MUL, MVT::v4i64, 1 }, @@ -219,6 +272,10 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; static const CostTblEntry AVX512BWCostTable[] = { + { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v32i16, 1 }, // vpsravw + { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. @@ -259,7 +316,7 @@ int X86TTIImpl::getArithmeticInstrCost( if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) return LT.first * Entry->Cost; - static const CostTblEntry AVX2CostTable[] = { + static const CostTblEntry AVX2ShiftCostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. { ISD::SHL, MVT::v4i32, 1 }, @@ -283,11 +340,11 @@ int X86TTIImpl::getArithmeticInstrCost( // is lowered into a vector multiply (vpmullw). return LT.first; - if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) + if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; } - static const CostTblEntry XOPCostTable[] = { + static const CostTblEntry XOPShiftCostTable[] = { // 128bit shifts take 1cy, but right shifts require negation beforehand. { ISD::SHL, MVT::v16i8, 1 }, { ISD::SRL, MVT::v16i8, 2 }, @@ -318,93 +375,20 @@ int X86TTIImpl::getArithmeticInstrCost( // Look for XOP lowering tricks. if (ST->hasXOP()) - if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second)) + if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; - static const CostTblEntry AVX2CustomCostTable[] = { - { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. - { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - - { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. - { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - - { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. - { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. - { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. - { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. - - { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v8i32, 1 }, // pmulld - { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add - - { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ - { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ - { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ - { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ - { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ - { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ - }; - - // Look for AVX2 lowering tricks for custom cases. - if (ST->hasAVX2()) - if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD, - LT.second)) - return LT.first * Entry->Cost; - - static const CostTblEntry AVXCustomCostTable[] = { - { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. - - { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ - { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ - { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ - { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ - { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ - { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ - - // Vectorizing division is a bad idea. See the SSE2 table for more comments. - { ISD::SDIV, MVT::v32i8, 32*20 }, - { ISD::SDIV, MVT::v16i16, 16*20 }, - { ISD::SDIV, MVT::v8i32, 8*20 }, - { ISD::SDIV, MVT::v4i64, 4*20 }, - { ISD::UDIV, MVT::v32i8, 32*20 }, - { ISD::UDIV, MVT::v16i16, 16*20 }, - { ISD::UDIV, MVT::v8i32, 8*20 }, - { ISD::UDIV, MVT::v4i64, 4*20 }, - }; - - // Look for AVX2 lowering tricks for custom cases. - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD, - LT.second)) - return LT.first * Entry->Cost; - - static const CostTblEntry - SSE2UniformCostTable[] = { + static const CostTblEntry SSE2UniformShiftCostTable[] = { // Uniform splats are cheaper for the following instructions. - { ISD::SHL, MVT::v16i8, 1 }, // psllw. - { ISD::SHL, MVT::v32i8, 2 }, // psllw. - { ISD::SHL, MVT::v8i16, 1 }, // psllw. { ISD::SHL, MVT::v16i16, 2 }, // psllw. - { ISD::SHL, MVT::v4i32, 1 }, // pslld { ISD::SHL, MVT::v8i32, 2 }, // pslld - { ISD::SHL, MVT::v2i64, 1 }, // psllq. { ISD::SHL, MVT::v4i64, 2 }, // psllq. - { ISD::SRL, MVT::v16i8, 1 }, // psrlw. - { ISD::SRL, MVT::v32i8, 2 }, // psrlw. - { ISD::SRL, MVT::v8i16, 1 }, // psrlw. { ISD::SRL, MVT::v16i16, 2 }, // psrlw. - { ISD::SRL, MVT::v4i32, 1 }, // psrld. { ISD::SRL, MVT::v8i32, 2 }, // psrld. - { ISD::SRL, MVT::v2i64, 1 }, // psrlq. { ISD::SRL, MVT::v4i64, 2 }, // psrlq. - { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. - { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb. - { ISD::SRA, MVT::v8i16, 1 }, // psraw. { ISD::SRA, MVT::v16i16, 2 }, // psraw. - { ISD::SRA, MVT::v4i32, 1 }, // psrad. { ISD::SRA, MVT::v8i32, 2 }, // psrad. { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. @@ -414,7 +398,7 @@ int X86TTIImpl::getArithmeticInstrCost( ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || (Op2Info == TargetTransformInfo::OK_UniformValue))) { if (const auto *Entry = - CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) + CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; } @@ -422,24 +406,98 @@ int X86TTIImpl::getArithmeticInstrCost( Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { MVT VT = LT.second; // Vector shift left by non uniform constant can be lowered - // into vector multiply (pmullw/pmulld). - if ((VT == MVT::v8i16 && ST->hasSSE2()) || - (VT == MVT::v4i32 && ST->hasSSE41())) - return LT.first; - - // v16i16 and v8i32 shifts by non-uniform constants are lowered into a - // sequence of extract + two vector multiply + insert. - if ((VT == MVT::v8i32 || VT == MVT::v16i16) && - (ST->hasAVX() && !ST->hasAVX2())) - ISD = ISD::MUL; - - // A vector shift left by non uniform constant is converted - // into a vector multiply; the new multiply is eventually - // lowered into a sequence of shuffles and 2 x pmuludq. - if (VT == MVT::v4i32 && ST->hasSSE2()) + // into vector multiply. + if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || + ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) ISD = ISD::MUL; } + static const CostTblEntry AVX2CostTable[] = { + { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + + { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + + { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. + { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. + + { ISD::SUB, MVT::v32i8, 1 }, // psubb + { ISD::ADD, MVT::v32i8, 1 }, // paddb + { ISD::SUB, MVT::v16i16, 1 }, // psubw + { ISD::ADD, MVT::v16i16, 1 }, // paddw + { ISD::SUB, MVT::v8i32, 1 }, // psubd + { ISD::ADD, MVT::v8i32, 1 }, // paddd + { ISD::SUB, MVT::v4i64, 1 }, // psubq + { ISD::ADD, MVT::v4i64, 1 }, // paddq + + { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v16i16, 1 }, // pmullw + { ISD::MUL, MVT::v8i32, 1 }, // pmulld + { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add + + { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ + }; + + // Look for AVX2 lowering tricks for custom cases. + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX1CostTable[] = { + // We don't have to scalarize unsupported ops. We can issue two half-sized + // operations and we only need to extract the upper YMM half. + // Two ops + 1 extract + 1 insert = 4. + { ISD::MUL, MVT::v16i16, 4 }, + { ISD::MUL, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v32i8, 4 }, + { ISD::ADD, MVT::v32i8, 4 }, + { ISD::SUB, MVT::v16i16, 4 }, + { ISD::ADD, MVT::v16i16, 4 }, + { ISD::SUB, MVT::v8i32, 4 }, + { ISD::ADD, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v4i64, 4 }, + { ISD::ADD, MVT::v4i64, 4 }, + + // A v4i64 multiply is custom lowered as two split v2i64 vectors that then + // are lowered as a series of long multiplies(3), shifts(3) and adds(2) + // Because we believe v4i64 to be a legal type, we must also include the + // extract+insert in the cost table. Therefore, the cost here is 18 + // instead of 8. + { ISD::MUL, MVT::v4i64, 18 }, + + { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. + + { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ + + // Vectorizing division is a bad idea. See the SSE2 table for more comments. + { ISD::SDIV, MVT::v32i8, 32*20 }, + { ISD::SDIV, MVT::v16i16, 16*20 }, + { ISD::SDIV, MVT::v8i32, 8*20 }, + { ISD::SDIV, MVT::v4i64, 4*20 }, + { ISD::UDIV, MVT::v32i8, 32*20 }, + { ISD::UDIV, MVT::v16i16, 16*20 }, + { ISD::UDIV, MVT::v8i32, 8*20 }, + { ISD::UDIV, MVT::v4i64, 4*20 }, + }; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + static const CostTblEntry SSE42CostTable[] = { { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ @@ -456,6 +514,8 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence. { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence. + { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld + { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence. @@ -501,6 +561,7 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v8i16, 1 }, // pmullw { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add @@ -516,46 +577,19 @@ int X86TTIImpl::getArithmeticInstrCost( // generally a bad idea. Assume somewhat arbitrarily that we have to be able // to hide "20 cycles" for each lane. { ISD::SDIV, MVT::v16i8, 16*20 }, - { ISD::SDIV, MVT::v8i16, 8*20 }, - { ISD::SDIV, MVT::v4i32, 4*20 }, - { ISD::SDIV, MVT::v2i64, 2*20 }, + { ISD::SDIV, MVT::v8i16, 8*20 }, + { ISD::SDIV, MVT::v4i32, 4*20 }, + { ISD::SDIV, MVT::v2i64, 2*20 }, { ISD::UDIV, MVT::v16i8, 16*20 }, - { ISD::UDIV, MVT::v8i16, 8*20 }, - { ISD::UDIV, MVT::v4i32, 4*20 }, - { ISD::UDIV, MVT::v2i64, 2*20 }, + { ISD::UDIV, MVT::v8i16, 8*20 }, + { ISD::UDIV, MVT::v4i32, 4*20 }, + { ISD::UDIV, MVT::v2i64, 2*20 }, }; if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) return LT.first * Entry->Cost; - static const CostTblEntry AVX1CostTable[] = { - // We don't have to scalarize unsupported ops. We can issue two half-sized - // operations and we only need to extract the upper YMM half. - // Two ops + 1 extract + 1 insert = 4. - { ISD::MUL, MVT::v16i16, 4 }, - { ISD::MUL, MVT::v8i32, 4 }, - { ISD::SUB, MVT::v32i8, 4 }, - { ISD::ADD, MVT::v32i8, 4 }, - { ISD::SUB, MVT::v16i16, 4 }, - { ISD::ADD, MVT::v16i16, 4 }, - { ISD::SUB, MVT::v8i32, 4 }, - { ISD::ADD, MVT::v8i32, 4 }, - { ISD::SUB, MVT::v4i64, 4 }, - { ISD::ADD, MVT::v4i64, 4 }, - // A v4i64 multiply is custom lowered as two split v2i64 vectors that then - // are lowered as a series of long multiplies(3), shifts(3) and adds(2) - // Because we believe v4i64 to be a legal type, we must also include the - // extract+insert in the cost table. Therefore, the cost here is 18 - // instead of 8. - { ISD::MUL, MVT::v4i64, 18 }, - }; - - // Look for AVX1 lowering tricks. - if (ST->hasAVX() && !ST->hasAVX2()) - if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) - return LT.first * Entry->Cost; - static const CostTblEntry SSE1CostTable[] = { { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ @@ -639,8 +673,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw - { TTI::SK_Reverse, MVT::v64i8, 6 }, // vextracti64x4 + 2*vperm2i128 - // + 2*pshufb + vinserti64x4 + { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2 { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp index f4742aaf748f1..82daf754be0dd 100644 --- a/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/lib/Transforms/IPO/LowerTypeTests.cpp @@ -42,6 +42,8 @@ using namespace llvm; using namespace lowertypetests; +using SummaryAction = LowerTypeTestsSummaryAction; + #define DEBUG_TYPE "lowertypetests" STATISTIC(ByteArraySizeBits, "Byte array size in bits"); @@ -55,9 +57,15 @@ static cl::opt<bool> AvoidReuse( cl::desc("Try to avoid reuse of byte array addresses using aliases"), cl::Hidden, cl::init(true)); -static cl::opt<std::string> ClSummaryAction( +static cl::opt<SummaryAction> ClSummaryAction( "lowertypetests-summary-action", - cl::desc("What to do with the summary when running this pass"), cl::Hidden); + cl::desc("What to do with the summary when running this pass"), + cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"), + clEnumValN(SummaryAction::Import, "import", + "Import typeid resolutions from summary and globals"), + clEnumValN(SummaryAction::Export, "export", + "Export typeid resolutions to summary and globals")), + cl::Hidden); static cl::opt<std::string> ClReadSummary( "lowertypetests-read-summary", @@ -226,8 +234,8 @@ public: class LowerTypeTestsModule { Module &M; - // This is for testing purposes only. - std::unique_ptr<ModuleSummaryIndex> OwnedSummary; + SummaryAction Action; + ModuleSummaryIndex *Summary; bool LinkerSubsectionsViaSymbols; Triple::ArchType Arch; @@ -319,21 +327,38 @@ class LowerTypeTestsModule { void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions); public: - LowerTypeTestsModule(Module &M); - ~LowerTypeTestsModule(); + LowerTypeTestsModule(Module &M, SummaryAction Action, + ModuleSummaryIndex *Summary); bool lower(); + + // Lower the module using the action and summary passed as command line + // arguments. For testing purposes only. + static bool runForTesting(Module &M); }; struct LowerTypeTests : public ModulePass { static char ID; - LowerTypeTests() : ModulePass(ID) { + + bool UseCommandLine = false; + + SummaryAction Action; + ModuleSummaryIndex *Summary; + + LowerTypeTests() : ModulePass(ID), UseCommandLine(true) { + initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry()); + } + + LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary) + : ModulePass(ID), Action(Action), Summary(Summary) { initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry()); } bool runOnModule(Module &M) override { if (skipModule(M)) return false; - return LowerTypeTestsModule(M).lower(); + if (UseCommandLine) + return LowerTypeTestsModule::runForTesting(M); + return LowerTypeTestsModule(M, Action, Summary).lower(); } }; @@ -343,7 +368,10 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false, false) char LowerTypeTests::ID = 0; -ModulePass *llvm::createLowerTypeTestsPass() { return new LowerTypeTests; } +ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action, + ModuleSummaryIndex *Summary) { + return new LowerTypeTests(Action, Summary); +} /// Build a bit set for TypeId using the object layouts in /// GlobalLayout. @@ -1145,22 +1173,12 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet( } /// Lower all type tests in this module. -LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) { - // Handle the command-line summary arguments. This code is for testing - // purposes only, so we handle errors directly. - if (!ClSummaryAction.empty()) { - OwnedSummary = make_unique<ModuleSummaryIndex>(); - if (!ClReadSummary.empty()) { - ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary + - ": "); - auto ReadSummaryFile = - ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary))); - - yaml::Input In(ReadSummaryFile->getBuffer()); - In >> *OwnedSummary; - ExitOnErr(errorCodeToError(In.error())); - } - } +LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action, + ModuleSummaryIndex *Summary) + : M(M), Action(Action), Summary(Summary) { + // FIXME: Use these fields. + (void)this->Action; + (void)this->Summary; Triple TargetTriple(M.getTargetTriple()); LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX(); @@ -1169,18 +1187,36 @@ LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) { ObjectFormat = TargetTriple.getObjectFormat(); } -LowerTypeTestsModule::~LowerTypeTestsModule() { - if (ClSummaryAction.empty() || ClWriteSummary.empty()) - return; +bool LowerTypeTestsModule::runForTesting(Module &M) { + ModuleSummaryIndex Summary; - ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary + - ": "); - std::error_code EC; - raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text); - ExitOnErr(errorCodeToError(EC)); + // Handle the command-line summary arguments. This code is for testing + // purposes only, so we handle errors directly. + if (!ClReadSummary.empty()) { + ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary + + ": "); + auto ReadSummaryFile = + ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary))); + + yaml::Input In(ReadSummaryFile->getBuffer()); + In >> Summary; + ExitOnErr(errorCodeToError(In.error())); + } + + bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower(); + + if (!ClWriteSummary.empty()) { + ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary + + ": "); + std::error_code EC; + raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text); + ExitOnErr(errorCodeToError(EC)); + + yaml::Output Out(OS); + Out << Summary; + } - yaml::Output Out(OS); - Out << *OwnedSummary; + return Changed; } bool LowerTypeTestsModule::lower() { @@ -1313,7 +1349,8 @@ bool LowerTypeTestsModule::lower() { PreservedAnalyses LowerTypeTestsPass::run(Module &M, ModuleAnalysisManager &AM) { - bool Changed = LowerTypeTestsModule(M).lower(); + bool Changed = + LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower(); if (!Changed) return PreservedAnalyses::all(); return PreservedAnalyses::none(); diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 293ddf21a68f7..d086ee05a64fe 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -857,7 +857,8 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { // Lower type metadata and the type.test intrinsic. This pass supports Clang's // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at // link time if CFI is enabled. The pass does nothing if CFI is disabled. - PM.add(createLowerTypeTestsPass()); + PM.add(createLowerTypeTestsPass(LowerTypeTestsSummaryAction::None, + /*Summary=*/nullptr)); if (OptLevel != 0) addLateLTOOptimizationPasses(PM); diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 012bfc7b4944c..013159cde7740 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1903,7 +1903,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp, return foldICmpShlOne(Cmp, Shl, C); // Check that the shift amount is in range. If not, don't perform undefined - // shifts. When the shift is visited it will be simplified. + // shifts. When the shift is visited, it will be simplified. unsigned TypeBits = C->getBitWidth(); if (ShiftAmt->uge(TypeBits)) return nullptr; @@ -1923,7 +1923,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp, return new ICmpInst(Pred, X, LShrC); if (Shl->hasOneUse()) { - // Otherwise strength reduce the shift into an and. + // Otherwise, strength reduce the shift into an and. Constant *Mask = ConstantInt::get(Shl->getType(), APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue())); @@ -1951,7 +1951,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp, } // When the shift is nuw and pred is >u or <=u, comparison only really happens - // in the pre-shifted bits. Since InstSimplify canoncalizes <=u into <u, the + // in the pre-shifted bits. Since InstSimplify canonicalizes <=u into <u, the // <=u case can be further converted to match <u (see below). if (Shl->hasNoUnsignedWrap() && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT)) { @@ -1970,9 +1970,9 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp, // Transform (icmp pred iM (shl iM %v, N), C) // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N)) // Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N. - // This enables us to get rid of the shift in favor of a trunc which can be + // This enables us to get rid of the shift in favor of a trunc that may be // free on the target. It has the additional benefit of comparing to a - // smaller constant, which will be target friendly. + // smaller constant that may be more target-friendly. unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1); if (Shl->hasOneUse() && Amt != 0 && C->countTrailingZeros() >= Amt && DL.isLegalInteger(TypeBits - Amt)) { diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 1d55283987766..54bdc9e0772b5 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1818,6 +1818,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { RegisteredFlag = new GlobalVariable( M, IntptrTy, false, GlobalVariable::CommonLinkage, ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName); + RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility); // Update llvm.compiler.used, adding the new liveness globals. This is // needed so that during LTO these variables stay alive. The alternative diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 6aeb5237ffe35..68faa886060a1 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1423,7 +1423,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { if (widenLoopCompare(DU)) return nullptr; - // This user does not evaluate to a recurence after widening, so don't + // This user does not evaluate to a recurrence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. truncateIVUse(DU, DT, LI); diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp index 08e7acdaaf72c..8fb580183e307 100644 --- a/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -415,7 +415,9 @@ public: Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), PH->getTerminator()); Value *Initial = - new LoadInst(InitialPtr, "load_initial", PH->getTerminator()); + new LoadInst(InitialPtr, "load_initial", /* isVolatile */ false, + Cand.Load->getAlignment(), PH->getTerminator()); + PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", &L->getHeader()->front()); PHI->addIncoming(Initial, PH); diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 6f7682c96cefb..76fe91884c7b8 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -1382,8 +1382,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(), Succ->begin(), Succ->end()); LPM->deleteSimpleAnalysisValue(BI, L); - BI->eraseFromParent(); RemoveFromWorklist(BI, Worklist); + BI->eraseFromParent(); // Remove Succ from the loop tree. LI->removeBlock(Succ); diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 8b8236390bf47..eef7db08cd460 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -79,7 +79,8 @@ STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted"); STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted"); STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified"); STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same"); -STATISTIC(NumGVNMaxIterations, "Maximum Number of iterations it took to converge GVN"); +STATISTIC(NumGVNMaxIterations, + "Maximum Number of iterations it took to converge GVN"); //===----------------------------------------------------------------------===// // GVN Pass @@ -327,7 +328,7 @@ private: // Elimination. struct ValueDFS; void convertDenseToDFSOrdered(CongruenceClass::MemberSet &, - std::vector<ValueDFS> &); + SmallVectorImpl<ValueDFS> &); bool eliminateInstructions(Function &); void replaceInstruction(Instruction *, Value *); @@ -336,8 +337,11 @@ private: // New instruction creation. void handleNewInstruction(Instruction *){}; + + // Various instruction touch utilities void markUsersTouched(Value *); void markMemoryUsersTouched(MemoryAccess *); + void markLeaderChangeTouched(CongruenceClass *CC); // Utilities. void cleanupTables(); @@ -390,10 +394,10 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false) PHIExpression *NewGVN::createPHIExpression(Instruction *I) { - BasicBlock *PhiBlock = I->getParent(); + BasicBlock *PHIBlock = I->getParent(); auto *PN = cast<PHINode>(I); - auto *E = new (ExpressionAllocator) - PHIExpression(PN->getNumOperands(), I->getParent()); + auto *E = + new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock); E->allocateOperands(ArgRecycler, ExpressionAllocator); E->setType(I->getType()); @@ -408,10 +412,10 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) { std::transform(Filtered.begin(), Filtered.end(), op_inserter(E), [&](const Use &U) -> Value * { - // Don't try to transform self-defined phis + // Don't try to transform self-defined phis. if (U == PN) return PN; - const BasicBlockEdge BBE(PN->getIncomingBlock(U), PhiBlock); + const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock); return lookupOperandLeader(U, I, BBE); }); return E; @@ -710,6 +714,15 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI, return E; } +// Utility function to check whether the congruence class has a member other +// than the given instruction. +bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) { + // Either it has more than one member, in which case it must contain something + // other than us (because it's indexed by value), or if it only has one member + // right now, that member should not be us. + return CC->Members.size() > 1 || CC->Members.count(I) == 0; +} + const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I, const BasicBlock *B) { // Unlike loads, we never try to eliminate stores, so we do not check if they @@ -725,8 +738,12 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I, cast<MemoryDef>(StoreAccess)->getDefiningAccess()); const Expression *OldStore = createStoreExpression(SI, StoreRHS, B); CongruenceClass *CC = ExpressionToClass.lookup(OldStore); + // Basically, check if the congruence class the store is in is defined by a + // store that isn't us, and has the same value. MemorySSA takes care of + // ensuring the store has the same memory state as us already. if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) && - CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B)) + CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) && + hasMemberOtherThanUs(CC, I)) return createStoreExpression(SI, StoreRHS, B); } @@ -810,36 +827,50 @@ bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) { const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I, const BasicBlock *B) { auto *E = cast<PHIExpression>(createPHIExpression(I)); - if (E->op_empty()) { + // We match the semantics of SimplifyPhiNode from InstructionSimplify here. + + // See if all arguaments are the same. + // We track if any were undef because they need special handling. + bool HasUndef = false; + auto Filtered = make_filter_range(E->operands(), [&](const Value *Arg) { + if (Arg == I) + return false; + if (isa<UndefValue>(Arg)) { + HasUndef = true; + return false; + } + return true; + }); + // If we are left with no operands, it's undef + if (Filtered.begin() == Filtered.end()) { DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef" << "\n"); E->deallocateOperands(ArgRecycler); ExpressionAllocator.Deallocate(E); return createConstantExpression(UndefValue::get(I->getType())); } - - Value *AllSameValue = E->getOperand(0); - - // See if all arguments are the same, ignoring undef arguments, because we can - // choose a value that is the same for them. - for (const Value *Arg : E->operands()) - if (Arg != AllSameValue && !isa<UndefValue>(Arg)) { - AllSameValue = nullptr; - break; + Value *AllSameValue = *(Filtered.begin()); + ++Filtered.begin(); + // Can't use std::equal here, sadly, because filter.begin moves. + if (llvm::all_of(Filtered, [AllSameValue](const Value *V) { + return V == AllSameValue; + })) { + // In LLVM's non-standard representation of phi nodes, it's possible to have + // phi nodes with cycles (IE dependent on other phis that are .... dependent + // on the original phi node), especially in weird CFG's where some arguments + // are unreachable, or uninitialized along certain paths. This can cause + // infinite loops during evaluation. We work around this by not trying to + // really evaluate them independently, but instead using a variable + // expression to say if one is equivalent to the other. + // We also special case undef, so that if we have an undef, we can't use the + // common value unless it dominates the phi block. + if (HasUndef) { + // Only have to check for instructions + if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue)) + if (!DT->dominates(AllSameInst, I)) + return E; } - if (AllSameValue) { - // It's possible to have phi nodes with cycles (IE dependent on - // other phis that are .... dependent on the original phi node), - // especially in weird CFG's where some arguments are unreachable, or - // uninitialized along certain paths. - // This can cause infinite loops during evaluation (even if you disable - // the recursion below, you will simply ping-pong between congruence - // classes). If a phi node symbolically evaluates to another phi node, - // just leave it alone. If they are really the same, we will still - // eliminate them in favor of each other. - if (isa<PHINode>(AllSameValue)) - return E; NumGVNPhisAllSame++; DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue << "\n"); @@ -1007,12 +1038,22 @@ void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) { } } +// Touch the instructions that need to be updated after a congruence class has a +// leader change, and mark changed values. +void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) { + for (auto M : CC->Members) { + if (auto *I = dyn_cast<Instruction>(M)) + TouchedInstructions.set(InstrDFS[I]); + ChangedValues.insert(M); + } +} + // Perform congruence finding on a given value numbering expression. void NewGVN::performCongruenceFinding(Value *V, const Expression *E) { - ValueToExpression[V] = E; // This is guaranteed to return something, since it will at least find // INITIAL. + CongruenceClass *VClass = ValueToClass[V]; assert(VClass && "Should have found a vclass"); // Dead classes should have been eliminated from the mapping. @@ -1031,14 +1072,17 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) { place->second = NewClass; // Constants and variables should always be made the leader. - if (const auto *CE = dyn_cast<ConstantExpression>(E)) + if (const auto *CE = dyn_cast<ConstantExpression>(E)) { NewClass->RepLeader = CE->getConstantValue(); - else if (const auto *VE = dyn_cast<VariableExpression>(E)) - NewClass->RepLeader = VE->getVariableValue(); - else if (const auto *SE = dyn_cast<StoreExpression>(E)) - NewClass->RepLeader = SE->getStoreInst()->getValueOperand(); - else + } else if (const auto *SE = dyn_cast<StoreExpression>(E)) { + StoreInst *SI = SE->getStoreInst(); + NewClass->RepLeader = + lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent()); + } else { NewClass->RepLeader = V; + } + assert(!isa<VariableExpression>(E) && + "VariableExpression should have been handled already"); EClass = NewClass; DEBUG(dbgs() << "Created new congruence class for " << *V @@ -1077,14 +1121,11 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) { ExpressionToClass.erase(VClass->DefiningExpr); } } else if (VClass->RepLeader == V) { - // FIXME: When the leader changes, the value numbering of - // everything may change, so we need to reprocess. + // When the leader changes, the value numbering of + // everything may change due to symbolization changes, so we need to + // reprocess. VClass->RepLeader = *(VClass->Members.begin()); - for (auto M : VClass->Members) { - if (auto *I = dyn_cast<Instruction>(M)) - TouchedInstructions.set(InstrDFS[I]); - ChangedValues.insert(M); - } + markLeaderChangeTouched(VClass); } } @@ -1106,6 +1147,27 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) { markMemoryUsersTouched(MA); } } + } else if (StoreInst *SI = dyn_cast<StoreInst>(V)) { + // There is, sadly, one complicating thing for stores. Stores do not + // produce values, only consume them. However, in order to make loads and + // stores value number the same, we ignore the value operand of the store. + // But the value operand will still be the leader of our class, and thus, it + // may change. Because the store is a use, the store will get reprocessed, + // but nothing will change about it, and so nothing above will catch it + // (since the class will not change). In order to make sure everything ends + // up okay, we need to recheck the leader of the class. Since stores of + // different values value number differently due to different memorydefs, we + // are guaranteed the leader is always the same between stores in the same + // class. + DEBUG(dbgs() << "Checking store leader\n"); + auto ProperLeader = + lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent()); + if (EClass->RepLeader != ProperLeader) { + DEBUG(dbgs() << "Store leader changed, fixing\n"); + EClass->RepLeader = ProperLeader; + markLeaderChangeTouched(EClass); + markMemoryUsersTouched(MSSA->getMemoryAccess(SI)); + } } } @@ -1708,8 +1770,9 @@ struct NewGVN::ValueDFS { } }; -void NewGVN::convertDenseToDFSOrdered(CongruenceClass::MemberSet &Dense, - std::vector<ValueDFS> &DFSOrderedSet) { +void NewGVN::convertDenseToDFSOrdered( + CongruenceClass::MemberSet &Dense, + SmallVectorImpl<ValueDFS> &DFSOrderedSet) { for (auto D : Dense) { // First add the value. BasicBlock *BB = getBlockForValue(D); @@ -1972,21 +2035,25 @@ bool NewGVN::eliminateInstructions(Function &F) { ValueDFSStack EliminationStack; // Convert the members to DFS ordered sets and then merge them. - std::vector<ValueDFS> DFSOrderedSet; + SmallVector<ValueDFS, 8> DFSOrderedSet; convertDenseToDFSOrdered(CC->Members, DFSOrderedSet); // Sort the whole thing. - sort(DFSOrderedSet.begin(), DFSOrderedSet.end()); - - for (auto &C : DFSOrderedSet) { - int MemberDFSIn = C.DFSIn; - int MemberDFSOut = C.DFSOut; - Value *Member = C.Val; - Use *MemberUse = C.U; - - // We ignore void things because we can't get a value from them. - if (Member && Member->getType()->isVoidTy()) - continue; + std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end()); + + for (auto &VD : DFSOrderedSet) { + int MemberDFSIn = VD.DFSIn; + int MemberDFSOut = VD.DFSOut; + Value *Member = VD.Val; + Use *MemberUse = VD.U; + + if (Member) { + // We ignore void things because we can't get a value from them. + // FIXME: We could actually use this to kill dead stores that are + // dominated by equivalent earlier stores. + if (Member->getType()->isVoidTy()) + continue; + } if (EliminationStack.empty()) { DEBUG(dbgs() << "Elimination Stack is empty\n"); @@ -1995,8 +2062,6 @@ bool NewGVN::eliminateInstructions(Function &F) { << EliminationStack.dfs_back().first << "," << EliminationStack.dfs_back().second << ")\n"); } - if (Member && isa<Constant>(Member)) - assert(isa<Constant>(CC->RepLeader)); DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << "," << MemberDFSOut << ")\n"); @@ -2037,11 +2102,8 @@ bool NewGVN::eliminateInstructions(Function &F) { continue; Value *Result = EliminationStack.back(); - // Don't replace our existing users with ourselves, and don't replace - // phi node arguments with the result of the same phi node. - // IE tmp = phi(tmp11, undef); tmp11 = foo -> tmp = phi(tmp, undef) - if (MemberUse->get() == Result || - (isa<PHINode>(Result) && MemberUse->getUser() == Result)) + // Don't replace our existing users with ourselves. + if (MemberUse->get() == Result) continue; DEBUG(dbgs() << "Found replacement " << *Result << " for " diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 8a6be97d08c7c..34be90692481b 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -511,9 +511,6 @@ private: void visitSelectInst(SelectInst &I); void visitBinaryOperator(Instruction &I); void visitCmpInst(CmpInst &I); - void visitExtractElementInst(ExtractElementInst &I); - void visitInsertElementInst(InsertElementInst &I); - void visitShuffleVectorInst(ShuffleVectorInst &I); void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); } @@ -970,21 +967,6 @@ void SCCPSolver::visitCmpInst(CmpInst &I) { markOverdefined(&I); } -void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) { - // TODO : SCCP does not handle vectors properly. - return markOverdefined(&I); -} - -void SCCPSolver::visitInsertElementInst(InsertElementInst &I) { - // TODO : SCCP does not handle vectors properly. - return markOverdefined(&I); -} - -void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) { - // TODO : SCCP does not handle vectors properly. - return markOverdefined(&I); -} - // Handle getelementptr instructions. If all operands are constants then we // can turn this into a getelementptr ConstantExpr. // diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp index 678d02e05d423..9844190ef84a2 100644 --- a/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -67,12 +67,15 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal( return true; } - // When exporting, consult the index. - auto Summaries = ImportIndex.findGlobalValueSummaryList(SGV->getGUID()); - assert(Summaries != ImportIndex.end() && - "Missing summary for global value when exporting"); - assert(Summaries->second.size() == 1 && "Local has more than one summary"); - auto Linkage = Summaries->second.front()->linkage(); + // When exporting, consult the index. We can have more than one local + // with the same GUID, in the case of same-named locals in different but + // same-named source files that were compiled in their respective directories + // (so the source file name and resulting GUID is the same). Find the one + // in this module. + auto Summary = ImportIndex.findSummaryInModule( + SGV->getGUID(), SGV->getParent()->getModuleIdentifier()); + assert(Summary && "Missing summary for global value when exporting"); + auto Linkage = Summary->linkage(); if (!GlobalValue::isLocalLinkage(Linkage)) { assert(!isNonRenamableLocal(*SGV) && "Attempting to promote non-renamable local"); diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index c8f030f7eb835..11d54bcf4f89d 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1189,19 +1189,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - Value *Ret = nullptr; StringRef Name = Callee->getName(); if (Name == "fabs" && hasFloatVersion(Name)) - Ret = optimizeUnaryDoubleFP(CI, B, false); + return optimizeUnaryDoubleFP(CI, B, false); - Value *Op = CI->getArgOperand(0); - if (Instruction *I = dyn_cast<Instruction>(Op)) { - // Fold fabs(x * x) -> x * x; any squared FP value must already be positive. - if (I->getOpcode() == Instruction::FMul) - if (I->getOperand(0) == I->getOperand(1)) - return Op; - } - return Ret; + return nullptr; } Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) { diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 31daba2248aae..578c65daf7c0f 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -783,6 +783,10 @@ protected: // Similarly, we create a new latch condition when setting up the structure // of the new loop, so the old one can become dead. SmallPtrSet<Instruction *, 4> DeadInstructions; + + // Holds the end values for each induction variable. We save the end values + // so we can later fix-up the external users of the induction variables. + DenseMap<PHINode *, Value *> IVEndValues; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -1879,13 +1883,6 @@ public: unsigned selectInterleaveCount(bool OptForSize, unsigned VF, unsigned LoopCost); - /// \return The most profitable unroll factor. - /// This method finds the best unroll-factor based on register pressure and - /// other parameters. VF and LoopCost are the selected vectorization factor - /// and the cost of the selected VF. - unsigned computeInterleaveCount(bool OptForSize, unsigned VF, - unsigned LoopCost); - /// \brief A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { @@ -3424,7 +3421,7 @@ void InnerLoopVectorizer::createEmptyLoop() { // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create( OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); - Value *EndValue; + Value *&EndValue = IVEndValues[OrigPhi]; if (OrigPhi == OldInduction) { // We know what the end value is. EndValue = CountRoundDown; @@ -3443,9 +3440,6 @@ void InnerLoopVectorizer::createEmptyLoop() { // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, MiddleBlock); - // Fix up external users of the induction variable. - fixupIVUsers(OrigPhi, II, CountRoundDown, EndValue, MiddleBlock); - // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); @@ -4116,11 +4110,23 @@ void InnerLoopVectorizer::vectorizeLoop() { Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } // end of for each Phi in PHIsToFix. - fixLCSSAPHIs(); - - // Make sure DomTree is updated. + // Update the dominator tree. + // + // FIXME: After creating the structure of the new loop, the dominator tree is + // no longer up-to-date, and it remains that way until we update it + // here. An out-of-date dominator tree is problematic for SCEV, + // because SCEVExpander uses it to guide code generation. The + // vectorizer use SCEVExpanders in several places. Instead, we should + // keep the dominator tree up-to-date as we go. updateAnalysis(); + // Fix-up external users of the induction variables. + for (auto &Entry : *Legal->getInductionVars()) + fixupIVUsers(Entry.first, Entry.second, + getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), + IVEndValues[Entry.first], LoopMiddleBlock); + + fixLCSSAPHIs(); predicateInstructions(); // Remove redundant induction instructions. |