diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2016-01-06 20:01:02 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2016-01-06 20:01:02 +0000 |
commit | 8a6c1c25bce0267ee4072bd7b786b921e8a66a35 (patch) | |
tree | ea70b740d40cffe568a990c7aecd1acb5f83f786 /lib/Target | |
parent | 84fe440ded1bfc237d720c49408b36798d67ceff (diff) |
Notes
Diffstat (limited to 'lib/Target')
49 files changed, 980 insertions, 596 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 0bff9b592c15..46ef2c111bae 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -124,6 +124,14 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", FeaturePerfMon, FeatureZCRegMove, FeatureZCZeroing]>; +def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", + "Samsung Exynos-M1 processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon]>; + def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON, FeatureCRC, @@ -136,6 +144,8 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; // FIXME: Cortex-A72 is currently modelled as an Cortex-A57. def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; +// FIXME: Exynos-M1 is currently modelled without a specific SchedModel. +def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>; //===----------------------------------------------------------------------===// // Assembly parser diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 79a84ad8c6c5..3d1ab4e3fc2b 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -158,7 +158,7 @@ INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE, "AArch64 A57 FP Load-Balancing", false, false) namespace { -/// A Chain is a sequence of instructions that are linked together by +/// A Chain is a sequence of instructions that are linked together by /// an accumulation operand. For example: /// /// fmul d0<def>, ? @@ -285,7 +285,7 @@ public: std::string str() const { std::string S; raw_string_ostream OS(S); - + OS << "{"; StartInst->print(OS, /* SkipOpers= */true); OS << " -> "; @@ -427,7 +427,7 @@ Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor, return Ch; } } - + // Bailout case - just return the first item. Chain *Ch = L.front(); L.erase(L.begin()); @@ -495,7 +495,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, RS.enterBasicBlock(&MBB); RS.forward(MachineBasicBlock::iterator(G->getStart())); - // Can we find an appropriate register that is available throughout the life + // Can we find an appropriate register that is available throughout the life // of the chain? unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass; BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID)); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 9f5beff12100..4ecfbe9e2280 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2426,7 +2426,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( continue; } - + if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); @@ -5074,7 +5074,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, // The index of an EXT is the first element if it is not UNDEF. // Watch out for the beginning UNDEFs. The EXT index should be the expected - // value of the first element. E.g. + // value of the first element. E.g. // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. // ExpectedElt is the last mask index plus 1. @@ -9491,6 +9491,103 @@ static SDValue performBRCONDCombine(SDNode *N, return SDValue(); } +// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test +// as well as whether the test should be inverted. This code is required to +// catch these cases (as opposed to standard dag combines) because +// AArch64ISD::TBZ is matched during legalization. +static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, + SelectionDAG &DAG) { + + if (!Op->hasOneUse()) + return Op; + + // We don't handle undef/constant-fold cases below, as they should have + // already been taken care of (e.g. and of 0, test of undefined shifted bits, + // etc.) + + // (tbz (trunc x), b) -> (tbz x, b) + // This case is just here to enable more of the below cases to be caught. + if (Op->getOpcode() == ISD::TRUNCATE && + Bit < Op->getValueType(0).getSizeInBits()) { + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + + if (Op->getNumOperands() != 2) + return Op; + + auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!C) + return Op; + + switch (Op->getOpcode()) { + default: + return Op; + + // (tbz (and x, m), b) -> (tbz x, b) + case ISD::AND: + if ((C->getZExtValue() >> Bit) & 1) + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + return Op; + + // (tbz (shl x, c), b) -> (tbz x, b-c) + case ISD::SHL: + if (C->getZExtValue() <= Bit && + (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { + Bit = Bit - C->getZExtValue(); + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + return Op; + + // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x + case ISD::SRA: + Bit = Bit + C->getZExtValue(); + if (Bit >= Op->getValueType(0).getSizeInBits()) + Bit = Op->getValueType(0).getSizeInBits() - 1; + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + + // (tbz (srl x, c), b) -> (tbz x, b+c) + case ISD::SRL: + if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { + Bit = Bit + C->getZExtValue(); + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + return Op; + + // (tbz (xor x, -1), b) -> (tbnz x, b) + case ISD::XOR: + if ((C->getZExtValue() >> Bit) & 1) + Invert = !Invert; + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } +} + +// Optimize test single bit zero/non-zero and branch. +static SDValue performTBZCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + bool Invert = false; + SDValue TestSrc = N->getOperand(1); + SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); + + if (TestSrc == NewTestSrc) + return SDValue(); + + unsigned NewOpc = N->getOpcode(); + if (Invert) { + if (NewOpc == AArch64ISD::TBZ) + NewOpc = AArch64ISD::TBNZ; + else { + assert(NewOpc == AArch64ISD::TBNZ); + NewOpc = AArch64ISD::TBZ; + } + } + + SDLoc DL(N); + return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, + DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); +} + // vselect (v1i1 setcc) -> // vselect (v1iXX setcc) (XX is the size of the compared operand type) // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as @@ -9642,6 +9739,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); + case AArch64ISD::TBNZ: + case AArch64ISD::TBZ: + return performTBZCombine(N, DCI, DAG); case AArch64ISD::CSEL: return performCONDCombine(N, DCI, DAG, 2, 3); case AArch64ISD::DUP: diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 566aa2c9a9ba..43664df3b861 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -613,21 +613,6 @@ static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst, (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } -// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. -static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, - MachineInstr *Op1) { - assert(MI->memoperands_empty() && "expected a new machineinstr"); - size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + - (Op1->memoperands_end() - Op1->memoperands_begin()); - - MachineFunction *MF = MI->getParent()->getParent(); - MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); - MachineSDNode::mmo_iterator MemEnd = - std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); - MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); - MI->setMemRefs(MemBegin, MemEnd); -} - MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -692,10 +677,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, TII->get(NewOpc)) .addOperand(getLdStRegOp(RtNewDest)) .addOperand(BaseRegOp) - .addImm(OffsetImm); - - // Copy MachineMemOperands from the original loads. - concatenateMemOperands(NewMemMI, I, Paired); + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*Paired)); DEBUG( dbgs() @@ -786,9 +769,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, TII->get(NewOpc)) .addOperand(getLdStRegOp(I)) .addOperand(BaseRegOp) - .addImm(OffsetImm); - // Copy MachineMemOperands from the original stores. - concatenateMemOperands(MIB, I, Paired); + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*Paired)); } else { // Handle Unscaled if (IsUnscaled) diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 1b8b9b27719c..151133b2f32c 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -33,7 +33,14 @@ class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { protected: - enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone}; + enum ARMProcFamilyEnum { + Others, + CortexA35, + CortexA53, + CortexA57, + Cyclone, + ExynosM1 + }; /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. ARMProcFamilyEnum ARMProcFamily; @@ -143,6 +150,7 @@ public: bool isCyclone() const { return CPUString == "cyclone"; } bool isCortexA57() const { return CPUString == "cortex-a57"; } bool isCortexA53() const { return CPUString == "cortex-a53"; } + bool isExynosM1() const { return CPUString == "exynos-m1"; } bool useAA() const override { return isCortexA53(); } diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index 78f5289ec26d..cde1c6df2608 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -834,7 +834,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings }; uint32_t -AArch64SysReg::SysRegMapper::fromString(StringRef Name, +AArch64SysReg::SysRegMapper::fromString(StringRef Name, const FeatureBitset& FeatureBits, bool &Valid) const { std::string NameLower = Name.lower(); @@ -878,7 +878,7 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, } std::string -AArch64SysReg::SysRegMapper::toString(uint32_t Bits, +AArch64SysReg::SysRegMapper::toString(uint32_t Bits, const FeatureBitset& FeatureBits) const { // First search the registers shared by all for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index f649cb9b8a8d..e63627eae123 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -285,17 +285,17 @@ struct AArch64NamedImmMapper { // Zero value of FeatureBitSet means the mapping is always available FeatureBitset FeatureBitSet; - bool isNameEqual(std::string Other, + bool isNameEqual(std::string Other, const FeatureBitset& FeatureBits) const { - if (FeatureBitSet.any() && + if (FeatureBitSet.any() && (FeatureBitSet & FeatureBits).none()) return false; return Name == Other; } - bool isValueEqual(uint32_t Other, + bool isValueEqual(uint32_t Other, const FeatureBitset& FeatureBits) const { - if (FeatureBitSet.any() && + if (FeatureBitSet.any() && (FeatureBitSet & FeatureBits).none()) return false; return Value == Other; @@ -310,7 +310,7 @@ struct AArch64NamedImmMapper { StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits, bool &Valid) const; // Maps string to value, depending on availability for FeatureBits given - uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, + uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, bool &Valid) const; /// Many of the instructions allow an alternative assembly form consisting of @@ -1322,7 +1322,7 @@ namespace AArch64TLBI { return true; } } -} +} namespace AArch64II { /// Target Operand Flag enum. diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index d4af8d2e48d1..db869cf7dd8b 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -118,6 +118,11 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "true", "Support flat address space">; +def FeatureXNACK : SubtargetFeature<"xnack", + "EnableXNACK", + "true", + "Enable XNACK support">; + def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index ba71dc05a8fc..9c3790264377 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -417,13 +417,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } } - if (VCCUsed || FlatUsed) + if (VCCUsed || FlatUsed || STM.isXNACKEnabled()) { MaxSGPR += 2; - if (FlatUsed) { - MaxSGPR += 2; - // 2 additional for VI+. - if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (FlatUsed) + MaxSGPR += 2; + + if (STM.isXNACKEnabled()) MaxSGPR += 2; } @@ -620,6 +620,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, if (MFI->hasDispatchPtr()) header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + if (STM.isXNACKEnabled()) + header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; + header.kernarg_segment_byte_size = MFI->ABIArgOffset; header.wavefront_sgpr_count = KernelInfo.NumSGPR; header.workitem_vgpr_count = KernelInfo.NumVGPR; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 11f6139deddd..2a7ce6a47176 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -204,14 +204,6 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; -def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); }]>; @@ -243,14 +235,6 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; -def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); }]>; @@ -299,16 +283,6 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), return isGlobalStore(dyn_cast<StoreSDNode>(N)); }]>; -def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast<StoreSDNode>(N)); -}]>; - -def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast<StoreSDNode>(N)); -}]>; - def local_store : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return isLocalStore(dyn_cast<StoreSDNode>(N)); @@ -385,15 +359,6 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>; -def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def flat_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast<StoreSDNode>(N)); -}]>; - def mskor_flat : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 44e0c47877a9..c6af5b93d257 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -73,6 +73,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false), EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), + EnableXNACK(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 9c7bb88f8f4a..d3712276d5e7 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -76,6 +76,7 @@ private: bool EnableIfCvt; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; + bool EnableXNACK; unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; @@ -290,6 +291,10 @@ public: } bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; + bool isXNACKEnabled() const { + return EnableXNACK; + } + unsigned getMaxWavesPerCU() const { if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) return 10; diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td index 88a090d3df35..c543814cae0d 100644 --- a/lib/Target/AMDGPU/CIInstructions.td +++ b/lib/Target/AMDGPU/CIInstructions.td @@ -264,42 +264,6 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC < } // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst -//===----------------------------------------------------------------------===// -// Flat Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [HasFlatAddressSpace] in { - -class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt, - PatFrag flat_ld> : - Pat <(vt (flat_ld i64:$ptr)), - (Instr_ADDR64 $ptr, 0, 0, 0) ->; - -def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>; - -class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> : - Pat <(st vt:$value, i64:$ptr), - (Instr $value, $ptr, 0, 0, 0) - >; - -def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>; -def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>; -def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>; -def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>; -def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>; -def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>; - -} // End HasFlatAddressSpace predicate - let Predicates = [isCI] in { // Convert (x - floor(x)) to fract(x) @@ -320,20 +284,10 @@ def : Pat < //===----------------------------------------------------------------------===// -// Patterns to generate flat for global +// Flat Patterns //===----------------------------------------------------------------------===// -def useFlatForGlobal : Predicate < - "Subtarget->useFlatForGlobal() || " - "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">; - -let Predicates = [useFlatForGlobal] in { - -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) ->; +let Predicates = [isCIVI] in { // Patterns for global loads with no offset class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < @@ -341,24 +295,24 @@ class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < (inst $addr, 0, 0, 0) >; -def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>; class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < (node vt:$data, i64:$addr), (inst $data, $addr, 0, 0, 0) >; -def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; -def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; -def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; +def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>; class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < (vt (node i64:$addr, vt:$data)), @@ -376,4 +330,4 @@ def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; -} // End Predicates = [useFlatForGlobal] +} // End Predicates = [isCIVI] diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 6b3c81c3af74..7d20509c464d 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -105,51 +105,53 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MBB.addLiveIn(PreloadedPrivateBufferReg); } - // We reserved the last registers for this. Shift it down to the end of those - // which were actually used. - // - // FIXME: It might be safer to use a pseudoregister before replacement. - - // FIXME: We should be able to eliminate unused input registers. We only - // cannot do this for the resources required for scratch access. For now we - // skip over user SGPRs and may leave unused holes. - - // We find the resource first because it has an alignment requirement. - if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. - for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { - // Pick the first unallocated one. Make sure we don't clobber the other - // reserved input we needed. - if (!MRI.isPhysRegUsed(Reg)) { - assert(MRI.isAllocatable(Reg)); - MRI.replaceRegWith(ScratchRsrcReg, Reg); - ScratchRsrcReg = Reg; - MFI->setScratchRSrcReg(ScratchRsrcReg); - break; + if (!ST.hasSGPRInitBug()) { + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } } } - } - if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { - // Pick the first unallocated SGPR. Be careful not to pick an alias of the - // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg)) { - assert(MRI.isAllocatable(Reg) && - !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); - - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - ScratchWaveOffsetReg = Reg; - MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); - break; + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } } } } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 10f2adde4867..8735277149a6 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -134,6 +134,34 @@ def SIconstdata_ptr : SDNode< SDTCisVT<0, i64>]> >; +//===----------------------------------------------------------------------===// +// PatFrags for FLAT instructions +//===----------------------------------------------------------------------===// + +class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr), + (ld node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)) || + isGlobalLoad(dyn_cast<LoadSDNode>(N)) || + isConstantLoad(cast<LoadSDNode>(N), -1); +}]>; + +def flat_load : flat_ld <load>; +def flat_az_extloadi8 : flat_ld <az_extloadi8>; +def flat_sextloadi8 : flat_ld <sextloadi8>; +def flat_az_extloadi16 : flat_ld <az_extloadi16>; +def flat_sextloadi16 : flat_ld <sextloadi16>; + +class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)) || + isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def flat_store: flat_st <store>; +def flat_truncstorei8 : flat_st <truncstorei8>; +def flat_truncstorei16 : flat_st <truncstorei16>; + + def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isGlobalLoad(cast<LoadSDNode>(N)) || isConstantLoad(cast<LoadSDNode>(N), -1); diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 6f653c70aca0..b7df058b7c0c 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -59,8 +59,6 @@ defm EXP : EXP_m; // SMRD Instructions //===----------------------------------------------------------------------===// -let mayLoad = 1 in { - // We are using the SGPR_32 and not the SReg_32 register class for 32-bit // SMRD instructions, because the SGPR_32 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. @@ -90,8 +88,6 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512 >; -} // mayLoad = 1 - //def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv", diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 935aad427198..bf15516bea7b 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -156,6 +156,17 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); + + if (LaneVGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + + // When compiling from inside Mesa, the compilation continues. + // Select an arbitrary register to avoid triggering assertions + // during subsequent passes. + LaneVGPR = AMDGPU::VGPR0; + } + LaneVGPRs[LaneVGPRIdx] = LaneVGPR; // Add this register as live-in to all blocks to avoid machine verifer diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 3cdffef05583..2afa00996609 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -37,13 +37,17 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); if (ST.hasSGPRInitBug()) { unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4; + if (ST.isXNACKEnabled()) + BaseIdx -= 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the - // next sgpr128 down. + // 98/99 need to be reserved for flat_scr or 96/97 for flat_scr and + // 98/99 for xnack_mask, and 100/101 for vcc. This is the next sgpr128 down + // either way. return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; } @@ -54,13 +58,25 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); if (ST.hasSGPRInitBug()) { - unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5; + unsigned Idx; + + if (!ST.isXNACKEnabled()) + Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5; + else + Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); } if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Next register before reservations for flat_scr and vcc. - return AMDGPU::SGPR97; + if (!ST.isXNACKEnabled()) { + // Next register before reservations for flat_scr and vcc. + return AMDGPU::SGPR97; + } else { + // Next register before reservations for flat_scr, xnack_mask, vcc, + // and scratch resource. + return AMDGPU::SGPR91; + } } return AMDGPU::SGPR95; @@ -86,6 +102,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // for VCC/FLAT_SCR. reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); + + if (ST.isXNACKEnabled()) + reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97); } // Tonga and Iceland can only allocate a fixed number of SGPRs due @@ -93,9 +112,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { if (ST.hasSGPRInitBug()) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). - // Assume XNACK_MASK is unused. unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4; + if (ST.isXNACKEnabled()) + Limit -= 2; + for (unsigned i = Limit; i < NumSGPRs; ++i) { unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); reserveRegisterTuples(Reserved, Reg); @@ -282,11 +303,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); - if (Spill.VGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - } - BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) @@ -315,11 +331,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); - if (Spill.VGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - } - BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td index 20a026a822e2..1a7801c92bd7 100644 --- a/lib/Target/AMDGPU/VIInstructions.td +++ b/lib/Target/AMDGPU/VIInstructions.td @@ -101,3 +101,12 @@ def S_DCACHE_WB_VOL : SMEM_Inval <0x23, } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI +let Predicates = [isVI] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +} // End Predicates = [isVI] diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index a44dc830a673..c171656b48ab 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -252,6 +252,8 @@ def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", "Swift ARM processors", []>; +def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", + "Samsung Exynos-M1 processors", []>; def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4", "Cortex-R4 ARM processors", []>; @@ -649,6 +651,12 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureCrypto, FeatureZCZeroing]>; +def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynosM1, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; //===----------------------------------------------------------------------===// // Register File Description diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index e89757c19ecc..55c1684028c2 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -340,12 +340,12 @@ namespace { /// verify - check BBOffsets, BBSizes, alignment of islands void ARMConstantIslands::verify() { #ifndef NDEBUG - for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); - MBBI != E; ++MBBI) { - MachineBasicBlock *MBB = &*MBBI; - unsigned MBBId = MBB->getNumber(); - assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset); - } + assert(std::is_sorted(MF->begin(), MF->end(), + [this](const MachineBasicBlock &LHS, + const MachineBasicBlock &RHS) { + return BBInfo[LHS.getNumber()].postOffset() < + BBInfo[RHS.getNumber()].postOffset(); + })); DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n"); for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { CPUser &U = CPUsers[i]; diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 725b8383c961..6e7e47b8706a 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1986,23 +1986,6 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, return AddedRegPressure.size() <= MemRegs.size() * 2; } - -/// Copy \p Op0 and \p Op1 operands into a new array assigned to MI. -static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, - MachineInstr *Op1) { - assert(MI->memoperands_empty() && "expected a new machineinstr"); - size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) - + (Op1->memoperands_end() - Op1->memoperands_begin()); - - MachineFunction *MF = MI->getParent()->getParent(); - MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); - MachineSDNode::mmo_iterator MemEnd = - std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); - MemEnd = - std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); - MI->setMemRefs(MemBegin, MemEnd); -} - bool ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc, @@ -2196,7 +2179,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, if (!isT2) MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); - concatenateMemOperands(MIB, Op0, Op1); + MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); DEBUG(dbgs() << "Formed " << *MIB << "\n"); ++NumLDRDFormed; } else { @@ -2210,7 +2193,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, if (!isT2) MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); - concatenateMemOperands(MIB, Op0, Op1); + MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); DEBUG(dbgs() << "Formed " << *MIB << "\n"); ++NumSTRDFormed; } diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index a8b28018f1b2..4d54e5751473 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -44,7 +44,7 @@ protected: enum ARMProcFamilyEnum { Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15, CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53, - CortexA57, CortexA72, Krait, Swift + CortexA57, CortexA72, Krait, Swift, ExynosM1 }; enum ARMProcClassEnum { None, AClass, RClass, MClass diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td index 1189cfd488ee..5a7eb215de42 100644 --- a/lib/Target/Hexagon/Hexagon.td +++ b/lib/Target/Hexagon/Hexagon.td @@ -251,6 +251,10 @@ def : Proc<"hexagonv60", HexagonModelV60, // Declare the target which we are implementing //===----------------------------------------------------------------------===// +def HexagonAsmParser : AsmParser { + bit HasMnemonicFirst = 0; +} + def HexagonAsmParserVariant : AsmParserVariant { int Variant = 0; string TokenizingCharacters = "#()=:.<>!+*"; @@ -259,5 +263,6 @@ def HexagonAsmParserVariant : AsmParserVariant { def Hexagon : Target { // Pull in Instruction Info: let InstructionSet = HexagonInstrInfo; + let AssemblyParsers = [HexagonAsmParser]; let AssemblyParserVariants = [HexagonAsmParserVariant]; } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td index 5cfeba720d90..421403f49724 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.td +++ b/lib/Target/Hexagon/HexagonInstrInfo.td @@ -5807,3 +5807,5 @@ include "HexagonInstrInfoV60.td" include "HexagonInstrInfoVector.td" include "HexagonInstrAlias.td" +include "HexagonSystemInst.td" + diff --git a/lib/Target/Hexagon/HexagonSystemInst.td b/lib/Target/Hexagon/HexagonSystemInst.td new file mode 100644 index 000000000000..784686a437ad --- /dev/null +++ b/lib/Target/Hexagon/HexagonSystemInst.td @@ -0,0 +1,113 @@ +//==- HexagonSystemInst.td - System Instructions for Hexagon -*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hexagon instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Cache manipulation instructions. +//===----------------------------------------------------------------------===// +let mayStore = 1 in +class ST_MISC_CACHEOP<dag outs, dag ins, + string asmstr, list<dag> pattern = [], + bits<3> amode, bits<3> type, bits<1> un> + : ST0Inst<outs, ins, asmstr, pattern, "", ST_tc_ld_SLOT0> { + + bits<5> Rs; + bits<5> Rt; + bits<5> Rd; + let Inst{31-28} = 0b1010; + let Inst{27-25} = amode; + let Inst{24-22} = type; + let Inst{21} = un; + let Inst{20-16} = Rs; + let Inst{12-8} = Rt; + let Inst{4-0} = Rd; +} + +let mayStore = 1 in +class ST_MISC_CACHEOP_SYS<dag outs, dag ins, + string asmstr, list<dag> pattern = [], + bits<3> amode, bits<3> type, bits<1> un> + : SYSInst<outs, ins, asmstr, pattern, ""> { + + bits<5> Rs; + bits<5> Rt; + bits<5> Rd; + let Inst{31-28} = 0b1010; + let Inst{27-25} = amode; + let Inst{24-22} = type; + let Inst{21} = un; + let Inst{20-16} = Rs; + let Inst{12-8} = Rt; + let Inst{4-0} = Rd; +} + + +let isSolo = 1, Rs = 0, Rt = 0, Rd = 0 in { +def Y2_syncht: ST_MISC_CACHEOP <(outs), (ins), + "syncht" , [], 0b100, 0b001, 0b0>; +} + +let Rt = 0, Rd = 0 in { +let isSoloAin1 = 1 in { + def Y2_dccleana: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs), + "dccleana($Rs)", [], 0b000, 0b000, 0b0>; + def Y2_dcinva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs), + "dcinva($Rs)", [], 0b000, 0b000, 0b1>; + def Y2_dccleaninva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs), + "dccleaninva($Rs)", [], 0b000, 0b001, 0b0>; + } +} + +let isSoloAX = 1, hasSideEffects = 1, Rd = 0 in { + def Y4_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, IntRegs:$Rt), + "l2fetch($Rs, $Rt)", [], 0b011, 0b000, 0b0>; + def Y5_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, DoubleRegs:$Rt), + "l2fetch($Rs, $Rt)", [], 0b011, 0b010, 0b0>; +} + +let hasSideEffects = 0, isSolo = 1 in +class Y2_INVALIDATE_CACHE<string mnemonic, bit MajOp> + : JRInst < + (outs), (ins IntRegs:$Rs), + #mnemonic#"($Rs)" > { + bits<5> Rs; + + let IClass = 0b0101; + let Inst{27-21} = 0b0110110; + let Inst{20-16} = Rs; + let Inst{13-12} = 0b00; + let Inst{11} = MajOp; + } +// Instruction cache invalidate +def Y2_icinva : Y2_INVALIDATE_CACHE<"icinva", 0b0>; + +// Zero an aligned 32-byte cacheline. +let isSoloAin1 = 1 in +def Y2_dczeroa: ST0Inst <(outs), (ins IntRegs:$Rs), + "dczeroa($Rs)"> { + bits<5> Rs; + let IClass = 0b1010; + let Inst{27-21} = 0b0000110; + let Inst{13} = 0b0; + let Inst{20-16} = Rs; + } + +// Memory synchronization. +let hasSideEffects = 0, isSolo = 1 in +def Y2_isync: JRInst <(outs), (ins), + "isync"> { + let IClass = 0b0101; + let Inst{27-16} = 0b011111000000; + let Inst{13} = 0b0; + let Inst{9-0} = 0b0000000010; + } + diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt index ee9d060f339e..92ecde3f90d6 100644 --- a/lib/Target/WebAssembly/known_gcc_test_failures.txt +++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt @@ -5,6 +5,23 @@ pr38151.c va-arg-22.c +# WebAssemblyRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator, int, unsigned int, llvm::RegScavenger *) const: Assertion `MI.getOperand(1).getImm() == 0 && "Can't eliminate FI yet if offset is already set"' +20030313-1.c +20030916-1.c +20031012-1.c +20041126-1.c +20060420-1.c +20071202-1.c +20120808-1.c +pr20527-1.c +pr27073.c +pr36339.c +pr37573.c +pr43236.c +pr43835.c +pr45070.c +pr51933.c + # TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed. struct-ret-1.c va-arg-11.c diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index b23f5c353013..55949155da9e 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -27,6 +27,7 @@ set(sources X86PadShortFunction.cpp X86RegisterInfo.cpp X86SelectionDAGInfo.cpp + X86ShuffleDecodeConstantPool.cpp X86Subtarget.cpp X86TargetMachine.cpp X86TargetObjectFile.cpp diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 82f0ee5a5ebc..73f654cba38c 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -32,7 +32,6 @@ static unsigned getVectorRegSize(unsigned RegNo) { return 64; llvm_unreachable("Unknown vector reg!"); - return 0; } static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT, diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 4fdd527d87c8..619f7c8d25df 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "X86ShuffleDecode.h" -#include "llvm/IR/Constants.h" #include "llvm/CodeGen/MachineValueType.h" //===----------------------------------------------------------------------===// @@ -296,54 +295,6 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, } } -void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - // It is not an error for the PSHUFB mask to not be a vector of i8 because the - // constant pool uniques constants by their bit representation. - // e.g. the following take up the same space in the constant pool: - // i128 -170141183420855150465331762880109871104 - // - // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> - // - // <4 x i32> <i32 -2147483648, i32 -2147483648, - // i32 -2147483648, i32 -2147483648> - -#ifndef NDEBUG - unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); - assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512); -#endif - - // This is a straightforward byte vector. - if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { - int NumElements = MaskTy->getVectorNumElements(); - ShuffleMask.reserve(NumElements); - - for (int i = 0; i < NumElements; ++i) { - // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte - // lane of the vector we're inside. - int Base = i & ~0xf; - Constant *COp = C->getAggregateElement(i); - if (!COp) { - ShuffleMask.clear(); - return; - } else if (isa<UndefValue>(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - // If the high bit (7) of the byte is set, the element is zeroed. - if (Element & (1 << 7)) - ShuffleMask.push_back(SM_SentinelZero); - else { - // Only the least significant 4 bits of the byte are used. - int Index = Base + (Element & 0xf); - ShuffleMask.push_back(Index); - } - } - } - // TODO: Handle funny-looking vectors too. -} - void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, SmallVectorImpl<int> &ShuffleMask) { for (int i = 0, e = RawMask.size(); i < e; ++i) { @@ -388,68 +339,6 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { } } -void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, - SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - // It is not an error for the PSHUFB mask to not be a vector of i8 because the - // constant pool uniques constants by their bit representation. - // e.g. the following take up the same space in the constant pool: - // i128 -170141183420855150465331762880109871104 - // - // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> - // - // <4 x i32> <i32 -2147483648, i32 -2147483648, - // i32 -2147483648, i32 -2147483648> - - unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); - - if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. - return; - - // Only support vector types. - if (!MaskTy->isVectorTy()) - return; - - // Make sure its an integer type. - Type *VecEltTy = MaskTy->getVectorElementType(); - if (!VecEltTy->isIntegerTy()) - return; - - // Support any element type from byte up to element size. - // This is necesary primarily because 64-bit elements get split to 32-bit - // in the constant pool on 32-bit target. - unsigned EltTySize = VecEltTy->getIntegerBitWidth(); - if (EltTySize < 8 || EltTySize > ElSize) - return; - - unsigned NumElements = MaskTySize / ElSize; - assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && - "Unexpected number of vector elements."); - ShuffleMask.reserve(NumElements); - unsigned NumElementsPerLane = 128 / ElSize; - unsigned Factor = ElSize / EltTySize; - - for (unsigned i = 0; i < NumElements; ++i) { - Constant *COp = C->getAggregateElement(i * Factor); - if (!COp) { - ShuffleMask.clear(); - return; - } else if (isa<UndefValue>(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - int Index = i & ~(NumElementsPerLane - 1); - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - if (ElSize == 64) - Index += (Element >> 1) & 0x1; - else - Index += Element & 0x3; - ShuffleMask.push_back(Index); - } - - // TODO: Handle funny-looking vectors too. -} - void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) { unsigned NumDstElts = DstVT.getVectorNumElements(); unsigned SrcScalarBits = SrcVT.getScalarSizeInBits(); @@ -572,58 +461,4 @@ void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, } } -void DecodeVPERMVMask(const Constant *C, MVT VT, - SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - if (MaskTy->isVectorTy()) { - unsigned NumElements = MaskTy->getVectorNumElements(); - if (NumElements == VT.getVectorNumElements()) { - for (unsigned i = 0; i < NumElements; ++i) { - Constant *COp = C->getAggregateElement(i); - if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) { - ShuffleMask.clear(); - return; - } - if (isa<UndefValue>(COp)) - ShuffleMask.push_back(SM_SentinelUndef); - else { - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - Element &= (1 << NumElements) - 1; - ShuffleMask.push_back(Element); - } - } - } - return; - } - // Scalar value; just broadcast it - if (!isa<ConstantInt>(C)) - return; - uint64_t Element = cast<ConstantInt>(C)->getZExtValue(); - int NumElements = VT.getVectorNumElements(); - Element &= (1 << NumElements) - 1; - for (int i = 0; i < NumElements; ++i) - ShuffleMask.push_back(Element); -} - -void DecodeVPERMV3Mask(const Constant *C, MVT VT, - SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - unsigned NumElements = MaskTy->getVectorNumElements(); - if (NumElements == VT.getVectorNumElements()) { - for (unsigned i = 0; i < NumElements; ++i) { - Constant *COp = C->getAggregateElement(i); - if (!COp) { - ShuffleMask.clear(); - return; - } - if (isa<UndefValue>(COp)) - ShuffleMask.push_back(SM_SentinelUndef); - else { - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - Element &= (1 << NumElements*2) - 1; - ShuffleMask.push_back(Element); - } - } - } -} } // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index ab18e6438ec9..72db6a81912b 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -23,7 +23,6 @@ //===----------------------------------------------------------------------===// namespace llvm { -class Constant; class MVT; enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 }; @@ -72,9 +71,6 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); /// different datatypes and vector widths. void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); -/// \brief Decode a PSHUFB mask from an IR-level vector constant. -void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); - /// \brief Decode a PSHUFB mask from a raw array of constants such as from /// BUILD_VECTOR. void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, @@ -95,10 +91,6 @@ void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); -/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. -void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, - SmallVectorImpl<int> &ShuffleMask); - /// \brief Decode a zero extension instruction as a shuffle mask. void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &ShuffleMask); @@ -118,18 +110,10 @@ void DecodeEXTRQIMask(int Len, int Idx, void DecodeINSERTQIMask(int Len, int Idx, SmallVectorImpl<int> &ShuffleMask); -/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant. -void DecodeVPERMVMask(const Constant *C, MVT VT, - SmallVectorImpl<int> &ShuffleMask); - /// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants. void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, SmallVectorImpl<int> &ShuffleMask); -/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant. -void DecodeVPERMV3Mask(const Constant *C, MVT VT, - SmallVectorImpl<int> &ShuffleMask); - /// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants. void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, SmallVectorImpl<int> &ShuffleMask); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index de94a138d865..629d4d3565f2 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -1098,9 +1098,9 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { RetRegs.push_back(VA.getLocReg()); } - // All x86 ABIs require that for returning structs by value we copy
- // the sret argument into %rax/%eax (depending on ABI) for the return.
- // We saved the argument into a virtual register in the entry block,
+ // All x86 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. if (F.hasStructRetAttr()) { unsigned Reg = X86MFInfo->getSRetReturnReg(); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 242d0333ef9a..8b5fd27b4775 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -78,27 +78,6 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); } -/// usesTheStack - This function checks if any of the users of EFLAGS -/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has -/// to use the stack, and if we don't adjust the stack we clobber the first -/// frame index. -/// See X86InstrInfo::copyPhysReg. -static bool usesTheStack(const MachineFunction &MF) { - const MachineRegisterInfo &MRI = MF.getRegInfo(); - - // Conservativley assume that inline assembly might use the stack. - if (MF.hasInlineAsm()) - return true; - - return any_of(MRI.reg_instructions(X86::EFLAGS), - [](const MachineInstr &RI) { return RI.isCopy(); }); -} - -static bool doesStackUseImplyFP(const MachineFunction &MF) { - bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - return IsWin64Prologue && usesTheStack(MF); -} - /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. @@ -112,8 +91,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() || - MFI->hasStackMap() || MFI->hasPatchPoint() || - doesStackUseImplyFP(MF)); + MFI->hasStackMap() || MFI->hasPatchPoint()); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { @@ -965,11 +943,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // push and pop from the stack. if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && !TRI->needsStackRealignment(MF) && - !MFI->hasVarSizedObjects() && // No dynamic alloca. - !MFI->adjustsStack() && // No calls. - !IsWin64CC && // Win64 has no Red Zone - !usesTheStack(MF) && // Don't push and pop. - !MF.shouldSplitStack()) { // Regular stack + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64CC && // Win64 has no Red Zone + !MFI->hasOpaqueSPAdjustment() && // Don't push and pop. + !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 4414e478b99b..868ae4e19e55 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -157,13 +157,9 @@ namespace { /// performance. bool OptForSize; - /// If true, selector should try to optimize for minimum code size. - bool OptForMinSize; - public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), OptForSize(false), - OptForMinSize(false) {} + : SelectionDAGISel(tm, OptLevel), OptForSize(false) {} const char *getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -535,10 +531,8 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { } void X86DAGToDAGISel::PreprocessISelDAG() { - // OptFor[Min]Size are used in pattern predicates that isel is matching. + // OptForSize is used in pattern predicates that isel is matching. OptForSize = MF->getFunction()->optForSize(); - OptForMinSize = MF->getFunction()->optForMinSize(); - assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize"); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0927c2f4fa50..d31aab0fa141 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -18,6 +18,7 @@ #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" +#include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" @@ -4556,6 +4557,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); + Result = DAG.getBitcast(CastVT, Result); Vec256 = DAG.getBitcast(CastVT, Vec256); Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); return DAG.getBitcast(ResultVT, Vec256); @@ -4851,8 +4853,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { DecodePSHUFBMask(C, Mask); - if (Mask.empty()) - return false; break; } @@ -4870,7 +4870,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); - if (Mask.empty()) return false; // Mask only contains negative index if an element is zero. if (std::any_of(Mask.begin(), Mask.end(), [](int M){ return M == SM_SentinelZero; })) @@ -4948,8 +4947,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { DecodeVPERMVMask(C, VT, Mask); - if (Mask.empty()) - return false; break; } return false; @@ -5000,8 +4997,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { DecodeVPERMV3Mask(C, VT, Mask); - if (Mask.empty()) - return false; break; } return false; @@ -5009,6 +5004,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, default: llvm_unreachable("unknown target shuffle node"); } + // Empty mask indicates the decode failed. + if (Mask.empty()) + return false; + // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. @@ -17372,6 +17371,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, if (!IntrData) { if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) return MarkEHRegistrationNode(Op, DAG); + if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || + IntNo == llvm::Intrinsic::x86_flags_read_u64 || + IntNo == llvm::Intrinsic::x86_flags_write_u32 || + IntNo == llvm::Intrinsic::x86_flags_write_u64) { + // We need a frame pointer because this will get lowered to a PUSH/POP + // sequence. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setHasOpaqueSPAdjustment(true); + // Don't do anything here, we will expand these intrinsics out later + // during ExpandISelPseudos in EmitInstrWithCustomInserter. + return SDValue(); + } return SDValue(); } @@ -21144,6 +21155,47 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert input VAL into EAX + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI->getOperand(0).getReg()); + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert zero to EDX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX) + .addReg(X86::EDX) + .addReg(X86::EDX); + // insert WRPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert RDPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); @@ -22495,6 +22547,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RDFLAGS32: + case X86::RDFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned PushF = + MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; + unsigned Pop = + MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; + BuildMI(*BB, MI, DL, TII->get(PushF)); + BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg()); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + + case X86::WRFLAGS32: + case X86::WRFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned Push = + MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; + unsigned PopF = + MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; + BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(PopF)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::RELEASE_FADD32mr: case X86::RELEASE_FADD64mr: return EmitLoweredAtomicFP(MI, BB); @@ -22611,7 +22693,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Thread synchronization. case X86::MONITOR: return EmitMonitor(MI, BB, Subtarget); - + // PKU feature + case X86::WRPKRU: + return EmitWRPKRU(MI, BB, Subtarget); + case X86::RDPKRU: + return EmitRDPKRU(MI, BB, Subtarget); // xbegin case X86::XBEGIN: return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); @@ -23480,6 +23566,31 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, } return SDValue(); } + case X86ISD::BLENDI: { + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && + "Unexpected input vector types"); + + // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector + // operands and changing the mask to 1. This saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + // x86InstrInfo knows how to commute this back after instruction selection + // if it would help register allocation. + + // TODO: If optimizing for size or a processor that doesn't suffer from + // partial register update stalls, this should be transformed into a MOVSD + // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. + + if (VT == MVT::v2f64) + if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) + if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { + SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); + } + + return SDValue(); + } default: return SDValue(); } @@ -23573,9 +23684,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, /// the operands which explicitly discard the lanes which are unused by this /// operation to try to flow through the rest of the combiner the fact that /// they're unused. -static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { +static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); + if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + return SDValue(); // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask @@ -23617,12 +23732,6 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) return SDValue(); - // Only specific types are legal at this point, assert so we notice if and - // when these change. - assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || - VT == MVT::v4f64) && - "Unknown vector type encountered!"); - return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); } @@ -23642,8 +23751,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. - if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) - if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) + if (TLI.isTypeLegal(VT)) + if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode @@ -27310,7 +27419,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, // from AH (which we otherwise need to do contortions to access). if (N0.getOpcode() == ISD::UDIVREM && N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && - (VT == MVT::i32 || VT == MVT::i64)) { + VT == MVT::i32) { SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, N0.getOperand(0), N0.getOperand(1)); @@ -27382,32 +27491,6 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { - SDValue V0 = N->getOperand(0); - SDValue V1 = N->getOperand(1); - SDLoc DL(N); - EVT VT = N->getValueType(0); - - // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector - // operands and changing the mask to 1. This saves us a bunch of - // pattern-matching possibilities related to scalar math ops in SSE/AVX. - // x86InstrInfo knows how to commute this back after instruction selection - // if it would help register allocation. - - // TODO: If optimizing for size or a processor that doesn't suffer from - // partial register update stalls, this should be transformed into a MOVSD - // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. - - if (VT == MVT::v2f64) - if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) - if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { - SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); - } - - return SDValue(); -} - static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); // Gather and Scatter instructions use k-registers for masks. The type of @@ -27840,6 +27923,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); +// TODO: refactor the [SU]DIVREM8_[SZ]EXT_HREG code so that it's not duplicated. case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); @@ -27851,6 +27935,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGNR: + case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: @@ -27865,7 +27950,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); case ISD::MGATHER: case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); } @@ -27902,6 +27986,18 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { } } +/// This function checks if any of the users of EFLAGS copies the EFLAGS. We +/// know that the code that lowers COPY of EFLAGS has to use the stack, and if +/// we don't adjust the stack we clobber the first frame index. +/// See X86InstrInfo::copyPhysReg. +bool X86TargetLowering::hasCopyImplyingStackAdjustment( + MachineFunction *MF) const { + const MachineRegisterInfo &MRI = MF->getRegInfo(); + + return any_of(MRI.reg_instructions(X86::EFLAGS), + [](const MachineInstr &RI) { return RI.isCopy(); }); +} + /// IsDesirableToPromoteOp - This method query the target whether it is /// beneficial for dag combiner to promote the specified node. If true, it /// should return the desired promotion type by reference. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index a29dc9af54f6..8bb0e5f8bd36 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -697,6 +697,10 @@ namespace llvm { /// and some i16 instructions are slow. bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; + /// Return true if the MachineFunction contains a COPY which would imply + /// HasOpaqueSPAdjustment. + bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const override; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 8bf2925a75db..0a27c33f033e 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2366,6 +2366,7 @@ def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)), multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, RegisterClass KRCSrc, Predicate prd> { let Predicates = [prd] in { + let hasSideEffects = 0 in def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 5d7283f7bd57..96a29ca8c370 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -250,7 +250,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), // Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1, AddedComplexity = 20 in + isPseudo = 1 in def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; @@ -263,7 +263,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { } let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], - AddedComplexity = 15 in { + AddedComplexity = 1 in { // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, // which only require 3 bytes compared to MOV32ri which requires 5. let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { @@ -278,24 +278,12 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; } -let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in { -// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1. -// FIXME: Add itinerary class and Schedule. -def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "", - [(set GR32:$dst, i32immSExt8:$src)]>, - Requires<[OptForMinSize]>; -def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "", - [(set GR64:$dst, i64immSExt8:$src)]>, - Requires<[OptForMinSize, NotWin64WithoutFP]>; -} - // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. -let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, - isCodeGenOnly = 1, hasSideEffects = 0 in -def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), - "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; +let isReMaterializable = 1, isAsCheapAsAMove = 1, + isPseudo = 1, hasSideEffects = 0 in +def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>; // This 64-bit pseudo-move can be used for both a 64-bit constant that is // actually the zero-extension of a 32-bit constant and for labels in the @@ -566,8 +554,8 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in { // TODO: Get this to fold the constant into the instruction. let isCodeGenOnly = 1, Defs = [EFLAGS] in def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), - "or{l}\t{$zero, $dst|$dst, $zero}", - [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK, + "or{l}\t{$zero, $dst|$dst, $zero}", [], + IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK, Sched<[WriteALULd, WriteRMW]>; let hasSideEffects = 1 in diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 63e78de69bc9..246804e34289 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -23,7 +23,6 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" @@ -4453,7 +4452,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // such as TF/IF/DF, which LLVM doesn't model. // // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. See X86FrameLowering.cpp - usesTheStack. + // first frame index. + // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment. bool AXDead = (Reg == AX) || @@ -4465,6 +4465,10 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // (unnecessarily) saving+restoring a dead register. However the // MachineVerifier expects operands that read from dead registers // to be marked with the "undef" flag. + // An example of this can be found in + // test/CodeGen/X86/peephole-na-phys-copy-folding.ll and + // test/CodeGen/X86/cmpxchg-clobber-flags.ll when using + // -verify-machineinstrs. BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); } if (FromEFLAGS) { @@ -5309,50 +5313,6 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, return true; } -bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { - MachineBasicBlock &MBB = *MIB->getParent(); - DebugLoc DL = MIB->getDebugLoc(); - int64_t Imm = MIB->getOperand(1).getImm(); - assert(Imm != 0 && "Using push/pop for 0 is not efficient."); - MachineBasicBlock::iterator I = MIB.getInstr(); - - int StackAdjustment; - - if (Subtarget.is64Bit()) { - assert(MIB->getOpcode() == X86::MOV64ImmSExti8 || - MIB->getOpcode() == X86::MOV32ImmSExti8); - // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and - // widen the register if necessary. - StackAdjustment = 8; - BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm); - MIB->setDesc(get(X86::POP64r)); - MIB->getOperand(0) - .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64)); - } else { - assert(MIB->getOpcode() == X86::MOV32ImmSExti8); - StackAdjustment = 4; - BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm); - MIB->setDesc(get(X86::POP32r)); - } - - // Build CFI if necessary. - MachineFunction &MF = *MBB.getParent(); - const X86FrameLowering *TFL = Subtarget.getFrameLowering(); - bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool NeedsDwarfCFI = - !IsWin64Prologue && - (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry()); - bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; - if (EmitCFI) { - TFL->BuildCFI(MBB, I, DL, - MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment)); - TFL->BuildCFI(MBB, std::next(I), DL, - MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment)); - } - - return true; -} - // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, @@ -5385,9 +5345,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); case X86::MOV32r_1: return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); - case X86::MOV32ImmSExti8: - case X86::MOV64ImmSExti8: - return ExpandMOVImmSExti8(MIB); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: @@ -5412,7 +5369,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; - + case X86::MOV32ri64: + MI->setDesc(get(X86::MOV32ri)); + return true; + // KNL does not recognize dependency-breaking idioms for mask registers, // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1. // Using %k0 as the undef input register is a performance heuristic based diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 9d40334206b2..edd09d617595 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -23,7 +23,6 @@ #include "X86GenInstrInfo.inc" namespace llvm { - class MachineInstrBuilder; class X86RegisterInfo; class X86Subtarget; @@ -565,9 +564,6 @@ private: /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const; - - /// Expand the MOVImmSExti8 pseudo-instructions. - bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const; }; } // End llvm namespace diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index f4ca2b880bad..ea8e56206ce6 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -822,8 +822,6 @@ def In32BitMode : Predicate<"Subtarget->is32Bit()">, AssemblerPredicate<"Mode32Bit", "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; -def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" - "Subtarget->getFrameLowering()->hasFP(*MF)">; def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; @@ -837,7 +835,6 @@ def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">; def OptForSize : Predicate<"OptForSize">; -def OptForMinSize : Predicate<"OptForMinSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; @@ -1093,6 +1090,32 @@ def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], } +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW], Defs = [ESP] in { + let Uses = [ESP, EFLAGS] in + def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins), + [(set GR32:$dst, (int_x86_flags_read_u32))]>, + Requires<[Not64BitMode]>; + + let Uses = [RSP, EFLAGS] in + def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins), + [(set GR64:$dst, (int_x86_flags_read_u64))]>, + Requires<[In64BitMode]>; +} + +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW] in { + let Defs = [ESP, EFLAGS], Uses = [ESP] in + def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src), + [(int_x86_flags_write_u32 GR32:$src)]>, + Requires<[Not64BitMode]>; + + let Defs = [RSP, EFLAGS], Uses = [RSP] in + def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src), + [(int_x86_flags_write_u64 GR64:$src)]>, + Requires<[In64BitMode]>; +} + let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0, SchedRW = [WriteLoad] in { def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, @@ -1133,7 +1156,8 @@ def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in { def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), - "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>; + "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32, + Requires<[In64BitMode]>; def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[In64BitMode]>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 11dc1e7d466b..83f9b1409f61 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -651,7 +651,7 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), // Misc. let SchedRW = [WriteShuffle] in { -let Uses = [EDI], Predicates = [HasSSE1,In32BitMode] in +let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)], diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td index cf5e2e38fe58..31608cd4c128 100644 --- a/lib/Target/X86/X86InstrMPX.td +++ b/lib/Target/X86/X86InstrMPX.td @@ -63,8 +63,8 @@ def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src), Requires<[HasMPX, In64BitMode]>; def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), - "bndstx \t{$src, $dst|$dst, $src}", []>, TB, + "bndstx \t{$src, $dst|$dst, $src}", []>, PS, Requires<[HasMPX]>; def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - "bndldx \t{$src, $dst|$dst, $src}", []>, TB, - Requires<[HasMPX]>;
\ No newline at end of file + "bndldx \t{$src, $dst|$dst, $src}", []>, PS, + Requires<[HasMPX]>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 7a44212bd829..624b9316e6fd 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1466,6 +1466,8 @@ def SSE_CVT_SD2SI : OpndItins< IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM >; +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm, OpndItins itins> { @@ -1489,6 +1491,8 @@ let hasSideEffects = 0 in { } } +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { let hasSideEffects = 0, Predicates = [UseAVX] in { @@ -1626,6 +1630,8 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, Operand memop, ComplexPattern mem_cpat, string asm, OpndItins itins> { @@ -3387,9 +3393,18 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, def : Pat<(Intr (load addr:$src)), (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m) addr:$src), VR128))>; - def : Pat<(Intr mem_cpat:$src), - (!cast<Instruction>(NAME#Suffix##m_Int) - (vt (IMPLICIT_DEF)), mem_cpat:$src)>; + } + // We don't want to fold scalar loads into these instructions unless + // optimizing for size. This is because the folded instruction will have a + // partial register update, while the unfolded sequence will not, e.g. + // movss mem, %xmm0 + // rcpss %xmm0, %xmm0 + // which has a clobber before the rcp, vs. + // rcpss mem, %xmm0 + let Predicates = [target, OptForSize] in { + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>(NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)>; } } @@ -3420,28 +3435,37 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, } } + // We don't want to fold scalar loads into these instructions unless + // optimizing for size. This is because the folded instruction will have a + // partial register update, while the unfolded sequence will not, e.g. + // vmovss mem, %xmm0 + // vrcpss %xmm0, %xmm0, %xmm0 + // which has a clobber before the rcp, vs. + // vrcpss mem, %xmm0, %xmm0 + // TODO: In theory, we could fold the load, and avoid the stall caused by + // the partial register store, either in ExeDepFix or with smarter RA. let Predicates = [UseAVX] in { def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; - - def : Pat<(vt (OpNode mem_cpat:$src)), - (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), - mem_cpat:$src)>; - } let Predicates = [HasAVX] in { def : Pat<(Intr VR128:$src), (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)), VR128:$src)>; - - def : Pat<(Intr mem_cpat:$src), - (!cast<Instruction>("V"#NAME#Suffix##m_Int) + } + let Predicates = [HasAVX, OptForSize] in { + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), mem_cpat:$src)>; } - let Predicates = [UseAVX, OptForSize] in - def : Pat<(ScalarVT (OpNode (load addr:$src))), - (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), - addr:$src)>; + let Predicates = [UseAVX, OptForSize] in { + def : Pat<(ScalarVT (OpNode (load addr:$src))), + (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), + addr:$src)>; + def : Pat<(vt (OpNode mem_cpat:$src)), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), + mem_cpat:$src)>; + } } /// sse1_fp_unop_p - SSE1 unops in packed form. diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 85e17f516f91..a97d1e5c86d0 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -498,10 +498,10 @@ let Predicates = [HasXSAVE] in { let Predicates = [HasXSAVEOPT] in { def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), "xsaveopt\t$dst", - [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, TB; + [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS; def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), "xsaveopt64\t$dst", - [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; + [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>; } let Predicates = [HasXSAVEC] in { def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), @@ -551,10 +551,17 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB; //==-----------------------------------------------------------------------===// // PKU - enable protection key +let usesCustomInserter = 1 in { + def WRPKRU : PseudoI<(outs), (ins GR32:$src), + [(int_x86_wrpkru GR32:$src)]>; + def RDPKRU : PseudoI<(outs GR32:$dst), (ins), + [(set GR32:$dst, (int_x86_rdpkru))]>; +} + let Defs = [EAX, EDX], Uses = [ECX] in - def RDPKRU : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; + def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; let Uses = [EAX, ECX, EDX] in - def WRPKRU : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; + def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; //===----------------------------------------------------------------------===// // FS/GS Base Instructions diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index dc6d85d582c8..646b556faa8f 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -1208,19 +1208,55 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_q, VSHIFT_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0), X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0), X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psra_q, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), @@ -1229,6 +1265,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv2_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv32hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv4_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv4_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv8_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index e186f7039b43..e1ca558f0f2c 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -14,6 +14,7 @@ #include "X86AsmPrinter.h" #include "X86RegisterInfo.h" +#include "X86ShuffleDecodeConstantPool.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "Utils/X86ShuffleDecode.h" @@ -454,10 +455,6 @@ ReSimplify: "LEA has segment specified!"); break; - case X86::MOV32ri64: - OutMI.setOpcode(X86::MOV32ri); - break; - // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B // if one of the registers is extended, but other isn't. case X86::VMOVZPQILo2PQIrr: diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp new file mode 100644 index 000000000000..ef16c5bdbfd8 --- /dev/null +++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -0,0 +1,190 @@ +//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics using +// constants from the constant pool. +// +//===----------------------------------------------------------------------===// + +#include "X86ShuffleDecodeConstantPool.h" +#include "Utils/X86ShuffleDecode.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/IR/Constants.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { + +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> + // + // <4 x i32> <i32 -2147483648, i32 -2147483648, + // i32 -2147483648, i32 -2147483648> + +#ifndef NDEBUG + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512); +#endif + + // This is a straightforward byte vector. + if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { + int NumElements = MaskTy->getVectorNumElements(); + ShuffleMask.reserve(NumElements); + + for (int i = 0; i < NumElements; ++i) { + // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte + // lane of the vector we're inside. + int Base = i & ~0xf; + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + // If the high bit (7) of the byte is set, the element is zeroed. + if (Element & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (Element & 0xf); + ShuffleMask.push_back(Index); + } + } + } + // TODO: Handle funny-looking vectors too. +} + +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> + // + // <4 x i32> <i32 -2147483648, i32 -2147483648, + // i32 -2147483648, i32 -2147483648> + + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + + if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. + return; + + // Only support vector types. + if (!MaskTy->isVectorTy()) + return; + + // Make sure its an integer type. + Type *VecEltTy = MaskTy->getVectorElementType(); + if (!VecEltTy->isIntegerTy()) + return; + + // Support any element type from byte up to element size. + // This is necesary primarily because 64-bit elements get split to 32-bit + // in the constant pool on 32-bit target. + unsigned EltTySize = VecEltTy->getIntegerBitWidth(); + if (EltTySize < 8 || EltTySize > ElSize) + return; + + unsigned NumElements = MaskTySize / ElSize; + assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && + "Unexpected number of vector elements."); + ShuffleMask.reserve(NumElements); + unsigned NumElementsPerLane = 128 / ElSize; + unsigned Factor = ElSize / EltTySize; + + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i * Factor); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + int Index = i & ~(NumElementsPerLane - 1); + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + if (ElSize == 64) + Index += (Element >> 1) & 0x1; + else + Index += Element & 0x3; + ShuffleMask.push_back(Index); + } + + // TODO: Handle funny-looking vectors too. +} + +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + if (MaskTy->isVectorTy()) { + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements) - 1; + ShuffleMask.push_back(Element); + } + } + } + return; + } + // Scalar value; just broadcast it + if (!isa<ConstantInt>(C)) + return; + uint64_t Element = cast<ConstantInt>(C)->getZExtValue(); + int NumElements = VT.getVectorNumElements(); + Element &= (1 << NumElements) - 1; + for (int i = 0; i < NumElements; ++i) + ShuffleMask.push_back(Element); +} + +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements*2) - 1; + ShuffleMask.push_back(Element); + } + } + } +} +} // llvm namespace diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h new file mode 100644 index 000000000000..bcf46322c8cd --- /dev/null +++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h @@ -0,0 +1,45 @@ +//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics using +// constants from the constant pool. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H +#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H + +#include "llvm/ADT/SmallVector.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { +class Constant; +class MVT; + +/// \brief Decode a PSHUFB mask from an IR-level vector constant. +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +} // llvm namespace + +#endif |