diff options
Diffstat (limited to 'lib/Target/X86')
-rw-r--r-- | lib/Target/X86/AsmParser/X86AsmParser.cpp | 94 | ||||
-rw-r--r-- | lib/Target/X86/AsmParser/X86Operand.h | 14 | ||||
-rw-r--r-- | lib/Target/X86/X86AsmPrinter.h | 1 | ||||
-rw-r--r-- | lib/Target/X86/X86FrameLowering.cpp | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 273 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrAVX512.td | 39 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.td | 6 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 9 | ||||
-rw-r--r-- | lib/Target/X86/X86InstructionSelector.cpp | 30 | ||||
-rw-r--r-- | lib/Target/X86/X86LegalizerInfo.cpp | 102 | ||||
-rw-r--r-- | lib/Target/X86/X86LegalizerInfo.h | 5 | ||||
-rw-r--r-- | lib/Target/X86/X86MCInstLower.cpp | 80 | ||||
-rw-r--r-- | lib/Target/X86/X86OptimizeLEAs.cpp | 8 | ||||
-rw-r--r-- | lib/Target/X86/X86RegisterBankInfo.cpp | 23 | ||||
-rw-r--r-- | lib/Target/X86/X86RegisterBankInfo.h | 7 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.cpp | 10 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetTransformInfo.cpp | 32 |
17 files changed, 442 insertions, 295 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index c1cfc82b4a812..32ab475f11867 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -776,11 +776,6 @@ private: bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc); - /// MS-compatibility: - /// Obtain an appropriate size qualifier, when facing its absence, - /// upon AVX512 vector/broadcast memory operand - unsigned AdjustAVX512Mem(unsigned Size, X86Operand* UnsizedMemOpNext); - bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? return getSTI().getFeatureBits()[X86::Mode64Bit]; @@ -1206,27 +1201,16 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( Identifier, Info.OpDecl); } + // We either have a direct symbol reference, or an offset from a symbol. The // parser always puts the symbol on the LHS, so look there for size // calculation purposes. + unsigned FrontendSize = 0; const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp); bool IsSymRef = isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp); - if (IsSymRef) { - if (!Size) { - Size = Info.Type * 8; // Size is in terms of bits in this context. - if (Size) - InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, - /*Len=*/0, Size); - if (AllowBetterSizeMatch) - // Handle cases where size qualifier is absent, upon an indirect symbol - // reference - e.g. "vaddps zmm1, zmm2, [var]" - // set Size to zero to allow matching mechansim to try and find a better - // size qualifier than our initial guess, based on available variants of - // the given instruction - Size = 0; - } - } + if (IsSymRef && !Size && Info.Type) + FrontendSize = Info.Type * 8; // Size is in terms of bits in this context. // When parsing inline assembly we set the base register to a non-zero value // if we don't know the actual value at this time. This is necessary to @@ -1234,7 +1218,7 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( BaseReg = BaseReg ? BaseReg : 1; return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, IndexReg, Scale, Start, End, Size, Identifier, - Info.OpDecl); + Info.OpDecl, FrontendSize); } static void @@ -2884,23 +2868,6 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, return true; } -unsigned X86AsmParser::AdjustAVX512Mem(unsigned Size, - X86Operand* UnsizedMemOpNext) { - // Check for the existence of an AVX512 platform - if (!getSTI().getFeatureBits()[X86::FeatureAVX512]) - return 0; - // Allow adjusting upon a (x|y|z)mm - if (Size == 512 || Size == 256 || Size == 128) - return Size; - // This is an allegadly broadcasting mem op adjustment, - // allow some more inquiring to validate it - if (Size == 64 || Size == 32) - return UnsizedMemOpNext && UnsizedMemOpNext->isToken() && - UnsizedMemOpNext->getToken().substr(0, 4).equals("{1to") ? Size : 0; - // Do not allow any other type of adjustments - return 0; -} - bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -2920,19 +2887,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // Find one unsized memory operand, if present. X86Operand *UnsizedMemOp = nullptr; - // If unsized memory operand was found - obtain following operand. - // For use in AdjustAVX512Mem - X86Operand *UnsizedMemOpNext = nullptr; for (const auto &Op : Operands) { X86Operand *X86Op = static_cast<X86Operand *>(Op.get()); - if (UnsizedMemOp) { - UnsizedMemOpNext = X86Op; + if (X86Op->isMemUnsized()) { + UnsizedMemOp = X86Op; // Have we found an unqualified memory operand, // break. IA allows only one memory operand. break; } - if (X86Op->isMemUnsized()) - UnsizedMemOp = X86Op; } // Allow some instructions to have implicitly pointer-sized operands. This is @@ -2978,7 +2940,6 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // If an unsized memory operand is present, try to match with each memory // operand size. In Intel assembly, the size is not part of the instruction // mnemonic. - unsigned MatchedSize = 0; if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) { static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512}; for (unsigned Size : MopSizes) { @@ -2993,17 +2954,10 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // If this returned as a missing feature failure, remember that. if (Match.back() == Match_MissingFeature) ErrorInfoMissingFeature = ErrorInfoIgnore; - if (M == Match_Success) - // MS-compatability: - // Adjust AVX512 vector/broadcast memory operand, - // when facing the absence of a size qualifier. - // Match GCC behavior on respective cases. - MatchedSize = AdjustAVX512Mem(Size, UnsizedMemOpNext); } // Restore the size of the unsized memory operand if we modified it. - if (UnsizedMemOp) - UnsizedMemOp->Mem.Size = 0; + UnsizedMemOp->Mem.Size = 0; } // If we haven't matched anything yet, this is not a basic integer or FPU @@ -3027,20 +2981,30 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, Op.getLocRange(), MatchingInlineAsm); } + unsigned NumSuccessfulMatches = + std::count(std::begin(Match), std::end(Match), Match_Success); + + // If matching was ambiguous and we had size information from the frontend, + // try again with that. This handles cases like "movxz eax, m8/m16". + if (UnsizedMemOp && NumSuccessfulMatches > 1 && + UnsizedMemOp->getMemFrontendSize()) { + UnsizedMemOp->Mem.Size = UnsizedMemOp->getMemFrontendSize(); + unsigned M = MatchInstruction( + Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax()); + if (M == Match_Success) + NumSuccessfulMatches = 1; + + // Add a rewrite that encodes the size information we used from the + // frontend. + InstInfo->AsmRewrites->emplace_back( + AOK_SizeDirective, UnsizedMemOp->getStartLoc(), + /*Len=*/0, UnsizedMemOp->getMemFrontendSize()); + } + // If exactly one matched, then we treat that as a successful match (and the // instruction will already have been filled in correctly, since the failing // matches won't have modified it). - unsigned NumSuccessfulMatches = - std::count(std::begin(Match), std::end(Match), Match_Success); if (NumSuccessfulMatches == 1) { - if (MatchedSize && isParsingInlineAsm() && isParsingIntelSyntax()) - // MS compatibility - - // Fix the rewrite according to the matched memory size - // MS inline assembly only - for (AsmRewrite &AR : *InstInfo->AsmRewrites) - if ((AR.Loc.getPointer() == UnsizedMemOp->StartLoc.getPointer()) && - (AR.Kind == AOK_SizeDirective)) - AR.Val = MatchedSize; // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the individual // transformations can chain off each other. @@ -3057,7 +3021,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, "multiple matches only possible with unsized memory operands"); return Error(UnsizedMemOp->getStartLoc(), "ambiguous operand size for instruction '" + Mnemonic + "\'", - UnsizedMemOp->getLocRange(), MatchingInlineAsm); + UnsizedMemOp->getLocRange()); } // If one instruction matched with a missing feature, report this as a diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index 9f1fa6c659070..33eff14b82157 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -62,6 +62,10 @@ struct X86Operand : public MCParsedAsmOperand { unsigned Scale; unsigned Size; unsigned ModeSize; + + /// If the memory operand is unsized and there are multiple instruction + /// matches, prefer the one with this size. + unsigned FrontendSize; }; union { @@ -136,6 +140,10 @@ struct X86Operand : public MCParsedAsmOperand { assert(Kind == Memory && "Invalid access!"); return Mem.ModeSize; } + unsigned getMemFrontendSize() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.FrontendSize; + } bool isToken() const override {return Kind == Token; } @@ -512,7 +520,7 @@ struct X86Operand : public MCParsedAsmOperand { static std::unique_ptr<X86Operand> CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), - void *OpDecl = nullptr) { + void *OpDecl = nullptr, unsigned FrontendSize = 0) { auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; @@ -521,6 +529,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.Scale = 1; Res->Mem.Size = Size; Res->Mem.ModeSize = ModeSize; + Res->Mem.FrontendSize = FrontendSize; Res->SymName = SymName; Res->OpDecl = OpDecl; Res->AddressOf = false; @@ -532,7 +541,7 @@ struct X86Operand : public MCParsedAsmOperand { CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), - void *OpDecl = nullptr) { + void *OpDecl = nullptr, unsigned FrontendSize = 0) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); @@ -548,6 +557,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.Scale = Scale; Res->Mem.Size = Size; Res->Mem.ModeSize = ModeSize; + Res->Mem.FrontendSize = FrontendSize; Res->SymName = SymName; Res->OpDecl = OpDecl; Res->AddressOf = false; diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 44bc373b0394c..d7c3b74d3efb2 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -91,6 +91,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { X86MCInstLower &MCIL); void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL); + void LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index a94045cd536d1..331e56976db7c 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -2990,6 +2990,10 @@ unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) void X86FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + // Mark the function as not having WinCFI. We will set it back to true in + // emitPrologue if it gets called and emits CFI. + MF.setHasWinCFI(false); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. const Function *Fn = MF.getFunction(); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 83542aaa013bd..9ee2234595f9a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1224,10 +1224,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); - setOperationAction(ISD::VSELECT, MVT::v8i1, Expand); - setOperationAction(ISD::VSELECT, MVT::v16i1, Expand); + if (Subtarget.hasDQI()) { for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) { setOperationAction(ISD::SINT_TO_FP, VT, Legal); @@ -1243,8 +1240,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } if (Subtarget.hasVLX()) { - setOperationAction(ISD::ABS, MVT::v4i64, Legal); - setOperationAction(ISD::ABS, MVT::v2i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); @@ -1270,8 +1265,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal); } - setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); @@ -1304,33 +1297,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); - setOperationAction(ISD::SETCC, MVT::v16i1, Custom); - setOperationAction(ISD::SETCC, MVT::v8i1, Custom); - setOperationAction(ISD::MUL, MVT::v8i64, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); - setOperationAction(ISD::SELECT, MVT::v16i1, Custom); - setOperationAction(ISD::SELECT, MVT::v8i1, Custom); - - setOperationAction(ISD::ADD, MVT::v8i1, Custom); - setOperationAction(ISD::ADD, MVT::v16i1, Custom); - setOperationAction(ISD::SUB, MVT::v8i1, Custom); - setOperationAction(ISD::SUB, MVT::v16i1, Custom); - setOperationAction(ISD::MUL, MVT::v8i1, Custom); - setOperationAction(ISD::MUL, MVT::v16i1, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); + // NonVLX sub-targets extend 128/256 vectors to use the 512 version. + setOperationAction(ISD::ABS, MVT::v4i64, Legal); + setOperationAction(ISD::ABS, MVT::v2i64, Legal); + + for (auto VT : { MVT::v8i1, MVT::v16i1 }) { + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); + + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Expand); + } + for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); @@ -1352,33 +1346,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64); if (Subtarget.hasCDI()) { - setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); - setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); - - setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); - setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); - setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); - setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); - - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); - - if (Subtarget.hasVLX()) { - setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); - setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); - setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); - setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); - } else { - setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); - setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); - setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); - setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); + // NonVLX sub-targets extend 128/256 vectors to use the 512 version. + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, + MVT::v4i64, MVT::v8i64}) { + setOperationAction(ISD::CTLZ, VT, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); } - - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); } // Subtarget.hasCDI() if (Subtarget.hasDQI()) { @@ -6070,7 +6043,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - if (NumNonZero > 8) + if (NumNonZero > 8 && !Subtarget.hasSSE41()) return SDValue(); SDLoc dl(Op); @@ -6158,7 +6131,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - if (NumNonZero > 4) + if (NumNonZero > 4 && !Subtarget.hasSSE41()) return SDValue(); SDLoc dl(Op); @@ -6241,7 +6214,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. - EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue(); + EltMaskIdx = Elt.getConstantOperandVal(1); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; @@ -6272,8 +6245,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; - CanFold = SrcVector == V1 && - cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i; + CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i); } if (!CanFold) @@ -20944,54 +20916,62 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } +// Split an unary integer op into 2 half sized ops. +static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + unsigned NumElems = VT.getVectorNumElements(); + unsigned SizeInBits = VT.getSizeInBits(); + + // Extract the Lo/Hi vectors + SDLoc dl(Op); + SDValue Src = Op.getOperand(0); + SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2); + SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2); + + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, Lo), + DAG.getNode(Op.getOpcode(), dl, NewVT, Hi)); +} + +// Decompose 256-bit ops into smaller 128-bit ops. +static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) { + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return LowerVectorIntUnary(Op, DAG); +} + +// Decompose 512-bit ops into smaller 256-bit ops. +static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) { + assert(Op.getSimpleValueType().is512BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 512-bit vector integer operation"); + return LowerVectorIntUnary(Op, DAG); +} + /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. // -// 1. i32/i64 128/256-bit vector (native support require VLX) are expended -// to 512-bit vector. -// 2. i8/i16 vector implemented using dword LZCNT vector instruction -// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, -// split the vector, perform operation on it's Lo a Hi part and -// concatenate the results. -static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { +// i8/i16 vector implemented using dword LZCNT vector instruction +// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, +// split the vector, perform operation on it's Lo a Hi part and +// concatenate the results. +static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::CTLZ); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); - if (EltVT == MVT::i64 || EltVT == MVT::i32) { - // Extend to 512 bit vector. - assert((VT.is256BitVector() || VT.is128BitVector()) && - "Unsupported value type for operation"); - - MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); - SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, - DAG.getUNDEF(NewVT), - Op.getOperand(0), - DAG.getIntPtrConstant(0, dl)); - SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); - - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, - DAG.getIntPtrConstant(0, dl)); - } - assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"); - if (16 < NumElems) { - // Split vector, it's Lo and Hi parts will be handled in next iteration. - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); - MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); - - Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo); - Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi); - - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); - } + // Split vector, it's Lo and Hi parts will be handled in next iteration. + if (16 < NumElems) + return LowerVectorIntUnary(Op, DAG); MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); - assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && "Unsupported value type for operation"); @@ -21078,23 +21058,17 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - SDValue Op0 = Op.getOperand(0); - if (Subtarget.hasAVX512()) - return LowerVectorCTLZ_AVX512(Op, DAG); + if (Subtarget.hasCDI()) + return LowerVectorCTLZ_AVX512CDI(Op, DAG); // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector() && !Subtarget.hasInt256()) { - unsigned NumElems = VT.getVectorNumElements(); + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntUnary(Op, DAG); - // Extract each 128-bit vector, perform ctlz and concat the result. - SDValue LHS = extract128BitVector(Op0, 0, DAG, DL); - SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL); - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, - DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS), - DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS)); - } + // Decompose 512-bit ops into smaller 256-bit ops. + if (VT.is512BitVector() && !Subtarget.hasBWI()) + return Lower512IntUnary(Op, DAG); assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); @@ -21258,19 +21232,7 @@ static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); - MVT VT = Op.getSimpleValueType(); - unsigned NumElems = VT.getVectorNumElements(); - - SDLoc dl(Op); - SDValue Src = Op.getOperand(0); - SDValue Lo = extract128BitVector(Src, 0, DAG, dl); - SDValue Hi = extract128BitVector(Src, NumElems / 2, DAG, dl); - - MVT EltVT = VT.getVectorElementType(); - MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - DAG.getNode(ISD::ABS, dl, NewVT, Lo), - DAG.getNode(ISD::ABS, dl, NewVT, Hi)); + return Lower256IntUnary(Op, DAG); } static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { @@ -23049,29 +23011,13 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); } - if (VT.is256BitVector() && !Subtarget.hasInt256()) { - unsigned NumElems = VT.getVectorNumElements(); - - // Extract each 128-bit vector, compute pop count and concat the result. - SDValue LHS = extract128BitVector(Op0, 0, DAG, DL); - SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL); - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, - LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), - LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); - } - - if (VT.is512BitVector() && !Subtarget.hasBWI()) { - unsigned NumElems = VT.getVectorNumElements(); - - // Extract each 256-bit vector, compute pop count and concat the result. - SDValue LHS = extract256BitVector(Op0, 0, DAG, DL); - SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL); + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntUnary(Op, DAG); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, - LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), - LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); - } + // Decompose 512-bit ops into smaller 256-bit ops. + if (VT.is512BitVector() && !Subtarget.hasBWI()) + return Lower512IntUnary(Op, DAG); return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } @@ -23098,20 +23044,12 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { DAG.getIntPtrConstant(0, DL)); } - MVT SVT = VT.getVectorElementType(); int NumElts = VT.getVectorNumElements(); int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector()) { - SDValue Lo = extract128BitVector(In, 0, DAG, DL); - SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL); - - MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, - DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo), - DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi)); - } + if (VT.is256BitVector()) + return Lower256IntUnary(Op, DAG); assert(VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."); @@ -23152,14 +23090,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, "Only byte vector BITREVERSE supported"); // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. - if (VT.is256BitVector() && !Subtarget.hasInt256()) { - MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2); - SDValue Lo = extract128BitVector(In, 0, DAG, DL); - SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL); - Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo); - Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); - } + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntUnary(Op, DAG); // Perform BITREVERSE using PSHUFB lookups. Each byte is split into // two nibbles and a PSHUFB lookup to find the bitreverse of each @@ -26585,6 +26517,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); + + case TargetOpcode::PATCHABLE_EVENT_CALL: + // Do nothing here, handle in xray instrumentation pass. + return BB; case X86::LCMPXCHG8B: { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); @@ -26667,7 +26603,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); - Known.Zero.clearAllBits(); Known.One.clearAllBits(); + Known.resetAll(); switch (Opc) { default: break; case X86ISD::ADD: @@ -26697,7 +26633,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case X86ISD::VSRLI: { if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) { - Known.Zero.setAllBits(); + Known.setAllZero(); break; } @@ -26729,8 +26665,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = KnownBits(InBitWidth); APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts); DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1); - Known.One = Known.One.zext(BitWidth); - Known.Zero = Known.Zero.zext(BitWidth); + Known = Known.zext(BitWidth); Known.Zero.setBitsFrom(InBitWidth); break; } @@ -31671,10 +31606,9 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) if (auto *AmtConst = AmtBV->getConstantSplatNode()) SraAmt = AmtConst->getZExtValue(); - } else if (Mask.getOpcode() == X86ISD::VSRAI) { - SDValue SraC = Mask.getOperand(1); - SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); - } + } else if (Mask.getOpcode() == X86ISD::VSRAI) + SraAmt = Mask.getConstantOperandVal(1); + if ((SraAmt + 1) != EltBits) return SDValue(); @@ -31708,7 +31642,9 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, V = Y; if (V) { - assert(EltBits == 8 || EltBits == 16 || EltBits == 32); + if (EltBits != 8 && EltBits != 16 && EltBits != 32) + return SDValue(); + SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); SDValue SubOp2 = Mask; @@ -34488,8 +34424,7 @@ static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG, if (Carry.getOpcode() == ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { - auto *Cond = cast<ConstantSDNode>(Carry.getOperand(0)); - if (Cond->getZExtValue() == X86::COND_B) + if (Carry.getConstantOperandVal(0) == X86::COND_B) return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1)); } } diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index c38c13bb97571..71d395244b4ad 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -8631,6 +8631,20 @@ multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs>; +// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v4i64 (abs VR256X:$src)), + (EXTRACT_SUBREG + (VPABSQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)), + sub_ymm)>; + def : Pat<(v2i64 (abs VR128X:$src)), + (EXTRACT_SUBREG + (VPABSQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)), + sub_xmm)>; +} + multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{ defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>; @@ -8639,6 +8653,31 @@ multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{ defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>; defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>; +// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX. +let Predicates = [HasCDI, NoVLX] in { + def : Pat<(v4i64 (ctlz VR256X:$src)), + (EXTRACT_SUBREG + (VPLZCNTQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)), + sub_ymm)>; + def : Pat<(v2i64 (ctlz VR128X:$src)), + (EXTRACT_SUBREG + (VPLZCNTQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)), + sub_xmm)>; + + def : Pat<(v8i32 (ctlz VR256X:$src)), + (EXTRACT_SUBREG + (VPLZCNTDZrr + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)), + sub_ymm)>; + def : Pat<(v4i32 (ctlz VR128X:$src)), + (EXTRACT_SUBREG + (VPLZCNTDZrr + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)), + sub_xmm)>; +} + //===---------------------------------------------------------------------===// // Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index cdf7ce19cdc8f..902b0c2c04e3e 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -1995,11 +1995,11 @@ def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>, Requires<[In64BitMode]>; // Data16 instruction prefix -def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>, +def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>, Requires<[Not16BitMode]>; // Data instruction prefix -def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", []>, +def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", []>, Requires<[In16BitMode]>; // Repeat string operation instruction prefixes @@ -2518,7 +2518,7 @@ let SchedRW = [ WriteSystem ] in { } let Uses = [ ECX, EAX, EBX ] in { - def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", + def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [(int_x86_mwaitx ECX, EAX, EBX)], IIC_SSE_MWAITX>, TB, Requires<[ HasMWAITX ]>; } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f22a50200c9a3..48da2fa607af0 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6718,22 +6718,23 @@ let Constraints = "$src1 = $dst" in { SSE_INTMUL_ITINS_P, 1>; } -let Predicates = [HasAVX, NoVLX] in { +let Predicates = [HasAVX, NoVLX] in defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>, VEX_4V, VEX_WIG; +let Predicates = [HasAVX] in defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_WIG; -} -let Predicates = [HasAVX2] in { + +let Predicates = [HasAVX2, NoVLX] in defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>, VEX_4V, VEX_L, VEX_WIG; +let Predicates = [HasAVX2] in defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L, VEX_WIG; -} let Constraints = "$src1 = $dst" in { defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 38f7bc0af5c72..d65eb1de8d094 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -65,8 +65,8 @@ private: MachineFunction &MF) const; bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; - bool selectFrameIndex(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF) const; + bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI, @@ -235,7 +235,7 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectLoadStoreOp(I, MRI, MF)) return true; - if (selectFrameIndex(I, MRI, MF)) + if (selectFrameIndexOrGep(I, MRI, MF)) return true; if (selectConstant(I, MRI, MF)) return true; @@ -427,27 +427,37 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } -bool X86InstructionSelector::selectFrameIndex(MachineInstr &I, - MachineRegisterInfo &MRI, - MachineFunction &MF) const { - if (I.getOpcode() != TargetOpcode::G_FRAME_INDEX) +bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + unsigned Opc = I.getOpcode(); + + if (Opc != TargetOpcode::G_FRAME_INDEX && Opc != TargetOpcode::G_GEP) return false; const unsigned DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); - // Use LEA to calculate frame index. + // Use LEA to calculate frame index and GEP unsigned NewOpc; if (Ty == LLT::pointer(0, 64)) NewOpc = X86::LEA64r; else if (Ty == LLT::pointer(0, 32)) NewOpc = STI.isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r; else - llvm_unreachable("Can't select G_FRAME_INDEX, unsupported type."); + llvm_unreachable("Can't select G_FRAME_INDEX/G_GEP, unsupported type."); I.setDesc(TII.get(NewOpc)); MachineInstrBuilder MIB(MF, I); - addOffset(MIB, 0); + + if (Opc == TargetOpcode::G_FRAME_INDEX) { + addOffset(MIB, 0); + } else { + MachineOperand &InxOp = I.getOperand(2); + I.addOperand(InxOp); // set IndexReg + InxOp.ChangeToImmediate(1); // set Scale + MIB.addImm(0).addReg(0); + } return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index a437f6bf4714d..4f5e70414aa93 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -34,6 +34,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizerInfo64bit(); setLegalizerInfoSSE1(); setLegalizerInfoSSE2(); + setLegalizerInfoSSE41(); + setLegalizerInfoAVX2(); + setLegalizerInfoAVX512(); + setLegalizerInfoAVX512DQ(); + setLegalizerInfoAVX512BW(); computeTables(); } @@ -50,7 +55,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); - for (unsigned BinOp : {G_ADD, G_SUB}) + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL}) for (auto Ty : {s8, s16, s32}) setAction({BinOp, Ty}, Legal); @@ -65,6 +70,12 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { // Pointer-handling setAction({G_FRAME_INDEX, p0}, Legal); + setAction({G_GEP, p0}, Legal); + setAction({G_GEP, 1, s32}, Legal); + + for (auto Ty : {s1, s8, s16}) + setAction({G_GEP, 1, Ty}, WidenScalar); + // Constants for (auto Ty : {s8, s16, s32, p0}) setAction({TargetOpcode::G_CONSTANT, Ty}, Legal); @@ -94,7 +105,7 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); - for (unsigned BinOp : {G_ADD, G_SUB}) + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL}) for (auto Ty : {s8, s16, s32, s64}) setAction({BinOp, Ty}, Legal); @@ -109,6 +120,13 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { // Pointer-handling setAction({G_FRAME_INDEX, p0}, Legal); + setAction({G_GEP, p0}, Legal); + setAction({G_GEP, 1, s32}, Legal); + setAction({G_GEP, 1, s64}, Legal); + + for (auto Ty : {s1, s8, s16}) + setAction({G_GEP, 1, Ty}, WidenScalar); + // Constants for (auto Ty : {s8, s16, s32, s64, p0}) setAction({TargetOpcode::G_CONSTANT, Ty}, Legal); @@ -149,6 +167,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() { return; const LLT s64 = LLT::scalar(64); + const LLT v8s16 = LLT::vector(8, 16); const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); @@ -159,4 +178,83 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() { for (unsigned BinOp : {G_ADD, G_SUB}) for (auto Ty : {v4s32}) setAction({BinOp, Ty}, Legal); + + setAction({G_MUL, v8s16}, Legal); +} + +void X86LegalizerInfo::setLegalizerInfoSSE41() { + if (!Subtarget.hasSSE41()) + return; + + const LLT v4s32 = LLT::vector(4, 32); + + setAction({G_MUL, v4s32}, Legal); +} + +void X86LegalizerInfo::setLegalizerInfoAVX2() { + if (!Subtarget.hasAVX2()) + return; + + const LLT v16s16 = LLT::vector(16, 16); + const LLT v8s32 = LLT::vector(8, 32); + + for (auto Ty : {v16s16, v8s32}) + setAction({G_MUL, Ty}, Legal); +} + +void X86LegalizerInfo::setLegalizerInfoAVX512() { + if (!Subtarget.hasAVX512()) + return; + + const LLT v16s32 = LLT::vector(16, 32); + + setAction({G_MUL, v16s32}, Legal); + + /************ VLX *******************/ + if (!Subtarget.hasVLX()) + return; + + const LLT v4s32 = LLT::vector(4, 32); + const LLT v8s32 = LLT::vector(8, 32); + + for (auto Ty : {v4s32, v8s32}) + setAction({G_MUL, Ty}, Legal); +} + +void X86LegalizerInfo::setLegalizerInfoAVX512DQ() { + if (!(Subtarget.hasAVX512() && Subtarget.hasDQI())) + return; + + const LLT v8s64 = LLT::vector(8, 64); + + setAction({G_MUL, v8s64}, Legal); + + /************ VLX *******************/ + if (!Subtarget.hasVLX()) + return; + + const LLT v2s64 = LLT::vector(2, 64); + const LLT v4s64 = LLT::vector(4, 64); + + for (auto Ty : {v2s64, v4s64}) + setAction({G_MUL, Ty}, Legal); +} + +void X86LegalizerInfo::setLegalizerInfoAVX512BW() { + if (!(Subtarget.hasAVX512() && Subtarget.hasBWI())) + return; + + const LLT v32s16 = LLT::vector(32, 16); + + setAction({G_MUL, v32s16}, Legal); + + /************ VLX *******************/ + if (!Subtarget.hasVLX()) + return; + + const LLT v8s16 = LLT::vector(8, 16); + const LLT v16s16 = LLT::vector(16, 16); + + for (auto Ty : {v8s16, v16s16}) + setAction({G_MUL, Ty}, Legal); } diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h index 3f00898b42322..ab5405a704273 100644 --- a/lib/Target/X86/X86LegalizerInfo.h +++ b/lib/Target/X86/X86LegalizerInfo.h @@ -38,6 +38,11 @@ private: void setLegalizerInfo64bit(); void setLegalizerInfoSSE1(); void setLegalizerInfoSSE2(); + void setLegalizerInfoSSE41(); + void setLegalizerInfoAVX2(); + void setLegalizerInfoAVX512(); + void setLegalizerInfoAVX512DQ(); + void setLegalizerInfoAVX512BW(); }; } // namespace llvm #endif diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 550e3543a71e8..598d88d8b9c3b 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -1040,6 +1040,83 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, getSubtargetInfo()); } +void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, + X86MCInstLower &MCIL) { + assert(Subtarget->is64Bit() && "XRay custom events only suports X86-64"); + + // We want to emit the following pattern, which follows the x86 calling + // convention to prepare for the trampoline call to be patched in. + // + // <args placement according SysV64 calling convention> + // .p2align 1, ... + // .Lxray_event_sled_N: + // jmp +N // jump across the call instruction + // callq __xray_CustomEvent // force relocation to symbol + // <args cleanup, jump to here> + // + // The relative jump needs to jump forward 24 bytes: + // 10 (args) + 5 (nops) + 9 (cleanup) + // + // After patching, it would look something like: + // + // nopw (2-byte nop) + // callq __xrayCustomEvent // already lowered + // + // --- + // First we emit the label and the jump. + auto CurSled = OutContext.createTempSymbol("xray_event_sled_", true); + OutStreamer->AddComment("# XRay Custom Event Log"); + OutStreamer->EmitCodeAlignment(2); + OutStreamer->EmitLabel(CurSled); + + // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as + // an operand (computed as an offset from the jmp instruction). + // FIXME: Find another less hacky way do force the relative jump. + OutStreamer->EmitBytes("\xeb\x14"); + + // The default C calling convention will place two arguments into %rcx and + // %rdx -- so we only work with those. + unsigned UsedRegs[] = {X86::RDI, X86::RSI, X86::RAX}; + + // Because we will use %rax, we preserve that across the call. + EmitAndCountInstruction(MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); + + // Then we put the operands in the %rdi and %rsi registers. + for (unsigned I = 0; I < MI.getNumOperands(); ++I) + if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) { + if (Op->isImm()) + EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri) + .addReg(UsedRegs[I]) + .addImm(Op->getImm())); + else if (Op->isReg()) { + if (Op->getReg() != UsedRegs[I]) + EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr) + .addReg(UsedRegs[I]) + .addReg(Op->getReg())); + else + EmitNops(*OutStreamer, 3, Subtarget->is64Bit(), getSubtargetInfo()); + } + } + + // We emit a hard dependency on the __xray_CustomEvent symbol, which is the + // name of the trampoline to be implemented by the XRay runtime. We put this + // explicitly in the %rax register. + auto TSym = OutContext.getOrCreateSymbol("__xray_CustomEvent"); + MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym); + EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri) + .addReg(X86::RAX) + .addOperand(MCIL.LowerSymbolOperand(TOp, TSym))); + + // Emit the call instruction. + EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(X86::RAX)); + + // Restore caller-saved and used registers. + OutStreamer->AddComment("xray custom event end."); + EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(X86::RAX)); + + recordSled(CurSled, MI, SledKind::CUSTOM_EVENT); +} + void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, X86MCInstLower &MCIL) { // We want to emit the following pattern: @@ -1415,6 +1492,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case TargetOpcode::PATCHABLE_TAIL_CALL: return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering); + + case TargetOpcode::PATCHABLE_EVENT_CALL: + return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering); case X86::MORESTACK_RET: EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp index 7be0a7fd40678..aabbf67a16b62 100644 --- a/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/lib/Target/X86/X86OptimizeLEAs.cpp @@ -223,8 +223,6 @@ public: StringRef getPassName() const override { return "X86 LEA Optimize"; } - bool doInitialization(Module &M) override; - /// \brief Loop over all of the basic blocks, replacing address /// calculations in load and store instructions, if it's already /// been calculated by LEA. Also, remove redundant LEAs. @@ -280,7 +278,6 @@ private: MachineRegisterInfo *MRI; const X86InstrInfo *TII; const X86RegisterInfo *TRI; - Module *TheModule; static char ID; }; @@ -649,11 +646,6 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { return Changed; } -bool OptimizeLEAPass::doInitialization(Module &M) { - TheModule = &M; - return false; -} - bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp index 0f8a750a02352..efd3df26dd424 100644 --- a/lib/Target/X86/X86RegisterBankInfo.cpp +++ b/lib/Target/X86/X86RegisterBankInfo.cpp @@ -139,8 +139,9 @@ bool X86RegisterBankInfo::getInstrValueMapping( return true; } -RegisterBankInfo::InstructionMapping -X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI, bool isFP) { +const RegisterBankInfo::InstructionMapping & +X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI, + bool isFP) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -152,10 +153,10 @@ X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI, bool isFP) { llvm_unreachable("Unsupported operand mapping yet."); auto Mapping = getValueMapping(getPartialMappingIdx(Ty, isFP), 3); - return InstructionMapping{DefaultMappingID, 1, Mapping, NumOperands}; + return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands); } -RegisterBankInfo::InstructionMapping +const RegisterBankInfo::InstructionMapping & X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -164,7 +165,7 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Try the default logic for non-generic instructions that are either copies // or already have some operands assigned to banks. if (!isPreISelGenericOpcode(Opc)) { - InstructionMapping Mapping = getInstrMappingImpl(MI); + const InstructionMapping &Mapping = getInstrMappingImpl(MI); if (Mapping.isValid()) return Mapping; } @@ -193,10 +194,10 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Finally construct the computed mapping. SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands); if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping)) - return InstructionMapping(); + return getInvalidInstructionMapping(); - return InstructionMapping{DefaultMappingID, /* Cost */ 1, - getOperandsMapping(OpdsMapping), NumOperands}; + return getInstructionMapping(DefaultMappingID, /* Cost */ 1, + getOperandsMapping(OpdsMapping), NumOperands); } void X86RegisterBankInfo::applyMappingImpl( @@ -231,10 +232,10 @@ X86RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const { if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping)) break; - RegisterBankInfo::InstructionMapping Mapping = InstructionMapping{ - /*ID*/ 1, /*Cost*/ 1, getOperandsMapping(OpdsMapping), NumOperands}; + const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( + /*ID*/ 1, /*Cost*/ 1, getOperandsMapping(OpdsMapping), NumOperands); InstructionMappings AltMappings; - AltMappings.emplace_back(std::move(Mapping)); + AltMappings.push_back(&Mapping); return AltMappings; } default: diff --git a/lib/Target/X86/X86RegisterBankInfo.h b/lib/Target/X86/X86RegisterBankInfo.h index a1e01a9ab9497..e227880427f3c 100644 --- a/lib/Target/X86/X86RegisterBankInfo.h +++ b/lib/Target/X86/X86RegisterBankInfo.h @@ -46,8 +46,8 @@ private: /// Get an instruction mapping. /// \return An InstructionMappings with a statically allocated /// OperandsMapping. - static InstructionMapping getSameOperandsMapping(const MachineInstr &MI, - bool isFP); + const InstructionMapping &getSameOperandsMapping(const MachineInstr &MI, + bool isFP) const; /// Track the bank of each instruction operand(register) static void @@ -74,7 +74,8 @@ public: /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; - InstructionMapping getInstrMapping(const MachineInstr &MI) const override; + const InstructionMapping & + getInstrMapping(const MachineInstr &MI) const override; }; } // namespace llvm diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 9ab751e2b002a..d66d39dcee174 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -139,12 +139,18 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, return X86II::MO_NO_FLAG; assert(!isTargetCOFF()); + const Function *F = dyn_cast_or_null<Function>(GV); - if (isTargetELF()) + if (isTargetELF()) { + if (is64Bit() && F && (CallingConv::X86_RegCall == F->getCallingConv())) + // According to psABI, PLT stub clobbers XMM8-XMM15. + // In Regcall calling convention those registers are used for passing + // parameters. Thus we need to prevent lazy binding in Regcall. + return X86II::MO_GOTPCREL; return X86II::MO_PLT; + } if (is64Bit()) { - auto *F = dyn_cast_or_null<Function>(GV); if (F && F->hasFnAttribute(Attribute::NonLazyBind)) // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index b742fb472372c..f3b619a2956a0 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1426,25 +1426,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ }; static const CostTblEntry AVX1CostTbl[] = { - { ISD::BITREVERSE, MVT::v4i64, 10 }, - { ISD::BITREVERSE, MVT::v8i32, 10 }, - { ISD::BITREVERSE, MVT::v16i16, 10 }, - { ISD::BITREVERSE, MVT::v32i8, 10 }, + { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert + { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert + { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert + { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert { ISD::BSWAP, MVT::v4i64, 4 }, { ISD::BSWAP, MVT::v8i32, 4 }, { ISD::BSWAP, MVT::v16i16, 4 }, - { ISD::CTLZ, MVT::v4i64, 46 }, - { ISD::CTLZ, MVT::v8i32, 36 }, - { ISD::CTLZ, MVT::v16i16, 28 }, - { ISD::CTLZ, MVT::v32i8, 18 }, - { ISD::CTPOP, MVT::v4i64, 14 }, - { ISD::CTPOP, MVT::v8i32, 22 }, - { ISD::CTPOP, MVT::v16i16, 18 }, - { ISD::CTPOP, MVT::v32i8, 12 }, - { ISD::CTTZ, MVT::v4i64, 20 }, - { ISD::CTTZ, MVT::v8i32, 28 }, - { ISD::CTTZ, MVT::v16i16, 24 }, - { ISD::CTTZ, MVT::v32i8, 18 }, + { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert + { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert + { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert + { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert + { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert + { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert + { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert + { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert + { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert + { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert + { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert + { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ |