aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/X86')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp4986
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h718
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp2362
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h647
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h445
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp498
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h119
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp1603
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h1226
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp345
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h40
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp1461
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.h26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp389
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h43
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp454
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h138
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp169
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h66
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp1840
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h79
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp790
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h145
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp603
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp571
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h166
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h34
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp113
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp80
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp461
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.h21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86.h186
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86.td1477
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp802
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h155
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp735
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp135
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp640
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp489
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h54
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp344
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.h33
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td1175
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp861
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp184
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp793
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp295
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp539
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp4028
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp459
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp702
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp133
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp984
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp1730
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp3597
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h257
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86GenRegisterBankInfo.def99
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp6020
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp51718
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h1713
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp175
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp269
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp253
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp147
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp2017
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86Instr3DNow.td112
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td149
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td12239
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td1545
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h232
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrCMovSetCC.td127
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td2179
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td430
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrExtension.td222
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td640
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp164
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h97
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td815
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp5697
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h97
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td1011
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td1195
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp9065
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h634
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td3740
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td86
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td582
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrMPX.td77
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td47
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td7995
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td72
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td1033
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td755
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td59
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td87
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrVecCompiler.td459
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td473
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp1693
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp848
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h1177
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp526
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h51
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp816
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp120
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp351
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp2631
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp30
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h230
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp74
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp723
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp230
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp487
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td235
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp265
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBankInfo.cpp315
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBankInfo.h81
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp935
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h156
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td646
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td1733
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td2008
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SchedPredicates.td143
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td1226
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td1894
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td2618
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td731
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td908
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td1462
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td1049
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td474
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td1561
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td1548
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp325
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h45
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp296
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h43
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp182
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp2278
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp352
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h949
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp584
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h63
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp58
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h52
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp4761
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h256
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp248
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp358
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp302
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp789
156 files changed, 198054 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 000000000000..9d9a20183f0f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,4986 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86MCExpr.h"
+#include "MCTargetDesc/X86TargetStreamer.h"
+#include "TargetInfo/X86TargetInfo.h"
+#include "X86AsmParserCommon.h"
+#include "X86Operand.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+
+static cl::opt<bool> LVIInlineAsmHardening(
+ "x86-experimental-lvi-inline-asm-hardening",
+ cl::desc("Harden inline assembly code that may be vulnerable to Load Value"
+ " Injection (LVI). This feature is experimental."), cl::Hidden);
+
+static bool checkScale(unsigned Scale, StringRef &ErrMsg) {
+ if (Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
+ ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
+ return true;
+ }
+ return false;
+}
+
+namespace {
+
+static const char OpPrecedence[] = {
+ 0, // IC_OR
+ 1, // IC_XOR
+ 2, // IC_AND
+ 4, // IC_LSHIFT
+ 4, // IC_RSHIFT
+ 5, // IC_PLUS
+ 5, // IC_MINUS
+ 6, // IC_MULTIPLY
+ 6, // IC_DIVIDE
+ 6, // IC_MOD
+ 7, // IC_NOT
+ 8, // IC_NEG
+ 9, // IC_RPAREN
+ 10, // IC_LPAREN
+ 0, // IC_IMM
+ 0, // IC_REGISTER
+ 3, // IC_EQ
+ 3, // IC_NE
+ 3, // IC_LT
+ 3, // IC_LE
+ 3, // IC_GT
+ 3 // IC_GE
+};
+
+class X86AsmParser : public MCTargetAsmParser {
+ ParseInstructionInfo *InstInfo;
+ bool Code16GCC;
+ unsigned ForcedDataPrefix = 0;
+
+ enum VEXEncoding {
+ VEXEncoding_Default,
+ VEXEncoding_VEX,
+ VEXEncoding_VEX2,
+ VEXEncoding_VEX3,
+ VEXEncoding_EVEX,
+ };
+
+ VEXEncoding ForcedVEXEncoding = VEXEncoding_Default;
+
+ enum DispEncoding {
+ DispEncoding_Default,
+ DispEncoding_Disp8,
+ DispEncoding_Disp32,
+ };
+
+ DispEncoding ForcedDispEncoding = DispEncoding_Default;
+
+private:
+ SMLoc consumeToken() {
+ MCAsmParser &Parser = getParser();
+ SMLoc Result = Parser.getTok().getLoc();
+ Parser.Lex();
+ return Result;
+ }
+
+ X86TargetStreamer &getTargetStreamer() {
+ assert(getParser().getStreamer().getTargetStreamer() &&
+ "do not have a target streamer");
+ MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+ return static_cast<X86TargetStreamer &>(TS);
+ }
+
+ unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst,
+ uint64_t &ErrorInfo, FeatureBitset &MissingFeatures,
+ bool matchingInlineAsm, unsigned VariantID = 0) {
+ // In Code16GCC mode, match as 32-bit.
+ if (Code16GCC)
+ SwitchMode(X86::Mode32Bit);
+ unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+ MissingFeatures, matchingInlineAsm,
+ VariantID);
+ if (Code16GCC)
+ SwitchMode(X86::Mode16Bit);
+ return rv;
+ }
+
+ enum InfixCalculatorTok {
+ IC_OR = 0,
+ IC_XOR,
+ IC_AND,
+ IC_LSHIFT,
+ IC_RSHIFT,
+ IC_PLUS,
+ IC_MINUS,
+ IC_MULTIPLY,
+ IC_DIVIDE,
+ IC_MOD,
+ IC_NOT,
+ IC_NEG,
+ IC_RPAREN,
+ IC_LPAREN,
+ IC_IMM,
+ IC_REGISTER,
+ IC_EQ,
+ IC_NE,
+ IC_LT,
+ IC_LE,
+ IC_GT,
+ IC_GE
+ };
+
+ enum IntelOperatorKind {
+ IOK_INVALID = 0,
+ IOK_LENGTH,
+ IOK_SIZE,
+ IOK_TYPE,
+ };
+
+ enum MasmOperatorKind {
+ MOK_INVALID = 0,
+ MOK_LENGTHOF,
+ MOK_SIZEOF,
+ MOK_TYPE,
+ };
+
+ class InfixCalculator {
+ typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
+ SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
+ SmallVector<ICToken, 4> PostfixStack;
+
+ bool isUnaryOperator(InfixCalculatorTok Op) const {
+ return Op == IC_NEG || Op == IC_NOT;
+ }
+
+ public:
+ int64_t popOperand() {
+ assert (!PostfixStack.empty() && "Poped an empty stack!");
+ ICToken Op = PostfixStack.pop_back_val();
+ if (!(Op.first == IC_IMM || Op.first == IC_REGISTER))
+ return -1; // The invalid Scale value will be caught later by checkScale
+ return Op.second;
+ }
+ void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
+ assert ((Op == IC_IMM || Op == IC_REGISTER) &&
+ "Unexpected operand!");
+ PostfixStack.push_back(std::make_pair(Op, Val));
+ }
+
+ void popOperator() { InfixOperatorStack.pop_back(); }
+ void pushOperator(InfixCalculatorTok Op) {
+ // Push the new operator if the stack is empty.
+ if (InfixOperatorStack.empty()) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // Push the new operator if it has a higher precedence than the operator
+ // on the top of the stack or the operator on the top of the stack is a
+ // left parentheses.
+ unsigned Idx = InfixOperatorStack.size() - 1;
+ InfixCalculatorTok StackOp = InfixOperatorStack[Idx];
+ if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // The operator on the top of the stack has higher precedence than the
+ // new operator.
+ unsigned ParenCount = 0;
+ while (1) {
+ // Nothing to process.
+ if (InfixOperatorStack.empty())
+ break;
+
+ Idx = InfixOperatorStack.size() - 1;
+ StackOp = InfixOperatorStack[Idx];
+ if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount))
+ break;
+
+ // If we have an even parentheses count and we see a left parentheses,
+ // then stop processing.
+ if (!ParenCount && StackOp == IC_LPAREN)
+ break;
+
+ if (StackOp == IC_RPAREN) {
+ ++ParenCount;
+ InfixOperatorStack.pop_back();
+ } else if (StackOp == IC_LPAREN) {
+ --ParenCount;
+ InfixOperatorStack.pop_back();
+ } else {
+ InfixOperatorStack.pop_back();
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+ }
+ // Push the new operator.
+ InfixOperatorStack.push_back(Op);
+ }
+
+ int64_t execute() {
+ // Push any remaining operators onto the postfix stack.
+ while (!InfixOperatorStack.empty()) {
+ InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val();
+ if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+
+ if (PostfixStack.empty())
+ return 0;
+
+ SmallVector<ICToken, 16> OperandStack;
+ for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
+ ICToken Op = PostfixStack[i];
+ if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
+ OperandStack.push_back(Op);
+ } else if (isUnaryOperator(Op.first)) {
+ assert (OperandStack.size() > 0 && "Too few operands.");
+ ICToken Operand = OperandStack.pop_back_val();
+ assert (Operand.first == IC_IMM &&
+ "Unary operation with a register!");
+ switch (Op.first) {
+ default:
+ report_fatal_error("Unexpected operator!");
+ break;
+ case IC_NEG:
+ OperandStack.push_back(std::make_pair(IC_IMM, -Operand.second));
+ break;
+ case IC_NOT:
+ OperandStack.push_back(std::make_pair(IC_IMM, ~Operand.second));
+ break;
+ }
+ } else {
+ assert (OperandStack.size() > 1 && "Too few operands.");
+ int64_t Val;
+ ICToken Op2 = OperandStack.pop_back_val();
+ ICToken Op1 = OperandStack.pop_back_val();
+ switch (Op.first) {
+ default:
+ report_fatal_error("Unexpected operator!");
+ break;
+ case IC_PLUS:
+ Val = Op1.second + Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MINUS:
+ Val = Op1.second - Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MULTIPLY:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Multiply operation with an immediate and a register!");
+ Val = Op1.second * Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_DIVIDE:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Divide operation with an immediate and a register!");
+ assert (Op2.second != 0 && "Division by zero!");
+ Val = Op1.second / Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MOD:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Modulo operation with an immediate and a register!");
+ Val = Op1.second % Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_OR:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Or operation with an immediate and a register!");
+ Val = Op1.second | Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_XOR:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Xor operation with an immediate and a register!");
+ Val = Op1.second ^ Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_AND:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "And operation with an immediate and a register!");
+ Val = Op1.second & Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_LSHIFT:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Left shift operation with an immediate and a register!");
+ Val = Op1.second << Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_RSHIFT:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Right shift operation with an immediate and a register!");
+ Val = Op1.second >> Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_EQ:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Equals operation with an immediate and a register!");
+ Val = (Op1.second == Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_NE:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Not-equals operation with an immediate and a register!");
+ Val = (Op1.second != Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_LT:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Less-than operation with an immediate and a register!");
+ Val = (Op1.second < Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_LE:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Less-than-or-equal operation with an immediate and a "
+ "register!");
+ Val = (Op1.second <= Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_GT:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Greater-than operation with an immediate and a register!");
+ Val = (Op1.second > Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_GE:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Greater-than-or-equal operation with an immediate and a "
+ "register!");
+ Val = (Op1.second >= Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ }
+ }
+ }
+ assert (OperandStack.size() == 1 && "Expected a single result.");
+ return OperandStack.pop_back_val().second;
+ }
+ };
+
+ enum IntelExprState {
+ IES_INIT,
+ IES_OR,
+ IES_XOR,
+ IES_AND,
+ IES_EQ,
+ IES_NE,
+ IES_LT,
+ IES_LE,
+ IES_GT,
+ IES_GE,
+ IES_LSHIFT,
+ IES_RSHIFT,
+ IES_PLUS,
+ IES_MINUS,
+ IES_OFFSET,
+ IES_CAST,
+ IES_NOT,
+ IES_MULTIPLY,
+ IES_DIVIDE,
+ IES_MOD,
+ IES_LBRAC,
+ IES_RBRAC,
+ IES_LPAREN,
+ IES_RPAREN,
+ IES_REGISTER,
+ IES_INTEGER,
+ IES_IDENTIFIER,
+ IES_ERROR
+ };
+
+ class IntelExprStateMachine {
+ IntelExprState State, PrevState;
+ unsigned BaseReg, IndexReg, TmpReg, Scale;
+ int64_t Imm;
+ const MCExpr *Sym;
+ StringRef SymName;
+ InfixCalculator IC;
+ InlineAsmIdentifierInfo Info;
+ short BracCount;
+ bool MemExpr;
+ bool OffsetOperator;
+ SMLoc OffsetOperatorLoc;
+ AsmTypeInfo CurType;
+
+ bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) {
+ if (Sym) {
+ ErrMsg = "cannot use more than one symbol in memory operand";
+ return true;
+ }
+ Sym = Val;
+ SymName = ID;
+ return false;
+ }
+
+ public:
+ IntelExprStateMachine()
+ : State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0),
+ TmpReg(0), Scale(0), Imm(0), Sym(nullptr), BracCount(0),
+ MemExpr(false), OffsetOperator(false) {}
+
+ void addImm(int64_t imm) { Imm += imm; }
+ short getBracCount() const { return BracCount; }
+ bool isMemExpr() const { return MemExpr; }
+ bool isOffsetOperator() const { return OffsetOperator; }
+ SMLoc getOffsetLoc() const { return OffsetOperatorLoc; }
+ unsigned getBaseReg() const { return BaseReg; }
+ unsigned getIndexReg() const { return IndexReg; }
+ unsigned getScale() const { return Scale; }
+ const MCExpr *getSym() const { return Sym; }
+ StringRef getSymName() const { return SymName; }
+ StringRef getType() const { return CurType.Name; }
+ unsigned getSize() const { return CurType.Size; }
+ unsigned getElementSize() const { return CurType.ElementSize; }
+ unsigned getLength() const { return CurType.Length; }
+ int64_t getImm() { return Imm + IC.execute(); }
+ bool isValidEndState() const {
+ return State == IES_RBRAC || State == IES_INTEGER;
+ }
+ bool hadError() const { return State == IES_ERROR; }
+ const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; }
+
+ void onOr() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_OR;
+ IC.pushOperator(IC_OR);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onXor() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_XOR;
+ IC.pushOperator(IC_XOR);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onAnd() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_AND;
+ IC.pushOperator(IC_AND);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onEq() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_EQ;
+ IC.pushOperator(IC_EQ);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onNE() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_NE;
+ IC.pushOperator(IC_NE);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLT() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_LT;
+ IC.pushOperator(IC_LT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLE() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_LE;
+ IC.pushOperator(IC_LE);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onGT() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_GT;
+ IC.pushOperator(IC_GT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onGE() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_GE;
+ IC.pushOperator(IC_GE);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLShift() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_LSHIFT;
+ IC.pushOperator(IC_LSHIFT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRShift() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_RSHIFT;
+ IC.pushOperator(IC_RSHIFT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ bool onPlus(StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ case IES_OFFSET:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // no explicit scale.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ if (IndexReg) {
+ ErrMsg = "BaseReg/IndexReg already set!";
+ return true;
+ }
+ IndexReg = TmpReg;
+ Scale = 0;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ return false;
+ }
+ bool onMinus(StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_OR:
+ case IES_XOR:
+ case IES_AND:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
+ case IES_LSHIFT:
+ case IES_RSHIFT:
+ case IES_PLUS:
+ case IES_NOT:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_MOD:
+ case IES_LPAREN:
+ case IES_RPAREN:
+ case IES_LBRAC:
+ case IES_RBRAC:
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_INIT:
+ case IES_OFFSET:
+ State = IES_MINUS;
+ // push minus operator if it is not a negate operator
+ if (CurrState == IES_REGISTER || CurrState == IES_RPAREN ||
+ CurrState == IES_INTEGER || CurrState == IES_RBRAC ||
+ CurrState == IES_OFFSET)
+ IC.pushOperator(IC_MINUS);
+ else if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+ // We have negate operator for Scale: it's illegal
+ ErrMsg = "Scale can't be negative";
+ return true;
+ } else
+ IC.pushOperator(IC_NEG);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // no explicit scale.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ if (IndexReg) {
+ ErrMsg = "BaseReg/IndexReg already set!";
+ return true;
+ }
+ IndexReg = TmpReg;
+ Scale = 0;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ return false;
+ }
+ void onNot() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_OR:
+ case IES_XOR:
+ case IES_AND:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
+ case IES_LSHIFT:
+ case IES_RSHIFT:
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_MOD:
+ case IES_LPAREN:
+ case IES_LBRAC:
+ case IES_INIT:
+ State = IES_NOT;
+ IC.pushOperator(IC_NOT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ bool onRegister(unsigned Reg, StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_LPAREN:
+ case IES_LBRAC:
+ State = IES_REGISTER;
+ TmpReg = Reg;
+ IC.pushOperand(IC_REGISTER);
+ break;
+ case IES_MULTIPLY:
+ // Index Register - Scale * Register
+ if (PrevState == IES_INTEGER) {
+ if (IndexReg) {
+ ErrMsg = "BaseReg/IndexReg already set!";
+ return true;
+ }
+ State = IES_REGISTER;
+ IndexReg = Reg;
+ // Get the scale and replace the 'Scale * Register' with '0'.
+ Scale = IC.popOperand();
+ if (checkScale(Scale, ErrMsg))
+ return true;
+ IC.pushOperand(IC_IMM);
+ IC.popOperator();
+ } else {
+ State = IES_ERROR;
+ }
+ break;
+ }
+ PrevState = CurrState;
+ return false;
+ }
+ bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName,
+ const InlineAsmIdentifierInfo &IDInfo,
+ const AsmTypeInfo &Type, bool ParsingMSInlineAsm,
+ StringRef &ErrMsg) {
+ // InlineAsm: Treat an enum value as an integer
+ if (ParsingMSInlineAsm)
+ if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
+ return onInteger(IDInfo.Enum.EnumVal, ErrMsg);
+ // Treat a symbolic constant like an integer
+ if (auto *CE = dyn_cast<MCConstantExpr>(SymRef))
+ return onInteger(CE->getValue(), ErrMsg);
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_CAST:
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ case IES_INIT:
+ case IES_LBRAC:
+ case IES_LPAREN:
+ if (setSymRef(SymRef, SymRefName, ErrMsg))
+ return true;
+ MemExpr = true;
+ State = IES_INTEGER;
+ IC.pushOperand(IC_IMM);
+ if (ParsingMSInlineAsm)
+ Info = IDInfo;
+ setTypeInfo(Type);
+ break;
+ }
+ return false;
+ }
+ bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ case IES_OR:
+ case IES_XOR:
+ case IES_AND:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
+ case IES_LSHIFT:
+ case IES_RSHIFT:
+ case IES_DIVIDE:
+ case IES_MOD:
+ case IES_MULTIPLY:
+ case IES_LPAREN:
+ case IES_INIT:
+ case IES_LBRAC:
+ State = IES_INTEGER;
+ if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+ // Index Register - Register * Scale
+ if (IndexReg) {
+ ErrMsg = "BaseReg/IndexReg already set!";
+ return true;
+ }
+ IndexReg = TmpReg;
+ Scale = TmpInt;
+ if (checkScale(Scale, ErrMsg))
+ return true;
+ // Get the scale and replace the 'Register * Scale' with '0'.
+ IC.popOperator();
+ } else {
+ IC.pushOperand(IC_IMM, TmpInt);
+ }
+ break;
+ }
+ PrevState = CurrState;
+ return false;
+ }
+ void onStar() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_MULTIPLY;
+ IC.pushOperator(IC_MULTIPLY);
+ break;
+ }
+ }
+ void onDivide() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ State = IES_DIVIDE;
+ IC.pushOperator(IC_DIVIDE);
+ break;
+ }
+ }
+ void onMod() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ State = IES_MOD;
+ IC.pushOperator(IC_MOD);
+ break;
+ }
+ }
+ bool onLBrac() {
+ if (BracCount)
+ return true;
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_RBRAC:
+ case IES_INTEGER:
+ case IES_RPAREN:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ CurType.Length = 1;
+ CurType.Size = CurType.ElementSize;
+ break;
+ case IES_INIT:
+ case IES_CAST:
+ assert(!BracCount && "BracCount should be zero on parsing's start");
+ State = IES_LBRAC;
+ break;
+ }
+ MemExpr = true;
+ BracCount++;
+ return false;
+ }
+ bool onRBrac() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_OFFSET:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ if (BracCount-- != 1)
+ return true;
+ State = IES_RBRAC;
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // no explicit scale.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 0;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ return false;
+ }
+ void onLParen() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_NOT:
+ case IES_OR:
+ case IES_XOR:
+ case IES_AND:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
+ case IES_LSHIFT:
+ case IES_RSHIFT:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_MOD:
+ case IES_LPAREN:
+ case IES_INIT:
+ case IES_LBRAC:
+ State = IES_LPAREN;
+ IC.pushOperator(IC_LPAREN);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRParen() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_OFFSET:
+ case IES_REGISTER:
+ case IES_RBRAC:
+ case IES_RPAREN:
+ State = IES_RPAREN;
+ IC.pushOperator(IC_RPAREN);
+ break;
+ }
+ }
+ bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
+ const InlineAsmIdentifierInfo &IDInfo,
+ bool ParsingMSInlineAsm, StringRef &ErrMsg) {
+ PrevState = State;
+ switch (State) {
+ default:
+ ErrMsg = "unexpected offset operator expression";
+ return true;
+ case IES_PLUS:
+ case IES_INIT:
+ case IES_LBRAC:
+ if (setSymRef(Val, ID, ErrMsg))
+ return true;
+ OffsetOperator = true;
+ OffsetOperatorLoc = OffsetLoc;
+ State = IES_OFFSET;
+ // As we cannot yet resolve the actual value (offset), we retain
+ // the requested semantics by pushing a '0' to the operands stack
+ IC.pushOperand(IC_IMM);
+ if (ParsingMSInlineAsm) {
+ Info = IDInfo;
+ }
+ break;
+ }
+ return false;
+ }
+ void onCast(AsmTypeInfo Info) {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_LPAREN:
+ setTypeInfo(Info);
+ State = IES_CAST;
+ break;
+ }
+ }
+ void setTypeInfo(AsmTypeInfo Type) { CurType = Type; }
+ };
+
+ bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
+ bool MatchingInlineAsm = false) {
+ MCAsmParser &Parser = getParser();
+ if (MatchingInlineAsm) {
+ if (!getLexer().isAtStartOfStatement())
+ Parser.eatToEndOfStatement();
+ return false;
+ }
+ return Parser.Error(L, Msg, Range);
+ }
+
+ bool MatchRegisterByName(unsigned &RegNo, StringRef RegName, SMLoc StartLoc,
+ SMLoc EndLoc);
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
+ bool RestoreOnFailure);
+
+ std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
+ std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+ bool IsSIReg(unsigned Reg);
+ unsigned GetSIDIForRegClass(unsigned RegClassID, unsigned Reg, bool IsSIReg);
+ void
+ AddDefaultSrcDestOperands(OperandVector &Operands,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
+ bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
+ OperandVector &FinalOperands);
+ bool ParseOperand(OperandVector &Operands);
+ bool ParseATTOperand(OperandVector &Operands);
+ bool ParseIntelOperand(OperandVector &Operands);
+ bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
+ InlineAsmIdentifierInfo &Info, SMLoc &End);
+ bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
+ unsigned IdentifyIntelInlineAsmOperator(StringRef Name);
+ unsigned ParseIntelInlineAsmOperator(unsigned OpKind);
+ unsigned IdentifyMasmOperator(StringRef Name);
+ bool ParseMasmOperator(unsigned OpKind, int64_t &Val);
+ bool ParseRoundingModeOp(SMLoc Start, OperandVector &Operands);
+ bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End);
+ bool ParseMasmNamedOperator(StringRef Name, IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End);
+ void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start,
+ SMLoc End);
+ bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+ bool ParseIntelInlineAsmIdentifier(const MCExpr *&Val, StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End,
+ bool IsParsingOffsetOperator = false);
+
+ bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc,
+ SMLoc EndLoc, OperandVector &Operands);
+
+ X86::CondCode ParseConditionCode(StringRef CCode);
+
+ bool ParseIntelMemoryOperandSize(unsigned &Size);
+ bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End,
+ unsigned Size, StringRef Identifier,
+ const InlineAsmIdentifierInfo &Info,
+ OperandVector &Operands);
+
+ bool parseDirectiveArch();
+ bool parseDirectiveNops(SMLoc L);
+ bool parseDirectiveEven(SMLoc L);
+ bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+
+ /// CodeView FPO data directives.
+ bool parseDirectiveFPOProc(SMLoc L);
+ bool parseDirectiveFPOSetFrame(SMLoc L);
+ bool parseDirectiveFPOPushReg(SMLoc L);
+ bool parseDirectiveFPOStackAlloc(SMLoc L);
+ bool parseDirectiveFPOStackAlign(SMLoc L);
+ bool parseDirectiveFPOEndPrologue(SMLoc L);
+ bool parseDirectiveFPOEndProc(SMLoc L);
+
+ /// SEH directives.
+ bool parseSEHRegisterNumber(unsigned RegClassID, unsigned &RegNo);
+ bool parseDirectiveSEHPushReg(SMLoc);
+ bool parseDirectiveSEHSetFrame(SMLoc);
+ bool parseDirectiveSEHSaveReg(SMLoc);
+ bool parseDirectiveSEHSaveXMM(SMLoc);
+ bool parseDirectiveSEHPushFrame(SMLoc);
+
+ unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
+ bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
+ bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+
+ // Load Value Injection (LVI) Mitigations for machine code
+ void emitWarningForSpecialLVIInstruction(SMLoc Loc);
+ void applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out);
+ void applyLVILoadHardeningMitigation(MCInst &Inst, MCStreamer &Out);
+
+ /// Wrapper around MCStreamer::emitInstruction(). Possibly adds
+ /// instrumentation around Inst.
+ void emitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+
+ void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
+ MCStreamer &Out, bool MatchingInlineAsm);
+
+ bool ErrorMissingFeature(SMLoc IDLoc, const FeatureBitset &MissingFeatures,
+ bool MatchingInlineAsm);
+
+ bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm);
+
+ bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm);
+
+ bool OmitRegisterFromClobberLists(unsigned RegNo) override;
+
+ /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
+ /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
+ /// return false if no parsing errors occurred, true otherwise.
+ bool HandleAVX512Operand(OperandVector &Operands);
+
+ bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc);
+
+ bool is64BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[X86::Mode64Bit];
+ }
+ bool is32BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[X86::Mode32Bit];
+ }
+ bool is16BitMode() const {
+ // FIXME: Can tablegen auto-generate this?
+ return getSTI().getFeatureBits()[X86::Mode16Bit];
+ }
+ void SwitchMode(unsigned mode) {
+ MCSubtargetInfo &STI = copySTI();
+ FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
+ FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
+ FeatureBitset FB = ComputeAvailableFeatures(
+ STI.ToggleFeature(OldMode.flip(mode)));
+ setAvailableFeatures(FB);
+
+ assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes));
+ }
+
+ unsigned getPointerWidth() {
+ if (is16BitMode()) return 16;
+ if (is32BitMode()) return 32;
+ if (is64BitMode()) return 64;
+ llvm_unreachable("invalid mode");
+ }
+
+ bool isParsingIntelSyntax() {
+ return getParser().getAssemblerDialect();
+ }
+
+ /// @name Auto-generated Matcher Functions
+ /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "X86GenAsmMatcher.inc"
+
+ /// }
+
+public:
+ enum X86MatchResultTy {
+ Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "X86GenAsmMatcher.inc"
+ };
+
+ X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
+ const MCInstrInfo &mii, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, sti, mii), InstInfo(nullptr),
+ Code16GCC(false) {
+
+ Parser.addAliasForDirective(".word", ".2byte");
+
+ // Initialize the set of available features.
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+ }
+
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) override;
+
+ bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
+
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+
+ bool ParseDirective(AsmToken DirectiveID) override;
+};
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, bool Is64BitMode,
+ StringRef &ErrMsg) {
+ // If we have both a base register and an index register make sure they are
+ // both 64-bit or 32-bit registers.
+ // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+
+ if (BaseReg != 0 &&
+ !(BaseReg == X86::RIP || BaseReg == X86::EIP ||
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg))) {
+ ErrMsg = "invalid base+index expression";
+ return true;
+ }
+
+ if (IndexReg != 0 &&
+ !(IndexReg == X86::EIZ || IndexReg == X86::RIZ ||
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR128XRegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR256XRegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR512RegClassID].contains(IndexReg))) {
+ ErrMsg = "invalid base+index expression";
+ return true;
+ }
+
+ if (((BaseReg == X86::RIP || BaseReg == X86::EIP) && IndexReg != 0) ||
+ IndexReg == X86::EIP || IndexReg == X86::RIP ||
+ IndexReg == X86::ESP || IndexReg == X86::RSP) {
+ ErrMsg = "invalid base+index expression";
+ return true;
+ }
+
+ // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
+ // and then only in non-64-bit modes.
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+ (Is64BitMode || (BaseReg != X86::BX && BaseReg != X86::BP &&
+ BaseReg != X86::SI && BaseReg != X86::DI))) {
+ ErrMsg = "invalid 16-bit base register";
+ return true;
+ }
+
+ if (BaseReg == 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
+ ErrMsg = "16-bit memory operand may not include only index register";
+ return true;
+ }
+
+ if (BaseReg != 0 && IndexReg != 0) {
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+ IndexReg == X86::EIZ)) {
+ ErrMsg = "base register is 64-bit, but index register is not";
+ return true;
+ }
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
+ IndexReg == X86::RIZ)) {
+ ErrMsg = "base register is 32-bit, but index register is not";
+ return true;
+ }
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) {
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) {
+ ErrMsg = "base register is 16-bit, but index register is not";
+ return true;
+ }
+ if ((BaseReg != X86::BX && BaseReg != X86::BP) ||
+ (IndexReg != X86::SI && IndexReg != X86::DI)) {
+ ErrMsg = "invalid 16-bit base/index register combination";
+ return true;
+ }
+ }
+ }
+
+ // RIP/EIP-relative addressing is only supported in 64-bit mode.
+ if (!Is64BitMode && BaseReg != 0 &&
+ (BaseReg == X86::RIP || BaseReg == X86::EIP)) {
+ ErrMsg = "IP-relative addressing requires 64-bit mode";
+ return true;
+ }
+
+ return checkScale(Scale, ErrMsg);
+}
+
+bool X86AsmParser::MatchRegisterByName(unsigned &RegNo, StringRef RegName,
+ SMLoc StartLoc, SMLoc EndLoc) {
+ // If we encounter a %, ignore it. This code handles registers with and
+ // without the prefix, unprefixed registers can occur in cfi directives.
+ RegName.consume_front("%");
+
+ RegNo = MatchRegisterName(RegName);
+
+ // If the match failed, try the register name as lowercase.
+ if (RegNo == 0)
+ RegNo = MatchRegisterName(RegName.lower());
+
+ // The "flags" and "mxcsr" registers cannot be referenced directly.
+ // Treat it as an identifier instead.
+ if (isParsingMSInlineAsm() && isParsingIntelSyntax() &&
+ (RegNo == X86::EFLAGS || RegNo == X86::MXCSR))
+ RegNo = 0;
+
+ if (!is64BitMode()) {
+ // FIXME: This should be done using Requires<Not64BitMode> and
+ // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+ // checked.
+ if (RegNo == X86::RIZ || RegNo == X86::RIP ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+ X86II::isX86_64NonExtLowByteReg(RegNo) ||
+ X86II::isX86_64ExtendedReg(RegNo)) {
+ return Error(StartLoc,
+ "register %" + RegName + " is only available in 64-bit mode",
+ SMRange(StartLoc, EndLoc));
+ }
+ }
+
+ // If this is "db[0-15]", match it as an alias
+ // for dr[0-15].
+ if (RegNo == 0 && RegName.startswith("db")) {
+ if (RegName.size() == 3) {
+ switch (RegName[2]) {
+ case '0':
+ RegNo = X86::DR0;
+ break;
+ case '1':
+ RegNo = X86::DR1;
+ break;
+ case '2':
+ RegNo = X86::DR2;
+ break;
+ case '3':
+ RegNo = X86::DR3;
+ break;
+ case '4':
+ RegNo = X86::DR4;
+ break;
+ case '5':
+ RegNo = X86::DR5;
+ break;
+ case '6':
+ RegNo = X86::DR6;
+ break;
+ case '7':
+ RegNo = X86::DR7;
+ break;
+ case '8':
+ RegNo = X86::DR8;
+ break;
+ case '9':
+ RegNo = X86::DR9;
+ break;
+ }
+ } else if (RegName.size() == 4 && RegName[2] == '1') {
+ switch (RegName[3]) {
+ case '0':
+ RegNo = X86::DR10;
+ break;
+ case '1':
+ RegNo = X86::DR11;
+ break;
+ case '2':
+ RegNo = X86::DR12;
+ break;
+ case '3':
+ RegNo = X86::DR13;
+ break;
+ case '4':
+ RegNo = X86::DR14;
+ break;
+ case '5':
+ RegNo = X86::DR15;
+ break;
+ }
+ }
+ }
+
+ if (RegNo == 0) {
+ if (isParsingIntelSyntax())
+ return true;
+ return Error(StartLoc, "invalid register name", SMRange(StartLoc, EndLoc));
+ }
+ return false;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc, bool RestoreOnFailure) {
+ MCAsmParser &Parser = getParser();
+ MCAsmLexer &Lexer = getLexer();
+ RegNo = 0;
+
+ SmallVector<AsmToken, 5> Tokens;
+ auto OnFailure = [RestoreOnFailure, &Lexer, &Tokens]() {
+ if (RestoreOnFailure) {
+ while (!Tokens.empty()) {
+ Lexer.UnLex(Tokens.pop_back_val());
+ }
+ }
+ };
+
+ const AsmToken &PercentTok = Parser.getTok();
+ StartLoc = PercentTok.getLoc();
+
+ // If we encounter a %, ignore it. This code handles registers with and
+ // without the prefix, unprefixed registers can occur in cfi directives.
+ if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent)) {
+ Tokens.push_back(PercentTok);
+ Parser.Lex(); // Eat percent token.
+ }
+
+ const AsmToken &Tok = Parser.getTok();
+ EndLoc = Tok.getEndLoc();
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ OnFailure();
+ if (isParsingIntelSyntax()) return true;
+ return Error(StartLoc, "invalid register name",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ if (MatchRegisterByName(RegNo, Tok.getString(), StartLoc, EndLoc)) {
+ OnFailure();
+ return true;
+ }
+
+ // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
+ if (RegNo == X86::ST0) {
+ Tokens.push_back(Tok);
+ Parser.Lex(); // Eat 'st'
+
+ // Check to see if we have '(4)' after %st.
+ if (Lexer.isNot(AsmToken::LParen))
+ return false;
+ // Lex the paren.
+ Tokens.push_back(Parser.getTok());
+ Parser.Lex();
+
+ const AsmToken &IntTok = Parser.getTok();
+ if (IntTok.isNot(AsmToken::Integer)) {
+ OnFailure();
+ return Error(IntTok.getLoc(), "expected stack index");
+ }
+ switch (IntTok.getIntVal()) {
+ case 0: RegNo = X86::ST0; break;
+ case 1: RegNo = X86::ST1; break;
+ case 2: RegNo = X86::ST2; break;
+ case 3: RegNo = X86::ST3; break;
+ case 4: RegNo = X86::ST4; break;
+ case 5: RegNo = X86::ST5; break;
+ case 6: RegNo = X86::ST6; break;
+ case 7: RegNo = X86::ST7; break;
+ default:
+ OnFailure();
+ return Error(IntTok.getLoc(), "invalid stack index");
+ }
+
+ // Lex IntTok
+ Tokens.push_back(IntTok);
+ Parser.Lex();
+ if (Lexer.isNot(AsmToken::RParen)) {
+ OnFailure();
+ return Error(Parser.getTok().getLoc(), "expected ')'");
+ }
+
+ EndLoc = Parser.getTok().getEndLoc();
+ Parser.Lex(); // Eat ')'
+ return false;
+ }
+
+ EndLoc = Parser.getTok().getEndLoc();
+
+ if (RegNo == 0) {
+ OnFailure();
+ if (isParsingIntelSyntax()) return true;
+ return Error(StartLoc, "invalid register name",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ return false;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false);
+}
+
+OperandMatchResultTy X86AsmParser::tryParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ bool Result =
+ ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true);
+ bool PendingErrors = getParser().hasPendingError();
+ getParser().clearPendingErrors();
+ if (PendingErrors)
+ return MatchOperand_ParseFail;
+ if (Result)
+ return MatchOperand_NoMatch;
+ return MatchOperand_Success;
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
+ const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+ return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+ /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
+ Loc, Loc, 0);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned Basereg = is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI);
+ const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+ return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+ /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
+ Loc, Loc, 0);
+}
+
+bool X86AsmParser::IsSIReg(unsigned Reg) {
+ switch (Reg) {
+ default: llvm_unreachable("Only (R|E)SI and (R|E)DI are expected!");
+ case X86::RSI:
+ case X86::ESI:
+ case X86::SI:
+ return true;
+ case X86::RDI:
+ case X86::EDI:
+ case X86::DI:
+ return false;
+ }
+}
+
+unsigned X86AsmParser::GetSIDIForRegClass(unsigned RegClassID, unsigned Reg,
+ bool IsSIReg) {
+ switch (RegClassID) {
+ default: llvm_unreachable("Unexpected register class");
+ case X86::GR64RegClassID:
+ return IsSIReg ? X86::RSI : X86::RDI;
+ case X86::GR32RegClassID:
+ return IsSIReg ? X86::ESI : X86::EDI;
+ case X86::GR16RegClassID:
+ return IsSIReg ? X86::SI : X86::DI;
+ }
+}
+
+void X86AsmParser::AddDefaultSrcDestOperands(
+ OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
+ if (isParsingIntelSyntax()) {
+ Operands.push_back(std::move(Dst));
+ Operands.push_back(std::move(Src));
+ }
+ else {
+ Operands.push_back(std::move(Src));
+ Operands.push_back(std::move(Dst));
+ }
+}
+
+bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
+ OperandVector &FinalOperands) {
+
+ if (OrigOperands.size() > 1) {
+ // Check if sizes match, OrigOperands also contains the instruction name
+ assert(OrigOperands.size() == FinalOperands.size() + 1 &&
+ "Operand size mismatch");
+
+ SmallVector<std::pair<SMLoc, std::string>, 2> Warnings;
+ // Verify types match
+ int RegClassID = -1;
+ for (unsigned int i = 0; i < FinalOperands.size(); ++i) {
+ X86Operand &OrigOp = static_cast<X86Operand &>(*OrigOperands[i + 1]);
+ X86Operand &FinalOp = static_cast<X86Operand &>(*FinalOperands[i]);
+
+ if (FinalOp.isReg() &&
+ (!OrigOp.isReg() || FinalOp.getReg() != OrigOp.getReg()))
+ // Return false and let a normal complaint about bogus operands happen
+ return false;
+
+ if (FinalOp.isMem()) {
+
+ if (!OrigOp.isMem())
+ // Return false and let a normal complaint about bogus operands happen
+ return false;
+
+ unsigned OrigReg = OrigOp.Mem.BaseReg;
+ unsigned FinalReg = FinalOp.Mem.BaseReg;
+
+ // If we've already encounterd a register class, make sure all register
+ // bases are of the same register class
+ if (RegClassID != -1 &&
+ !X86MCRegisterClasses[RegClassID].contains(OrigReg)) {
+ return Error(OrigOp.getStartLoc(),
+ "mismatching source and destination index registers");
+ }
+
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(OrigReg))
+ RegClassID = X86::GR64RegClassID;
+ else if (X86MCRegisterClasses[X86::GR32RegClassID].contains(OrigReg))
+ RegClassID = X86::GR32RegClassID;
+ else if (X86MCRegisterClasses[X86::GR16RegClassID].contains(OrigReg))
+ RegClassID = X86::GR16RegClassID;
+ else
+ // Unexpected register class type
+ // Return false and let a normal complaint about bogus operands happen
+ return false;
+
+ bool IsSI = IsSIReg(FinalReg);
+ FinalReg = GetSIDIForRegClass(RegClassID, FinalReg, IsSI);
+
+ if (FinalReg != OrigReg) {
+ std::string RegName = IsSI ? "ES:(R|E)SI" : "ES:(R|E)DI";
+ Warnings.push_back(std::make_pair(
+ OrigOp.getStartLoc(),
+ "memory operand is only for determining the size, " + RegName +
+ " will be used for the location"));
+ }
+
+ FinalOp.Mem.Size = OrigOp.Mem.Size;
+ FinalOp.Mem.SegReg = OrigOp.Mem.SegReg;
+ FinalOp.Mem.BaseReg = FinalReg;
+ }
+ }
+
+ // Produce warnings only if all the operands passed the adjustment - prevent
+ // legal cases like "movsd (%rax), %xmm0" mistakenly produce warnings
+ for (auto &WarningMsg : Warnings) {
+ Warning(WarningMsg.first, WarningMsg.second);
+ }
+
+ // Remove old operands
+ for (unsigned int i = 0; i < FinalOperands.size(); ++i)
+ OrigOperands.pop_back();
+ }
+ // OrigOperands.append(FinalOperands.begin(), FinalOperands.end());
+ for (unsigned int i = 0; i < FinalOperands.size(); ++i)
+ OrigOperands.push_back(std::move(FinalOperands[i]));
+
+ return false;
+}
+
+bool X86AsmParser::ParseOperand(OperandVector &Operands) {
+ if (isParsingIntelSyntax())
+ return ParseIntelOperand(Operands);
+
+ return ParseATTOperand(Operands);
+}
+
+bool X86AsmParser::CreateMemForMSInlineAsm(
+ unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
+ const InlineAsmIdentifierInfo &Info, OperandVector &Operands) {
+ // If we found a decl other than a VarDecl, then assume it is a FuncDecl or
+ // some other label reference.
+ if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) {
+ // Insert an explicit size if the user didn't have one.
+ if (!Size) {
+ Size = getPointerWidth();
+ InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+ /*Len=*/0, Size);
+ }
+ // Create an absolute memory reference in order to match against
+ // instructions taking a PC relative operand.
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start,
+ End, Size, Identifier,
+ Info.Label.Decl));
+ return false;
+ }
+ // We either have a direct symbol reference, or an offset from a symbol. The
+ // parser always puts the symbol on the LHS, so look there for size
+ // calculation purposes.
+ unsigned FrontendSize = 0;
+ void *Decl = nullptr;
+ bool IsGlobalLV = false;
+ if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
+ // Size is in terms of bits in this context.
+ FrontendSize = Info.Var.Type * 8;
+ Decl = Info.Var.Decl;
+ IsGlobalLV = Info.Var.IsGlobalLV;
+ }
+ // It is widely common for MS InlineAsm to use a global variable and one/two
+ // registers in a mmory expression, and though unaccessible via rip/eip.
+ if (IsGlobalLV && (BaseReg || IndexReg)) {
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, Start, End));
+ return false;
+ }
+ // Otherwise, we set the base register to a non-zero value
+ // if we don't know the actual value at this time. This is necessary to
+ // get the matching correct in some cases.
+ BaseReg = BaseReg ? BaseReg : 1;
+ Operands.push_back(X86Operand::CreateMem(
+ getPointerWidth(), SegReg, Disp, BaseReg, IndexReg, Scale, Start, End,
+ Size,
+ /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, FrontendSize));
+ return false;
+}
+
+// Some binary bitwise operators have a named synonymous
+// Query a candidate string for being such a named operator
+// and if so - invoke the appropriate handler
+bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
+ IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End) {
+ // A named operator should be either lower or upper case, but not a mix...
+ // except in MASM, which uses full case-insensitivity.
+ if (Name.compare(Name.lower()) && Name.compare(Name.upper()) &&
+ !getParser().isParsingMasm())
+ return false;
+ if (Name.equals_lower("not")) {
+ SM.onNot();
+ } else if (Name.equals_lower("or")) {
+ SM.onOr();
+ } else if (Name.equals_lower("shl")) {
+ SM.onLShift();
+ } else if (Name.equals_lower("shr")) {
+ SM.onRShift();
+ } else if (Name.equals_lower("xor")) {
+ SM.onXor();
+ } else if (Name.equals_lower("and")) {
+ SM.onAnd();
+ } else if (Name.equals_lower("mod")) {
+ SM.onMod();
+ } else if (Name.equals_lower("offset")) {
+ SMLoc OffsetLoc = getTok().getLoc();
+ const MCExpr *Val = nullptr;
+ StringRef ID;
+ InlineAsmIdentifierInfo Info;
+ ParseError = ParseIntelOffsetOperator(Val, ID, Info, End);
+ if (ParseError)
+ return true;
+ StringRef ErrMsg;
+ ParseError =
+ SM.onOffset(Val, OffsetLoc, ID, Info, isParsingMSInlineAsm(), ErrMsg);
+ if (ParseError)
+ return Error(SMLoc::getFromPointer(Name.data()), ErrMsg);
+ } else {
+ return false;
+ }
+ if (!Name.equals_lower("offset"))
+ End = consumeToken();
+ return true;
+}
+bool X86AsmParser::ParseMasmNamedOperator(StringRef Name,
+ IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End) {
+ if (Name.equals_lower("eq")) {
+ SM.onEq();
+ } else if (Name.equals_lower("ne")) {
+ SM.onNE();
+ } else if (Name.equals_lower("lt")) {
+ SM.onLT();
+ } else if (Name.equals_lower("le")) {
+ SM.onLE();
+ } else if (Name.equals_lower("gt")) {
+ SM.onGT();
+ } else if (Name.equals_lower("ge")) {
+ SM.onGE();
+ } else {
+ return false;
+ }
+ End = consumeToken();
+ return true;
+}
+
+bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+ MCAsmParser &Parser = getParser();
+ StringRef ErrMsg;
+
+ AsmToken::TokenKind PrevTK = AsmToken::Error;
+ bool Done = false;
+ while (!Done) {
+ // Get a fresh reference on each loop iteration in case the previous
+ // iteration moved the token storage during UnLex().
+ const AsmToken &Tok = Parser.getTok();
+
+ bool UpdateLocLex = true;
+ AsmToken::TokenKind TK = getLexer().getKind();
+
+ switch (TK) {
+ default:
+ if ((Done = SM.isValidEndState()))
+ break;
+ return Error(Tok.getLoc(), "unknown token in expression");
+ case AsmToken::Error:
+ return Error(getLexer().getErrLoc(), getLexer().getErr());
+ break;
+ case AsmToken::EndOfStatement:
+ Done = true;
+ break;
+ case AsmToken::Real:
+ // DotOperator: [ebx].0
+ UpdateLocLex = false;
+ if (ParseIntelDotOperator(SM, End))
+ return true;
+ break;
+ case AsmToken::Dot:
+ if (!Parser.isParsingMasm()) {
+ if ((Done = SM.isValidEndState()))
+ break;
+ return Error(Tok.getLoc(), "unknown token in expression");
+ }
+ // MASM allows spaces around the dot operator (e.g., "var . x")
+ Lex();
+ UpdateLocLex = false;
+ if (ParseIntelDotOperator(SM, End))
+ return true;
+ break;
+ case AsmToken::Dollar:
+ if (!Parser.isParsingMasm()) {
+ if ((Done = SM.isValidEndState()))
+ break;
+ return Error(Tok.getLoc(), "unknown token in expression");
+ }
+ LLVM_FALLTHROUGH;
+ case AsmToken::String: {
+ if (Parser.isParsingMasm()) {
+ // MASM parsers handle strings in expressions as constants.
+ SMLoc ValueLoc = Tok.getLoc();
+ int64_t Res;
+ const MCExpr *Val;
+ if (Parser.parsePrimaryExpr(Val, End, nullptr))
+ return true;
+ UpdateLocLex = false;
+ if (!Val->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
+ return Error(ValueLoc, "expected absolute value");
+ if (SM.onInteger(Res, ErrMsg))
+ return Error(ValueLoc, ErrMsg);
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ }
+ case AsmToken::At:
+ case AsmToken::Identifier: {
+ SMLoc IdentLoc = Tok.getLoc();
+ StringRef Identifier = Tok.getString();
+ UpdateLocLex = false;
+ if (Parser.isParsingMasm()) {
+ size_t DotOffset = Identifier.find_first_of('.');
+ if (DotOffset != StringRef::npos) {
+ consumeToken();
+ StringRef LHS = Identifier.slice(0, DotOffset);
+ StringRef Dot = Identifier.slice(DotOffset, DotOffset + 1);
+ StringRef RHS = Identifier.slice(DotOffset + 1, StringRef::npos);
+ if (!RHS.empty()) {
+ getLexer().UnLex(AsmToken(AsmToken::Identifier, RHS));
+ }
+ getLexer().UnLex(AsmToken(AsmToken::Dot, Dot));
+ if (!LHS.empty()) {
+ getLexer().UnLex(AsmToken(AsmToken::Identifier, LHS));
+ }
+ break;
+ }
+ }
+ // (MASM only) <TYPE> PTR operator
+ if (Parser.isParsingMasm()) {
+ const AsmToken &NextTok = getLexer().peekTok();
+ if (NextTok.is(AsmToken::Identifier) &&
+ NextTok.getIdentifier().equals_lower("ptr")) {
+ AsmTypeInfo Info;
+ if (Parser.lookUpType(Identifier, Info))
+ return Error(Tok.getLoc(), "unknown type");
+ SM.onCast(Info);
+ // Eat type and PTR.
+ consumeToken();
+ End = consumeToken();
+ break;
+ }
+ }
+ // Register, or (MASM only) <register>.<field>
+ unsigned Reg;
+ if (Tok.is(AsmToken::Identifier)) {
+ if (!ParseRegister(Reg, IdentLoc, End, /*RestoreOnFailure=*/true)) {
+ if (SM.onRegister(Reg, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ break;
+ }
+ if (Parser.isParsingMasm()) {
+ const std::pair<StringRef, StringRef> IDField =
+ Tok.getString().split('.');
+ const StringRef ID = IDField.first, Field = IDField.second;
+ SMLoc IDEndLoc = SMLoc::getFromPointer(ID.data() + ID.size());
+ if (!Field.empty() &&
+ !MatchRegisterByName(Reg, ID, IdentLoc, IDEndLoc)) {
+ if (SM.onRegister(Reg, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+
+ AsmFieldInfo Info;
+ SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data());
+ if (Parser.lookUpField(Field, Info))
+ return Error(FieldStartLoc, "unknown offset");
+ else if (SM.onPlus(ErrMsg))
+ return Error(getTok().getLoc(), ErrMsg);
+ else if (SM.onInteger(Info.Offset, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ SM.setTypeInfo(Info.Type);
+
+ End = consumeToken();
+ break;
+ }
+ }
+ }
+ // Operator synonymous ("not", "or" etc.)
+ bool ParseError = false;
+ if (ParseIntelNamedOperator(Identifier, SM, ParseError, End)) {
+ if (ParseError)
+ return true;
+ break;
+ }
+ if (Parser.isParsingMasm() &&
+ ParseMasmNamedOperator(Identifier, SM, ParseError, End)) {
+ if (ParseError)
+ return true;
+ break;
+ }
+ // Symbol reference, when parsing assembly content
+ InlineAsmIdentifierInfo Info;
+ AsmFieldInfo FieldInfo;
+ const MCExpr *Val;
+ if (isParsingMSInlineAsm() || Parser.isParsingMasm()) {
+ // MS Dot Operator expression
+ if (Identifier.count('.') &&
+ (PrevTK == AsmToken::RBrac || PrevTK == AsmToken::RParen)) {
+ if (ParseIntelDotOperator(SM, End))
+ return true;
+ break;
+ }
+ }
+ if (isParsingMSInlineAsm()) {
+ // MS InlineAsm operators (TYPE/LENGTH/SIZE)
+ if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
+ if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
+ if (SM.onInteger(Val, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ } else {
+ return true;
+ }
+ break;
+ }
+ // MS InlineAsm identifier
+ // Call parseIdentifier() to combine @ with the identifier behind it.
+ if (TK == AsmToken::At && Parser.parseIdentifier(Identifier))
+ return Error(IdentLoc, "expected identifier");
+ if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
+ return true;
+ else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
+ true, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ break;
+ }
+ if (Parser.isParsingMasm()) {
+ if (unsigned OpKind = IdentifyMasmOperator(Identifier)) {
+ int64_t Val;
+ if (ParseMasmOperator(OpKind, Val))
+ return true;
+ if (SM.onInteger(Val, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ break;
+ }
+ if (!getParser().lookUpType(Identifier, FieldInfo.Type)) {
+ // Field offset immediate; <TYPE>.<field specification>
+ Lex(); // eat type
+ bool EndDot = parseOptionalToken(AsmToken::Dot);
+ while (EndDot || (getTok().is(AsmToken::Identifier) &&
+ getTok().getString().startswith("."))) {
+ getParser().parseIdentifier(Identifier);
+ if (!EndDot)
+ Identifier.consume_front(".");
+ EndDot = Identifier.consume_back(".");
+ if (getParser().lookUpField(FieldInfo.Type.Name, Identifier,
+ FieldInfo)) {
+ SMLoc IDEnd =
+ SMLoc::getFromPointer(Identifier.data() + Identifier.size());
+ return Error(IdentLoc, "Unable to lookup field reference!",
+ SMRange(IdentLoc, IDEnd));
+ }
+ if (!EndDot)
+ EndDot = parseOptionalToken(AsmToken::Dot);
+ }
+ if (SM.onInteger(FieldInfo.Offset, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ break;
+ }
+ }
+ if (getParser().parsePrimaryExpr(Val, End, &FieldInfo.Type)) {
+ return Error(Tok.getLoc(), "Unexpected identifier!");
+ } else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
+ false, ErrMsg)) {
+ return Error(IdentLoc, ErrMsg);
+ }
+ break;
+ }
+ case AsmToken::Integer: {
+ // Look for 'b' or 'f' following an Integer as a directional label
+ SMLoc Loc = getTok().getLoc();
+ int64_t IntVal = getTok().getIntVal();
+ End = consumeToken();
+ UpdateLocLex = false;
+ if (getLexer().getKind() == AsmToken::Identifier) {
+ StringRef IDVal = getTok().getString();
+ if (IDVal == "f" || IDVal == "b") {
+ MCSymbol *Sym =
+ getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b");
+ MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+ const MCExpr *Val =
+ MCSymbolRefExpr::create(Sym, Variant, getContext());
+ if (IDVal == "b" && Sym->isUndefined())
+ return Error(Loc, "invalid reference to undefined symbol");
+ StringRef Identifier = Sym->getName();
+ InlineAsmIdentifierInfo Info;
+ AsmTypeInfo Type;
+ if (SM.onIdentifierExpr(Val, Identifier, Info, Type,
+ isParsingMSInlineAsm(), ErrMsg))
+ return Error(Loc, ErrMsg);
+ End = consumeToken();
+ } else {
+ if (SM.onInteger(IntVal, ErrMsg))
+ return Error(Loc, ErrMsg);
+ }
+ } else {
+ if (SM.onInteger(IntVal, ErrMsg))
+ return Error(Loc, ErrMsg);
+ }
+ break;
+ }
+ case AsmToken::Plus:
+ if (SM.onPlus(ErrMsg))
+ return Error(getTok().getLoc(), ErrMsg);
+ break;
+ case AsmToken::Minus:
+ if (SM.onMinus(ErrMsg))
+ return Error(getTok().getLoc(), ErrMsg);
+ break;
+ case AsmToken::Tilde: SM.onNot(); break;
+ case AsmToken::Star: SM.onStar(); break;
+ case AsmToken::Slash: SM.onDivide(); break;
+ case AsmToken::Percent: SM.onMod(); break;
+ case AsmToken::Pipe: SM.onOr(); break;
+ case AsmToken::Caret: SM.onXor(); break;
+ case AsmToken::Amp: SM.onAnd(); break;
+ case AsmToken::LessLess:
+ SM.onLShift(); break;
+ case AsmToken::GreaterGreater:
+ SM.onRShift(); break;
+ case AsmToken::LBrac:
+ if (SM.onLBrac())
+ return Error(Tok.getLoc(), "unexpected bracket encountered");
+ break;
+ case AsmToken::RBrac:
+ if (SM.onRBrac())
+ return Error(Tok.getLoc(), "unexpected bracket encountered");
+ break;
+ case AsmToken::LParen: SM.onLParen(); break;
+ case AsmToken::RParen: SM.onRParen(); break;
+ }
+ if (SM.hadError())
+ return Error(Tok.getLoc(), "unknown token in expression");
+
+ if (!Done && UpdateLocLex)
+ End = consumeToken();
+
+ PrevTK = TK;
+ }
+ return false;
+}
+
+void X86AsmParser::RewriteIntelExpression(IntelExprStateMachine &SM,
+ SMLoc Start, SMLoc End) {
+ SMLoc Loc = Start;
+ unsigned ExprLen = End.getPointer() - Start.getPointer();
+ // Skip everything before a symbol displacement (if we have one)
+ if (SM.getSym() && !SM.isOffsetOperator()) {
+ StringRef SymName = SM.getSymName();
+ if (unsigned Len = SymName.data() - Start.getPointer())
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, Start, Len);
+ Loc = SMLoc::getFromPointer(SymName.data() + SymName.size());
+ ExprLen = End.getPointer() - (SymName.data() + SymName.size());
+ // If we have only a symbol than there's no need for complex rewrite,
+ // simply skip everything after it
+ if (!(SM.getBaseReg() || SM.getIndexReg() || SM.getImm())) {
+ if (ExprLen)
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, Loc, ExprLen);
+ return;
+ }
+ }
+ // Build an Intel Expression rewrite
+ StringRef BaseRegStr;
+ StringRef IndexRegStr;
+ StringRef OffsetNameStr;
+ if (SM.getBaseReg())
+ BaseRegStr = X86IntelInstPrinter::getRegisterName(SM.getBaseReg());
+ if (SM.getIndexReg())
+ IndexRegStr = X86IntelInstPrinter::getRegisterName(SM.getIndexReg());
+ if (SM.isOffsetOperator())
+ OffsetNameStr = SM.getSymName();
+ // Emit it
+ IntelExpr Expr(BaseRegStr, IndexRegStr, SM.getScale(), OffsetNameStr,
+ SM.getImm(), SM.isMemExpr());
+ InstInfo->AsmRewrites->emplace_back(Loc, ExprLen, Expr);
+}
+
+// Inline assembly may use variable names with namespace alias qualifiers.
+bool X86AsmParser::ParseIntelInlineAsmIdentifier(
+ const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End, bool IsParsingOffsetOperator) {
+ MCAsmParser &Parser = getParser();
+ assert(isParsingMSInlineAsm() && "Expected to be parsing inline assembly.");
+ Val = nullptr;
+
+ StringRef LineBuf(Identifier.data());
+ SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Loc = Tok.getLoc();
+
+ // Advance the token stream until the end of the current token is
+ // after the end of what the frontend claimed.
+ const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
+ do {
+ End = Tok.getEndLoc();
+ getLexer().Lex();
+ } while (End.getPointer() < EndPtr);
+ Identifier = LineBuf;
+
+ // The frontend should end parsing on an assembler token boundary, unless it
+ // failed parsing.
+ assert((End.getPointer() == EndPtr ||
+ Info.isKind(InlineAsmIdentifierInfo::IK_Invalid)) &&
+ "frontend claimed part of a token?");
+
+ // If the identifier lookup was unsuccessful, assume that we are dealing with
+ // a label.
+ if (Info.isKind(InlineAsmIdentifierInfo::IK_Invalid)) {
+ StringRef InternalName =
+ SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(),
+ Loc, false);
+ assert(InternalName.size() && "We should have an internal name here.");
+ // Push a rewrite for replacing the identifier name with the internal name,
+ // unless we are parsing the operand of an offset operator
+ if (!IsParsingOffsetOperator)
+ InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
+ InternalName);
+ else
+ Identifier = InternalName;
+ } else if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
+ return false;
+ // Create the symbol reference.
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+ MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+ Val = MCSymbolRefExpr::create(Sym, Variant, getParser().getContext());
+ return false;
+}
+
+//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
+bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ // Eat "{" and mark the current place.
+ const SMLoc consumedToken = consumeToken();
+ if (Tok.isNot(AsmToken::Identifier))
+ return Error(Tok.getLoc(), "Expected an identifier after {");
+ if (Tok.getIdentifier().startswith("r")){
+ int rndMode = StringSwitch<int>(Tok.getIdentifier())
+ .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
+ .Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF)
+ .Case("ru", X86::STATIC_ROUNDING::TO_POS_INF)
+ .Case("rz", X86::STATIC_ROUNDING::TO_ZERO)
+ .Default(-1);
+ if (-1 == rndMode)
+ return Error(Tok.getLoc(), "Invalid rounding mode.");
+ Parser.Lex(); // Eat "r*" of r*-sae
+ if (!getLexer().is(AsmToken::Minus))
+ return Error(Tok.getLoc(), "Expected - at this point");
+ Parser.Lex(); // Eat "-"
+ Parser.Lex(); // Eat the sae
+ if (!getLexer().is(AsmToken::RCurly))
+ return Error(Tok.getLoc(), "Expected } at this point");
+ SMLoc End = Tok.getEndLoc();
+ Parser.Lex(); // Eat "}"
+ const MCExpr *RndModeOp =
+ MCConstantExpr::create(rndMode, Parser.getContext());
+ Operands.push_back(X86Operand::CreateImm(RndModeOp, Start, End));
+ return false;
+ }
+ if(Tok.getIdentifier().equals("sae")){
+ Parser.Lex(); // Eat the sae
+ if (!getLexer().is(AsmToken::RCurly))
+ return Error(Tok.getLoc(), "Expected } at this point");
+ Parser.Lex(); // Eat "}"
+ Operands.push_back(X86Operand::CreateToken("{sae}", consumedToken));
+ return false;
+ }
+ return Error(Tok.getLoc(), "unknown token in expression");
+}
+
+/// Parse the '.' operator.
+bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM,
+ SMLoc &End) {
+ const AsmToken &Tok = getTok();
+ AsmFieldInfo Info;
+
+ // Drop the optional '.'.
+ StringRef DotDispStr = Tok.getString();
+ if (DotDispStr.startswith("."))
+ DotDispStr = DotDispStr.drop_front(1);
+ StringRef TrailingDot;
+
+ // .Imm gets lexed as a real.
+ if (Tok.is(AsmToken::Real)) {
+ APInt DotDisp;
+ DotDispStr.getAsInteger(10, DotDisp);
+ Info.Offset = DotDisp.getZExtValue();
+ } else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) &&
+ Tok.is(AsmToken::Identifier)) {
+ if (DotDispStr.endswith(".")) {
+ TrailingDot = DotDispStr.substr(DotDispStr.size() - 1);
+ DotDispStr = DotDispStr.drop_back(1);
+ }
+ const std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+ const StringRef Base = BaseMember.first, Member = BaseMember.second;
+ if (getParser().lookUpField(SM.getType(), DotDispStr, Info) &&
+ getParser().lookUpField(SM.getSymName(), DotDispStr, Info) &&
+ getParser().lookUpField(DotDispStr, Info) &&
+ (!SemaCallback ||
+ SemaCallback->LookupInlineAsmField(Base, Member, Info.Offset)))
+ return Error(Tok.getLoc(), "Unable to lookup field reference!");
+ } else {
+ return Error(Tok.getLoc(), "Unexpected token type!");
+ }
+
+ // Eat the DotExpression and update End
+ End = SMLoc::getFromPointer(DotDispStr.data());
+ const char *DotExprEndLoc = DotDispStr.data() + DotDispStr.size();
+ while (Tok.getLoc().getPointer() < DotExprEndLoc)
+ Lex();
+ if (!TrailingDot.empty())
+ getLexer().UnLex(AsmToken(AsmToken::Dot, TrailingDot));
+ SM.addImm(Info.Offset);
+ SM.setTypeInfo(Info.Type);
+ return false;
+}
+
+/// Parse the 'offset' operator.
+/// This operator is used to specify the location of a given operand
+bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
+ InlineAsmIdentifierInfo &Info,
+ SMLoc &End) {
+ // Eat offset, mark start of identifier.
+ SMLoc Start = Lex().getLoc();
+ ID = getTok().getString();
+ if (!isParsingMSInlineAsm()) {
+ if ((getTok().isNot(AsmToken::Identifier) &&
+ getTok().isNot(AsmToken::String)) ||
+ getParser().parsePrimaryExpr(Val, End, nullptr))
+ return Error(Start, "unexpected token!");
+ } else if (ParseIntelInlineAsmIdentifier(Val, ID, Info, false, End, true)) {
+ return Error(Start, "unable to lookup expression");
+ } else if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) {
+ return Error(Start, "offset operator cannot yet handle constants");
+ }
+ return false;
+}
+
+// Query a candidate string for being an Intel assembly operator
+// Report back its kind, or IOK_INVALID if does not evaluated as a known one
+unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) {
+ return StringSwitch<unsigned>(Name)
+ .Cases("TYPE","type",IOK_TYPE)
+ .Cases("SIZE","size",IOK_SIZE)
+ .Cases("LENGTH","length",IOK_LENGTH)
+ .Default(IOK_INVALID);
+}
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator
+/// returns the number of elements in an array. It returns the value 1 for
+/// non-array variables. The SIZE operator returns the size of a C or C++
+/// variable. A variable's size is the product of its LENGTH and TYPE. The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ Parser.Lex(); // Eat operator.
+
+ const MCExpr *Val = nullptr;
+ InlineAsmIdentifierInfo Info;
+ SMLoc Start = Tok.getLoc(), End;
+ StringRef Identifier = Tok.getString();
+ if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info,
+ /*IsUnevaluatedOperand=*/true, End))
+ return 0;
+
+ if (!Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
+ Error(Start, "unable to lookup expression");
+ return 0;
+ }
+
+ unsigned CVal = 0;
+ switch(OpKind) {
+ default: llvm_unreachable("Unexpected operand kind!");
+ case IOK_LENGTH: CVal = Info.Var.Length; break;
+ case IOK_SIZE: CVal = Info.Var.Size; break;
+ case IOK_TYPE: CVal = Info.Var.Type; break;
+ }
+
+ return CVal;
+}
+
+// Query a candidate string for being an Intel assembly operator
+// Report back its kind, or IOK_INVALID if does not evaluated as a known one
+unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) {
+ return StringSwitch<unsigned>(Name.lower())
+ .Case("type", MOK_TYPE)
+ .Cases("size", "sizeof", MOK_SIZEOF)
+ .Cases("length", "lengthof", MOK_LENGTHOF)
+ .Default(MOK_INVALID);
+}
+
+/// Parse the 'LENGTHOF', 'SIZEOF', and 'TYPE' operators. The LENGTHOF operator
+/// returns the number of elements in an array. It returns the value 1 for
+/// non-array variables. The SIZEOF operator returns the size of a type or
+/// variable in bytes. A variable's size is the product of its LENGTH and TYPE.
+/// The TYPE operator returns the size of a variable. If the variable is an
+/// array, TYPE returns the size of a single element.
+bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
+ MCAsmParser &Parser = getParser();
+ SMLoc OpLoc = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat operator.
+
+ Val = 0;
+ if (OpKind == MOK_SIZEOF || OpKind == MOK_TYPE) {
+ // Check for SIZEOF(<type>) and TYPE(<type>).
+ bool InParens = Parser.getTok().is(AsmToken::LParen);
+ const AsmToken &IDTok = InParens ? getLexer().peekTok() : Parser.getTok();
+ AsmTypeInfo Type;
+ if (IDTok.is(AsmToken::Identifier) &&
+ !Parser.lookUpType(IDTok.getIdentifier(), Type)) {
+ Val = Type.Size;
+
+ // Eat tokens.
+ if (InParens)
+ parseToken(AsmToken::LParen);
+ parseToken(AsmToken::Identifier);
+ if (InParens)
+ parseToken(AsmToken::RParen);
+ }
+ }
+
+ if (!Val) {
+ IntelExprStateMachine SM;
+ SMLoc End, Start = Parser.getTok().getLoc();
+ if (ParseIntelExpression(SM, End))
+ return true;
+
+ switch (OpKind) {
+ default:
+ llvm_unreachable("Unexpected operand kind!");
+ case MOK_SIZEOF:
+ Val = SM.getSize();
+ break;
+ case MOK_LENGTHOF:
+ Val = SM.getLength();
+ break;
+ case MOK_TYPE:
+ Val = SM.getElementSize();
+ break;
+ }
+
+ if (!Val)
+ return Error(OpLoc, "expression has unknown type", SMRange(Start, End));
+ }
+
+ return false;
+}
+
+bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
+ Size = StringSwitch<unsigned>(getTok().getString())
+ .Cases("BYTE", "byte", 8)
+ .Cases("WORD", "word", 16)
+ .Cases("DWORD", "dword", 32)
+ .Cases("FLOAT", "float", 32)
+ .Cases("LONG", "long", 32)
+ .Cases("FWORD", "fword", 48)
+ .Cases("DOUBLE", "double", 64)
+ .Cases("QWORD", "qword", 64)
+ .Cases("MMWORD","mmword", 64)
+ .Cases("XWORD", "xword", 80)
+ .Cases("TBYTE", "tbyte", 80)
+ .Cases("XMMWORD", "xmmword", 128)
+ .Cases("YMMWORD", "ymmword", 256)
+ .Cases("ZMMWORD", "zmmword", 512)
+ .Default(0);
+ if (Size) {
+ const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word).
+ if (!(Tok.getString().equals("PTR") || Tok.getString().equals("ptr")))
+ return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
+ Lex(); // Eat ptr.
+ }
+ return false;
+}
+
+bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Start, End;
+
+ // Parse optional Size directive.
+ unsigned Size;
+ if (ParseIntelMemoryOperandSize(Size))
+ return true;
+ bool PtrInOperand = bool(Size);
+
+ Start = Tok.getLoc();
+
+ // Rounding mode operand.
+ if (getLexer().is(AsmToken::LCurly))
+ return ParseRoundingModeOp(Start, Operands);
+
+ // Register operand.
+ unsigned RegNo = 0;
+ if (Tok.is(AsmToken::Identifier) && !ParseRegister(RegNo, Start, End)) {
+ if (RegNo == X86::RIP)
+ return Error(Start, "rip can only be used as a base register");
+ // A Register followed by ':' is considered a segment override
+ if (Tok.isNot(AsmToken::Colon)) {
+ if (PtrInOperand)
+ return Error(Start, "expected memory operand after 'ptr', "
+ "found register operand instead");
+ Operands.push_back(X86Operand::CreateReg(RegNo, Start, End));
+ return false;
+ }
+ // An alleged segment override. check if we have a valid segment register
+ if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
+ return Error(Start, "invalid segment register");
+ // Eat ':' and update Start location
+ Start = Lex().getLoc();
+ }
+
+ // Immediates and Memory
+ IntelExprStateMachine SM;
+ if (ParseIntelExpression(SM, End))
+ return true;
+
+ if (isParsingMSInlineAsm())
+ RewriteIntelExpression(SM, Start, Tok.getLoc());
+
+ int64_t Imm = SM.getImm();
+ const MCExpr *Disp = SM.getSym();
+ const MCExpr *ImmDisp = MCConstantExpr::create(Imm, getContext());
+ if (Disp && Imm)
+ Disp = MCBinaryExpr::createAdd(Disp, ImmDisp, getContext());
+ if (!Disp)
+ Disp = ImmDisp;
+
+ // RegNo != 0 specifies a valid segment register,
+ // and we are parsing a segment override
+ if (!SM.isMemExpr() && !RegNo) {
+ if (isParsingMSInlineAsm() && SM.isOffsetOperator()) {
+ const InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
+ // Disp includes the address of a variable; make sure this is recorded
+ // for later handling.
+ Operands.push_back(X86Operand::CreateImm(Disp, Start, End,
+ SM.getSymName(), Info.Var.Decl,
+ Info.Var.IsGlobalLV));
+ return false;
+ }
+ }
+
+ Operands.push_back(X86Operand::CreateImm(Disp, Start, End));
+ return false;
+ }
+
+ StringRef ErrMsg;
+ unsigned BaseReg = SM.getBaseReg();
+ unsigned IndexReg = SM.getIndexReg();
+ unsigned Scale = SM.getScale();
+ if (!PtrInOperand)
+ Size = SM.getElementSize() << 3;
+
+ if (Scale == 0 && BaseReg != X86::ESP && BaseReg != X86::RSP &&
+ (IndexReg == X86::ESP || IndexReg == X86::RSP))
+ std::swap(BaseReg, IndexReg);
+
+ // If BaseReg is a vector register and IndexReg is not, swap them unless
+ // Scale was specified in which case it would be an error.
+ if (Scale == 0 &&
+ !(X86MCRegisterClasses[X86::VR128XRegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR256XRegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::VR512RegClassID].contains(IndexReg)) &&
+ (X86MCRegisterClasses[X86::VR128XRegClassID].contains(BaseReg) ||
+ X86MCRegisterClasses[X86::VR256XRegClassID].contains(BaseReg) ||
+ X86MCRegisterClasses[X86::VR512RegClassID].contains(BaseReg)))
+ std::swap(BaseReg, IndexReg);
+
+ if (Scale != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))
+ return Error(Start, "16-bit addresses cannot have a scale");
+
+ // If there was no explicit scale specified, change it to 1.
+ if (Scale == 0)
+ Scale = 1;
+
+ // If this is a 16-bit addressing mode with the base and index in the wrong
+ // order, swap them so CheckBaseRegAndIndexRegAndScale doesn't fail. It is
+ // shared with att syntax where order matters.
+ if ((BaseReg == X86::SI || BaseReg == X86::DI) &&
+ (IndexReg == X86::BX || IndexReg == X86::BP))
+ std::swap(BaseReg, IndexReg);
+
+ if ((BaseReg || IndexReg) &&
+ CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
+ ErrMsg))
+ return Error(Start, ErrMsg);
+ if (isParsingMSInlineAsm())
+ return CreateMemForMSInlineAsm(RegNo, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size, SM.getSymName(),
+ SM.getIdentifierInfo(), Operands);
+
+ // When parsing x64 MS-style assembly, all memory operands default to
+ // RIP-relative when interpreted as non-absolute references.
+ if (Parser.isParsingMasm() && is64BitMode()) {
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
+ BaseReg, IndexReg, Scale, Start,
+ End, Size,
+ /*DefaultBaseReg=*/X86::RIP));
+ return false;
+ }
+
+ if ((BaseReg || IndexReg || RegNo))
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
+ BaseReg, IndexReg, Scale, Start,
+ End, Size));
+ else
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size));
+ return false;
+}
+
+bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ switch (getLexer().getKind()) {
+ case AsmToken::Dollar: {
+ // $42 or $ID -> immediate.
+ SMLoc Start = Parser.getTok().getLoc(), End;
+ Parser.Lex();
+ const MCExpr *Val;
+ // This is an immediate, so we should not parse a register. Do a precheck
+ // for '%' to supercede intra-register parse errors.
+ SMLoc L = Parser.getTok().getLoc();
+ if (check(getLexer().is(AsmToken::Percent), L,
+ "expected immediate expression") ||
+ getParser().parseExpression(Val, End) ||
+ check(isa<X86MCExpr>(Val), L, "expected immediate expression"))
+ return true;
+ Operands.push_back(X86Operand::CreateImm(Val, Start, End));
+ return false;
+ }
+ case AsmToken::LCurly: {
+ SMLoc Start = Parser.getTok().getLoc();
+ return ParseRoundingModeOp(Start, Operands);
+ }
+ default: {
+ // This a memory operand or a register. We have some parsing complications
+ // as a '(' may be part of an immediate expression or the addressing mode
+ // block. This is complicated by the fact that an assembler-level variable
+ // may refer either to a register or an immediate expression.
+
+ SMLoc Loc = Parser.getTok().getLoc(), EndLoc;
+ const MCExpr *Expr = nullptr;
+ unsigned Reg = 0;
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // No '(' so this is either a displacement expression or a register.
+ if (Parser.parseExpression(Expr, EndLoc))
+ return true;
+ if (auto *RE = dyn_cast<X86MCExpr>(Expr)) {
+ // Segment Register. Reset Expr and copy value to register.
+ Expr = nullptr;
+ Reg = RE->getRegNo();
+
+ // Sanity check register.
+ if (Reg == X86::EIZ || Reg == X86::RIZ)
+ return Error(
+ Loc, "%eiz and %riz can only be used as index registers",
+ SMRange(Loc, EndLoc));
+ if (Reg == X86::RIP)
+ return Error(Loc, "%rip can only be used as a base register",
+ SMRange(Loc, EndLoc));
+ // Return register that are not segment prefixes immediately.
+ if (!Parser.parseOptionalToken(AsmToken::Colon)) {
+ Operands.push_back(X86Operand::CreateReg(Reg, Loc, EndLoc));
+ return false;
+ }
+ if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(Reg))
+ return Error(Loc, "invalid segment register");
+ // Accept a '*' absolute memory reference after the segment. Place it
+ // before the full memory operand.
+ if (getLexer().is(AsmToken::Star))
+ Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
+ }
+ }
+ // This is a Memory operand.
+ return ParseMemOperand(Reg, Expr, Loc, EndLoc, Operands);
+ }
+ }
+}
+
+// X86::COND_INVALID if not a recognized condition code or alternate mnemonic,
+// otherwise the EFLAGS Condition Code enumerator.
+X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) {
+ return StringSwitch<X86::CondCode>(CC)
+ .Case("o", X86::COND_O) // Overflow
+ .Case("no", X86::COND_NO) // No Overflow
+ .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal
+ .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below
+ .Cases("e", "z", X86::COND_E) // Equal/Zero
+ .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero
+ .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above
+ .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal
+ .Case("s", X86::COND_S) // Sign
+ .Case("ns", X86::COND_NS) // No Sign
+ .Cases("p", "pe", X86::COND_P) // Parity/Parity Even
+ .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd
+ .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal
+ .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less
+ .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater
+ .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal
+ .Default(X86::COND_INVALID);
+}
+
+// true on failure, false otherwise
+// If no {z} mark was found - Parser doesn't advance
+bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
+ const SMLoc &StartLoc) {
+ MCAsmParser &Parser = getParser();
+ // Assuming we are just pass the '{' mark, quering the next token
+ // Searched for {z}, but none was found. Return false, as no parsing error was
+ // encountered
+ if (!(getLexer().is(AsmToken::Identifier) &&
+ (getLexer().getTok().getIdentifier() == "z")))
+ return false;
+ Parser.Lex(); // Eat z
+ // Query and eat the '}' mark
+ if (!getLexer().is(AsmToken::RCurly))
+ return Error(getLexer().getLoc(), "Expected } at this point");
+ Parser.Lex(); // Eat '}'
+ // Assign Z with the {z} mark opernad
+ Z = X86Operand::CreateToken("{z}", StartLoc);
+ return false;
+}
+
+// true on failure, false otherwise
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().is(AsmToken::LCurly)) {
+ // Eat "{" and mark the current place.
+ const SMLoc consumedToken = consumeToken();
+ // Distinguish {1to<NUM>} from {%k<NUM>}.
+ if(getLexer().is(AsmToken::Integer)) {
+ // Parse memory broadcasting ({1to<NUM>}).
+ if (getLexer().getTok().getIntVal() != 1)
+ return TokError("Expected 1to<NUM> at this point");
+ StringRef Prefix = getLexer().getTok().getString();
+ Parser.Lex(); // Eat first token of 1to8
+ if (!getLexer().is(AsmToken::Identifier))
+ return TokError("Expected 1to<NUM> at this point");
+ // Recognize only reasonable suffixes.
+ SmallVector<char, 5> BroadcastVector;
+ StringRef BroadcastString = (Prefix + getLexer().getTok().getIdentifier())
+ .toStringRef(BroadcastVector);
+ if (!BroadcastString.startswith("1to"))
+ return TokError("Expected 1to<NUM> at this point");
+ const char *BroadcastPrimitive =
+ StringSwitch<const char *>(BroadcastString)
+ .Case("1to2", "{1to2}")
+ .Case("1to4", "{1to4}")
+ .Case("1to8", "{1to8}")
+ .Case("1to16", "{1to16}")
+ .Default(nullptr);
+ if (!BroadcastPrimitive)
+ return TokError("Invalid memory broadcast primitive.");
+ Parser.Lex(); // Eat trailing token of 1toN
+ if (!getLexer().is(AsmToken::RCurly))
+ return TokError("Expected } at this point");
+ Parser.Lex(); // Eat "}"
+ Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
+ consumedToken));
+ // No AVX512 specific primitives can pass
+ // after memory broadcasting, so return.
+ return false;
+ } else {
+ // Parse either {k}{z}, {z}{k}, {k} or {z}
+ // last one have no meaning, but GCC accepts it
+ // Currently, we're just pass a '{' mark
+ std::unique_ptr<X86Operand> Z;
+ if (ParseZ(Z, consumedToken))
+ return true;
+ // Reaching here means that parsing of the allegadly '{z}' mark yielded
+ // no errors.
+ // Query for the need of further parsing for a {%k<NUM>} mark
+ if (!Z || getLexer().is(AsmToken::LCurly)) {
+ SMLoc StartLoc = Z ? consumeToken() : consumedToken;
+ // Parse an op-mask register mark ({%k<NUM>}), which is now to be
+ // expected
+ unsigned RegNo;
+ SMLoc RegLoc;
+ if (!ParseRegister(RegNo, RegLoc, StartLoc) &&
+ X86MCRegisterClasses[X86::VK1RegClassID].contains(RegNo)) {
+ if (RegNo == X86::K0)
+ return Error(RegLoc, "Register k0 can't be used as write mask");
+ if (!getLexer().is(AsmToken::RCurly))
+ return Error(getLexer().getLoc(), "Expected } at this point");
+ Operands.push_back(X86Operand::CreateToken("{", StartLoc));
+ Operands.push_back(
+ X86Operand::CreateReg(RegNo, StartLoc, StartLoc));
+ Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+ } else
+ return Error(getLexer().getLoc(),
+ "Expected an op-mask register at this point");
+ // {%k<NUM>} mark is found, inquire for {z}
+ if (getLexer().is(AsmToken::LCurly) && !Z) {
+ // Have we've found a parsing error, or found no (expected) {z} mark
+ // - report an error
+ if (ParseZ(Z, consumeToken()) || !Z)
+ return Error(getLexer().getLoc(),
+ "Expected a {z} mark at this point");
+
+ }
+ // '{z}' on its own is meaningless, hence should be ignored.
+ // on the contrary - have it been accompanied by a K register,
+ // allow it.
+ if (Z)
+ Operands.push_back(std::move(Z));
+ }
+ }
+ }
+ return false;
+}
+
+/// ParseMemOperand: 'seg : disp(basereg, indexreg, scale)'. The '%ds:' prefix
+/// has already been parsed if present. disp may be provided as well.
+bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
+ SMLoc StartLoc, SMLoc EndLoc,
+ OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc Loc;
+ // Based on the initial passed values, we may be in any of these cases, we are
+ // in one of these cases (with current position (*)):
+
+ // 1. seg : * disp (base-index-scale-expr)
+ // 2. seg : *(disp) (base-index-scale-expr)
+ // 3. seg : *(base-index-scale-expr)
+ // 4. disp *(base-index-scale-expr)
+ // 5. *(disp) (base-index-scale-expr)
+ // 6. *(base-index-scale-expr)
+ // 7. disp *
+ // 8. *(disp)
+
+ // If we do not have an displacement yet, check if we're in cases 4 or 6 by
+ // checking if the first object after the parenthesis is a register (or an
+ // identifier referring to a register) and parse the displacement or default
+ // to 0 as appropriate.
+ auto isAtMemOperand = [this]() {
+ if (this->getLexer().isNot(AsmToken::LParen))
+ return false;
+ AsmToken Buf[2];
+ StringRef Id;
+ auto TokCount = this->getLexer().peekTokens(Buf, true);
+ if (TokCount == 0)
+ return false;
+ switch (Buf[0].getKind()) {
+ case AsmToken::Percent:
+ case AsmToken::Comma:
+ return true;
+ // These lower cases are doing a peekIdentifier.
+ case AsmToken::At:
+ case AsmToken::Dollar:
+ if ((TokCount > 1) &&
+ (Buf[1].is(AsmToken::Identifier) || Buf[1].is(AsmToken::String)) &&
+ (Buf[0].getLoc().getPointer() + 1 == Buf[1].getLoc().getPointer()))
+ Id = StringRef(Buf[0].getLoc().getPointer(),
+ Buf[1].getIdentifier().size() + 1);
+ break;
+ case AsmToken::Identifier:
+ case AsmToken::String:
+ Id = Buf[0].getIdentifier();
+ break;
+ default:
+ return false;
+ }
+ // We have an ID. Check if it is bound to a register.
+ if (!Id.empty()) {
+ MCSymbol *Sym = this->getContext().getOrCreateSymbol(Id);
+ if (Sym->isVariable()) {
+ auto V = Sym->getVariableValue(/*SetUsed*/ false);
+ return isa<X86MCExpr>(V);
+ }
+ }
+ return false;
+ };
+
+ if (!Disp) {
+ // Parse immediate if we're not at a mem operand yet.
+ if (!isAtMemOperand()) {
+ if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(Disp, EndLoc))
+ return true;
+ assert(!isa<X86MCExpr>(Disp) && "Expected non-register here.");
+ } else {
+ // Disp is implicitly zero if we haven't parsed it yet.
+ Disp = MCConstantExpr::create(0, Parser.getContext());
+ }
+ }
+
+ // We are now either at the end of the operand or at the '(' at the start of a
+ // base-index-scale-expr.
+
+ if (!parseOptionalToken(AsmToken::LParen)) {
+ if (SegReg == 0)
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
+ else
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+ 0, 0, 1, StartLoc, EndLoc));
+ return false;
+ }
+
+ // If we reached here, then eat the '(' and Process
+ // the rest of the memory operand.
+ unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+ SMLoc BaseLoc = getLexer().getLoc();
+ const MCExpr *E;
+ StringRef ErrMsg;
+
+ // Parse BaseReg if one is provided.
+ if (getLexer().isNot(AsmToken::Comma) && getLexer().isNot(AsmToken::RParen)) {
+ if (Parser.parseExpression(E, EndLoc) ||
+ check(!isa<X86MCExpr>(E), BaseLoc, "expected register here"))
+ return true;
+
+ // Sanity check register.
+ BaseReg = cast<X86MCExpr>(E)->getRegNo();
+ if (BaseReg == X86::EIZ || BaseReg == X86::RIZ)
+ return Error(BaseLoc, "eiz and riz can only be used as index registers",
+ SMRange(BaseLoc, EndLoc));
+ }
+
+ if (parseOptionalToken(AsmToken::Comma)) {
+ // Following the comma we should have either an index register, or a scale
+ // value. We don't support the later form, but we want to parse it
+ // correctly.
+ //
+ // Even though it would be completely consistent to support syntax like
+ // "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
+ if (getLexer().isNot(AsmToken::RParen)) {
+ if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(E, EndLoc))
+ return true;
+
+ if (!isa<X86MCExpr>(E)) {
+ // We've parsed an unexpected Scale Value instead of an index
+ // register. Interpret it as an absolute.
+ int64_t ScaleVal;
+ if (!E->evaluateAsAbsolute(ScaleVal, getStreamer().getAssemblerPtr()))
+ return Error(Loc, "expected absolute expression");
+ if (ScaleVal != 1)
+ Warning(Loc, "scale factor without index register is ignored");
+ Scale = 1;
+ } else { // IndexReg Found.
+ IndexReg = cast<X86MCExpr>(E)->getRegNo();
+
+ if (BaseReg == X86::RIP)
+ return Error(Loc,
+ "%rip as base register can not have an index register");
+ if (IndexReg == X86::RIP)
+ return Error(Loc, "%rip is not allowed as an index register");
+
+ if (parseOptionalToken(AsmToken::Comma)) {
+ // Parse the scale amount:
+ // ::= ',' [scale-expression]
+
+ // A scale amount without an index is ignored.
+ if (getLexer().isNot(AsmToken::RParen)) {
+ int64_t ScaleVal;
+ if (Parser.parseTokenLoc(Loc) ||
+ Parser.parseAbsoluteExpression(ScaleVal))
+ return Error(Loc, "expected scale expression");
+ Scale = (unsigned)ScaleVal;
+ // Validate the scale amount.
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+ Scale != 1)
+ return Error(Loc, "scale factor in 16-bit address must be 1");
+ if (checkScale(Scale, ErrMsg))
+ return Error(Loc, ErrMsg);
+ }
+ }
+ }
+ }
+ }
+
+ // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+ if (parseToken(AsmToken::RParen, "unexpected token in memory operand"))
+ return true;
+
+ // This is to support otherwise illegal operand (%dx) found in various
+ // unofficial manuals examples (e.g. "out[s]?[bwl]? %al, (%dx)") and must now
+ // be supported. Mark such DX variants separately fix only in special cases.
+ if (BaseReg == X86::DX && IndexReg == 0 && Scale == 1 && SegReg == 0 &&
+ isa<MCConstantExpr>(Disp) &&
+ cast<MCConstantExpr>(Disp)->getValue() == 0) {
+ Operands.push_back(X86Operand::CreateDXReg(BaseLoc, BaseLoc));
+ return false;
+ }
+
+ if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
+ ErrMsg))
+ return Error(BaseLoc, ErrMsg);
+
+ if (SegReg || BaseReg || IndexReg)
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+ BaseReg, IndexReg, Scale, StartLoc,
+ EndLoc));
+ else
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
+ return false;
+}
+
+// Parse either a standard primary expression or a register.
+bool X86AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+ MCAsmParser &Parser = getParser();
+ // See if this is a register first.
+ if (getTok().is(AsmToken::Percent) ||
+ (isParsingIntelSyntax() && getTok().is(AsmToken::Identifier) &&
+ MatchRegisterName(Parser.getTok().getString()))) {
+ SMLoc StartLoc = Parser.getTok().getLoc();
+ unsigned RegNo;
+ if (ParseRegister(RegNo, StartLoc, EndLoc))
+ return true;
+ Res = X86MCExpr::create(RegNo, Parser.getContext());
+ return false;
+ }
+ return Parser.parsePrimaryExpr(Res, EndLoc, nullptr);
+}
+
+bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ InstInfo = &Info;
+
+ // Reset the forced VEX encoding.
+ ForcedVEXEncoding = VEXEncoding_Default;
+ ForcedDispEncoding = DispEncoding_Default;
+
+ // Parse pseudo prefixes.
+ while (1) {
+ if (Name == "{") {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(), "Unexpected token after '{'");
+ std::string Prefix = Parser.getTok().getString().lower();
+ Parser.Lex(); // Eat identifier.
+ if (getLexer().isNot(AsmToken::RCurly))
+ return Error(Parser.getTok().getLoc(), "Expected '}'");
+ Parser.Lex(); // Eat curly.
+
+ if (Prefix == "vex")
+ ForcedVEXEncoding = VEXEncoding_VEX;
+ else if (Prefix == "vex2")
+ ForcedVEXEncoding = VEXEncoding_VEX2;
+ else if (Prefix == "vex3")
+ ForcedVEXEncoding = VEXEncoding_VEX3;
+ else if (Prefix == "evex")
+ ForcedVEXEncoding = VEXEncoding_EVEX;
+ else if (Prefix == "disp8")
+ ForcedDispEncoding = DispEncoding_Disp8;
+ else if (Prefix == "disp32")
+ ForcedDispEncoding = DispEncoding_Disp32;
+ else
+ return Error(NameLoc, "unknown prefix");
+
+ NameLoc = Parser.getTok().getLoc();
+ if (getLexer().is(AsmToken::LCurly)) {
+ Parser.Lex();
+ Name = "{";
+ } else {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(), "Expected identifier");
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
+ Name = Parser.getTok().getString();
+ Parser.Lex();
+ }
+ continue;
+ }
+ // Parse MASM style pseudo prefixes.
+ if (isParsingMSInlineAsm()) {
+ if (Name.equals_lower("vex"))
+ ForcedVEXEncoding = VEXEncoding_VEX;
+ else if (Name.equals_lower("vex2"))
+ ForcedVEXEncoding = VEXEncoding_VEX2;
+ else if (Name.equals_lower("vex3"))
+ ForcedVEXEncoding = VEXEncoding_VEX3;
+ else if (Name.equals_lower("evex"))
+ ForcedVEXEncoding = VEXEncoding_EVEX;
+
+ if (ForcedVEXEncoding != VEXEncoding_Default) {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(), "Expected identifier");
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
+ Name = Parser.getTok().getString();
+ NameLoc = Parser.getTok().getLoc();
+ Parser.Lex();
+ }
+ }
+ break;
+ }
+
+ // Support the suffix syntax for overriding displacement size as well.
+ if (Name.consume_back(".d32")) {
+ ForcedDispEncoding = DispEncoding_Disp32;
+ } else if (Name.consume_back(".d8")) {
+ ForcedDispEncoding = DispEncoding_Disp8;
+ }
+
+ StringRef PatchedName = Name;
+
+ // Hack to skip "short" following Jcc.
+ if (isParsingIntelSyntax() &&
+ (PatchedName == "jmp" || PatchedName == "jc" || PatchedName == "jnc" ||
+ PatchedName == "jcxz" || PatchedName == "jexcz" ||
+ (PatchedName.startswith("j") &&
+ ParseConditionCode(PatchedName.substr(1)) != X86::COND_INVALID))) {
+ StringRef NextTok = Parser.getTok().getString();
+ if (NextTok == "short") {
+ SMLoc NameEndLoc =
+ NameLoc.getFromPointer(NameLoc.getPointer() + Name.size());
+ // Eat the short keyword.
+ Parser.Lex();
+ // MS and GAS ignore the short keyword; they both determine the jmp type
+ // based on the distance of the label. (NASM does emit different code with
+ // and without "short," though.)
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc,
+ NextTok.size() + 1);
+ }
+ }
+
+ // FIXME: Hack to recognize setneb as setne.
+ if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
+ PatchedName != "setb" && PatchedName != "setnb")
+ PatchedName = PatchedName.substr(0, Name.size()-1);
+
+ unsigned ComparisonPredicate = ~0U;
+
+ // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+ if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
+ (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+ PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+ bool IsVCMP = PatchedName[0] == 'v';
+ unsigned CCIdx = IsVCMP ? 4 : 3;
+ unsigned CC = StringSwitch<unsigned>(
+ PatchedName.slice(CCIdx, PatchedName.size() - 2))
+ .Case("eq", 0x00)
+ .Case("eq_oq", 0x00)
+ .Case("lt", 0x01)
+ .Case("lt_os", 0x01)
+ .Case("le", 0x02)
+ .Case("le_os", 0x02)
+ .Case("unord", 0x03)
+ .Case("unord_q", 0x03)
+ .Case("neq", 0x04)
+ .Case("neq_uq", 0x04)
+ .Case("nlt", 0x05)
+ .Case("nlt_us", 0x05)
+ .Case("nle", 0x06)
+ .Case("nle_us", 0x06)
+ .Case("ord", 0x07)
+ .Case("ord_q", 0x07)
+ /* AVX only from here */
+ .Case("eq_uq", 0x08)
+ .Case("nge", 0x09)
+ .Case("nge_us", 0x09)
+ .Case("ngt", 0x0A)
+ .Case("ngt_us", 0x0A)
+ .Case("false", 0x0B)
+ .Case("false_oq", 0x0B)
+ .Case("neq_oq", 0x0C)
+ .Case("ge", 0x0D)
+ .Case("ge_os", 0x0D)
+ .Case("gt", 0x0E)
+ .Case("gt_os", 0x0E)
+ .Case("true", 0x0F)
+ .Case("true_uq", 0x0F)
+ .Case("eq_os", 0x10)
+ .Case("lt_oq", 0x11)
+ .Case("le_oq", 0x12)
+ .Case("unord_s", 0x13)
+ .Case("neq_us", 0x14)
+ .Case("nlt_uq", 0x15)
+ .Case("nle_uq", 0x16)
+ .Case("ord_s", 0x17)
+ .Case("eq_us", 0x18)
+ .Case("nge_uq", 0x19)
+ .Case("ngt_uq", 0x1A)
+ .Case("false_os", 0x1B)
+ .Case("neq_os", 0x1C)
+ .Case("ge_oq", 0x1D)
+ .Case("gt_oq", 0x1E)
+ .Case("true_us", 0x1F)
+ .Default(~0U);
+ if (CC != ~0U && (IsVCMP || CC < 8)) {
+ if (PatchedName.endswith("ss"))
+ PatchedName = IsVCMP ? "vcmpss" : "cmpss";
+ else if (PatchedName.endswith("sd"))
+ PatchedName = IsVCMP ? "vcmpsd" : "cmpsd";
+ else if (PatchedName.endswith("ps"))
+ PatchedName = IsVCMP ? "vcmpps" : "cmpps";
+ else if (PatchedName.endswith("pd"))
+ PatchedName = IsVCMP ? "vcmppd" : "cmppd";
+ else
+ llvm_unreachable("Unexpected suffix!");
+
+ ComparisonPredicate = CC;
+ }
+ }
+
+ // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+ if (PatchedName.startswith("vpcmp") &&
+ (PatchedName.back() == 'b' || PatchedName.back() == 'w' ||
+ PatchedName.back() == 'd' || PatchedName.back() == 'q')) {
+ unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned CC = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - SuffixSize))
+ .Case("eq", 0x0) // Only allowed on unsigned. Checked below.
+ .Case("lt", 0x1)
+ .Case("le", 0x2)
+ //.Case("false", 0x3) // Not a documented alias.
+ .Case("neq", 0x4)
+ .Case("nlt", 0x5)
+ .Case("nle", 0x6)
+ //.Case("true", 0x7) // Not a documented alias.
+ .Default(~0U);
+ if (CC != ~0U && (CC != 0 || SuffixSize == 2)) {
+ switch (PatchedName.back()) {
+ default: llvm_unreachable("Unexpected character!");
+ case 'b': PatchedName = SuffixSize == 2 ? "vpcmpub" : "vpcmpb"; break;
+ case 'w': PatchedName = SuffixSize == 2 ? "vpcmpuw" : "vpcmpw"; break;
+ case 'd': PatchedName = SuffixSize == 2 ? "vpcmpud" : "vpcmpd"; break;
+ case 'q': PatchedName = SuffixSize == 2 ? "vpcmpuq" : "vpcmpq"; break;
+ }
+ // Set up the immediate to push into the operands later.
+ ComparisonPredicate = CC;
+ }
+ }
+
+ // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+ if (PatchedName.startswith("vpcom") &&
+ (PatchedName.back() == 'b' || PatchedName.back() == 'w' ||
+ PatchedName.back() == 'd' || PatchedName.back() == 'q')) {
+ unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned CC = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - SuffixSize))
+ .Case("lt", 0x0)
+ .Case("le", 0x1)
+ .Case("gt", 0x2)
+ .Case("ge", 0x3)
+ .Case("eq", 0x4)
+ .Case("neq", 0x5)
+ .Case("false", 0x6)
+ .Case("true", 0x7)
+ .Default(~0U);
+ if (CC != ~0U) {
+ switch (PatchedName.back()) {
+ default: llvm_unreachable("Unexpected character!");
+ case 'b': PatchedName = SuffixSize == 2 ? "vpcomub" : "vpcomb"; break;
+ case 'w': PatchedName = SuffixSize == 2 ? "vpcomuw" : "vpcomw"; break;
+ case 'd': PatchedName = SuffixSize == 2 ? "vpcomud" : "vpcomd"; break;
+ case 'q': PatchedName = SuffixSize == 2 ? "vpcomuq" : "vpcomq"; break;
+ }
+ // Set up the immediate to push into the operands later.
+ ComparisonPredicate = CC;
+ }
+ }
+
+
+ // Determine whether this is an instruction prefix.
+ // FIXME:
+ // Enhance prefixes integrity robustness. for example, following forms
+ // are currently tolerated:
+ // repz repnz <insn> ; GAS errors for the use of two similar prefixes
+ // lock addq %rax, %rbx ; Destination operand must be of memory type
+ // xacquire <insn> ; xacquire must be accompanied by 'lock'
+ bool IsPrefix =
+ StringSwitch<bool>(Name)
+ .Cases("cs", "ds", "es", "fs", "gs", "ss", true)
+ .Cases("rex64", "data32", "data16", "addr32", "addr16", true)
+ .Cases("xacquire", "xrelease", true)
+ .Cases("acquire", "release", isParsingIntelSyntax())
+ .Default(false);
+
+ auto isLockRepeatNtPrefix = [](StringRef N) {
+ return StringSwitch<bool>(N)
+ .Cases("lock", "rep", "repe", "repz", "repne", "repnz", "notrack", true)
+ .Default(false);
+ };
+
+ bool CurlyAsEndOfStatement = false;
+
+ unsigned Flags = X86::IP_NO_PREFIX;
+ while (isLockRepeatNtPrefix(Name.lower())) {
+ unsigned Prefix =
+ StringSwitch<unsigned>(Name)
+ .Cases("lock", "lock", X86::IP_HAS_LOCK)
+ .Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT)
+ .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
+ .Cases("notrack", "notrack", X86::IP_HAS_NOTRACK)
+ .Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
+ Flags |= Prefix;
+ if (getLexer().is(AsmToken::EndOfStatement)) {
+ // We don't have real instr with the given prefix
+ // let's use the prefix as the instr.
+ // TODO: there could be several prefixes one after another
+ Flags = X86::IP_NO_PREFIX;
+ break;
+ }
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
+ Name = Parser.getTok().getString();
+ Parser.Lex(); // eat the prefix
+ // Hack: we could have something like "rep # some comment" or
+ // "lock; cmpxchg16b $1" or "lock\0A\09incl" or "lock/incl"
+ while (Name.startswith(";") || Name.startswith("\n") ||
+ Name.startswith("#") || Name.startswith("\t") ||
+ Name.startswith("/")) {
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
+ Name = Parser.getTok().getString();
+ Parser.Lex(); // go to next prefix or instr
+ }
+ }
+
+ if (Flags)
+ PatchedName = Name;
+
+ // Hacks to handle 'data16' and 'data32'
+ if (PatchedName == "data16" && is16BitMode()) {
+ return Error(NameLoc, "redundant data16 prefix");
+ }
+ if (PatchedName == "data32") {
+ if (is32BitMode())
+ return Error(NameLoc, "redundant data32 prefix");
+ if (is64BitMode())
+ return Error(NameLoc, "'data32' is not supported in 64-bit mode");
+ // Hack to 'data16' for the table lookup.
+ PatchedName = "data16";
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ StringRef Next = Parser.getTok().getString();
+ getLexer().Lex();
+ // data32 effectively changes the instruction suffix.
+ // TODO Generalize.
+ if (Next == "callw")
+ Next = "calll";
+ if (Next == "ljmpw")
+ Next = "ljmpl";
+
+ Name = Next;
+ PatchedName = Name;
+ ForcedDataPrefix = X86::Mode32Bit;
+ IsPrefix = false;
+ }
+ }
+
+ Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
+ // Push the immediate if we extracted one from the mnemonic.
+ if (ComparisonPredicate != ~0U && !isParsingIntelSyntax()) {
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+ }
+
+ // This does the actual operand parsing. Don't parse any more if we have a
+ // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
+ // just want to parse the "lock" as the first instruction and the "incl" as
+ // the next one.
+ if (getLexer().isNot(AsmToken::EndOfStatement) && !IsPrefix) {
+ // Parse '*' modifier.
+ if (getLexer().is(AsmToken::Star))
+ Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
+
+ // Read the operands.
+ while(1) {
+ if (ParseOperand(Operands))
+ return true;
+ if (HandleAVX512Operand(Operands))
+ return true;
+
+ // check for comma and eat it
+ if (getLexer().is(AsmToken::Comma))
+ Parser.Lex();
+ else
+ break;
+ }
+
+ // In MS inline asm curly braces mark the beginning/end of a block,
+ // therefore they should be interepreted as end of statement
+ CurlyAsEndOfStatement =
+ isParsingIntelSyntax() && isParsingMSInlineAsm() &&
+ (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
+ if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
+ return TokError("unexpected token in argument list");
+ }
+
+ // Push the immediate if we extracted one from the mnemonic.
+ if (ComparisonPredicate != ~0U && isParsingIntelSyntax()) {
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+ }
+
+ // Consume the EndOfStatement or the prefix separator Slash
+ if (getLexer().is(AsmToken::EndOfStatement) ||
+ (IsPrefix && getLexer().is(AsmToken::Slash)))
+ Parser.Lex();
+ else if (CurlyAsEndOfStatement)
+ // Add an actual EndOfStatement before the curly brace
+ Info.AsmRewrites->emplace_back(AOK_EndOfStatement,
+ getLexer().getTok().getLoc(), 0);
+
+ // This is for gas compatibility and cannot be done in td.
+ // Adding "p" for some floating point with no argument.
+ // For example: fsub --> fsubp
+ bool IsFp =
+ Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr";
+ if (IsFp && Operands.size() == 1) {
+ const char *Repl = StringSwitch<const char *>(Name)
+ .Case("fsub", "fsubp")
+ .Case("fdiv", "fdivp")
+ .Case("fsubr", "fsubrp")
+ .Case("fdivr", "fdivrp");
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
+ }
+
+ if ((Name == "mov" || Name == "movw" || Name == "movl") &&
+ (Operands.size() == 3)) {
+ X86Operand &Op1 = (X86Operand &)*Operands[1];
+ X86Operand &Op2 = (X86Operand &)*Operands[2];
+ SMLoc Loc = Op1.getEndLoc();
+ // Moving a 32 or 16 bit value into a segment register has the same
+ // behavior. Modify such instructions to always take shorter form.
+ if (Op1.isReg() && Op2.isReg() &&
+ X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(
+ Op2.getReg()) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(Op1.getReg()) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(Op1.getReg()))) {
+ // Change instruction name to match new instruction.
+ if (Name != "mov" && Name[3] == (is16BitMode() ? 'l' : 'w')) {
+ Name = is16BitMode() ? "movw" : "movl";
+ Operands[0] = X86Operand::CreateToken(Name, NameLoc);
+ }
+ // Select the correct equivalent 16-/32-bit source register.
+ unsigned Reg =
+ getX86SubSuperRegisterOrZero(Op1.getReg(), is16BitMode() ? 16 : 32);
+ Operands[1] = X86Operand::CreateReg(Reg, Loc, Loc);
+ }
+ }
+
+ // This is a terrible hack to handle "out[s]?[bwl]? %al, (%dx)" ->
+ // "outb %al, %dx". Out doesn't take a memory form, but this is a widely
+ // documented form in various unofficial manuals, so a lot of code uses it.
+ if ((Name == "outb" || Name == "outsb" || Name == "outw" || Name == "outsw" ||
+ Name == "outl" || Name == "outsl" || Name == "out" || Name == "outs") &&
+ Operands.size() == 3) {
+ X86Operand &Op = (X86Operand &)*Operands.back();
+ if (Op.isDXReg())
+ Operands.back() = X86Operand::CreateReg(X86::DX, Op.getStartLoc(),
+ Op.getEndLoc());
+ }
+ // Same hack for "in[s]?[bwl]? (%dx), %al" -> "inb %dx, %al".
+ if ((Name == "inb" || Name == "insb" || Name == "inw" || Name == "insw" ||
+ Name == "inl" || Name == "insl" || Name == "in" || Name == "ins") &&
+ Operands.size() == 3) {
+ X86Operand &Op = (X86Operand &)*Operands[1];
+ if (Op.isDXReg())
+ Operands[1] = X86Operand::CreateReg(X86::DX, Op.getStartLoc(),
+ Op.getEndLoc());
+ }
+
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 2> TmpOperands;
+ bool HadVerifyError = false;
+
+ // Append default arguments to "ins[bwld]"
+ if (Name.startswith("ins") &&
+ (Operands.size() == 1 || Operands.size() == 3) &&
+ (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd" ||
+ Name == "ins")) {
+
+ AddDefaultSrcDestOperands(TmpOperands,
+ X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
+ DefaultMemDIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Append default arguments to "outs[bwld]"
+ if (Name.startswith("outs") &&
+ (Operands.size() == 1 || Operands.size() == 3) &&
+ (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
+ Name == "outsd" || Name == "outs")) {
+ AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
+ X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
+ // values of $SIREG according to the mode. It would be nice if this
+ // could be achieved with InstAlias in the tables.
+ if (Name.startswith("lods") &&
+ (Operands.size() == 1 || Operands.size() == 2) &&
+ (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
+ Name == "lodsl" || Name == "lodsd" || Name == "lodsq")) {
+ TmpOperands.push_back(DefaultMemSIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate
+ // values of $DIREG according to the mode. It would be nice if this
+ // could be achieved with InstAlias in the tables.
+ if (Name.startswith("stos") &&
+ (Operands.size() == 1 || Operands.size() == 2) &&
+ (Name == "stos" || Name == "stosb" || Name == "stosw" ||
+ Name == "stosl" || Name == "stosd" || Name == "stosq")) {
+ TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate
+ // values of $DIREG according to the mode. It would be nice if this
+ // could be achieved with InstAlias in the tables.
+ if (Name.startswith("scas") &&
+ (Operands.size() == 1 || Operands.size() == 2) &&
+ (Name == "scas" || Name == "scasb" || Name == "scasw" ||
+ Name == "scasl" || Name == "scasd" || Name == "scasq")) {
+ TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Add default SI and DI operands to "cmps[bwlq]".
+ if (Name.startswith("cmps") &&
+ (Operands.size() == 1 || Operands.size() == 3) &&
+ (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
+ Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
+ AddDefaultSrcDestOperands(TmpOperands, DefaultMemDIOperand(NameLoc),
+ DefaultMemSIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Add default SI and DI operands to "movs[bwlq]".
+ if (((Name.startswith("movs") &&
+ (Name == "movs" || Name == "movsb" || Name == "movsw" ||
+ Name == "movsl" || Name == "movsd" || Name == "movsq")) ||
+ (Name.startswith("smov") &&
+ (Name == "smov" || Name == "smovb" || Name == "smovw" ||
+ Name == "smovl" || Name == "smovd" || Name == "smovq"))) &&
+ (Operands.size() == 1 || Operands.size() == 3)) {
+ if (Name == "movsd" && Operands.size() == 1 && !isParsingIntelSyntax())
+ Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+ AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
+ DefaultMemDIOperand(NameLoc));
+ HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+ }
+
+ // Check if we encountered an error for one the string insturctions
+ if (HadVerifyError) {
+ return HadVerifyError;
+ }
+
+ // Transforms "xlat mem8" into "xlatb"
+ if ((Name == "xlat" || Name == "xlatb") && Operands.size() == 2) {
+ X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+ if (Op1.isMem8()) {
+ Warning(Op1.getStartLoc(), "memory operand is only for determining the "
+ "size, (R|E)BX will be used for the location");
+ Operands.pop_back();
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue("xlatb");
+ }
+ }
+
+ if (Flags)
+ Operands.push_back(X86Operand::CreatePrefix(Flags, NameLoc, NameLoc));
+ return false;
+}
+
+bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+
+ switch (Inst.getOpcode()) {
+ default: return false;
+ case X86::JMP_1:
+ // {disp32} forces a larger displacement as if the instruction was relaxed.
+ // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
+ // This matches GNU assembler.
+ if (ForcedDispEncoding == DispEncoding_Disp32) {
+ Inst.setOpcode(is16BitMode() ? X86::JMP_2 : X86::JMP_4);
+ return true;
+ }
+
+ return false;
+ case X86::JCC_1:
+ // {disp32} forces a larger displacement as if the instruction was relaxed.
+ // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
+ // This matches GNU assembler.
+ if (ForcedDispEncoding == DispEncoding_Disp32) {
+ Inst.setOpcode(is16BitMode() ? X86::JCC_2 : X86::JCC_4);
+ return true;
+ }
+
+ return false;
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVAPDrr:
+ case X86::VMOVAPDYrr:
+ case X86::VMOVAPSrr:
+ case X86::VMOVAPSYrr:
+ case X86::VMOVDQArr:
+ case X86::VMOVDQAYrr:
+ case X86::VMOVDQUrr:
+ case X86::VMOVDQUYrr:
+ case X86::VMOVUPDrr:
+ case X86::VMOVUPDYrr:
+ case X86::VMOVUPSrr:
+ case X86::VMOVUPSYrr: {
+ // We can get a smaller encoding by using VEX.R instead of VEX.B if one of
+ // the registers is extended, but other isn't.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3 ||
+ MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 ||
+ MRI->getEncodingValue(Inst.getOperand(1).getReg()) < 8)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ }
+ Inst.setOpcode(NewOpc);
+ return true;
+ }
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ // We can get a smaller encoding by using VEX.R instead of VEX.B if one of
+ // the registers is extended, but other isn't.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3 ||
+ MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 ||
+ MRI->getEncodingValue(Inst.getOperand(2).getReg()) < 8)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+ case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
+ }
+ Inst.setOpcode(NewOpc);
+ return true;
+ }
+ case X86::RCR8ri: case X86::RCR16ri: case X86::RCR32ri: case X86::RCR64ri:
+ case X86::RCL8ri: case X86::RCL16ri: case X86::RCL32ri: case X86::RCL64ri:
+ case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
+ case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
+ case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
+ case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
+ case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri: {
+ // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
+ // FIXME: It would be great if we could just do this with an InstAlias.
+ if (!Inst.getOperand(2).isImm() || Inst.getOperand(2).getImm() != 1)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::RCR8ri: NewOpc = X86::RCR8r1; break;
+ case X86::RCR16ri: NewOpc = X86::RCR16r1; break;
+ case X86::RCR32ri: NewOpc = X86::RCR32r1; break;
+ case X86::RCR64ri: NewOpc = X86::RCR64r1; break;
+ case X86::RCL8ri: NewOpc = X86::RCL8r1; break;
+ case X86::RCL16ri: NewOpc = X86::RCL16r1; break;
+ case X86::RCL32ri: NewOpc = X86::RCL32r1; break;
+ case X86::RCL64ri: NewOpc = X86::RCL64r1; break;
+ case X86::ROR8ri: NewOpc = X86::ROR8r1; break;
+ case X86::ROR16ri: NewOpc = X86::ROR16r1; break;
+ case X86::ROR32ri: NewOpc = X86::ROR32r1; break;
+ case X86::ROR64ri: NewOpc = X86::ROR64r1; break;
+ case X86::ROL8ri: NewOpc = X86::ROL8r1; break;
+ case X86::ROL16ri: NewOpc = X86::ROL16r1; break;
+ case X86::ROL32ri: NewOpc = X86::ROL32r1; break;
+ case X86::ROL64ri: NewOpc = X86::ROL64r1; break;
+ case X86::SAR8ri: NewOpc = X86::SAR8r1; break;
+ case X86::SAR16ri: NewOpc = X86::SAR16r1; break;
+ case X86::SAR32ri: NewOpc = X86::SAR32r1; break;
+ case X86::SAR64ri: NewOpc = X86::SAR64r1; break;
+ case X86::SHR8ri: NewOpc = X86::SHR8r1; break;
+ case X86::SHR16ri: NewOpc = X86::SHR16r1; break;
+ case X86::SHR32ri: NewOpc = X86::SHR32r1; break;
+ case X86::SHR64ri: NewOpc = X86::SHR64r1; break;
+ case X86::SHL8ri: NewOpc = X86::SHL8r1; break;
+ case X86::SHL16ri: NewOpc = X86::SHL16r1; break;
+ case X86::SHL32ri: NewOpc = X86::SHL32r1; break;
+ case X86::SHL64ri: NewOpc = X86::SHL64r1; break;
+ }
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::RCR8mi: case X86::RCR16mi: case X86::RCR32mi: case X86::RCR64mi:
+ case X86::RCL8mi: case X86::RCL16mi: case X86::RCL32mi: case X86::RCL64mi:
+ case X86::ROR8mi: case X86::ROR16mi: case X86::ROR32mi: case X86::ROR64mi:
+ case X86::ROL8mi: case X86::ROL16mi: case X86::ROL32mi: case X86::ROL64mi:
+ case X86::SAR8mi: case X86::SAR16mi: case X86::SAR32mi: case X86::SAR64mi:
+ case X86::SHR8mi: case X86::SHR16mi: case X86::SHR32mi: case X86::SHR64mi:
+ case X86::SHL8mi: case X86::SHL16mi: case X86::SHL32mi: case X86::SHL64mi: {
+ // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
+ // FIXME: It would be great if we could just do this with an InstAlias.
+ if (!Inst.getOperand(X86::AddrNumOperands).isImm() ||
+ Inst.getOperand(X86::AddrNumOperands).getImm() != 1)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::RCR8mi: NewOpc = X86::RCR8m1; break;
+ case X86::RCR16mi: NewOpc = X86::RCR16m1; break;
+ case X86::RCR32mi: NewOpc = X86::RCR32m1; break;
+ case X86::RCR64mi: NewOpc = X86::RCR64m1; break;
+ case X86::RCL8mi: NewOpc = X86::RCL8m1; break;
+ case X86::RCL16mi: NewOpc = X86::RCL16m1; break;
+ case X86::RCL32mi: NewOpc = X86::RCL32m1; break;
+ case X86::RCL64mi: NewOpc = X86::RCL64m1; break;
+ case X86::ROR8mi: NewOpc = X86::ROR8m1; break;
+ case X86::ROR16mi: NewOpc = X86::ROR16m1; break;
+ case X86::ROR32mi: NewOpc = X86::ROR32m1; break;
+ case X86::ROR64mi: NewOpc = X86::ROR64m1; break;
+ case X86::ROL8mi: NewOpc = X86::ROL8m1; break;
+ case X86::ROL16mi: NewOpc = X86::ROL16m1; break;
+ case X86::ROL32mi: NewOpc = X86::ROL32m1; break;
+ case X86::ROL64mi: NewOpc = X86::ROL64m1; break;
+ case X86::SAR8mi: NewOpc = X86::SAR8m1; break;
+ case X86::SAR16mi: NewOpc = X86::SAR16m1; break;
+ case X86::SAR32mi: NewOpc = X86::SAR32m1; break;
+ case X86::SAR64mi: NewOpc = X86::SAR64m1; break;
+ case X86::SHR8mi: NewOpc = X86::SHR8m1; break;
+ case X86::SHR16mi: NewOpc = X86::SHR16m1; break;
+ case X86::SHR32mi: NewOpc = X86::SHR32m1; break;
+ case X86::SHR64mi: NewOpc = X86::SHR64m1; break;
+ case X86::SHL8mi: NewOpc = X86::SHL8m1; break;
+ case X86::SHL16mi: NewOpc = X86::SHL16m1; break;
+ case X86::SHL32mi: NewOpc = X86::SHL32m1; break;
+ case X86::SHL64mi: NewOpc = X86::SHL64m1; break;
+ }
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ for (int i = 0; i != X86::AddrNumOperands; ++i)
+ TmpInst.addOperand(Inst.getOperand(i));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::INT: {
+ // Transforms "int $3" into "int3" as a size optimization. We can't write an
+ // instalias with an immediate operand yet.
+ if (!Inst.getOperand(0).isImm() || Inst.getOperand(0).getImm() != 3)
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::INT3);
+ Inst = TmpInst;
+ return true;
+ }
+ }
+}
+
+bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+
+ switch (Inst.getOpcode()) {
+ case X86::VGATHERDPDYrm:
+ case X86::VGATHERDPDrm:
+ case X86::VGATHERDPSYrm:
+ case X86::VGATHERDPSrm:
+ case X86::VGATHERQPDYrm:
+ case X86::VGATHERQPDrm:
+ case X86::VGATHERQPSYrm:
+ case X86::VGATHERQPSrm:
+ case X86::VPGATHERDDYrm:
+ case X86::VPGATHERDDrm:
+ case X86::VPGATHERDQYrm:
+ case X86::VPGATHERDQrm:
+ case X86::VPGATHERQDYrm:
+ case X86::VPGATHERQDrm:
+ case X86::VPGATHERQQYrm:
+ case X86::VPGATHERQQrm: {
+ unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+ unsigned Index =
+ MRI->getEncodingValue(Inst.getOperand(3 + X86::AddrIndexReg).getReg());
+ if (Dest == Mask || Dest == Index || Mask == Index)
+ return Warning(Ops[0]->getStartLoc(), "mask, index, and destination "
+ "registers should be distinct");
+ break;
+ }
+ case X86::VGATHERDPDZ128rm:
+ case X86::VGATHERDPDZ256rm:
+ case X86::VGATHERDPDZrm:
+ case X86::VGATHERDPSZ128rm:
+ case X86::VGATHERDPSZ256rm:
+ case X86::VGATHERDPSZrm:
+ case X86::VGATHERQPDZ128rm:
+ case X86::VGATHERQPDZ256rm:
+ case X86::VGATHERQPDZrm:
+ case X86::VGATHERQPSZ128rm:
+ case X86::VGATHERQPSZ256rm:
+ case X86::VGATHERQPSZrm:
+ case X86::VPGATHERDDZ128rm:
+ case X86::VPGATHERDDZ256rm:
+ case X86::VPGATHERDDZrm:
+ case X86::VPGATHERDQZ128rm:
+ case X86::VPGATHERDQZ256rm:
+ case X86::VPGATHERDQZrm:
+ case X86::VPGATHERQDZ128rm:
+ case X86::VPGATHERQDZ256rm:
+ case X86::VPGATHERQDZrm:
+ case X86::VPGATHERQQZ128rm:
+ case X86::VPGATHERQQZ256rm:
+ case X86::VPGATHERQQZrm: {
+ unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ unsigned Index =
+ MRI->getEncodingValue(Inst.getOperand(4 + X86::AddrIndexReg).getReg());
+ if (Dest == Index)
+ return Warning(Ops[0]->getStartLoc(), "index and destination registers "
+ "should be distinct");
+ break;
+ }
+ case X86::V4FMADDPSrm:
+ case X86::V4FMADDPSrmk:
+ case X86::V4FMADDPSrmkz:
+ case X86::V4FMADDSSrm:
+ case X86::V4FMADDSSrmk:
+ case X86::V4FMADDSSrmkz:
+ case X86::V4FNMADDPSrm:
+ case X86::V4FNMADDPSrmk:
+ case X86::V4FNMADDPSrmkz:
+ case X86::V4FNMADDSSrm:
+ case X86::V4FNMADDSSrmk:
+ case X86::V4FNMADDSSrmkz:
+ case X86::VP4DPWSSDSrm:
+ case X86::VP4DPWSSDSrmk:
+ case X86::VP4DPWSSDSrmkz:
+ case X86::VP4DPWSSDrm:
+ case X86::VP4DPWSSDrmk:
+ case X86::VP4DPWSSDrmkz: {
+ unsigned Src2 = Inst.getOperand(Inst.getNumOperands() -
+ X86::AddrNumOperands - 1).getReg();
+ unsigned Src2Enc = MRI->getEncodingValue(Src2);
+ if (Src2Enc % 4 != 0) {
+ StringRef RegName = X86IntelInstPrinter::getRegisterName(Src2);
+ unsigned GroupStart = (Src2Enc / 4) * 4;
+ unsigned GroupEnd = GroupStart + 3;
+ return Warning(Ops[0]->getStartLoc(),
+ "source register '" + RegName + "' implicitly denotes '" +
+ RegName.take_front(3) + Twine(GroupStart) + "' to '" +
+ RegName.take_front(3) + Twine(GroupEnd) +
+ "' source group");
+ }
+ break;
+ }
+ }
+
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+ // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
+ // check this with the legacy encoding, VEX/EVEX/XOP don't use REX.
+ if ((MCID.TSFlags & X86II::EncodingMask) == 0) {
+ MCPhysReg HReg = X86::NoRegister;
+ bool UsesRex = MCID.TSFlags & X86II::REX_W;
+ unsigned NumOps = Inst.getNumOperands();
+ for (unsigned i = 0; i != NumOps; ++i) {
+ const MCOperand &MO = Inst.getOperand(i);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+ HReg = Reg;
+ if (X86II::isX86_64NonExtLowByteReg(Reg) ||
+ X86II::isX86_64ExtendedReg(Reg))
+ UsesRex = true;
+ }
+
+ if (UsesRex && HReg != X86::NoRegister) {
+ StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg);
+ return Error(Ops[0]->getStartLoc(),
+ "can't encode '" + RegName + "' in an instruction requiring "
+ "REX prefix");
+ }
+ }
+
+ return false;
+}
+
+static const char *getSubtargetFeatureName(uint64_t Val);
+
+void X86AsmParser::emitWarningForSpecialLVIInstruction(SMLoc Loc) {
+ Warning(Loc, "Instruction may be vulnerable to LVI and "
+ "requires manual mitigation");
+ Note(SMLoc(), "See https://software.intel.com/"
+ "security-software-guidance/insights/"
+ "deep-dive-load-value-injection#specialinstructions"
+ " for more information");
+}
+
+/// RET instructions and also instructions that indirect calls/jumps from memory
+/// combine a load and a branch within a single instruction. To mitigate these
+/// instructions against LVI, they must be decomposed into separate load and
+/// branch instructions, with an LFENCE in between. For more details, see:
+/// - X86LoadValueInjectionRetHardening.cpp
+/// - X86LoadValueInjectionIndirectThunks.cpp
+/// - https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection
+///
+/// Returns `true` if a mitigation was applied or warning was emitted.
+void X86AsmParser::applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out) {
+ // Information on control-flow instructions that require manual mitigation can
+ // be found here:
+ // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
+ switch (Inst.getOpcode()) {
+ case X86::RETW:
+ case X86::RETL:
+ case X86::RETQ:
+ case X86::RETIL:
+ case X86::RETIQ:
+ case X86::RETIW: {
+ MCInst ShlInst, FenceInst;
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned Basereg =
+ is64BitMode() ? X86::RSP : (Parse32 ? X86::ESP : X86::SP);
+ const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+ auto ShlMemOp = X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+ /*BaseReg=*/Basereg, /*IndexReg=*/0,
+ /*Scale=*/1, SMLoc{}, SMLoc{}, 0);
+ ShlInst.setOpcode(X86::SHL64mi);
+ ShlMemOp->addMemOperands(ShlInst, 5);
+ ShlInst.addOperand(MCOperand::createImm(0));
+ FenceInst.setOpcode(X86::LFENCE);
+ Out.emitInstruction(ShlInst, getSTI());
+ Out.emitInstruction(FenceInst, getSTI());
+ return;
+ }
+ case X86::JMP16m:
+ case X86::JMP32m:
+ case X86::JMP64m:
+ case X86::CALL16m:
+ case X86::CALL32m:
+ case X86::CALL64m:
+ emitWarningForSpecialLVIInstruction(Inst.getLoc());
+ return;
+ }
+}
+
+/// To mitigate LVI, every instruction that performs a load can be followed by
+/// an LFENCE instruction to squash any potential mis-speculation. There are
+/// some instructions that require additional considerations, and may requre
+/// manual mitigation. For more details, see:
+/// https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection
+///
+/// Returns `true` if a mitigation was applied or warning was emitted.
+void X86AsmParser::applyLVILoadHardeningMitigation(MCInst &Inst,
+ MCStreamer &Out) {
+ auto Opcode = Inst.getOpcode();
+ auto Flags = Inst.getFlags();
+ if ((Flags & X86::IP_HAS_REPEAT) || (Flags & X86::IP_HAS_REPEAT_NE)) {
+ // Information on REP string instructions that require manual mitigation can
+ // be found here:
+ // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
+ switch (Opcode) {
+ case X86::CMPSB:
+ case X86::CMPSW:
+ case X86::CMPSL:
+ case X86::CMPSQ:
+ case X86::SCASB:
+ case X86::SCASW:
+ case X86::SCASL:
+ case X86::SCASQ:
+ emitWarningForSpecialLVIInstruction(Inst.getLoc());
+ return;
+ }
+ } else if (Opcode == X86::REP_PREFIX || Opcode == X86::REPNE_PREFIX) {
+ // If a REP instruction is found on its own line, it may or may not be
+ // followed by a vulnerable instruction. Emit a warning just in case.
+ emitWarningForSpecialLVIInstruction(Inst.getLoc());
+ return;
+ }
+
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+
+ // Can't mitigate after terminators or calls. A control flow change may have
+ // already occurred.
+ if (MCID.isTerminator() || MCID.isCall())
+ return;
+
+ // LFENCE has the mayLoad property, don't double fence.
+ if (MCID.mayLoad() && Inst.getOpcode() != X86::LFENCE) {
+ MCInst FenceInst;
+ FenceInst.setOpcode(X86::LFENCE);
+ Out.emitInstruction(FenceInst, getSTI());
+ }
+}
+
+void X86AsmParser::emitInstruction(MCInst &Inst, OperandVector &Operands,
+ MCStreamer &Out) {
+ if (LVIInlineAsmHardening &&
+ getSTI().getFeatureBits()[X86::FeatureLVIControlFlowIntegrity])
+ applyLVICFIMitigation(Inst, Out);
+
+ Out.emitInstruction(Inst, getSTI());
+
+ if (LVIInlineAsmHardening &&
+ getSTI().getFeatureBits()[X86::FeatureLVILoadHardening])
+ applyLVILoadHardeningMitigation(Inst, Out);
+}
+
+bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out, uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ if (isParsingIntelSyntax())
+ return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+ MatchingInlineAsm);
+ return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+ MatchingInlineAsm);
+}
+
+void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
+ OperandVector &Operands, MCStreamer &Out,
+ bool MatchingInlineAsm) {
+ // FIXME: This should be replaced with a real .td file alias mechanism.
+ // Also, MatchInstructionImpl should actually *do* the EmitInstruction
+ // call.
+ const char *Repl = StringSwitch<const char *>(Op.getToken())
+ .Case("finit", "fninit")
+ .Case("fsave", "fnsave")
+ .Case("fstcw", "fnstcw")
+ .Case("fstcww", "fnstcw")
+ .Case("fstenv", "fnstenv")
+ .Case("fstsw", "fnstsw")
+ .Case("fstsww", "fnstsw")
+ .Case("fclex", "fnclex")
+ .Default(nullptr);
+ if (Repl) {
+ MCInst Inst;
+ Inst.setOpcode(X86::WAIT);
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ emitInstruction(Inst, Operands, Out);
+ Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
+ }
+}
+
+bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc,
+ const FeatureBitset &MissingFeatures,
+ bool MatchingInlineAsm) {
+ assert(MissingFeatures.any() && "Unknown missing feature!");
+ SmallString<126> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "instruction requires:";
+ for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) {
+ if (MissingFeatures[i])
+ OS << ' ' << getSubtargetFeatureName(i);
+ }
+ return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm);
+}
+
+static unsigned getPrefixes(OperandVector &Operands) {
+ unsigned Result = 0;
+ X86Operand &Prefix = static_cast<X86Operand &>(*Operands.back());
+ if (Prefix.isPrefix()) {
+ Result = Prefix.getPrefix();
+ Operands.pop_back();
+ }
+ return Result;
+}
+
+unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+ unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &MCID = MII.get(Opc);
+
+ if (ForcedVEXEncoding == VEXEncoding_EVEX &&
+ (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX)
+ return Match_Unsupported;
+
+ if ((ForcedVEXEncoding == VEXEncoding_VEX ||
+ ForcedVEXEncoding == VEXEncoding_VEX2 ||
+ ForcedVEXEncoding == VEXEncoding_VEX3) &&
+ (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
+ return Match_Unsupported;
+
+ // These instructions are only available with {vex}, {vex2} or {vex3} prefix
+ if (MCID.TSFlags & X86II::ExplicitVEXPrefix &&
+ (ForcedVEXEncoding != VEXEncoding_VEX &&
+ ForcedVEXEncoding != VEXEncoding_VEX2 &&
+ ForcedVEXEncoding != VEXEncoding_VEX3))
+ return Match_Unsupported;
+
+ // These instructions match ambiguously with their VEX encoded counterparts
+ // and appear first in the matching table. Reject them unless we're forcing
+ // EVEX encoding.
+ // FIXME: We really need a way to break the ambiguity.
+ switch (Opc) {
+ case X86::VCVTSD2SIZrm_Int:
+ case X86::VCVTSD2SI64Zrm_Int:
+ case X86::VCVTSS2SIZrm_Int:
+ case X86::VCVTSS2SI64Zrm_Int:
+ case X86::VCVTTSD2SIZrm: case X86::VCVTTSD2SIZrm_Int:
+ case X86::VCVTTSD2SI64Zrm: case X86::VCVTTSD2SI64Zrm_Int:
+ case X86::VCVTTSS2SIZrm: case X86::VCVTTSS2SIZrm_Int:
+ case X86::VCVTTSS2SI64Zrm: case X86::VCVTTSS2SI64Zrm_Int:
+ if (ForcedVEXEncoding != VEXEncoding_EVEX)
+ return Match_Unsupported;
+ break;
+ }
+
+ return Match_Success;
+}
+
+bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!");
+ SMRange EmptyRange = None;
+
+ // First, handle aliases that expand to multiple instructions.
+ MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands,
+ Out, MatchingInlineAsm);
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+ unsigned Prefixes = getPrefixes(Operands);
+
+ MCInst Inst;
+
+ // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
+ // encoder and printer.
+ if (ForcedVEXEncoding == VEXEncoding_VEX)
+ Prefixes |= X86::IP_USE_VEX;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX2)
+ Prefixes |= X86::IP_USE_VEX2;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ Prefixes |= X86::IP_USE_VEX3;
+ else if (ForcedVEXEncoding == VEXEncoding_EVEX)
+ Prefixes |= X86::IP_USE_EVEX;
+
+ // Set encoded flags for {disp8} and {disp32}.
+ if (ForcedDispEncoding == DispEncoding_Disp8)
+ Prefixes |= X86::IP_USE_DISP8;
+ else if (ForcedDispEncoding == DispEncoding_Disp32)
+ Prefixes |= X86::IP_USE_DISP32;
+
+ if (Prefixes)
+ Inst.setFlags(Prefixes);
+
+ // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
+ // when matching the instruction.
+ if (ForcedDataPrefix == X86::Mode32Bit)
+ SwitchMode(X86::Mode32Bit);
+ // First, try a direct match.
+ FeatureBitset MissingFeatures;
+ unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo,
+ MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
+ if (ForcedDataPrefix == X86::Mode32Bit) {
+ SwitchMode(X86::Mode16Bit);
+ ForcedDataPrefix = 0;
+ }
+ switch (OriginalError) {
+ default: llvm_unreachable("Unexpected match result!");
+ case Match_Success:
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
+ return true;
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the
+ // individual transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ emitInstruction(Inst, Operands, Out);
+ Opcode = Inst.getOpcode();
+ return false;
+ case Match_InvalidImmUnsignedi4: {
+ SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ return Error(ErrorLoc, "immediate must be an integer in range [0, 15]",
+ EmptyRange, MatchingInlineAsm);
+ }
+ case Match_MissingFeature:
+ return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm);
+ case Match_InvalidOperand:
+ case Match_MnemonicFail:
+ case Match_Unsupported:
+ break;
+ }
+ if (Op.getToken().empty()) {
+ Error(IDLoc, "instruction must have size higher than 0", EmptyRange,
+ MatchingInlineAsm);
+ return true;
+ }
+
+ // FIXME: Ideally, we would only attempt suffix matches for things which are
+ // valid prefixes, and we could just infer the right unambiguous
+ // type. However, that requires substantially more matcher support than the
+ // following hack.
+
+ // Change the operand to point to a temporary token.
+ StringRef Base = Op.getToken();
+ SmallString<16> Tmp;
+ Tmp += Base;
+ Tmp += ' ';
+ Op.setTokenValue(Tmp);
+
+ // If this instruction starts with an 'f', then it is a floating point stack
+ // instruction. These come in up to three forms for 32-bit, 64-bit, and
+ // 80-bit floating point, which use the suffixes s,l,t respectively.
+ //
+ // Otherwise, we assume that this may be an integer instruction, which comes
+ // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
+ const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+ // MemSize corresponding to Suffixes. { 8, 16, 32, 64 } { 32, 64, 80, 0 }
+ const char *MemSize = Base[0] != 'f' ? "\x08\x10\x20\x40" : "\x20\x40\x50\0";
+
+ // Check for the various suffix matches.
+ uint64_t ErrorInfoIgnore;
+ FeatureBitset ErrorInfoMissingFeatures; // Init suppresses compiler warnings.
+ unsigned Match[4];
+
+ // Some instruction like VPMULDQ is NOT the variant of VPMULD but a new one.
+ // So we should make sure the suffix matcher only works for memory variant
+ // that has the same size with the suffix.
+ // FIXME: This flag is a workaround for legacy instructions that didn't
+ // declare non suffix variant assembly.
+ bool HasVectorReg = false;
+ X86Operand *MemOp = nullptr;
+ for (const auto &Op : Operands) {
+ X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+ if (X86Op->isVectorReg())
+ HasVectorReg = true;
+ else if (X86Op->isMem()) {
+ MemOp = X86Op;
+ assert(MemOp->Mem.Size == 0 && "Memory size always 0 under ATT syntax");
+ // Have we found an unqualified memory operand,
+ // break. IA allows only one memory operand.
+ break;
+ }
+ }
+
+ for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
+ Tmp.back() = Suffixes[I];
+ if (MemOp && HasVectorReg)
+ MemOp->Mem.Size = MemSize[I];
+ Match[I] = Match_MnemonicFail;
+ if (MemOp || !HasVectorReg) {
+ Match[I] =
+ MatchInstruction(Operands, Inst, ErrorInfoIgnore, MissingFeatures,
+ MatchingInlineAsm, isParsingIntelSyntax());
+ // If this returned as a missing feature failure, remember that.
+ if (Match[I] == Match_MissingFeature)
+ ErrorInfoMissingFeatures = MissingFeatures;
+ }
+ }
+
+ // Restore the old token.
+ Op.setTokenValue(Base);
+
+ // If exactly one matched, then we treat that as a successful match (and the
+ // instruction will already have been filled in correctly, since the failing
+ // matches won't have modified it).
+ unsigned NumSuccessfulMatches =
+ std::count(std::begin(Match), std::end(Match), Match_Success);
+ if (NumSuccessfulMatches == 1) {
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
+ return true;
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the
+ // individual transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ emitInstruction(Inst, Operands, Out);
+ Opcode = Inst.getOpcode();
+ return false;
+ }
+
+ // Otherwise, the match failed, try to produce a decent error message.
+
+ // If we had multiple suffix matches, then identify this as an ambiguous
+ // match.
+ if (NumSuccessfulMatches > 1) {
+ char MatchChars[4];
+ unsigned NumMatches = 0;
+ for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I)
+ if (Match[I] == Match_Success)
+ MatchChars[NumMatches++] = Suffixes[I];
+
+ SmallString<126> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "ambiguous instructions require an explicit suffix (could be ";
+ for (unsigned i = 0; i != NumMatches; ++i) {
+ if (i != 0)
+ OS << ", ";
+ if (i + 1 == NumMatches)
+ OS << "or ";
+ OS << "'" << Base << MatchChars[i] << "'";
+ }
+ OS << ")";
+ Error(IDLoc, OS.str(), EmptyRange, MatchingInlineAsm);
+ return true;
+ }
+
+ // Okay, we know that none of the variants matched successfully.
+
+ // If all of the instructions reported an invalid mnemonic, then the original
+ // mnemonic was invalid.
+ if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
+ if (OriginalError == Match_MnemonicFail)
+ return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
+ Op.getLocRange(), MatchingInlineAsm);
+
+ if (OriginalError == Match_Unsupported)
+ return Error(IDLoc, "unsupported instruction", EmptyRange,
+ MatchingInlineAsm);
+
+ assert(OriginalError == Match_InvalidOperand && "Unexpected error");
+ // Recover location info for the operand if we know which was the problem.
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction", EmptyRange,
+ MatchingInlineAsm);
+
+ X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
+ if (Operand.getStartLoc().isValid()) {
+ SMRange OperandRange = Operand.getLocRange();
+ return Error(Operand.getStartLoc(), "invalid operand for instruction",
+ OperandRange, MatchingInlineAsm);
+ }
+ }
+
+ return Error(IDLoc, "invalid operand for instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched as unsupported, report this as unsupported.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_Unsupported) == 1) {
+ return Error(IDLoc, "unsupported instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with a missing feature, report this as a
+ // missing feature.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_MissingFeature) == 1) {
+ ErrorInfo = Match_MissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with an invalid operand, report this as an
+ // operand failure.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_InvalidOperand) == 1) {
+ return Error(IDLoc, "invalid operand for instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
+ // If all of these were an outright failure, report it in a useless way.
+ Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+ EmptyRange, MatchingInlineAsm);
+ return true;
+}
+
+bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!");
+ StringRef Mnemonic = (static_cast<X86Operand &>(*Operands[0])).getToken();
+ SMRange EmptyRange = None;
+ StringRef Base = (static_cast<X86Operand &>(*Operands[0])).getToken();
+ unsigned Prefixes = getPrefixes(Operands);
+
+ // First, handle aliases that expand to multiple instructions.
+ MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands, Out, MatchingInlineAsm);
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+
+ MCInst Inst;
+
+ // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
+ // encoder and printer.
+ if (ForcedVEXEncoding == VEXEncoding_VEX)
+ Prefixes |= X86::IP_USE_VEX;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX2)
+ Prefixes |= X86::IP_USE_VEX2;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ Prefixes |= X86::IP_USE_VEX3;
+ else if (ForcedVEXEncoding == VEXEncoding_EVEX)
+ Prefixes |= X86::IP_USE_EVEX;
+
+ // Set encoded flags for {disp8} and {disp32}.
+ if (ForcedDispEncoding == DispEncoding_Disp8)
+ Prefixes |= X86::IP_USE_DISP8;
+ else if (ForcedDispEncoding == DispEncoding_Disp32)
+ Prefixes |= X86::IP_USE_DISP32;
+
+ if (Prefixes)
+ Inst.setFlags(Prefixes);
+
+ // Find one unsized memory operand, if present.
+ X86Operand *UnsizedMemOp = nullptr;
+ for (const auto &Op : Operands) {
+ X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+ if (X86Op->isMemUnsized()) {
+ UnsizedMemOp = X86Op;
+ // Have we found an unqualified memory operand,
+ // break. IA allows only one memory operand.
+ break;
+ }
+ }
+
+ // Allow some instructions to have implicitly pointer-sized operands. This is
+ // compatible with gas.
+ if (UnsizedMemOp) {
+ static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"};
+ for (const char *Instr : PtrSizedInstrs) {
+ if (Mnemonic == Instr) {
+ UnsizedMemOp->Mem.Size = getPointerWidth();
+ break;
+ }
+ }
+ }
+
+ SmallVector<unsigned, 8> Match;
+ FeatureBitset ErrorInfoMissingFeatures;
+ FeatureBitset MissingFeatures;
+
+ // If unsized push has immediate operand we should default the default pointer
+ // size for the size.
+ if (Mnemonic == "push" && Operands.size() == 2) {
+ auto *X86Op = static_cast<X86Operand *>(Operands[1].get());
+ if (X86Op->isImm()) {
+ // If it's not a constant fall through and let remainder take care of it.
+ const auto *CE = dyn_cast<MCConstantExpr>(X86Op->getImm());
+ unsigned Size = getPointerWidth();
+ if (CE &&
+ (isIntN(Size, CE->getValue()) || isUIntN(Size, CE->getValue()))) {
+ SmallString<16> Tmp;
+ Tmp += Base;
+ Tmp += (is64BitMode())
+ ? "q"
+ : (is32BitMode()) ? "l" : (is16BitMode()) ? "w" : " ";
+ Op.setTokenValue(Tmp);
+ // Do match in ATT mode to allow explicit suffix usage.
+ Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo,
+ MissingFeatures, MatchingInlineAsm,
+ false /*isParsingIntelSyntax()*/));
+ Op.setTokenValue(Base);
+ }
+ }
+ }
+
+ // If an unsized memory operand is present, try to match with each memory
+ // operand size. In Intel assembly, the size is not part of the instruction
+ // mnemonic.
+ if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
+ static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512};
+ for (unsigned Size : MopSizes) {
+ UnsizedMemOp->Mem.Size = Size;
+ uint64_t ErrorInfoIgnore;
+ unsigned LastOpcode = Inst.getOpcode();
+ unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
+ MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
+ if (Match.empty() || LastOpcode != Inst.getOpcode())
+ Match.push_back(M);
+
+ // If this returned as a missing feature failure, remember that.
+ if (Match.back() == Match_MissingFeature)
+ ErrorInfoMissingFeatures = MissingFeatures;
+ }
+
+ // Restore the size of the unsized memory operand if we modified it.
+ UnsizedMemOp->Mem.Size = 0;
+ }
+
+ // If we haven't matched anything yet, this is not a basic integer or FPU
+ // operation. There shouldn't be any ambiguity in our mnemonic table, so try
+ // matching with the unsized operand.
+ if (Match.empty()) {
+ Match.push_back(MatchInstruction(
+ Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax()));
+ // If this returned as a missing feature failure, remember that.
+ if (Match.back() == Match_MissingFeature)
+ ErrorInfoMissingFeatures = MissingFeatures;
+ }
+
+ // Restore the size of the unsized memory operand if we modified it.
+ if (UnsizedMemOp)
+ UnsizedMemOp->Mem.Size = 0;
+
+ // If it's a bad mnemonic, all results will be the same.
+ if (Match.back() == Match_MnemonicFail) {
+ return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'",
+ Op.getLocRange(), MatchingInlineAsm);
+ }
+
+ unsigned NumSuccessfulMatches =
+ std::count(std::begin(Match), std::end(Match), Match_Success);
+
+ // If matching was ambiguous and we had size information from the frontend,
+ // try again with that. This handles cases like "movxz eax, m8/m16".
+ if (UnsizedMemOp && NumSuccessfulMatches > 1 &&
+ UnsizedMemOp->getMemFrontendSize()) {
+ UnsizedMemOp->Mem.Size = UnsizedMemOp->getMemFrontendSize();
+ unsigned M = MatchInstruction(
+ Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
+ if (M == Match_Success)
+ NumSuccessfulMatches = 1;
+
+ // Add a rewrite that encodes the size information we used from the
+ // frontend.
+ InstInfo->AsmRewrites->emplace_back(
+ AOK_SizeDirective, UnsizedMemOp->getStartLoc(),
+ /*Len=*/0, UnsizedMemOp->getMemFrontendSize());
+ }
+
+ // If exactly one matched, then we treat that as a successful match (and the
+ // instruction will already have been filled in correctly, since the failing
+ // matches won't have modified it).
+ if (NumSuccessfulMatches == 1) {
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
+ return true;
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the individual
+ // transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+ Inst.setLoc(IDLoc);
+ if (!MatchingInlineAsm)
+ emitInstruction(Inst, Operands, Out);
+ Opcode = Inst.getOpcode();
+ return false;
+ } else if (NumSuccessfulMatches > 1) {
+ assert(UnsizedMemOp &&
+ "multiple matches only possible with unsized memory operands");
+ return Error(UnsizedMemOp->getStartLoc(),
+ "ambiguous operand size for instruction '" + Mnemonic + "\'",
+ UnsizedMemOp->getLocRange());
+ }
+
+ // If one instruction matched as unsupported, report this as unsupported.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_Unsupported) == 1) {
+ return Error(IDLoc, "unsupported instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with a missing feature, report this as a
+ // missing feature.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_MissingFeature) == 1) {
+ ErrorInfo = Match_MissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures,
+ MatchingInlineAsm);
+ }
+
+ // If one instruction matched with an invalid operand, report this as an
+ // operand failure.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_InvalidOperand) == 1) {
+ return Error(IDLoc, "invalid operand for instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_InvalidImmUnsignedi4) == 1) {
+ SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ return Error(ErrorLoc, "immediate must be an integer in range [0, 15]",
+ EmptyRange, MatchingInlineAsm);
+ }
+
+ // If all of these were an outright failure, report it in a useless way.
+ return Error(IDLoc, "unknown instruction mnemonic", EmptyRange,
+ MatchingInlineAsm);
+}
+
+bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
+ return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo);
+}
+
+bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
+ MCAsmParser &Parser = getParser();
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if (IDVal.startswith(".arch"))
+ return parseDirectiveArch();
+ if (IDVal.startswith(".code"))
+ return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
+ else if (IDVal.startswith(".att_syntax")) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if (Parser.getTok().getString() == "prefix")
+ Parser.Lex();
+ else if (Parser.getTok().getString() == "noprefix")
+ return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not "
+ "supported: registers must have a "
+ "'%' prefix in .att_syntax");
+ }
+ getParser().setAssemblerDialect(0);
+ return false;
+ } else if (IDVal.startswith(".intel_syntax")) {
+ getParser().setAssemblerDialect(1);
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if (Parser.getTok().getString() == "noprefix")
+ Parser.Lex();
+ else if (Parser.getTok().getString() == "prefix")
+ return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not "
+ "supported: registers must not have "
+ "a '%' prefix in .intel_syntax");
+ }
+ return false;
+ } else if (IDVal == ".nops")
+ return parseDirectiveNops(DirectiveID.getLoc());
+ else if (IDVal == ".even")
+ return parseDirectiveEven(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_proc")
+ return parseDirectiveFPOProc(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_setframe")
+ return parseDirectiveFPOSetFrame(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_pushreg")
+ return parseDirectiveFPOPushReg(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_stackalloc")
+ return parseDirectiveFPOStackAlloc(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_stackalign")
+ return parseDirectiveFPOStackAlign(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_endprologue")
+ return parseDirectiveFPOEndPrologue(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_endproc")
+ return parseDirectiveFPOEndProc(DirectiveID.getLoc());
+ else if (IDVal == ".seh_pushreg" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".pushreg")))
+ return parseDirectiveSEHPushReg(DirectiveID.getLoc());
+ else if (IDVal == ".seh_setframe" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".setframe")))
+ return parseDirectiveSEHSetFrame(DirectiveID.getLoc());
+ else if (IDVal == ".seh_savereg" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".savereg")))
+ return parseDirectiveSEHSaveReg(DirectiveID.getLoc());
+ else if (IDVal == ".seh_savexmm" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".savexmm128")))
+ return parseDirectiveSEHSaveXMM(DirectiveID.getLoc());
+ else if (IDVal == ".seh_pushframe" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".pushframe")))
+ return parseDirectiveSEHPushFrame(DirectiveID.getLoc());
+
+ return true;
+}
+
+bool X86AsmParser::parseDirectiveArch() {
+ // Ignore .arch for now.
+ getParser().parseStringToEndOfStatement();
+ return false;
+}
+
+/// parseDirectiveNops
+/// ::= .nops size[, control]
+bool X86AsmParser::parseDirectiveNops(SMLoc L) {
+ int64_t NumBytes = 0, Control = 0;
+ SMLoc NumBytesLoc, ControlLoc;
+ const MCSubtargetInfo STI = getSTI();
+ NumBytesLoc = getTok().getLoc();
+ if (getParser().checkForValidSection() ||
+ getParser().parseAbsoluteExpression(NumBytes))
+ return true;
+
+ if (parseOptionalToken(AsmToken::Comma)) {
+ ControlLoc = getTok().getLoc();
+ if (getParser().parseAbsoluteExpression(Control))
+ return true;
+ }
+ if (getParser().parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.nops' directive"))
+ return true;
+
+ if (NumBytes <= 0) {
+ Error(NumBytesLoc, "'.nops' directive with non-positive size");
+ return false;
+ }
+
+ if (Control < 0) {
+ Error(ControlLoc, "'.nops' directive with negative NOP size");
+ return false;
+ }
+
+ /// Emit nops
+ getParser().getStreamer().emitNops(NumBytes, Control, L);
+
+ return false;
+}
+
+/// parseDirectiveEven
+/// ::= .even
+bool X86AsmParser::parseDirectiveEven(SMLoc L) {
+ if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ return false;
+
+ const MCSection *Section = getStreamer().getCurrentSectionOnly();
+ if (!Section) {
+ getStreamer().InitSections(false);
+ Section = getStreamer().getCurrentSectionOnly();
+ }
+ if (Section->UseCodeAlign())
+ getStreamer().emitCodeAlignment(2, 0);
+ else
+ getStreamer().emitValueToAlignment(2, 0, 1, 0);
+ return false;
+}
+
+/// ParseDirectiveCode
+/// ::= .code16 | .code32 | .code64
+bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ Code16GCC = false;
+ if (IDVal == ".code16") {
+ Parser.Lex();
+ if (!is16BitMode()) {
+ SwitchMode(X86::Mode16Bit);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
+ }
+ } else if (IDVal == ".code16gcc") {
+ // .code16gcc parses as if in 32-bit mode, but emits code in 16-bit mode.
+ Parser.Lex();
+ Code16GCC = true;
+ if (!is16BitMode()) {
+ SwitchMode(X86::Mode16Bit);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
+ }
+ } else if (IDVal == ".code32") {
+ Parser.Lex();
+ if (!is32BitMode()) {
+ SwitchMode(X86::Mode32Bit);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code32);
+ }
+ } else if (IDVal == ".code64") {
+ Parser.Lex();
+ if (!is64BitMode()) {
+ SwitchMode(X86::Mode64Bit);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code64);
+ }
+ } else {
+ Error(L, "unknown directive " + IDVal);
+ return false;
+ }
+
+ return false;
+}
+
+// .cv_fpo_proc foo
+bool X86AsmParser::parseDirectiveFPOProc(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ StringRef ProcName;
+ int64_t ParamsSize;
+ if (Parser.parseIdentifier(ProcName))
+ return Parser.TokError("expected symbol name");
+ if (Parser.parseIntToken(ParamsSize, "expected parameter byte count"))
+ return true;
+ if (!isUIntN(32, ParamsSize))
+ return Parser.TokError("parameters size out of range");
+ if (Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_proc' directive");
+ MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
+ return getTargetStreamer().emitFPOProc(ProcSym, ParamsSize, L);
+}
+
+// .cv_fpo_setframe ebp
+bool X86AsmParser::parseDirectiveFPOSetFrame(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ unsigned Reg;
+ SMLoc DummyLoc;
+ if (ParseRegister(Reg, DummyLoc, DummyLoc) ||
+ Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_setframe' directive");
+ return getTargetStreamer().emitFPOSetFrame(Reg, L);
+}
+
+// .cv_fpo_pushreg ebx
+bool X86AsmParser::parseDirectiveFPOPushReg(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ unsigned Reg;
+ SMLoc DummyLoc;
+ if (ParseRegister(Reg, DummyLoc, DummyLoc) ||
+ Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_pushreg' directive");
+ return getTargetStreamer().emitFPOPushReg(Reg, L);
+}
+
+// .cv_fpo_stackalloc 20
+bool X86AsmParser::parseDirectiveFPOStackAlloc(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ int64_t Offset;
+ if (Parser.parseIntToken(Offset, "expected offset") ||
+ Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_stackalloc' directive");
+ return getTargetStreamer().emitFPOStackAlloc(Offset, L);
+}
+
+// .cv_fpo_stackalign 8
+bool X86AsmParser::parseDirectiveFPOStackAlign(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ int64_t Offset;
+ if (Parser.parseIntToken(Offset, "expected offset") ||
+ Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_stackalign' directive");
+ return getTargetStreamer().emitFPOStackAlign(Offset, L);
+}
+
+// .cv_fpo_endprologue
+bool X86AsmParser::parseDirectiveFPOEndPrologue(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_endprologue' directive");
+ return getTargetStreamer().emitFPOEndPrologue(L);
+}
+
+// .cv_fpo_endproc
+bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_endproc' directive");
+ return getTargetStreamer().emitFPOEndProc(L);
+}
+
+bool X86AsmParser::parseSEHRegisterNumber(unsigned RegClassID,
+ unsigned &RegNo) {
+ SMLoc startLoc = getLexer().getLoc();
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+
+ // Try parsing the argument as a register first.
+ if (getLexer().getTok().isNot(AsmToken::Integer)) {
+ SMLoc endLoc;
+ if (ParseRegister(RegNo, startLoc, endLoc))
+ return true;
+
+ if (!X86MCRegisterClasses[RegClassID].contains(RegNo)) {
+ return Error(startLoc,
+ "register is not supported for use with this directive");
+ }
+ } else {
+ // Otherwise, an integer number matching the encoding of the desired
+ // register may appear.
+ int64_t EncodedReg;
+ if (getParser().parseAbsoluteExpression(EncodedReg))
+ return true;
+
+ // The SEH register number is the same as the encoding register number. Map
+ // from the encoding back to the LLVM register number.
+ RegNo = 0;
+ for (MCPhysReg Reg : X86MCRegisterClasses[RegClassID]) {
+ if (MRI->getEncodingValue(Reg) == EncodedReg) {
+ RegNo = Reg;
+ break;
+ }
+ }
+ if (RegNo == 0) {
+ return Error(startLoc,
+ "incorrect register number for use with this directive");
+ }
+ }
+
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHPushReg(SMLoc Loc) {
+ unsigned Reg = 0;
+ if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFIPushReg(Reg, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSetFrame(SMLoc Loc) {
+ unsigned Reg = 0;
+ int64_t Off;
+ if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+ return true;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("you must specify a stack pointer offset");
+
+ getParser().Lex();
+ if (getParser().parseAbsoluteExpression(Off))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFISetFrame(Reg, Off, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSaveReg(SMLoc Loc) {
+ unsigned Reg = 0;
+ int64_t Off;
+ if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+ return true;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("you must specify an offset on the stack");
+
+ getParser().Lex();
+ if (getParser().parseAbsoluteExpression(Off))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFISaveReg(Reg, Off, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSaveXMM(SMLoc Loc) {
+ unsigned Reg = 0;
+ int64_t Off;
+ if (parseSEHRegisterNumber(X86::VR128XRegClassID, Reg))
+ return true;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("you must specify an offset on the stack");
+
+ getParser().Lex();
+ if (getParser().parseAbsoluteExpression(Off))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) {
+ bool Code = false;
+ StringRef CodeID;
+ if (getLexer().is(AsmToken::At)) {
+ SMLoc startLoc = getLexer().getLoc();
+ getParser().Lex();
+ if (!getParser().parseIdentifier(CodeID)) {
+ if (CodeID != "code")
+ return Error(startLoc, "expected @code");
+ Code = true;
+ }
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFIPushFrame(Code, Loc);
+ return false;
+}
+
+// Force static initialization.
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86AsmParser() {
+ RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
+ RegisterMCAsmParser<X86AsmParser> Y(getTheX86_64Target());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_SUBTARGET_FEATURE_NAME
+#include "X86GenAsmMatcher.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
new file mode 100644
index 000000000000..e9be28ca77b0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -0,0 +1,44 @@
+//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+
+#include "llvm/Support/MathExtras.h"
+
+namespace llvm {
+
+inline bool isImmSExti16i8Value(uint64_t Value) {
+ return isInt<8>(Value) ||
+ (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value)));
+}
+
+inline bool isImmSExti32i8Value(uint64_t Value) {
+ return isInt<8>(Value) ||
+ (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value)));
+}
+
+inline bool isImmSExti64i8Value(uint64_t Value) {
+ return isInt<8>(Value);
+}
+
+inline bool isImmSExti64i32Value(uint64_t Value) {
+ return isInt<32>(Value);
+}
+
+inline bool isImmUnsignedi8Value(uint64_t Value) {
+ return isUInt<8>(Value) || isInt<8>(Value);
+}
+
+inline bool isImmUnsignedi4Value(uint64_t Value) {
+ return isUInt<4>(Value);
+}
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h
new file mode 100644
index 000000000000..e32335331879
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -0,0 +1,718 @@
+//===- X86Operand.h - Parsed X86 machine instruction ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+
+#include "MCTargetDesc/X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86AsmParserCommon.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <memory>
+
+namespace llvm {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand final : public MCParsedAsmOperand {
+ enum KindTy { Token, Register, Immediate, Memory, Prefix, DXRegister } Kind;
+
+ SMLoc StartLoc, EndLoc;
+ SMLoc OffsetOfLoc;
+ StringRef SymName;
+ void *OpDecl;
+ bool AddressOf;
+ bool CallOperand;
+
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegOp {
+ unsigned RegNo;
+ };
+
+ struct PrefOp {
+ unsigned Prefixes;
+ };
+
+ struct ImmOp {
+ const MCExpr *Val;
+ bool LocalRef;
+ };
+
+ struct MemOp {
+ unsigned SegReg;
+ const MCExpr *Disp;
+ unsigned BaseReg;
+ unsigned DefaultBaseReg;
+ unsigned IndexReg;
+ unsigned Scale;
+ unsigned Size;
+ unsigned ModeSize;
+
+ /// If the memory operand is unsized and there are multiple instruction
+ /// matches, prefer the one with this size.
+ unsigned FrontendSize;
+ };
+
+ union {
+ struct TokOp Tok;
+ struct RegOp Reg;
+ struct ImmOp Imm;
+ struct MemOp Mem;
+ struct PrefOp Pref;
+ };
+
+ X86Operand(KindTy K, SMLoc Start, SMLoc End)
+ : Kind(K), StartLoc(Start), EndLoc(End), CallOperand(false) {}
+
+ StringRef getSymName() override { return SymName; }
+ void *getOpDecl() override { return OpDecl; }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const override { return StartLoc; }
+
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const override { return EndLoc; }
+
+ /// getLocRange - Get the range between the first and last token of this
+ /// operand.
+ SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
+ /// getOffsetOfLoc - Get the location of the offset operator.
+ SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
+
+ void print(raw_ostream &OS) const override {
+
+ auto PrintImmValue = [&](const MCExpr *Val, const char *VName) {
+ if (Val->getKind() == MCExpr::Constant) {
+ if (auto Imm = cast<MCConstantExpr>(Val)->getValue())
+ OS << VName << Imm;
+ } else if (Val->getKind() == MCExpr::SymbolRef) {
+ if (auto *SRE = dyn_cast<MCSymbolRefExpr>(Val)) {
+ const MCSymbol &Sym = SRE->getSymbol();
+ if (const char *SymNameStr = Sym.getName().data())
+ OS << VName << SymNameStr;
+ }
+ }
+ };
+
+ switch (Kind) {
+ case Token:
+ OS << Tok.Data;
+ break;
+ case Register:
+ OS << "Reg:" << X86IntelInstPrinter::getRegisterName(Reg.RegNo);
+ break;
+ case DXRegister:
+ OS << "DXReg";
+ break;
+ case Immediate:
+ PrintImmValue(Imm.Val, "Imm:");
+ break;
+ case Prefix:
+ OS << "Prefix:" << Pref.Prefixes;
+ break;
+ case Memory:
+ OS << "Memory: ModeSize=" << Mem.ModeSize;
+ if (Mem.Size)
+ OS << ",Size=" << Mem.Size;
+ if (Mem.BaseReg)
+ OS << ",BaseReg=" << X86IntelInstPrinter::getRegisterName(Mem.BaseReg);
+ if (Mem.IndexReg)
+ OS << ",IndexReg="
+ << X86IntelInstPrinter::getRegisterName(Mem.IndexReg);
+ if (Mem.Scale)
+ OS << ",Scale=" << Mem.Scale;
+ if (Mem.Disp)
+ PrintImmValue(Mem.Disp, ",Disp=");
+ if (Mem.SegReg)
+ OS << ",SegReg=" << X86IntelInstPrinter::getRegisterName(Mem.SegReg);
+ break;
+ }
+ }
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+ void setTokenValue(StringRef Value) {
+ assert(Kind == Token && "Invalid access!");
+ Tok.Data = Value.data();
+ Tok.Length = Value.size();
+ }
+
+ unsigned getReg() const override {
+ assert(Kind == Register && "Invalid access!");
+ return Reg.RegNo;
+ }
+
+ unsigned getPrefix() const {
+ assert(Kind == Prefix && "Invalid access!");
+ return Pref.Prefixes;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+
+ const MCExpr *getMemDisp() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Disp;
+ }
+ unsigned getMemSegReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.SegReg;
+ }
+ unsigned getMemBaseReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.BaseReg;
+ }
+ unsigned getMemDefaultBaseReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.DefaultBaseReg;
+ }
+ unsigned getMemIndexReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.IndexReg;
+ }
+ unsigned getMemScale() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Scale;
+ }
+ unsigned getMemModeSize() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.ModeSize;
+ }
+ unsigned getMemFrontendSize() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.FrontendSize;
+ }
+
+ bool isToken() const override {return Kind == Token; }
+
+ bool isImm() const override { return Kind == Immediate; }
+
+ bool isImmSExti16i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti16i8Value(CE->getValue());
+ }
+ bool isImmSExti32i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti32i8Value(CE->getValue());
+ }
+ bool isImmSExti64i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti64i8Value(CE->getValue());
+ }
+ bool isImmSExti64i32() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ return isImmSExti64i32Value(CE->getValue());
+ }
+
+ bool isImmUnsignedi4() const {
+ if (!isImm()) return false;
+ // If this isn't a constant expr, reject it. The immediate byte is shared
+ // with a register encoding. We can't have it affected by a relocation.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ return isImmUnsignedi4Value(CE->getValue());
+ }
+
+ bool isImmUnsignedi8() const {
+ if (!isImm()) return false;
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return true;
+ return isImmUnsignedi8Value(CE->getValue());
+ }
+
+ bool isOffsetOfLocal() const override { return isImm() && Imm.LocalRef; }
+
+ bool needAddressOf() const override { return AddressOf; }
+
+ bool isMem() const override { return Kind == Memory; }
+ bool isMemUnsized() const {
+ return Kind == Memory && Mem.Size == 0;
+ }
+ bool isMem8() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMem16() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMem32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMem64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+ }
+ bool isMem80() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+ }
+ bool isMem128() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+ }
+ bool isMem256() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+ }
+ bool isMem512() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+ }
+
+ bool isSibMem() const {
+ return isMem() && Mem.BaseReg != X86::RIP && Mem.BaseReg != X86::EIP;
+ }
+
+ bool isMemIndexReg(unsigned LowR, unsigned HighR) const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR;
+ }
+
+ bool isMem64_RC128() const {
+ return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM15);
+ }
+ bool isMem128_RC128() const {
+ return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM15);
+ }
+ bool isMem128_RC256() const {
+ return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM15);
+ }
+ bool isMem256_RC128() const {
+ return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM15);
+ }
+ bool isMem256_RC256() const {
+ return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM15);
+ }
+
+ bool isMem64_RC128X() const {
+ return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM31);
+ }
+ bool isMem128_RC128X() const {
+ return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM31);
+ }
+ bool isMem128_RC256X() const {
+ return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM31);
+ }
+ bool isMem256_RC128X() const {
+ return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM31);
+ }
+ bool isMem256_RC256X() const {
+ return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM31);
+ }
+ bool isMem256_RC512() const {
+ return isMem256() && isMemIndexReg(X86::ZMM0, X86::ZMM31);
+ }
+ bool isMem512_RC256X() const {
+ return isMem512() && isMemIndexReg(X86::YMM0, X86::YMM31);
+ }
+ bool isMem512_RC512() const {
+ return isMem512() && isMemIndexReg(X86::ZMM0, X86::ZMM31);
+ }
+
+ bool isAbsMem() const {
+ return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+ !getMemIndexReg() && getMemScale() == 1;
+ }
+ bool isAVX512RC() const{
+ return isImm();
+ }
+
+ bool isAbsMem16() const {
+ return isAbsMem() && Mem.ModeSize == 16;
+ }
+
+ bool isSrcIdx() const {
+ return !getMemIndexReg() && getMemScale() == 1 &&
+ (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
+ getMemBaseReg() == X86::SI) && isa<MCConstantExpr>(getMemDisp()) &&
+ cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+ }
+ bool isSrcIdx8() const {
+ return isMem8() && isSrcIdx();
+ }
+ bool isSrcIdx16() const {
+ return isMem16() && isSrcIdx();
+ }
+ bool isSrcIdx32() const {
+ return isMem32() && isSrcIdx();
+ }
+ bool isSrcIdx64() const {
+ return isMem64() && isSrcIdx();
+ }
+
+ bool isDstIdx() const {
+ return !getMemIndexReg() && getMemScale() == 1 &&
+ (getMemSegReg() == 0 || getMemSegReg() == X86::ES) &&
+ (getMemBaseReg() == X86::RDI || getMemBaseReg() == X86::EDI ||
+ getMemBaseReg() == X86::DI) && isa<MCConstantExpr>(getMemDisp()) &&
+ cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+ }
+ bool isDstIdx8() const {
+ return isMem8() && isDstIdx();
+ }
+ bool isDstIdx16() const {
+ return isMem16() && isDstIdx();
+ }
+ bool isDstIdx32() const {
+ return isMem32() && isDstIdx();
+ }
+ bool isDstIdx64() const {
+ return isMem64() && isDstIdx();
+ }
+
+ bool isMemOffs() const {
+ return Kind == Memory && !getMemBaseReg() && !getMemIndexReg() &&
+ getMemScale() == 1;
+ }
+
+ bool isMemOffs16_8() const {
+ return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMemOffs16_16() const {
+ return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMemOffs16_32() const {
+ return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMemOffs32_8() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMemOffs32_16() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMemOffs32_32() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMemOffs32_64() const {
+ return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 64);
+ }
+ bool isMemOffs64_8() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMemOffs64_16() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMemOffs64_32() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMemOffs64_64() const {
+ return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64);
+ }
+
+ bool isPrefix() const { return Kind == Prefix; }
+ bool isReg() const override { return Kind == Register; }
+ bool isDXReg() const { return Kind == DXRegister; }
+
+ bool isGR32orGR64() const {
+ return Kind == Register &&
+ (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+ }
+
+ bool isGR16orGR32orGR64() const {
+ return Kind == Register &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+ }
+
+ bool isVectorReg() const {
+ return Kind == Register &&
+ (X86MCRegisterClasses[X86::VR64RegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::VR128XRegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::VR256XRegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::VR512RegClassID].contains(getReg()));
+ }
+
+ bool isVK1Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK1RegClassID].contains(getReg());
+ }
+
+ bool isVK2Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK2RegClassID].contains(getReg());
+ }
+
+ bool isVK4Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK4RegClassID].contains(getReg());
+ }
+
+ bool isVK8Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK8RegClassID].contains(getReg());
+ }
+
+ bool isVK16Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK16RegClassID].contains(getReg());
+ }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ MCRegister RegNo = getReg();
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+ RegNo = getX86SubSuperRegister(RegNo, 32);
+ Inst.addOperand(MCOperand::createReg(RegNo));
+ }
+
+ void addGR16orGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ MCRegister RegNo = getReg();
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(RegNo) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+ RegNo = getX86SubSuperRegister(RegNo, 16);
+ Inst.addOperand(MCOperand::createReg(RegNo));
+ }
+
+ void addAVX512RCOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addMaskPairOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ unsigned Reg = getReg();
+ switch (Reg) {
+ case X86::K0:
+ case X86::K1:
+ Reg = X86::K0_K1;
+ break;
+ case X86::K2:
+ case X86::K3:
+ Reg = X86::K2_K3;
+ break;
+ case X86::K4:
+ case X86::K5:
+ Reg = X86::K4_K5;
+ break;
+ case X86::K6:
+ case X86::K7:
+ Reg = X86::K6_K7;
+ break;
+ }
+ Inst.addOperand(MCOperand::createReg(Reg));
+ }
+
+ void addMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 5) && "Invalid number of operands!");
+ if (getMemBaseReg())
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ else
+ Inst.addOperand(MCOperand::createReg(getMemDefaultBaseReg()));
+ Inst.addOperand(MCOperand::createImm(getMemScale()));
+ Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
+ addExpr(Inst, getMemDisp());
+ Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+ }
+
+ void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 1) && "Invalid number of operands!");
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+ }
+
+ void addSrcIdxOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 2) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+ }
+
+ void addDstIdxOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 1) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ }
+
+ void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 2) && "Invalid number of operands!");
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+ Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+ }
+
+ static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
+ SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
+ auto Res = std::make_unique<X86Operand>(Token, Loc, EndLoc);
+ Res->Tok.Data = Str.data();
+ Res->Tok.Length = Str.size();
+ return Res;
+ }
+
+ static std::unique_ptr<X86Operand>
+ CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+ bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
+ StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+ auto Res = std::make_unique<X86Operand>(Register, StartLoc, EndLoc);
+ Res->Reg.RegNo = RegNo;
+ Res->AddressOf = AddressOf;
+ Res->OffsetOfLoc = OffsetOfLoc;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ return Res;
+ }
+
+ static std::unique_ptr<X86Operand>
+ CreateDXReg(SMLoc StartLoc, SMLoc EndLoc) {
+ return std::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc);
+ }
+
+ static std::unique_ptr<X86Operand>
+ CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) {
+ auto Res = std::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
+ Res->Pref.Prefixes = Prefixes;
+ return Res;
+ }
+
+ static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
+ SMLoc StartLoc, SMLoc EndLoc,
+ StringRef SymName = StringRef(),
+ void *OpDecl = nullptr,
+ bool GlobalRef = true) {
+ auto Res = std::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
+ Res->Imm.Val = Val;
+ Res->Imm.LocalRef = !GlobalRef;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = true;
+ return Res;
+ }
+
+ /// Create an absolute memory operand.
+ static std::unique_ptr<X86Operand>
+ CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+ unsigned Size = 0, StringRef SymName = StringRef(),
+ void *OpDecl = nullptr, unsigned FrontendSize = 0) {
+ auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = 0;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = 0;
+ Res->Mem.DefaultBaseReg = 0;
+ Res->Mem.IndexReg = 0;
+ Res->Mem.Scale = 1;
+ Res->Mem.Size = Size;
+ Res->Mem.ModeSize = ModeSize;
+ Res->Mem.FrontendSize = FrontendSize;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
+ return Res;
+ }
+
+ /// Create a generalized memory operand.
+ static std::unique_ptr<X86Operand>
+ CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc,
+ SMLoc EndLoc, unsigned Size = 0,
+ unsigned DefaultBaseReg = X86::NoRegister,
+ StringRef SymName = StringRef(), void *OpDecl = nullptr,
+ unsigned FrontendSize = 0) {
+ // We should never just have a displacement, that should be parsed as an
+ // absolute memory operand.
+ assert((SegReg || BaseReg || IndexReg || DefaultBaseReg) &&
+ "Invalid memory operand!");
+
+ // The scale should always be one of {1,2,4,8}.
+ assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+ "Invalid scale!");
+ auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = SegReg;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = BaseReg;
+ Res->Mem.DefaultBaseReg = DefaultBaseReg;
+ Res->Mem.IndexReg = IndexReg;
+ Res->Mem.Scale = Scale;
+ Res->Mem.Size = Size;
+ Res->Mem.ModeSize = ModeSize;
+ Res->Mem.FrontendSize = FrontendSize;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
+ return Res;
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 000000000000..05e482a6b66e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -0,0 +1,2362 @@
+//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+// MCInsts.
+//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets. The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+// These attributes, recorded in enum attributeBits
+// (X86DisassemblerDecoderCommon.h), form a bitmask. The table CONTEXTS_SYM
+// provides a mapping from bitmasks to contexts, which are represented by
+// enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is. The
+// disassembler distinguishes four kinds of opcodes, which are enumerated in
+// OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+// (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+// (0x0f 0x3a 0xnn). Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+// (X86DisassemblerDecoderCommon.h). Use the opcode class to determine which
+// OpcodeDecision (ibid.) to look the opcode in. Look up the opcode, to get
+// a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+// instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+// ModR/M byte to complete decode. The ModRMDecision's type is an entry from
+// ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+// ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+// of type InstrUID (X86DisassemblerDecoderCommon.h). Looking this ID up in
+// INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+// meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+// (X86DisassemblerDecoderCommon.h) and its type is an entry from
+// OperandType (ibid.). The encoding indicates how to read it from the
+// instruction; the type indicates how to interpret the value once it has
+// been read. For example, a register operand could be stored in the R/M
+// field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+// the main opcode. This is orthogonal from its meaning (an GPR or an XMM
+// register, for instance). Given this information, the operands can be
+// extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+// and operands into a format understandable by the client - in this case, an
+// MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself. The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+// invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+// table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+// factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+// responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "TargetInfo/X86TargetInfo.h"
+#include "X86DisassemblerDecoder.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+#define DEBUG_TYPE "x86-disassembler"
+
+#define debug(s) LLVM_DEBUG(dbgs() << __LINE__ << ": " << s);
+
+// Specifies whether a ModR/M byte is needed and (if so) which
+// instruction each possible value of the ModR/M byte corresponds to. Once
+// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+ uint8_t modrm_type;
+ uint16_t instructionIDs;
+};
+
+// Specifies which set of ModR/M->instruction tables to look at
+// given a particular opcode.
+struct OpcodeDecision {
+ ModRMDecision modRMDecisions[256];
+};
+
+// Specifies which opcode->instruction tables to look at given
+// a particular context (set of attributes). Since there are many possible
+// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+// applies given a specific set of attributes. Hence there are only IC_max
+// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+ OpcodeDecision opcodeDecisions[IC_max];
+};
+
+#include "X86GenDisassemblerTables.inc"
+
+static InstrUID decode(OpcodeType type, InstructionContext insnContext,
+ uint8_t opcode, uint8_t modRM) {
+ const struct ModRMDecision *dec;
+
+ switch (type) {
+ case ONEBYTE:
+ dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case TWOBYTE:
+ dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_38:
+ dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_3A:
+ dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOP8_MAP:
+ dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOP9_MAP:
+ dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOPA_MAP:
+ dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEDNOW_MAP:
+ dec =
+ &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ }
+
+ switch (dec->modrm_type) {
+ default:
+ llvm_unreachable("Corrupt table! Unknown modrm_type");
+ return 0;
+ case MODRM_ONEENTRY:
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITRM:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs + 1];
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITREG:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3) + 8];
+ return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3)];
+ case MODRM_SPLITMISC:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs + (modRM & 0x3f) + 8];
+ return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3)];
+ case MODRM_FULL:
+ return modRMTable[dec->instructionIDs + modRM];
+ }
+}
+
+static bool peek(struct InternalInstruction *insn, uint8_t &byte) {
+ uint64_t offset = insn->readerCursor - insn->startLocation;
+ if (offset >= insn->bytes.size())
+ return true;
+ byte = insn->bytes[offset];
+ return false;
+}
+
+template <typename T> static bool consume(InternalInstruction *insn, T &ptr) {
+ auto r = insn->bytes;
+ uint64_t offset = insn->readerCursor - insn->startLocation;
+ if (offset + sizeof(T) > r.size())
+ return true;
+ T ret = 0;
+ for (unsigned i = 0; i < sizeof(T); ++i)
+ ret |= (uint64_t)r[offset + i] << (i * 8);
+ ptr = ret;
+ insn->readerCursor += sizeof(T);
+ return false;
+}
+
+static bool isREX(struct InternalInstruction *insn, uint8_t prefix) {
+ return insn->mode == MODE_64BIT && prefix >= 0x40 && prefix <= 0x4f;
+}
+
+// Consumes all of an instruction's prefix bytes, and marks the
+// instruction as having them. Also sets the instruction's default operand,
+// address, and other relevant data sizes to report operands correctly.
+//
+// insn must not be empty.
+static int readPrefixes(struct InternalInstruction *insn) {
+ bool isPrefix = true;
+ uint8_t byte = 0;
+ uint8_t nextByte;
+
+ LLVM_DEBUG(dbgs() << "readPrefixes()");
+
+ while (isPrefix) {
+ // If we fail reading prefixes, just stop here and let the opcode reader
+ // deal with it.
+ if (consume(insn, byte))
+ break;
+
+ // If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+ // break and let it be disassembled as a normal "instruction".
+ if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) // LOCK
+ break;
+
+ if ((byte == 0xf2 || byte == 0xf3) && !peek(insn, nextByte)) {
+ // If the byte is 0xf2 or 0xf3, and any of the following conditions are
+ // met:
+ // - it is followed by a LOCK (0xf0) prefix
+ // - it is followed by an xchg instruction
+ // then it should be disassembled as a xacquire/xrelease not repne/rep.
+ if (((nextByte == 0xf0) ||
+ ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) {
+ insn->xAcquireRelease = true;
+ if (!(byte == 0xf3 && nextByte == 0x90)) // PAUSE instruction support
+ break;
+ }
+ // Also if the byte is 0xf3, and the following condition is met:
+ // - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+ // "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+ // then it should be disassembled as an xrelease not rep.
+ if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 ||
+ nextByte == 0xc6 || nextByte == 0xc7)) {
+ insn->xAcquireRelease = true;
+ break;
+ }
+ if (isREX(insn, nextByte)) {
+ uint8_t nnextByte;
+ // Go to REX prefix after the current one
+ if (consume(insn, nnextByte))
+ return -1;
+ // We should be able to read next byte after REX prefix
+ if (peek(insn, nnextByte))
+ return -1;
+ --insn->readerCursor;
+ }
+ }
+
+ switch (byte) {
+ case 0xf0: // LOCK
+ insn->hasLockPrefix = true;
+ break;
+ case 0xf2: // REPNE/REPNZ
+ case 0xf3: { // REP or REPE/REPZ
+ uint8_t nextByte;
+ if (peek(insn, nextByte))
+ break;
+ // TODO:
+ // 1. There could be several 0x66
+ // 2. if (nextByte == 0x66) and nextNextByte != 0x0f then
+ // it's not mandatory prefix
+ // 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need
+ // 0x0f exactly after it to be mandatory prefix
+ if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66)
+ // The last of 0xf2 /0xf3 is mandatory prefix
+ insn->mandatoryPrefix = byte;
+ insn->repeatPrefix = byte;
+ break;
+ }
+ case 0x2e: // CS segment override -OR- Branch not taken
+ insn->segmentOverride = SEG_OVERRIDE_CS;
+ break;
+ case 0x36: // SS segment override -OR- Branch taken
+ insn->segmentOverride = SEG_OVERRIDE_SS;
+ break;
+ case 0x3e: // DS segment override
+ insn->segmentOverride = SEG_OVERRIDE_DS;
+ break;
+ case 0x26: // ES segment override
+ insn->segmentOverride = SEG_OVERRIDE_ES;
+ break;
+ case 0x64: // FS segment override
+ insn->segmentOverride = SEG_OVERRIDE_FS;
+ break;
+ case 0x65: // GS segment override
+ insn->segmentOverride = SEG_OVERRIDE_GS;
+ break;
+ case 0x66: { // Operand-size override {
+ uint8_t nextByte;
+ insn->hasOpSize = true;
+ if (peek(insn, nextByte))
+ break;
+ // 0x66 can't overwrite existing mandatory prefix and should be ignored
+ if (!insn->mandatoryPrefix && (nextByte == 0x0f || isREX(insn, nextByte)))
+ insn->mandatoryPrefix = byte;
+ break;
+ }
+ case 0x67: // Address-size override
+ insn->hasAdSize = true;
+ break;
+ default: // Not a prefix byte
+ isPrefix = false;
+ break;
+ }
+
+ if (isPrefix)
+ LLVM_DEBUG(dbgs() << format("Found prefix 0x%hhx", byte));
+ }
+
+ insn->vectorExtensionType = TYPE_NO_VEX_XOP;
+
+ if (byte == 0x62) {
+ uint8_t byte1, byte2;
+ if (consume(insn, byte1)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read second byte of EVEX prefix");
+ return -1;
+ }
+
+ if (peek(insn, byte2)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read third byte of EVEX prefix");
+ return -1;
+ }
+
+ if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
+ ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+ insn->vectorExtensionType = TYPE_EVEX;
+ } else {
+ --insn->readerCursor; // unconsume byte1
+ --insn->readerCursor; // unconsume byte
+ }
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ insn->vectorExtensionPrefix[0] = byte;
+ insn->vectorExtensionPrefix[1] = byte1;
+ if (consume(insn, insn->vectorExtensionPrefix[2])) {
+ LLVM_DEBUG(dbgs() << "Couldn't read third byte of EVEX prefix");
+ return -1;
+ }
+ if (consume(insn, insn->vectorExtensionPrefix[3])) {
+ LLVM_DEBUG(dbgs() << "Couldn't read fourth byte of EVEX prefix");
+ return -1;
+ }
+
+ // We simulate the REX prefix for simplicity's sake
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40 |
+ (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3) |
+ (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2) |
+ (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1) |
+ (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
+ }
+
+ LLVM_DEBUG(
+ dbgs() << format(
+ "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]));
+ }
+ } else if (byte == 0xc4) {
+ uint8_t byte1;
+ if (peek(insn, byte1)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
+ insn->vectorExtensionType = TYPE_VEX_3B;
+ else
+ --insn->readerCursor;
+
+ if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consume(insn, insn->vectorExtensionPrefix[1]);
+ consume(insn, insn->vectorExtensionPrefix[2]);
+
+ // We simulate the REX prefix for simplicity's sake
+
+ if (insn->mode == MODE_64BIT)
+ insn->rexPrefix = 0x40 |
+ (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3) |
+ (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2) |
+ (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1) |
+ (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
+
+ LLVM_DEBUG(dbgs() << format("Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0],
+ insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2]));
+ }
+ } else if (byte == 0xc5) {
+ uint8_t byte1;
+ if (peek(insn, byte1)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
+ insn->vectorExtensionType = TYPE_VEX_2B;
+ else
+ --insn->readerCursor;
+
+ if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consume(insn, insn->vectorExtensionPrefix[1]);
+
+ if (insn->mode == MODE_64BIT)
+ insn->rexPrefix =
+ 0x40 | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
+
+ switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ insn->hasOpSize = true;
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << format("Found VEX prefix 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0],
+ insn->vectorExtensionPrefix[1]));
+ }
+ } else if (byte == 0x8f) {
+ uint8_t byte1;
+ if (peek(insn, byte1)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read second byte of XOP");
+ return -1;
+ }
+
+ if ((byte1 & 0x38) != 0x0) // 0 in these 3 bits is a POP instruction.
+ insn->vectorExtensionType = TYPE_XOP;
+ else
+ --insn->readerCursor;
+
+ if (insn->vectorExtensionType == TYPE_XOP) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consume(insn, insn->vectorExtensionPrefix[1]);
+ consume(insn, insn->vectorExtensionPrefix[2]);
+
+ // We simulate the REX prefix for simplicity's sake
+
+ if (insn->mode == MODE_64BIT)
+ insn->rexPrefix = 0x40 |
+ (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3) |
+ (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2) |
+ (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1) |
+ (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
+
+ switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ insn->hasOpSize = true;
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << format("Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0],
+ insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2]));
+ }
+ } else if (isREX(insn, byte)) {
+ if (peek(insn, nextByte))
+ return -1;
+ insn->rexPrefix = byte;
+ LLVM_DEBUG(dbgs() << format("Found REX prefix 0x%hhx", byte));
+ } else
+ --insn->readerCursor;
+
+ if (insn->mode == MODE_16BIT) {
+ insn->registerSize = (insn->hasOpSize ? 4 : 2);
+ insn->addressSize = (insn->hasAdSize ? 4 : 2);
+ insn->displacementSize = (insn->hasAdSize ? 4 : 2);
+ insn->immediateSize = (insn->hasOpSize ? 4 : 2);
+ } else if (insn->mode == MODE_32BIT) {
+ insn->registerSize = (insn->hasOpSize ? 2 : 4);
+ insn->addressSize = (insn->hasAdSize ? 2 : 4);
+ insn->displacementSize = (insn->hasAdSize ? 2 : 4);
+ insn->immediateSize = (insn->hasOpSize ? 2 : 4);
+ } else if (insn->mode == MODE_64BIT) {
+ if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+ insn->registerSize = 8;
+ insn->addressSize = (insn->hasAdSize ? 4 : 8);
+ insn->displacementSize = 4;
+ insn->immediateSize = 4;
+ } else {
+ insn->registerSize = (insn->hasOpSize ? 2 : 4);
+ insn->addressSize = (insn->hasAdSize ? 4 : 8);
+ insn->displacementSize = (insn->hasOpSize ? 2 : 4);
+ insn->immediateSize = (insn->hasOpSize ? 2 : 4);
+ }
+ }
+
+ return 0;
+}
+
+// Consumes the SIB byte to determine addressing information.
+static int readSIB(struct InternalInstruction *insn) {
+ SIBBase sibBaseBase = SIB_BASE_NONE;
+ uint8_t index, base;
+
+ LLVM_DEBUG(dbgs() << "readSIB()");
+ switch (insn->addressSize) {
+ case 2:
+ default:
+ llvm_unreachable("SIB-based addressing doesn't work in 16-bit mode");
+ case 4:
+ insn->sibIndexBase = SIB_INDEX_EAX;
+ sibBaseBase = SIB_BASE_EAX;
+ break;
+ case 8:
+ insn->sibIndexBase = SIB_INDEX_RAX;
+ sibBaseBase = SIB_BASE_RAX;
+ break;
+ }
+
+ if (consume(insn, insn->sib))
+ return -1;
+
+ index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+
+ if (index == 0x4) {
+ insn->sibIndex = SIB_INDEX_NONE;
+ } else {
+ insn->sibIndex = (SIBIndex)(insn->sibIndexBase + index);
+ }
+
+ insn->sibScale = 1 << scaleFromSIB(insn->sib);
+
+ base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+ switch (base) {
+ case 0x5:
+ case 0xd:
+ switch (modFromModRM(insn->modRM)) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = SIB_BASE_NONE;
+ break;
+ case 0x1:
+ insn->eaDisplacement = EA_DISP_8;
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ case 0x2:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ default:
+ llvm_unreachable("Cannot have Mod = 0b11 and a SIB byte");
+ }
+ break;
+ default:
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ }
+
+ return 0;
+}
+
+static int readDisplacement(struct InternalInstruction *insn) {
+ int8_t d8;
+ int16_t d16;
+ int32_t d32;
+ LLVM_DEBUG(dbgs() << "readDisplacement()");
+
+ insn->displacementOffset = insn->readerCursor - insn->startLocation;
+ switch (insn->eaDisplacement) {
+ case EA_DISP_NONE:
+ break;
+ case EA_DISP_8:
+ if (consume(insn, d8))
+ return -1;
+ insn->displacement = d8;
+ break;
+ case EA_DISP_16:
+ if (consume(insn, d16))
+ return -1;
+ insn->displacement = d16;
+ break;
+ case EA_DISP_32:
+ if (consume(insn, d32))
+ return -1;
+ insn->displacement = d32;
+ break;
+ }
+
+ return 0;
+}
+
+// Consumes all addressing information (ModR/M byte, SIB byte, and displacement.
+static int readModRM(struct InternalInstruction *insn) {
+ uint8_t mod, rm, reg, evexrm;
+ LLVM_DEBUG(dbgs() << "readModRM()");
+
+ if (insn->consumedModRM)
+ return 0;
+
+ if (consume(insn, insn->modRM))
+ return -1;
+ insn->consumedModRM = true;
+
+ mod = modFromModRM(insn->modRM);
+ rm = rmFromModRM(insn->modRM);
+ reg = regFromModRM(insn->modRM);
+
+ // This goes by insn->registerSize to pick the correct register, which messes
+ // up if we're using (say) XMM or 8-bit register operands. That gets fixed in
+ // fixupReg().
+ switch (insn->registerSize) {
+ case 2:
+ insn->regBase = MODRM_REG_AX;
+ insn->eaRegBase = EA_REG_AX;
+ break;
+ case 4:
+ insn->regBase = MODRM_REG_EAX;
+ insn->eaRegBase = EA_REG_EAX;
+ break;
+ case 8:
+ insn->regBase = MODRM_REG_RAX;
+ insn->eaRegBase = EA_REG_RAX;
+ break;
+ }
+
+ reg |= rFromREX(insn->rexPrefix) << 3;
+ rm |= bFromREX(insn->rexPrefix) << 3;
+
+ evexrm = 0;
+ if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT) {
+ reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+ evexrm = xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+ }
+
+ insn->reg = (Reg)(insn->regBase + reg);
+
+ switch (insn->addressSize) {
+ case 2: {
+ EABase eaBaseBase = EA_BASE_BX_SI;
+
+ switch (mod) {
+ case 0x0:
+ if (rm == 0x6) {
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ } else {
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_NONE;
+ }
+ break;
+ case 0x1:
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_8;
+ insn->displacementSize = 1;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x2:
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x3:
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ }
+ case 4:
+ case 8: {
+ EABase eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+ switch (mod) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_NONE; // readSIB may override this
+ // In determining whether RIP-relative mode is used (rm=5),
+ // or whether a SIB byte is present (rm=4),
+ // the extension bits (REX.b and EVEX.x) are ignored.
+ switch (rm & 7) {
+ case 0x4: // SIB byte is present
+ insn->eaBase = (insn->addressSize == 4 ? EA_BASE_sib : EA_BASE_sib64);
+ if (readSIB(insn) || readDisplacement(insn))
+ return -1;
+ break;
+ case 0x5: // RIP-relative
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_32;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ break;
+ }
+ break;
+ case 0x1:
+ insn->displacementSize = 1;
+ LLVM_FALLTHROUGH;
+ case 0x2:
+ insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+ switch (rm & 7) {
+ case 0x4: // SIB byte is present
+ insn->eaBase = EA_BASE_sib;
+ if (readSIB(insn) || readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 0x3:
+ insn->eaDisplacement = EA_DISP_NONE;
+ insn->eaBase = (EABase)(insn->eaRegBase + rm + evexrm);
+ break;
+ }
+ break;
+ }
+ } // switch (insn->addressSize)
+
+ return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix, mask) \
+ static uint16_t name(struct InternalInstruction *insn, OperandType type, \
+ uint8_t index, uint8_t *valid) { \
+ *valid = 1; \
+ switch (type) { \
+ default: \
+ debug("Unhandled register type"); \
+ *valid = 0; \
+ return 0; \
+ case TYPE_Rv: \
+ return base + index; \
+ case TYPE_R8: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
+ if (insn->rexPrefix && index >= 4 && index <= 7) { \
+ return prefix##_SPL + (index - 4); \
+ } else { \
+ return prefix##_AL + index; \
+ } \
+ case TYPE_R16: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
+ return prefix##_AX + index; \
+ case TYPE_R32: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
+ return prefix##_EAX + index; \
+ case TYPE_R64: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
+ return prefix##_RAX + index; \
+ case TYPE_ZMM: \
+ return prefix##_ZMM0 + index; \
+ case TYPE_YMM: \
+ return prefix##_YMM0 + index; \
+ case TYPE_XMM: \
+ return prefix##_XMM0 + index; \
+ case TYPE_TMM: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_TMM0 + index; \
+ case TYPE_VK: \
+ index &= 0xf; \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_K0 + index; \
+ case TYPE_VK_PAIR: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_K0_K1 + (index / 2); \
+ case TYPE_MM64: \
+ return prefix##_MM0 + (index & 0x7); \
+ case TYPE_SEGMENTREG: \
+ if ((index & 7) > 5) \
+ *valid = 0; \
+ return prefix##_ES + (index & 7); \
+ case TYPE_DEBUGREG: \
+ return prefix##_DR0 + index; \
+ case TYPE_CONTROLREG: \
+ return prefix##_CR0 + index; \
+ case TYPE_BNDR: \
+ if (index > 3) \
+ *valid = 0; \
+ return prefix##_BND0 + index; \
+ case TYPE_MVSIBX: \
+ return prefix##_XMM0 + index; \
+ case TYPE_MVSIBY: \
+ return prefix##_YMM0 + index; \
+ case TYPE_MVSIBZ: \
+ return prefix##_ZMM0 + index; \
+ } \
+ }
+
+// Consult an operand type to determine the meaning of the reg or R/M field. If
+// the operand is an XMM operand, for example, an operand would be XMM0 instead
+// of AX, which readModRM() would otherwise misinterpret it as.
+//
+// @param insn - The instruction containing the operand.
+// @param type - The operand type.
+// @param index - The existing value of the field as reported by readModRM().
+// @param valid - The address of a uint8_t. The target is set to 1 if the
+// field is valid for the register class; 0 if not.
+// @return - The proper value.
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG, 0x1f)
+GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG, 0xf)
+
+// Consult an operand specifier to determine which of the fixup*Value functions
+// to use in correcting readModRM()'ss interpretation.
+//
+// @param insn - See fixup*Value().
+// @param op - The operand specifier.
+// @return - 0 if fixup was successful; -1 if the register returned was
+// invalid for its class.
+static int fixupReg(struct InternalInstruction *insn,
+ const struct OperandSpecifier *op) {
+ uint8_t valid;
+ LLVM_DEBUG(dbgs() << "fixupReg()");
+
+ switch ((OperandEncoding)op->encoding) {
+ default:
+ debug("Expected a REG or R/M encoding in fixupReg");
+ return -1;
+ case ENCODING_VVVV:
+ insn->vvvv =
+ (Reg)fixupRegValue(insn, (OperandType)op->type, insn->vvvv, &valid);
+ if (!valid)
+ return -1;
+ break;
+ case ENCODING_REG:
+ insn->reg = (Reg)fixupRegValue(insn, (OperandType)op->type,
+ insn->reg - insn->regBase, &valid);
+ if (!valid)
+ return -1;
+ break;
+ case ENCODING_SIB:
+ CASE_ENCODING_RM:
+ if (insn->eaBase >= insn->eaRegBase) {
+ insn->eaBase = (EABase)fixupRMValue(
+ insn, (OperandType)op->type, insn->eaBase - insn->eaRegBase, &valid);
+ if (!valid)
+ return -1;
+ }
+ break;
+ }
+
+ return 0;
+}
+
+// Read the opcode (except the ModR/M byte in the case of extended or escape
+// opcodes).
+static bool readOpcode(struct InternalInstruction *insn) {
+ uint8_t current;
+ LLVM_DEBUG(dbgs() << "readOpcode()");
+
+ insn->opcodeType = ONEBYTE;
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+ default:
+ LLVM_DEBUG(
+ dbgs() << format("Unhandled mm field for instruction (0x%hhx)",
+ mmFromEVEX2of4(insn->vectorExtensionPrefix[1])));
+ return true;
+ case VEX_LOB_0F:
+ insn->opcodeType = TWOBYTE;
+ return consume(insn, insn->opcode);
+ case VEX_LOB_0F38:
+ insn->opcodeType = THREEBYTE_38;
+ return consume(insn, insn->opcode);
+ case VEX_LOB_0F3A:
+ insn->opcodeType = THREEBYTE_3A;
+ return consume(insn, insn->opcode);
+ }
+ } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
+ default:
+ LLVM_DEBUG(
+ dbgs() << format("Unhandled m-mmmm field for instruction (0x%hhx)",
+ mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])));
+ return true;
+ case VEX_LOB_0F:
+ insn->opcodeType = TWOBYTE;
+ return consume(insn, insn->opcode);
+ case VEX_LOB_0F38:
+ insn->opcodeType = THREEBYTE_38;
+ return consume(insn, insn->opcode);
+ case VEX_LOB_0F3A:
+ insn->opcodeType = THREEBYTE_3A;
+ return consume(insn, insn->opcode);
+ }
+ } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ insn->opcodeType = TWOBYTE;
+ return consume(insn, insn->opcode);
+ } else if (insn->vectorExtensionType == TYPE_XOP) {
+ switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
+ default:
+ LLVM_DEBUG(
+ dbgs() << format("Unhandled m-mmmm field for instruction (0x%hhx)",
+ mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])));
+ return true;
+ case XOP_MAP_SELECT_8:
+ insn->opcodeType = XOP8_MAP;
+ return consume(insn, insn->opcode);
+ case XOP_MAP_SELECT_9:
+ insn->opcodeType = XOP9_MAP;
+ return consume(insn, insn->opcode);
+ case XOP_MAP_SELECT_A:
+ insn->opcodeType = XOPA_MAP;
+ return consume(insn, insn->opcode);
+ }
+ }
+
+ if (consume(insn, current))
+ return true;
+
+ if (current == 0x0f) {
+ LLVM_DEBUG(
+ dbgs() << format("Found a two-byte escape prefix (0x%hhx)", current));
+ if (consume(insn, current))
+ return true;
+
+ if (current == 0x38) {
+ LLVM_DEBUG(dbgs() << format("Found a three-byte escape prefix (0x%hhx)",
+ current));
+ if (consume(insn, current))
+ return true;
+
+ insn->opcodeType = THREEBYTE_38;
+ } else if (current == 0x3a) {
+ LLVM_DEBUG(dbgs() << format("Found a three-byte escape prefix (0x%hhx)",
+ current));
+ if (consume(insn, current))
+ return true;
+
+ insn->opcodeType = THREEBYTE_3A;
+ } else if (current == 0x0f) {
+ LLVM_DEBUG(
+ dbgs() << format("Found a 3dnow escape prefix (0x%hhx)", current));
+
+ // Consume operands before the opcode to comply with the 3DNow encoding
+ if (readModRM(insn))
+ return true;
+
+ if (consume(insn, current))
+ return true;
+
+ insn->opcodeType = THREEDNOW_MAP;
+ } else {
+ LLVM_DEBUG(dbgs() << "Didn't find a three-byte escape prefix");
+ insn->opcodeType = TWOBYTE;
+ }
+ } else if (insn->mandatoryPrefix)
+ // The opcode with mandatory prefix must start with opcode escape.
+ // If not it's legacy repeat prefix
+ insn->mandatoryPrefix = 0;
+
+ // At this point we have consumed the full opcode.
+ // Anything we consume from here on must be unconsumed.
+ insn->opcode = current;
+
+ return false;
+}
+
+// Determine whether equiv is the 16-bit equivalent of orig (32-bit or 64-bit).
+static bool is16BitEquivalent(const char *orig, const char *equiv) {
+ for (int i = 0;; i++) {
+ if (orig[i] == '\0' && equiv[i] == '\0')
+ return true;
+ if (orig[i] == '\0' || equiv[i] == '\0')
+ return false;
+ if (orig[i] != equiv[i]) {
+ if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+ continue;
+ if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+ continue;
+ if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+ continue;
+ return false;
+ }
+ }
+}
+
+// Determine whether this instruction is a 64-bit instruction.
+static bool is64Bit(const char *name) {
+ for (int i = 0;; ++i) {
+ if (name[i] == '\0')
+ return false;
+ if (name[i] == '6' && name[i + 1] == '4')
+ return true;
+ }
+}
+
+// Determine the ID of an instruction, consuming the ModR/M byte as appropriate
+// for extended and escape opcodes, and using a supplied attribute mask.
+static int getInstructionIDWithAttrMask(uint16_t *instructionID,
+ struct InternalInstruction *insn,
+ uint16_t attrMask) {
+ auto insnCtx = InstructionContext(x86DisassemblerContexts[attrMask]);
+ const ContextDecision *decision;
+ switch (insn->opcodeType) {
+ case ONEBYTE:
+ decision = &ONEBYTE_SYM;
+ break;
+ case TWOBYTE:
+ decision = &TWOBYTE_SYM;
+ break;
+ case THREEBYTE_38:
+ decision = &THREEBYTE38_SYM;
+ break;
+ case THREEBYTE_3A:
+ decision = &THREEBYTE3A_SYM;
+ break;
+ case XOP8_MAP:
+ decision = &XOP8_MAP_SYM;
+ break;
+ case XOP9_MAP:
+ decision = &XOP9_MAP_SYM;
+ break;
+ case XOPA_MAP:
+ decision = &XOPA_MAP_SYM;
+ break;
+ case THREEDNOW_MAP:
+ decision = &THREEDNOW_MAP_SYM;
+ break;
+ }
+
+ if (decision->opcodeDecisions[insnCtx]
+ .modRMDecisions[insn->opcode]
+ .modrm_type != MODRM_ONEENTRY) {
+ if (readModRM(insn))
+ return -1;
+ *instructionID =
+ decode(insn->opcodeType, insnCtx, insn->opcode, insn->modRM);
+ } else {
+ *instructionID = decode(insn->opcodeType, insnCtx, insn->opcode, 0);
+ }
+
+ return 0;
+}
+
+// Determine the ID of an instruction, consuming the ModR/M byte as appropriate
+// for extended and escape opcodes. Determines the attributes and context for
+// the instruction before doing so.
+static int getInstructionID(struct InternalInstruction *insn,
+ const MCInstrInfo *mii) {
+ uint16_t attrMask;
+ uint16_t instructionID;
+
+ LLVM_DEBUG(dbgs() << "getID()");
+
+ attrMask = ATTR_NONE;
+
+ if (insn->mode == MODE_64BIT)
+ attrMask |= ATTR_64BIT;
+
+ if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+ attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXKZ;
+ if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXB;
+ if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXK;
+ if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_VEXL;
+ if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXL2;
+ } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
+ attrMask |= ATTR_VEXL;
+ } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
+ attrMask |= ATTR_VEXL;
+ } else if (insn->vectorExtensionType == TYPE_XOP) {
+ switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
+ attrMask |= ATTR_VEXL;
+ } else {
+ return -1;
+ }
+ } else if (!insn->mandatoryPrefix) {
+ // If we don't have mandatory prefix we should use legacy prefixes here
+ if (insn->hasOpSize && (insn->mode != MODE_16BIT))
+ attrMask |= ATTR_OPSIZE;
+ if (insn->hasAdSize)
+ attrMask |= ATTR_ADSIZE;
+ if (insn->opcodeType == ONEBYTE) {
+ if (insn->repeatPrefix == 0xf3 && (insn->opcode == 0x90))
+ // Special support for PAUSE
+ attrMask |= ATTR_XS;
+ } else {
+ if (insn->repeatPrefix == 0xf2)
+ attrMask |= ATTR_XD;
+ else if (insn->repeatPrefix == 0xf3)
+ attrMask |= ATTR_XS;
+ }
+ } else {
+ switch (insn->mandatoryPrefix) {
+ case 0xf2:
+ attrMask |= ATTR_XD;
+ break;
+ case 0xf3:
+ attrMask |= ATTR_XS;
+ break;
+ case 0x66:
+ if (insn->mode != MODE_16BIT)
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case 0x67:
+ attrMask |= ATTR_ADSIZE;
+ break;
+ }
+ }
+
+ if (insn->rexPrefix & 0x08) {
+ attrMask |= ATTR_REXW;
+ attrMask &= ~ATTR_ADSIZE;
+ }
+
+ if (insn->mode == MODE_16BIT) {
+ // JCXZ/JECXZ need special handling for 16-bit mode because the meaning
+ // of the AdSize prefix is inverted w.r.t. 32-bit mode.
+ if (insn->opcodeType == ONEBYTE && insn->opcode == 0xE3)
+ attrMask ^= ATTR_ADSIZE;
+ // If we're in 16-bit mode and this is one of the relative jumps and opsize
+ // prefix isn't present, we need to force the opsize attribute since the
+ // prefix is inverted relative to 32-bit mode.
+ if (!insn->hasOpSize && insn->opcodeType == ONEBYTE &&
+ (insn->opcode == 0xE8 || insn->opcode == 0xE9))
+ attrMask |= ATTR_OPSIZE;
+
+ if (!insn->hasOpSize && insn->opcodeType == TWOBYTE &&
+ insn->opcode >= 0x80 && insn->opcode <= 0x8F)
+ attrMask |= ATTR_OPSIZE;
+ }
+
+
+ if (getInstructionIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ // The following clauses compensate for limitations of the tables.
+
+ if (insn->mode != MODE_64BIT &&
+ insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+ // The tables can't distinquish between cases where the W-bit is used to
+ // select register size and cases where its a required part of the opcode.
+ if ((insn->vectorExtensionType == TYPE_EVEX &&
+ wFromEVEX3of4(insn->vectorExtensionPrefix[2])) ||
+ (insn->vectorExtensionType == TYPE_VEX_3B &&
+ wFromVEX3of3(insn->vectorExtensionPrefix[2])) ||
+ (insn->vectorExtensionType == TYPE_XOP &&
+ wFromXOP3of3(insn->vectorExtensionPrefix[2]))) {
+
+ uint16_t instructionIDWithREXW;
+ if (getInstructionIDWithAttrMask(&instructionIDWithREXW, insn,
+ attrMask | ATTR_REXW)) {
+ insn->instructionID = instructionID;
+ insn->spec = &INSTRUCTIONS_SYM[instructionID];
+ return 0;
+ }
+
+ auto SpecName = mii->getName(instructionIDWithREXW);
+ // If not a 64-bit instruction. Switch the opcode.
+ if (!is64Bit(SpecName.data())) {
+ insn->instructionID = instructionIDWithREXW;
+ insn->spec = &INSTRUCTIONS_SYM[instructionIDWithREXW];
+ return 0;
+ }
+ }
+ }
+
+ // Absolute moves, umonitor, and movdir64b need special handling.
+ // -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
+ // inverted w.r.t.
+ // -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
+ // any position.
+ if ((insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) ||
+ (insn->opcodeType == TWOBYTE && (insn->opcode == 0xAE)) ||
+ (insn->opcodeType == THREEBYTE_38 && insn->opcode == 0xF8)) {
+ // Make sure we observed the prefixes in any position.
+ if (insn->hasAdSize)
+ attrMask |= ATTR_ADSIZE;
+ if (insn->hasOpSize)
+ attrMask |= ATTR_OPSIZE;
+
+ // In 16-bit, invert the attributes.
+ if (insn->mode == MODE_16BIT) {
+ attrMask ^= ATTR_ADSIZE;
+
+ // The OpSize attribute is only valid with the absolute moves.
+ if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0))
+ attrMask ^= ATTR_OPSIZE;
+ }
+
+ if (getInstructionIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ insn->instructionID = instructionID;
+ insn->spec = &INSTRUCTIONS_SYM[instructionID];
+ return 0;
+ }
+
+ if ((insn->mode == MODE_16BIT || insn->hasOpSize) &&
+ !(attrMask & ATTR_OPSIZE)) {
+ // The instruction tables make no distinction between instructions that
+ // allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+ // particular spot (i.e., many MMX operations). In general we're
+ // conservative, but in the specific case where OpSize is present but not in
+ // the right place we check if there's a 16-bit operation.
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithOpsize;
+ llvm::StringRef specName, specWithOpSizeName;
+
+ spec = &INSTRUCTIONS_SYM[instructionID];
+
+ if (getInstructionIDWithAttrMask(&instructionIDWithOpsize, insn,
+ attrMask | ATTR_OPSIZE)) {
+ // ModRM required with OpSize but not present. Give up and return the
+ // version without OpSize set.
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specName = mii->getName(instructionID);
+ specWithOpSizeName = mii->getName(instructionIDWithOpsize);
+
+ if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) &&
+ (insn->mode == MODE_16BIT) ^ insn->hasOpSize) {
+ insn->instructionID = instructionIDWithOpsize;
+ insn->spec = &INSTRUCTIONS_SYM[instructionIDWithOpsize];
+ } else {
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ }
+ return 0;
+ }
+
+ if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+ insn->rexPrefix & 0x01) {
+ // NOOP shouldn't decode as NOOP if REX.b is set. Instead it should decode
+ // as XCHG %r8, %eax.
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithNewOpcode;
+ const struct InstructionSpecifier *specWithNewOpcode;
+
+ spec = &INSTRUCTIONS_SYM[instructionID];
+
+ // Borrow opcode from one of the other XCHGar opcodes
+ insn->opcode = 0x91;
+
+ if (getInstructionIDWithAttrMask(&instructionIDWithNewOpcode, insn,
+ attrMask)) {
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specWithNewOpcode = &INSTRUCTIONS_SYM[instructionIDWithNewOpcode];
+
+ // Change back
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionIDWithNewOpcode;
+ insn->spec = specWithNewOpcode;
+
+ return 0;
+ }
+
+ insn->instructionID = instructionID;
+ insn->spec = &INSTRUCTIONS_SYM[insn->instructionID];
+
+ return 0;
+}
+
+// Read an operand from the opcode field of an instruction and interprets it
+// appropriately given the operand width. Handles AddRegFrm instructions.
+//
+// @param insn - the instruction whose opcode field is to be read.
+// @param size - The width (in bytes) of the register being specified.
+// 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+// RAX.
+// @return - 0 on success; nonzero otherwise.
+static int readOpcodeRegister(struct InternalInstruction *insn, uint8_t size) {
+ LLVM_DEBUG(dbgs() << "readOpcodeRegister()");
+
+ if (size == 0)
+ size = insn->registerSize;
+
+ switch (size) {
+ case 1:
+ insn->opcodeRegister = (Reg)(
+ MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
+ if (insn->rexPrefix && insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+ insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+ insn->opcodeRegister =
+ (Reg)(MODRM_REG_SPL + (insn->opcodeRegister - MODRM_REG_AL - 4));
+ }
+
+ break;
+ case 2:
+ insn->opcodeRegister = (Reg)(
+ MODRM_REG_AX + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
+ break;
+ case 4:
+ insn->opcodeRegister =
+ (Reg)(MODRM_REG_EAX +
+ ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
+ break;
+ case 8:
+ insn->opcodeRegister =
+ (Reg)(MODRM_REG_RAX +
+ ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
+ break;
+ }
+
+ return 0;
+}
+
+// Consume an immediate operand from an instruction, given the desired operand
+// size.
+//
+// @param insn - The instruction whose operand is to be read.
+// @param size - The width (in bytes) of the operand.
+// @return - 0 if the immediate was successfully consumed; nonzero
+// otherwise.
+static int readImmediate(struct InternalInstruction *insn, uint8_t size) {
+ uint8_t imm8;
+ uint16_t imm16;
+ uint32_t imm32;
+ uint64_t imm64;
+
+ LLVM_DEBUG(dbgs() << "readImmediate()");
+
+ assert(insn->numImmediatesConsumed < 2 && "Already consumed two immediates");
+
+ insn->immediateSize = size;
+ insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+ switch (size) {
+ case 1:
+ if (consume(insn, imm8))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm8;
+ break;
+ case 2:
+ if (consume(insn, imm16))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm16;
+ break;
+ case 4:
+ if (consume(insn, imm32))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm32;
+ break;
+ case 8:
+ if (consume(insn, imm64))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm64;
+ break;
+ default:
+ llvm_unreachable("invalid size");
+ }
+
+ insn->numImmediatesConsumed++;
+
+ return 0;
+}
+
+// Consume vvvv from an instruction if it has a VEX prefix.
+static int readVVVV(struct InternalInstruction *insn) {
+ LLVM_DEBUG(dbgs() << "readVVVV()");
+
+ int vvvv;
+ if (insn->vectorExtensionType == TYPE_EVEX)
+ vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
+ vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
+ else if (insn->vectorExtensionType == TYPE_VEX_3B)
+ vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+ else if (insn->vectorExtensionType == TYPE_VEX_2B)
+ vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+ else if (insn->vectorExtensionType == TYPE_XOP)
+ vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+ else
+ return -1;
+
+ if (insn->mode != MODE_64BIT)
+ vvvv &= 0xf; // Can only clear bit 4. Bit 3 must be cleared later.
+
+ insn->vvvv = static_cast<Reg>(vvvv);
+ return 0;
+}
+
+// Read an mask register from the opcode field of an instruction.
+//
+// @param insn - The instruction whose opcode field is to be read.
+// @return - 0 on success; nonzero otherwise.
+static int readMaskRegister(struct InternalInstruction *insn) {
+ LLVM_DEBUG(dbgs() << "readMaskRegister()");
+
+ if (insn->vectorExtensionType != TYPE_EVEX)
+ return -1;
+
+ insn->writemask =
+ static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
+ return 0;
+}
+
+// Consults the specifier for an instruction and consumes all
+// operands for that instruction, interpreting them as it goes.
+static int readOperands(struct InternalInstruction *insn) {
+ int hasVVVV, needVVVV;
+ int sawRegImm = 0;
+
+ LLVM_DEBUG(dbgs() << "readOperands()");
+
+ // If non-zero vvvv specified, make sure one of the operands uses it.
+ hasVVVV = !readVVVV(insn);
+ needVVVV = hasVVVV && (insn->vvvv != 0);
+
+ for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+ switch (Op.encoding) {
+ case ENCODING_NONE:
+ case ENCODING_SI:
+ case ENCODING_DI:
+ break;
+ CASE_ENCODING_VSIB:
+ // VSIB can use the V2 bit so check only the other bits.
+ if (needVVVV)
+ needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0);
+ if (readModRM(insn))
+ return -1;
+
+ // Reject if SIB wasn't used.
+ if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64)
+ return -1;
+
+ // If sibIndex was set to SIB_INDEX_NONE, index offset is 4.
+ if (insn->sibIndex == SIB_INDEX_NONE)
+ insn->sibIndex = (SIBIndex)(insn->sibIndexBase + 4);
+
+ // If EVEX.v2 is set this is one of the 16-31 registers.
+ if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT &&
+ v2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ insn->sibIndex = (SIBIndex)(insn->sibIndex + 16);
+
+ // Adjust the index register to the correct size.
+ switch ((OperandType)Op.type) {
+ default:
+ debug("Unhandled VSIB index type");
+ return -1;
+ case TYPE_MVSIBX:
+ insn->sibIndex =
+ (SIBIndex)(SIB_INDEX_XMM0 + (insn->sibIndex - insn->sibIndexBase));
+ break;
+ case TYPE_MVSIBY:
+ insn->sibIndex =
+ (SIBIndex)(SIB_INDEX_YMM0 + (insn->sibIndex - insn->sibIndexBase));
+ break;
+ case TYPE_MVSIBZ:
+ insn->sibIndex =
+ (SIBIndex)(SIB_INDEX_ZMM0 + (insn->sibIndex - insn->sibIndexBase));
+ break;
+ }
+
+ // Apply the AVX512 compressed displacement scaling factor.
+ if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+ insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB);
+ break;
+ case ENCODING_SIB:
+ // Reject if SIB wasn't used.
+ if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64)
+ return -1;
+ if (readModRM(insn))
+ return -1;
+ if (fixupReg(insn, &Op))
+ return -1;
+ break;
+ case ENCODING_REG:
+ CASE_ENCODING_RM:
+ if (readModRM(insn))
+ return -1;
+ if (fixupReg(insn, &Op))
+ return -1;
+ // Apply the AVX512 compressed displacement scaling factor.
+ if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+ insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
+ break;
+ case ENCODING_IB:
+ if (sawRegImm) {
+ // Saw a register immediate so don't read again and instead split the
+ // previous immediate. FIXME: This is a hack.
+ insn->immediates[insn->numImmediatesConsumed] =
+ insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+ ++insn->numImmediatesConsumed;
+ break;
+ }
+ if (readImmediate(insn, 1))
+ return -1;
+ if (Op.type == TYPE_XMM || Op.type == TYPE_YMM)
+ sawRegImm = 1;
+ break;
+ case ENCODING_IW:
+ if (readImmediate(insn, 2))
+ return -1;
+ break;
+ case ENCODING_ID:
+ if (readImmediate(insn, 4))
+ return -1;
+ break;
+ case ENCODING_IO:
+ if (readImmediate(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Iv:
+ if (readImmediate(insn, insn->immediateSize))
+ return -1;
+ break;
+ case ENCODING_Ia:
+ if (readImmediate(insn, insn->addressSize))
+ return -1;
+ break;
+ case ENCODING_IRC:
+ insn->RC = (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 1) |
+ lFromEVEX4of4(insn->vectorExtensionPrefix[3]);
+ break;
+ case ENCODING_RB:
+ if (readOpcodeRegister(insn, 1))
+ return -1;
+ break;
+ case ENCODING_RW:
+ if (readOpcodeRegister(insn, 2))
+ return -1;
+ break;
+ case ENCODING_RD:
+ if (readOpcodeRegister(insn, 4))
+ return -1;
+ break;
+ case ENCODING_RO:
+ if (readOpcodeRegister(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Rv:
+ if (readOpcodeRegister(insn, 0))
+ return -1;
+ break;
+ case ENCODING_CC:
+ insn->immediates[1] = insn->opcode & 0xf;
+ break;
+ case ENCODING_FP:
+ break;
+ case ENCODING_VVVV:
+ needVVVV = 0; // Mark that we have found a VVVV operand.
+ if (!hasVVVV)
+ return -1;
+ if (insn->mode != MODE_64BIT)
+ insn->vvvv = static_cast<Reg>(insn->vvvv & 0x7);
+ if (fixupReg(insn, &Op))
+ return -1;
+ break;
+ case ENCODING_WRITEMASK:
+ if (readMaskRegister(insn))
+ return -1;
+ break;
+ case ENCODING_DUP:
+ break;
+ default:
+ LLVM_DEBUG(dbgs() << "Encountered an operand with an unknown encoding.");
+ return -1;
+ }
+ }
+
+ // If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail
+ if (needVVVV)
+ return -1;
+
+ return 0;
+}
+
+namespace llvm {
+
+// Fill-ins to make the compiler happy. These constants are never actually
+// assigned; they are just filler to make an automatically-generated switch
+// statement work.
+namespace X86 {
+ enum {
+ BX_SI = 500,
+ BX_DI = 501,
+ BP_SI = 502,
+ BP_DI = 503,
+ sib = 504,
+ sib64 = 505
+ };
+} // namespace X86
+
+} // namespace llvm
+
+static bool translateInstruction(MCInst &target,
+ InternalInstruction &source,
+ const MCDisassembler *Dis);
+
+namespace {
+
+/// Generic disassembler for all X86 platforms. All each platform class should
+/// have to do is subclass the constructor, and provide a different
+/// disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+ std::unique_ptr<const MCInstrInfo> MII;
+public:
+ X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ std::unique_ptr<const MCInstrInfo> MII);
+public:
+ DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &cStream) const override;
+
+private:
+ DisassemblerMode fMode;
+};
+
+} // namespace
+
+X86GenericDisassembler::X86GenericDisassembler(
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx,
+ std::unique_ptr<const MCInstrInfo> MII)
+ : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
+ const FeatureBitset &FB = STI.getFeatureBits();
+ if (FB[X86::Mode16Bit]) {
+ fMode = MODE_16BIT;
+ return;
+ } else if (FB[X86::Mode32Bit]) {
+ fMode = MODE_32BIT;
+ return;
+ } else if (FB[X86::Mode64Bit]) {
+ fMode = MODE_64BIT;
+ return;
+ }
+
+ llvm_unreachable("Invalid CPU mode");
+}
+
+MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
+ MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &CStream) const {
+ CommentStream = &CStream;
+
+ InternalInstruction Insn;
+ memset(&Insn, 0, sizeof(InternalInstruction));
+ Insn.bytes = Bytes;
+ Insn.startLocation = Address;
+ Insn.readerCursor = Address;
+ Insn.mode = fMode;
+
+ if (Bytes.empty() || readPrefixes(&Insn) || readOpcode(&Insn) ||
+ getInstructionID(&Insn, MII.get()) || Insn.instructionID == 0 ||
+ readOperands(&Insn)) {
+ Size = Insn.readerCursor - Address;
+ return Fail;
+ }
+
+ Insn.operands = x86OperandSets[Insn.spec->operands];
+ Insn.length = Insn.readerCursor - Insn.startLocation;
+ Size = Insn.length;
+ if (Size > 15)
+ LLVM_DEBUG(dbgs() << "Instruction exceeds 15-byte limit");
+
+ bool Ret = translateInstruction(Instr, Insn, this);
+ if (!Ret) {
+ unsigned Flags = X86::IP_NO_PREFIX;
+ if (Insn.hasAdSize)
+ Flags |= X86::IP_HAS_AD_SIZE;
+ if (!Insn.mandatoryPrefix) {
+ if (Insn.hasOpSize)
+ Flags |= X86::IP_HAS_OP_SIZE;
+ if (Insn.repeatPrefix == 0xf2)
+ Flags |= X86::IP_HAS_REPEAT_NE;
+ else if (Insn.repeatPrefix == 0xf3 &&
+ // It should not be 'pause' f3 90
+ Insn.opcode != 0x90)
+ Flags |= X86::IP_HAS_REPEAT;
+ if (Insn.hasLockPrefix)
+ Flags |= X86::IP_HAS_LOCK;
+ }
+ Instr.setFlags(Flags);
+ }
+ return (!Ret) ? Success : Fail;
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+/// register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param reg - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+ static constexpr MCPhysReg llvmRegnums[] = {ALL_REGS};
+#undef ENTRY
+
+ MCPhysReg llvmRegnum = llvmRegnums[reg];
+ mcInst.addOperand(MCOperand::createReg(llvmRegnum));
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value - The immediate Value, has had any PC adjustment made by
+/// the caller.
+/// @param isBranch - If the instruction is a branch instruction
+/// @param Address - The starting address of the instruction
+/// @param Offset - The byte offset to this immediate in the instruction
+/// @param Width - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width. If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created. This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const MCDisassembler *Dis) {
+ return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+ Offset, Width);
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the rip.
+/// These can often be addresses in a literal pool. The Address of the
+/// instruction and its immediate Value are used to determine the address
+/// being referenced in the literal pool entry. The SymbolLookUp call back will
+/// return a pointer to a literal 'C' string if the referenced address is an
+/// address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
+ const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ Dis->tryAddingPcLoadReferenceComment(Value, Address);
+}
+
+static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+ 0, // SEG_OVERRIDE_NONE
+ X86::CS,
+ X86::SS,
+ X86::DS,
+ X86::ES,
+ X86::FS,
+ X86::GS
+};
+
+/// translateSrcIndex - Appends a source index operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction.
+static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) {
+ unsigned baseRegNo;
+
+ if (insn.mode == MODE_64BIT)
+ baseRegNo = insn.hasAdSize ? X86::ESI : X86::RSI;
+ else if (insn.mode == MODE_32BIT)
+ baseRegNo = insn.hasAdSize ? X86::SI : X86::ESI;
+ else {
+ assert(insn.mode == MODE_16BIT);
+ baseRegNo = insn.hasAdSize ? X86::ESI : X86::SI;
+ }
+ MCOperand baseReg = MCOperand::createReg(baseRegNo);
+ mcInst.addOperand(baseReg);
+
+ MCOperand segmentReg;
+ segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+ mcInst.addOperand(segmentReg);
+ return false;
+}
+
+/// translateDstIndex - Appends a destination index operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction.
+
+static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
+ unsigned baseRegNo;
+
+ if (insn.mode == MODE_64BIT)
+ baseRegNo = insn.hasAdSize ? X86::EDI : X86::RDI;
+ else if (insn.mode == MODE_32BIT)
+ baseRegNo = insn.hasAdSize ? X86::DI : X86::EDI;
+ else {
+ assert(insn.mode == MODE_16BIT);
+ baseRegNo = insn.hasAdSize ? X86::EDI : X86::DI;
+ }
+ MCOperand baseReg = MCOperand::createReg(baseRegNo);
+ mcInst.addOperand(baseReg);
+ return false;
+}
+
+/// translateImmediate - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param immediate - The immediate value to append.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+static void translateImmediate(MCInst &mcInst, uint64_t immediate,
+ const OperandSpecifier &operand,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ // Sign-extend the immediate if necessary.
+
+ OperandType type = (OperandType)operand.type;
+
+ bool isBranch = false;
+ uint64_t pcrel = 0;
+ if (type == TYPE_REL) {
+ isBranch = true;
+ pcrel = insn.startLocation +
+ insn.immediateOffset + insn.immediateSize;
+ switch (operand.encoding) {
+ default:
+ break;
+ case ENCODING_Iv:
+ switch (insn.displacementSize) {
+ default:
+ break;
+ case 1:
+ if(immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case 2:
+ if(immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case 4:
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ case 8:
+ break;
+ }
+ break;
+ case ENCODING_IB:
+ if(immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case ENCODING_IW:
+ if(immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case ENCODING_ID:
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ }
+ }
+ // By default sign-extend all X86 immediates based on their encoding.
+ else if (type == TYPE_IMM) {
+ switch (operand.encoding) {
+ default:
+ break;
+ case ENCODING_IB:
+ if(immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case ENCODING_IW:
+ if(immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case ENCODING_ID:
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ case ENCODING_IO:
+ break;
+ }
+ }
+
+ switch (type) {
+ case TYPE_XMM:
+ mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4)));
+ return;
+ case TYPE_YMM:
+ mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4)));
+ return;
+ case TYPE_ZMM:
+ mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
+ return;
+ default:
+ // operand is 64 bits wide. Do nothing.
+ break;
+ }
+
+ if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
+ insn.immediateOffset, insn.immediateSize,
+ mcInst, Dis))
+ mcInst.addOperand(MCOperand::createImm(immediate));
+
+ if (type == TYPE_MOFFS) {
+ MCOperand segmentReg;
+ segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+ mcInst.addOperand(segmentReg);
+ }
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+/// ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction to extract the R/M field
+/// from.
+/// @return - 0 on success; -1 otherwise
+static bool translateRMRegister(MCInst &mcInst,
+ InternalInstruction &insn) {
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ debug("A R/M register operand may not have a SIB byte");
+ return true;
+ }
+
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected EA base register");
+ return true;
+ case EA_BASE_NONE:
+ debug("EA_BASE_NONE for ModR/M base");
+ return true;
+#define ENTRY(x) case EA_BASE_##x:
+ ALL_EA_BASES
+#undef ENTRY
+ debug("A R/M register operand may not have a base; "
+ "the operand must be a register.");
+ return true;
+#define ENTRY(x) \
+ case EA_REG_##x: \
+ mcInst.addOperand(MCOperand::createReg(X86::x)); break;
+ ALL_REGS
+#undef ENTRY
+ }
+
+ return false;
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+/// fields of an internal instruction (and possibly its SIB byte) to a memory
+/// operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @param ForceSIB - The instruction must use SIB.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
+ const MCDisassembler *Dis,
+ bool ForceSIB = false) {
+ // Addresses in an MCInst are represented as five operands:
+ // 1. basereg (register) The R/M base, or (if there is a SIB) the
+ // SIB base
+ // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified
+ // scale amount
+ // 3. indexreg (register) x86_registerNONE, or (if there is a SIB)
+ // the index (which is multiplied by the
+ // scale amount)
+ // 4. displacement (immediate) 0, or the displacement if there is one
+ // 5. segmentreg (register) x86_registerNONE for now, but could be set
+ // if we have segment overrides
+
+ MCOperand baseReg;
+ MCOperand scaleAmount;
+ MCOperand indexReg;
+ MCOperand displacement;
+ MCOperand segmentReg;
+ uint64_t pcrel = 0;
+
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ if (insn.sibBase != SIB_BASE_NONE) {
+ switch (insn.sibBase) {
+ default:
+ debug("Unexpected sibBase");
+ return true;
+#define ENTRY(x) \
+ case SIB_BASE_##x: \
+ baseReg = MCOperand::createReg(X86::x); break;
+ ALL_SIB_BASES
+#undef ENTRY
+ }
+ } else {
+ baseReg = MCOperand::createReg(X86::NoRegister);
+ }
+
+ if (insn.sibIndex != SIB_INDEX_NONE) {
+ switch (insn.sibIndex) {
+ default:
+ debug("Unexpected sibIndex");
+ return true;
+#define ENTRY(x) \
+ case SIB_INDEX_##x: \
+ indexReg = MCOperand::createReg(X86::x); break;
+ EA_BASES_32BIT
+ EA_BASES_64BIT
+ REGS_XMM
+ REGS_YMM
+ REGS_ZMM
+#undef ENTRY
+ }
+ } else {
+ // Use EIZ/RIZ for a few ambiguous cases where the SIB byte is present,
+ // but no index is used and modrm alone should have been enough.
+ // -No base register in 32-bit mode. In 64-bit mode this is used to
+ // avoid rip-relative addressing.
+ // -Any base register used other than ESP/RSP/R12D/R12. Using these as a
+ // base always requires a SIB byte.
+ // -A scale other than 1 is used.
+ if (!ForceSIB &&
+ (insn.sibScale != 1 ||
+ (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) ||
+ (insn.sibBase != SIB_BASE_NONE &&
+ insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP &&
+ insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12))) {
+ indexReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIZ :
+ X86::RIZ);
+ } else
+ indexReg = MCOperand::createReg(X86::NoRegister);
+ }
+
+ scaleAmount = MCOperand::createImm(insn.sibScale);
+ } else {
+ switch (insn.eaBase) {
+ case EA_BASE_NONE:
+ if (insn.eaDisplacement == EA_DISP_NONE) {
+ debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+ return true;
+ }
+ if (insn.mode == MODE_64BIT){
+ pcrel = insn.startLocation +
+ insn.displacementOffset + insn.displacementSize;
+ tryAddingPcLoadReferenceComment(insn.startLocation +
+ insn.displacementOffset,
+ insn.displacement + pcrel, Dis);
+ // Section 2.2.1.6
+ baseReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIP :
+ X86::RIP);
+ }
+ else
+ baseReg = MCOperand::createReg(X86::NoRegister);
+
+ indexReg = MCOperand::createReg(X86::NoRegister);
+ break;
+ case EA_BASE_BX_SI:
+ baseReg = MCOperand::createReg(X86::BX);
+ indexReg = MCOperand::createReg(X86::SI);
+ break;
+ case EA_BASE_BX_DI:
+ baseReg = MCOperand::createReg(X86::BX);
+ indexReg = MCOperand::createReg(X86::DI);
+ break;
+ case EA_BASE_BP_SI:
+ baseReg = MCOperand::createReg(X86::BP);
+ indexReg = MCOperand::createReg(X86::SI);
+ break;
+ case EA_BASE_BP_DI:
+ baseReg = MCOperand::createReg(X86::BP);
+ indexReg = MCOperand::createReg(X86::DI);
+ break;
+ default:
+ indexReg = MCOperand::createReg(X86::NoRegister);
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected eaBase");
+ return true;
+ // Here, we will use the fill-ins defined above. However,
+ // BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+ // sib and sib64 were handled in the top-level if, so they're only
+ // placeholders to keep the compiler happy.
+#define ENTRY(x) \
+ case EA_BASE_##x: \
+ baseReg = MCOperand::createReg(X86::x); break;
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+ ALL_REGS
+#undef ENTRY
+ debug("A R/M memory operand may not be a register; "
+ "the base field must be a base.");
+ return true;
+ }
+ }
+
+ scaleAmount = MCOperand::createImm(1);
+ }
+
+ displacement = MCOperand::createImm(insn.displacement);
+
+ segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+
+ mcInst.addOperand(baseReg);
+ mcInst.addOperand(scaleAmount);
+ mcInst.addOperand(indexReg);
+ if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
+ insn.startLocation, insn.displacementOffset,
+ insn.displacementSize, mcInst, Dis))
+ mcInst.addOperand(displacement);
+ mcInst.addOperand(segmentReg);
+ return false;
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+/// byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
+ InternalInstruction &insn, const MCDisassembler *Dis) {
+ switch (operand.type) {
+ default:
+ debug("Unexpected type for a R/M operand");
+ return true;
+ case TYPE_R8:
+ case TYPE_R16:
+ case TYPE_R32:
+ case TYPE_R64:
+ case TYPE_Rv:
+ case TYPE_MM64:
+ case TYPE_XMM:
+ case TYPE_YMM:
+ case TYPE_ZMM:
+ case TYPE_TMM:
+ case TYPE_VK_PAIR:
+ case TYPE_VK:
+ case TYPE_DEBUGREG:
+ case TYPE_CONTROLREG:
+ case TYPE_BNDR:
+ return translateRMRegister(mcInst, insn);
+ case TYPE_M:
+ case TYPE_MVSIBX:
+ case TYPE_MVSIBY:
+ case TYPE_MVSIBZ:
+ return translateRMMemory(mcInst, insn, Dis);
+ case TYPE_MSIB:
+ return translateRMMemory(mcInst, insn, Dis, true);
+ }
+}
+
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+/// LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param stackPos - The stack position to translate.
+static void translateFPRegister(MCInst &mcInst,
+ uint8_t stackPos) {
+ mcInst.addOperand(MCOperand::createReg(X86::ST0 + stackPos));
+}
+
+/// translateMaskRegister - Translates a 3-bit mask register number to
+/// LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param maskRegNum - Number of mask register from 0 to 7.
+/// @return - false on success; true otherwise.
+static bool translateMaskRegister(MCInst &mcInst,
+ uint8_t maskRegNum) {
+ if (maskRegNum >= 8) {
+ debug("Invalid mask register number");
+ return true;
+ }
+
+ mcInst.addOperand(MCOperand::createReg(X86::K0 + maskRegNum));
+ return false;
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction
+/// to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ switch (operand.encoding) {
+ default:
+ debug("Unhandled operand encoding during translation");
+ return true;
+ case ENCODING_REG:
+ translateRegister(mcInst, insn.reg);
+ return false;
+ case ENCODING_WRITEMASK:
+ return translateMaskRegister(mcInst, insn.writemask);
+ case ENCODING_SIB:
+ CASE_ENCODING_RM:
+ CASE_ENCODING_VSIB:
+ return translateRM(mcInst, operand, insn, Dis);
+ case ENCODING_IB:
+ case ENCODING_IW:
+ case ENCODING_ID:
+ case ENCODING_IO:
+ case ENCODING_Iv:
+ case ENCODING_Ia:
+ translateImmediate(mcInst,
+ insn.immediates[insn.numImmediatesTranslated++],
+ operand,
+ insn,
+ Dis);
+ return false;
+ case ENCODING_IRC:
+ mcInst.addOperand(MCOperand::createImm(insn.RC));
+ return false;
+ case ENCODING_SI:
+ return translateSrcIndex(mcInst, insn);
+ case ENCODING_DI:
+ return translateDstIndex(mcInst, insn);
+ case ENCODING_RB:
+ case ENCODING_RW:
+ case ENCODING_RD:
+ case ENCODING_RO:
+ case ENCODING_Rv:
+ translateRegister(mcInst, insn.opcodeRegister);
+ return false;
+ case ENCODING_CC:
+ mcInst.addOperand(MCOperand::createImm(insn.immediates[1]));
+ return false;
+ case ENCODING_FP:
+ translateFPRegister(mcInst, insn.modRM & 7);
+ return false;
+ case ENCODING_VVVV:
+ translateRegister(mcInst, insn.vvvv);
+ return false;
+ case ENCODING_DUP:
+ return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
+ insn, Dis);
+ }
+}
+
+/// translateInstruction - Translates an internal instruction and all its
+/// operands to an MCInst.
+///
+/// @param mcInst - The MCInst to populate with the instruction's data.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateInstruction(MCInst &mcInst,
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
+ if (!insn.spec) {
+ debug("Instruction has no specification");
+ return true;
+ }
+
+ mcInst.clear();
+ mcInst.setOpcode(insn.instructionID);
+ // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
+ // prefix bytes should be disassembled as xrelease and xacquire then set the
+ // opcode to those instead of the rep and repne opcodes.
+ if (insn.xAcquireRelease) {
+ if(mcInst.getOpcode() == X86::REP_PREFIX)
+ mcInst.setOpcode(X86::XRELEASE_PREFIX);
+ else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
+ mcInst.setOpcode(X86::XACQUIRE_PREFIX);
+ }
+
+ insn.numImmediatesTranslated = 0;
+
+ for (const auto &Op : insn.operands) {
+ if (Op.encoding != ENCODING_NONE) {
+ if (translateOperand(mcInst, Op, insn, Dis)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static MCDisassembler *createX86Disassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
+ return new X86GenericDisassembler(STI, Ctx, std::move(MII));
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Disassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheX86_32Target(),
+ createX86Disassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheX86_64Target(),
+ createX86Disassembler);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 000000000000..4318c17f03a0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -0,0 +1,647 @@
+//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the public interface of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/X86DisassemblerDecoderCommon.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+// Accessor functions for various fields of an Intel instruction
+#define modFromModRM(modRM) (((modRM) & 0xc0) >> 6)
+#define regFromModRM(modRM) (((modRM) & 0x38) >> 3)
+#define rmFromModRM(modRM) ((modRM) & 0x7)
+#define scaleFromSIB(sib) (((sib) & 0xc0) >> 6)
+#define indexFromSIB(sib) (((sib) & 0x38) >> 3)
+#define baseFromSIB(sib) ((sib) & 0x7)
+#define wFromREX(rex) (((rex) & 0x8) >> 3)
+#define rFromREX(rex) (((rex) & 0x4) >> 2)
+#define xFromREX(rex) (((rex) & 0x2) >> 1)
+#define bFromREX(rex) ((rex) & 0x1)
+
+#define rFromEVEX2of4(evex) (((~(evex)) & 0x80) >> 7)
+#define xFromEVEX2of4(evex) (((~(evex)) & 0x40) >> 6)
+#define bFromEVEX2of4(evex) (((~(evex)) & 0x20) >> 5)
+#define r2FromEVEX2of4(evex) (((~(evex)) & 0x10) >> 4)
+#define mmFromEVEX2of4(evex) ((evex) & 0x3)
+#define wFromEVEX3of4(evex) (((evex) & 0x80) >> 7)
+#define vvvvFromEVEX3of4(evex) (((~(evex)) & 0x78) >> 3)
+#define ppFromEVEX3of4(evex) ((evex) & 0x3)
+#define zFromEVEX4of4(evex) (((evex) & 0x80) >> 7)
+#define l2FromEVEX4of4(evex) (((evex) & 0x40) >> 6)
+#define lFromEVEX4of4(evex) (((evex) & 0x20) >> 5)
+#define bFromEVEX4of4(evex) (((evex) & 0x10) >> 4)
+#define v2FromEVEX4of4(evex) (((~evex) & 0x8) >> 3)
+#define aaaFromEVEX4of4(evex) ((evex) & 0x7)
+
+#define rFromVEX2of3(vex) (((~(vex)) & 0x80) >> 7)
+#define xFromVEX2of3(vex) (((~(vex)) & 0x40) >> 6)
+#define bFromVEX2of3(vex) (((~(vex)) & 0x20) >> 5)
+#define mmmmmFromVEX2of3(vex) ((vex) & 0x1f)
+#define wFromVEX3of3(vex) (((vex) & 0x80) >> 7)
+#define vvvvFromVEX3of3(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromVEX3of3(vex) (((vex) & 0x4) >> 2)
+#define ppFromVEX3of3(vex) ((vex) & 0x3)
+
+#define rFromVEX2of2(vex) (((~(vex)) & 0x80) >> 7)
+#define vvvvFromVEX2of2(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromVEX2of2(vex) (((vex) & 0x4) >> 2)
+#define ppFromVEX2of2(vex) ((vex) & 0x3)
+
+#define rFromXOP2of3(xop) (((~(xop)) & 0x80) >> 7)
+#define xFromXOP2of3(xop) (((~(xop)) & 0x40) >> 6)
+#define bFromXOP2of3(xop) (((~(xop)) & 0x20) >> 5)
+#define mmmmmFromXOP2of3(xop) ((xop) & 0x1f)
+#define wFromXOP3of3(xop) (((xop) & 0x80) >> 7)
+#define vvvvFromXOP3of3(vex) (((~(vex)) & 0x78) >> 3)
+#define lFromXOP3of3(xop) (((xop) & 0x4) >> 2)
+#define ppFromXOP3of3(xop) ((xop) & 0x3)
+
+// These enums represent Intel registers for use by the decoder.
+#define REGS_8BIT \
+ ENTRY(AL) \
+ ENTRY(CL) \
+ ENTRY(DL) \
+ ENTRY(BL) \
+ ENTRY(AH) \
+ ENTRY(CH) \
+ ENTRY(DH) \
+ ENTRY(BH) \
+ ENTRY(R8B) \
+ ENTRY(R9B) \
+ ENTRY(R10B) \
+ ENTRY(R11B) \
+ ENTRY(R12B) \
+ ENTRY(R13B) \
+ ENTRY(R14B) \
+ ENTRY(R15B) \
+ ENTRY(SPL) \
+ ENTRY(BPL) \
+ ENTRY(SIL) \
+ ENTRY(DIL)
+
+#define EA_BASES_16BIT \
+ ENTRY(BX_SI) \
+ ENTRY(BX_DI) \
+ ENTRY(BP_SI) \
+ ENTRY(BP_DI) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(BP) \
+ ENTRY(BX) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define REGS_16BIT \
+ ENTRY(AX) \
+ ENTRY(CX) \
+ ENTRY(DX) \
+ ENTRY(BX) \
+ ENTRY(SP) \
+ ENTRY(BP) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define EA_BASES_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(sib) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define REGS_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(ESP) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define EA_BASES_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(sib64) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(RSP) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_MMX \
+ ENTRY(MM0) \
+ ENTRY(MM1) \
+ ENTRY(MM2) \
+ ENTRY(MM3) \
+ ENTRY(MM4) \
+ ENTRY(MM5) \
+ ENTRY(MM6) \
+ ENTRY(MM7)
+
+#define REGS_XMM \
+ ENTRY(XMM0) \
+ ENTRY(XMM1) \
+ ENTRY(XMM2) \
+ ENTRY(XMM3) \
+ ENTRY(XMM4) \
+ ENTRY(XMM5) \
+ ENTRY(XMM6) \
+ ENTRY(XMM7) \
+ ENTRY(XMM8) \
+ ENTRY(XMM9) \
+ ENTRY(XMM10) \
+ ENTRY(XMM11) \
+ ENTRY(XMM12) \
+ ENTRY(XMM13) \
+ ENTRY(XMM14) \
+ ENTRY(XMM15) \
+ ENTRY(XMM16) \
+ ENTRY(XMM17) \
+ ENTRY(XMM18) \
+ ENTRY(XMM19) \
+ ENTRY(XMM20) \
+ ENTRY(XMM21) \
+ ENTRY(XMM22) \
+ ENTRY(XMM23) \
+ ENTRY(XMM24) \
+ ENTRY(XMM25) \
+ ENTRY(XMM26) \
+ ENTRY(XMM27) \
+ ENTRY(XMM28) \
+ ENTRY(XMM29) \
+ ENTRY(XMM30) \
+ ENTRY(XMM31)
+
+#define REGS_YMM \
+ ENTRY(YMM0) \
+ ENTRY(YMM1) \
+ ENTRY(YMM2) \
+ ENTRY(YMM3) \
+ ENTRY(YMM4) \
+ ENTRY(YMM5) \
+ ENTRY(YMM6) \
+ ENTRY(YMM7) \
+ ENTRY(YMM8) \
+ ENTRY(YMM9) \
+ ENTRY(YMM10) \
+ ENTRY(YMM11) \
+ ENTRY(YMM12) \
+ ENTRY(YMM13) \
+ ENTRY(YMM14) \
+ ENTRY(YMM15) \
+ ENTRY(YMM16) \
+ ENTRY(YMM17) \
+ ENTRY(YMM18) \
+ ENTRY(YMM19) \
+ ENTRY(YMM20) \
+ ENTRY(YMM21) \
+ ENTRY(YMM22) \
+ ENTRY(YMM23) \
+ ENTRY(YMM24) \
+ ENTRY(YMM25) \
+ ENTRY(YMM26) \
+ ENTRY(YMM27) \
+ ENTRY(YMM28) \
+ ENTRY(YMM29) \
+ ENTRY(YMM30) \
+ ENTRY(YMM31)
+
+#define REGS_ZMM \
+ ENTRY(ZMM0) \
+ ENTRY(ZMM1) \
+ ENTRY(ZMM2) \
+ ENTRY(ZMM3) \
+ ENTRY(ZMM4) \
+ ENTRY(ZMM5) \
+ ENTRY(ZMM6) \
+ ENTRY(ZMM7) \
+ ENTRY(ZMM8) \
+ ENTRY(ZMM9) \
+ ENTRY(ZMM10) \
+ ENTRY(ZMM11) \
+ ENTRY(ZMM12) \
+ ENTRY(ZMM13) \
+ ENTRY(ZMM14) \
+ ENTRY(ZMM15) \
+ ENTRY(ZMM16) \
+ ENTRY(ZMM17) \
+ ENTRY(ZMM18) \
+ ENTRY(ZMM19) \
+ ENTRY(ZMM20) \
+ ENTRY(ZMM21) \
+ ENTRY(ZMM22) \
+ ENTRY(ZMM23) \
+ ENTRY(ZMM24) \
+ ENTRY(ZMM25) \
+ ENTRY(ZMM26) \
+ ENTRY(ZMM27) \
+ ENTRY(ZMM28) \
+ ENTRY(ZMM29) \
+ ENTRY(ZMM30) \
+ ENTRY(ZMM31)
+
+#define REGS_MASKS \
+ ENTRY(K0) \
+ ENTRY(K1) \
+ ENTRY(K2) \
+ ENTRY(K3) \
+ ENTRY(K4) \
+ ENTRY(K5) \
+ ENTRY(K6) \
+ ENTRY(K7)
+
+#define REGS_MASK_PAIRS \
+ ENTRY(K0_K1) \
+ ENTRY(K2_K3) \
+ ENTRY(K4_K5) \
+ ENTRY(K6_K7)
+
+#define REGS_SEGMENT \
+ ENTRY(ES) \
+ ENTRY(CS) \
+ ENTRY(SS) \
+ ENTRY(DS) \
+ ENTRY(FS) \
+ ENTRY(GS)
+
+#define REGS_DEBUG \
+ ENTRY(DR0) \
+ ENTRY(DR1) \
+ ENTRY(DR2) \
+ ENTRY(DR3) \
+ ENTRY(DR4) \
+ ENTRY(DR5) \
+ ENTRY(DR6) \
+ ENTRY(DR7) \
+ ENTRY(DR8) \
+ ENTRY(DR9) \
+ ENTRY(DR10) \
+ ENTRY(DR11) \
+ ENTRY(DR12) \
+ ENTRY(DR13) \
+ ENTRY(DR14) \
+ ENTRY(DR15)
+
+#define REGS_CONTROL \
+ ENTRY(CR0) \
+ ENTRY(CR1) \
+ ENTRY(CR2) \
+ ENTRY(CR3) \
+ ENTRY(CR4) \
+ ENTRY(CR5) \
+ ENTRY(CR6) \
+ ENTRY(CR7) \
+ ENTRY(CR8) \
+ ENTRY(CR9) \
+ ENTRY(CR10) \
+ ENTRY(CR11) \
+ ENTRY(CR12) \
+ ENTRY(CR13) \
+ ENTRY(CR14) \
+ ENTRY(CR15)
+
+#define REGS_BOUND \
+ ENTRY(BND0) \
+ ENTRY(BND1) \
+ ENTRY(BND2) \
+ ENTRY(BND3)
+
+#undef REGS_TMM
+#define REGS_TMM \
+ ENTRY(TMM0) \
+ ENTRY(TMM1) \
+ ENTRY(TMM2) \
+ ENTRY(TMM3) \
+ ENTRY(TMM4) \
+ ENTRY(TMM5) \
+ ENTRY(TMM6) \
+ ENTRY(TMM7)
+
+#define ALL_EA_BASES \
+ EA_BASES_16BIT \
+ EA_BASES_32BIT \
+ EA_BASES_64BIT
+
+#define ALL_SIB_BASES \
+ REGS_32BIT \
+ REGS_64BIT
+
+#define ALL_REGS \
+ REGS_8BIT \
+ REGS_16BIT \
+ REGS_32BIT \
+ REGS_64BIT \
+ REGS_MMX \
+ REGS_XMM \
+ REGS_YMM \
+ REGS_ZMM \
+ REGS_MASKS \
+ REGS_MASK_PAIRS \
+ REGS_SEGMENT \
+ REGS_DEBUG \
+ REGS_CONTROL \
+ REGS_BOUND \
+ REGS_TMM \
+ ENTRY(RIP)
+
+/// All possible values of the base field for effective-address
+/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
+/// We distinguish between bases (EA_BASE_*) and registers that just happen
+/// to be referred to when Mod == 0b11 (EA_REG_*).
+enum EABase {
+ EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ EA_max
+};
+
+/// All possible values of the SIB index field.
+/// borrows entries from ALL_EA_BASES with the special case that
+/// sib is synonymous with NONE.
+/// Vector SIB: index can be XMM or YMM.
+enum SIBIndex {
+ SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+ ALL_EA_BASES
+ REGS_XMM
+ REGS_YMM
+ REGS_ZMM
+#undef ENTRY
+ SIB_INDEX_max
+};
+
+/// All possible values of the SIB base field.
+enum SIBBase {
+ SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+ ALL_SIB_BASES
+#undef ENTRY
+ SIB_BASE_max
+};
+
+/// Possible displacement types for effective-address computations.
+enum EADisplacement {
+ EA_DISP_NONE,
+ EA_DISP_8,
+ EA_DISP_16,
+ EA_DISP_32
+};
+
+/// All possible values of the reg field in the ModR/M byte.
+enum Reg {
+#define ENTRY(x) MODRM_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ MODRM_REG_max
+};
+
+/// All possible segment overrides.
+enum SegmentOverride {
+ SEG_OVERRIDE_NONE,
+ SEG_OVERRIDE_CS,
+ SEG_OVERRIDE_SS,
+ SEG_OVERRIDE_DS,
+ SEG_OVERRIDE_ES,
+ SEG_OVERRIDE_FS,
+ SEG_OVERRIDE_GS,
+ SEG_OVERRIDE_max
+};
+
+/// Possible values for the VEX.m-mmmm field
+enum VEXLeadingOpcodeByte {
+ VEX_LOB_0F = 0x1,
+ VEX_LOB_0F38 = 0x2,
+ VEX_LOB_0F3A = 0x3
+};
+
+enum XOPMapSelect {
+ XOP_MAP_SELECT_8 = 0x8,
+ XOP_MAP_SELECT_9 = 0x9,
+ XOP_MAP_SELECT_A = 0xA
+};
+
+/// Possible values for the VEX.pp/EVEX.pp field
+enum VEXPrefixCode {
+ VEX_PREFIX_NONE = 0x0,
+ VEX_PREFIX_66 = 0x1,
+ VEX_PREFIX_F3 = 0x2,
+ VEX_PREFIX_F2 = 0x3
+};
+
+enum VectorExtensionType {
+ TYPE_NO_VEX_XOP = 0x0,
+ TYPE_VEX_2B = 0x1,
+ TYPE_VEX_3B = 0x2,
+ TYPE_EVEX = 0x3,
+ TYPE_XOP = 0x4
+};
+
+/// The specification for how to extract and interpret a full instruction and
+/// its operands.
+struct InstructionSpecifier {
+ uint16_t operands;
+};
+
+/// The x86 internal instruction, which is produced by the decoder.
+struct InternalInstruction {
+ // Opaque value passed to the reader
+ llvm::ArrayRef<uint8_t> bytes;
+ // The address of the next byte to read via the reader
+ uint64_t readerCursor;
+
+ // General instruction information
+
+ // The mode to disassemble for (64-bit, protected, real)
+ DisassemblerMode mode;
+ // The start of the instruction, usable with the reader
+ uint64_t startLocation;
+ // The length of the instruction, in bytes
+ size_t length;
+
+ // Prefix state
+
+ // The possible mandatory prefix
+ uint8_t mandatoryPrefix;
+ // The value of the vector extension prefix(EVEX/VEX/XOP), if present
+ uint8_t vectorExtensionPrefix[4];
+ // The type of the vector extension prefix
+ VectorExtensionType vectorExtensionType;
+ // The value of the REX prefix, if present
+ uint8_t rexPrefix;
+ // The segment override type
+ SegmentOverride segmentOverride;
+ // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
+ bool xAcquireRelease;
+
+ // Address-size override
+ bool hasAdSize;
+ // Operand-size override
+ bool hasOpSize;
+ // Lock prefix
+ bool hasLockPrefix;
+ // The repeat prefix if any
+ uint8_t repeatPrefix;
+
+ // Sizes of various critical pieces of data, in bytes
+ uint8_t registerSize;
+ uint8_t addressSize;
+ uint8_t displacementSize;
+ uint8_t immediateSize;
+
+ // Offsets from the start of the instruction to the pieces of data, which is
+ // needed to find relocation entries for adding symbolic operands.
+ uint8_t displacementOffset;
+ uint8_t immediateOffset;
+
+ // opcode state
+
+ // The last byte of the opcode, not counting any ModR/M extension
+ uint8_t opcode;
+
+ // decode state
+
+ // The type of opcode, used for indexing into the array of decode tables
+ OpcodeType opcodeType;
+ // The instruction ID, extracted from the decode table
+ uint16_t instructionID;
+ // The specifier for the instruction, from the instruction info table
+ const InstructionSpecifier *spec;
+
+ // state for additional bytes, consumed during operand decode. Pattern:
+ // consumed___ indicates that the byte was already consumed and does not
+ // need to be consumed again.
+
+ // The VEX.vvvv field, which contains a third register operand for some AVX
+ // instructions.
+ Reg vvvv;
+
+ // The writemask for AVX-512 instructions which is contained in EVEX.aaa
+ Reg writemask;
+
+ // The ModR/M byte, which contains most register operands and some portion of
+ // all memory operands.
+ bool consumedModRM;
+ uint8_t modRM;
+
+ // The SIB byte, used for more complex 32- or 64-bit memory operands
+ uint8_t sib;
+
+ // The displacement, used for memory operands
+ int32_t displacement;
+
+ // Immediates. There can be two in some cases
+ uint8_t numImmediatesConsumed;
+ uint8_t numImmediatesTranslated;
+ uint64_t immediates[2];
+
+ // A register or immediate operand encoded into the opcode
+ Reg opcodeRegister;
+
+ // Portions of the ModR/M byte
+
+ // These fields determine the allowable values for the ModR/M fields, which
+ // depend on operand and address widths.
+ EABase eaRegBase;
+ Reg regBase;
+
+ // The Mod and R/M fields can encode a base for an effective address, or a
+ // register. These are separated into two fields here.
+ EABase eaBase;
+ EADisplacement eaDisplacement;
+ // The reg field always encodes a register
+ Reg reg;
+
+ // SIB state
+ SIBIndex sibIndexBase;
+ SIBIndex sibIndex;
+ uint8_t sibScale;
+ SIBBase sibBase;
+
+ // Embedded rounding control.
+ uint8_t RC;
+
+ ArrayRef<OperandSpecifier> operands;
+};
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h b/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
new file mode 100644
index 000000000000..56738e9cfa73
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
@@ -0,0 +1,445 @@
+//==========-- ImmutableGraph.h - A fast DAG implementation ---------=========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Description: ImmutableGraph is a fast DAG implementation that cannot be
+/// modified, except by creating a new ImmutableGraph. ImmutableGraph is
+/// implemented as two arrays: one containing nodes, and one containing edges.
+/// The advantages to this implementation are two-fold:
+/// 1. Iteration and traversal operations benefit from cache locality.
+/// 2. Operations on sets of nodes/edges are efficient, and representations of
+/// those sets in memory are compact. For instance, a set of edges is
+/// implemented as a bit vector, wherein each bit corresponds to one edge in
+/// the edge array. This implies a lower bound of 64x spatial improvement
+/// over, e.g., an llvm::DenseSet or llvm::SmallSet. It also means that
+/// insert/erase/contains operations complete in negligible constant time:
+/// insert and erase require one load and one store, and contains requires
+/// just one load.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
+#define LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+template <typename NodeValueT, typename EdgeValueT> class ImmutableGraph {
+ using Traits = GraphTraits<ImmutableGraph<NodeValueT, EdgeValueT> *>;
+ template <typename> friend class ImmutableGraphBuilder;
+
+public:
+ using node_value_type = NodeValueT;
+ using edge_value_type = EdgeValueT;
+ using size_type = int;
+ class Node;
+ class Edge {
+ friend class ImmutableGraph;
+ template <typename> friend class ImmutableGraphBuilder;
+
+ const Node *Dest;
+ edge_value_type Value;
+
+ public:
+ const Node *getDest() const { return Dest; };
+ const edge_value_type &getValue() const { return Value; }
+ };
+ class Node {
+ friend class ImmutableGraph;
+ template <typename> friend class ImmutableGraphBuilder;
+
+ const Edge *Edges;
+ node_value_type Value;
+
+ public:
+ const node_value_type &getValue() const { return Value; }
+
+ const Edge *edges_begin() const { return Edges; }
+ // Nodes are allocated sequentially. Edges for a node are stored together.
+ // The end of this Node's edges is the beginning of the next node's edges.
+ // An extra node was allocated to hold the end pointer for the last real
+ // node.
+ const Edge *edges_end() const { return (this + 1)->Edges; }
+ ArrayRef<Edge> edges() const {
+ return makeArrayRef(edges_begin(), edges_end());
+ }
+ };
+
+protected:
+ ImmutableGraph(std::unique_ptr<Node[]> Nodes, std::unique_ptr<Edge[]> Edges,
+ size_type NodesSize, size_type EdgesSize)
+ : Nodes(std::move(Nodes)), Edges(std::move(Edges)), NodesSize(NodesSize),
+ EdgesSize(EdgesSize) {}
+ ImmutableGraph(const ImmutableGraph &) = delete;
+ ImmutableGraph(ImmutableGraph &&) = delete;
+ ImmutableGraph &operator=(const ImmutableGraph &) = delete;
+ ImmutableGraph &operator=(ImmutableGraph &&) = delete;
+
+public:
+ ArrayRef<Node> nodes() const { return makeArrayRef(Nodes.get(), NodesSize); }
+ const Node *nodes_begin() const { return nodes().begin(); }
+ const Node *nodes_end() const { return nodes().end(); }
+
+ ArrayRef<Edge> edges() const { return makeArrayRef(Edges.get(), EdgesSize); }
+ const Edge *edges_begin() const { return edges().begin(); }
+ const Edge *edges_end() const { return edges().end(); }
+
+ size_type nodes_size() const { return NodesSize; }
+ size_type edges_size() const { return EdgesSize; }
+
+ // Node N must belong to this ImmutableGraph.
+ size_type getNodeIndex(const Node &N) const {
+ return std::distance(nodes_begin(), &N);
+ }
+ // Edge E must belong to this ImmutableGraph.
+ size_type getEdgeIndex(const Edge &E) const {
+ return std::distance(edges_begin(), &E);
+ }
+
+ // FIXME: Could NodeSet and EdgeSet be templated to share code?
+ class NodeSet {
+ const ImmutableGraph &G;
+ BitVector V;
+
+ public:
+ NodeSet(const ImmutableGraph &G, bool ContainsAll = false)
+ : G{G}, V{static_cast<unsigned>(G.nodes_size()), ContainsAll} {}
+ bool insert(const Node &N) {
+ size_type Idx = G.getNodeIndex(N);
+ bool AlreadyExists = V.test(Idx);
+ V.set(Idx);
+ return !AlreadyExists;
+ }
+ void erase(const Node &N) {
+ size_type Idx = G.getNodeIndex(N);
+ V.reset(Idx);
+ }
+ bool contains(const Node &N) const {
+ size_type Idx = G.getNodeIndex(N);
+ return V.test(Idx);
+ }
+ void clear() { V.reset(); }
+ size_type empty() const { return V.none(); }
+ /// Return the number of elements in the set
+ size_type count() const { return V.count(); }
+ /// Return the size of the set's domain
+ size_type size() const { return V.size(); }
+ /// Set union
+ NodeSet &operator|=(const NodeSet &RHS) {
+ assert(&this->G == &RHS.G);
+ V |= RHS.V;
+ return *this;
+ }
+ /// Set intersection
+ NodeSet &operator&=(const NodeSet &RHS) {
+ assert(&this->G == &RHS.G);
+ V &= RHS.V;
+ return *this;
+ }
+ /// Set disjoint union
+ NodeSet &operator^=(const NodeSet &RHS) {
+ assert(&this->G == &RHS.G);
+ V ^= RHS.V;
+ return *this;
+ }
+
+ using index_iterator = typename BitVector::const_set_bits_iterator;
+ index_iterator index_begin() const { return V.set_bits_begin(); }
+ index_iterator index_end() const { return V.set_bits_end(); }
+ void set(size_type Idx) { V.set(Idx); }
+ void reset(size_type Idx) { V.reset(Idx); }
+
+ class iterator {
+ const NodeSet &Set;
+ size_type Current;
+
+ void advance() {
+ assert(Current != -1);
+ Current = Set.V.find_next(Current);
+ }
+
+ public:
+ iterator(const NodeSet &Set, size_type Begin)
+ : Set{Set}, Current{Begin} {}
+ iterator operator++(int) {
+ iterator Tmp = *this;
+ advance();
+ return Tmp;
+ }
+ iterator &operator++() {
+ advance();
+ return *this;
+ }
+ Node *operator*() const {
+ assert(Current != -1);
+ return Set.G.nodes_begin() + Current;
+ }
+ bool operator==(const iterator &other) const {
+ assert(&this->Set == &other.Set);
+ return this->Current == other.Current;
+ }
+ bool operator!=(const iterator &other) const { return !(*this == other); }
+ };
+
+ iterator begin() const { return iterator{*this, V.find_first()}; }
+ iterator end() const { return iterator{*this, -1}; }
+ };
+
+ class EdgeSet {
+ const ImmutableGraph &G;
+ BitVector V;
+
+ public:
+ EdgeSet(const ImmutableGraph &G, bool ContainsAll = false)
+ : G{G}, V{static_cast<unsigned>(G.edges_size()), ContainsAll} {}
+ bool insert(const Edge &E) {
+ size_type Idx = G.getEdgeIndex(E);
+ bool AlreadyExists = V.test(Idx);
+ V.set(Idx);
+ return !AlreadyExists;
+ }
+ void erase(const Edge &E) {
+ size_type Idx = G.getEdgeIndex(E);
+ V.reset(Idx);
+ }
+ bool contains(const Edge &E) const {
+ size_type Idx = G.getEdgeIndex(E);
+ return V.test(Idx);
+ }
+ void clear() { V.reset(); }
+ bool empty() const { return V.none(); }
+ /// Return the number of elements in the set
+ size_type count() const { return V.count(); }
+ /// Return the size of the set's domain
+ size_type size() const { return V.size(); }
+ /// Set union
+ EdgeSet &operator|=(const EdgeSet &RHS) {
+ assert(&this->G == &RHS.G);
+ V |= RHS.V;
+ return *this;
+ }
+ /// Set intersection
+ EdgeSet &operator&=(const EdgeSet &RHS) {
+ assert(&this->G == &RHS.G);
+ V &= RHS.V;
+ return *this;
+ }
+ /// Set disjoint union
+ EdgeSet &operator^=(const EdgeSet &RHS) {
+ assert(&this->G == &RHS.G);
+ V ^= RHS.V;
+ return *this;
+ }
+
+ using index_iterator = typename BitVector::const_set_bits_iterator;
+ index_iterator index_begin() const { return V.set_bits_begin(); }
+ index_iterator index_end() const { return V.set_bits_end(); }
+ void set(size_type Idx) { V.set(Idx); }
+ void reset(size_type Idx) { V.reset(Idx); }
+
+ class iterator {
+ const EdgeSet &Set;
+ size_type Current;
+
+ void advance() {
+ assert(Current != -1);
+ Current = Set.V.find_next(Current);
+ }
+
+ public:
+ iterator(const EdgeSet &Set, size_type Begin)
+ : Set{Set}, Current{Begin} {}
+ iterator operator++(int) {
+ iterator Tmp = *this;
+ advance();
+ return Tmp;
+ }
+ iterator &operator++() {
+ advance();
+ return *this;
+ }
+ Edge *operator*() const {
+ assert(Current != -1);
+ return Set.G.edges_begin() + Current;
+ }
+ bool operator==(const iterator &other) const {
+ assert(&this->Set == &other.Set);
+ return this->Current == other.Current;
+ }
+ bool operator!=(const iterator &other) const { return !(*this == other); }
+ };
+
+ iterator begin() const { return iterator{*this, V.find_first()}; }
+ iterator end() const { return iterator{*this, -1}; }
+ };
+
+private:
+ std::unique_ptr<Node[]> Nodes;
+ std::unique_ptr<Edge[]> Edges;
+ size_type NodesSize;
+ size_type EdgesSize;
+};
+
+template <typename GraphT> class ImmutableGraphBuilder {
+ using node_value_type = typename GraphT::node_value_type;
+ using edge_value_type = typename GraphT::edge_value_type;
+ static_assert(
+ std::is_base_of<ImmutableGraph<node_value_type, edge_value_type>,
+ GraphT>::value,
+ "Template argument to ImmutableGraphBuilder must derive from "
+ "ImmutableGraph<>");
+ using size_type = typename GraphT::size_type;
+ using NodeSet = typename GraphT::NodeSet;
+ using Node = typename GraphT::Node;
+ using EdgeSet = typename GraphT::EdgeSet;
+ using Edge = typename GraphT::Edge;
+ using BuilderEdge = std::pair<edge_value_type, size_type>;
+ using EdgeList = std::vector<BuilderEdge>;
+ using BuilderVertex = std::pair<node_value_type, EdgeList>;
+ using VertexVec = std::vector<BuilderVertex>;
+
+public:
+ using BuilderNodeRef = size_type;
+
+ BuilderNodeRef addVertex(const node_value_type &V) {
+ auto I = AdjList.emplace(AdjList.end(), V, EdgeList{});
+ return std::distance(AdjList.begin(), I);
+ }
+
+ void addEdge(const edge_value_type &E, BuilderNodeRef From,
+ BuilderNodeRef To) {
+ AdjList[From].second.emplace_back(E, To);
+ }
+
+ bool empty() const { return AdjList.empty(); }
+
+ template <typename... ArgT> std::unique_ptr<GraphT> get(ArgT &&... Args) {
+ size_type VertexSize = AdjList.size(), EdgeSize = 0;
+ for (const auto &V : AdjList) {
+ EdgeSize += V.second.size();
+ }
+ auto VertexArray =
+ std::make_unique<Node[]>(VertexSize + 1 /* terminator node */);
+ auto EdgeArray = std::make_unique<Edge[]>(EdgeSize);
+ size_type VI = 0, EI = 0;
+ for (; VI < VertexSize; ++VI) {
+ VertexArray[VI].Value = std::move(AdjList[VI].first);
+ VertexArray[VI].Edges = &EdgeArray[EI];
+ auto NumEdges = static_cast<size_type>(AdjList[VI].second.size());
+ for (size_type VEI = 0; VEI < NumEdges; ++VEI, ++EI) {
+ auto &E = AdjList[VI].second[VEI];
+ EdgeArray[EI].Value = std::move(E.first);
+ EdgeArray[EI].Dest = &VertexArray[E.second];
+ }
+ }
+ assert(VI == VertexSize && EI == EdgeSize && "ImmutableGraph malformed");
+ VertexArray[VI].Edges = &EdgeArray[EdgeSize]; // terminator node
+ return std::make_unique<GraphT>(std::move(VertexArray),
+ std::move(EdgeArray), VertexSize, EdgeSize,
+ std::forward<ArgT>(Args)...);
+ }
+
+ template <typename... ArgT>
+ static std::unique_ptr<GraphT> trim(const GraphT &G, const NodeSet &TrimNodes,
+ const EdgeSet &TrimEdges,
+ ArgT &&... Args) {
+ size_type NewVertexSize = G.nodes_size() - TrimNodes.count();
+ size_type NewEdgeSize = G.edges_size() - TrimEdges.count();
+ auto NewVertexArray =
+ std::make_unique<Node[]>(NewVertexSize + 1 /* terminator node */);
+ auto NewEdgeArray = std::make_unique<Edge[]>(NewEdgeSize);
+
+ // Walk the nodes and determine the new index for each node.
+ size_type NewNodeIndex = 0;
+ std::vector<size_type> RemappedNodeIndex(G.nodes_size());
+ for (const Node &N : G.nodes()) {
+ if (TrimNodes.contains(N))
+ continue;
+ RemappedNodeIndex[G.getNodeIndex(N)] = NewNodeIndex++;
+ }
+ assert(NewNodeIndex == NewVertexSize &&
+ "Should have assigned NewVertexSize indices");
+
+ size_type VertexI = 0, EdgeI = 0;
+ for (const Node &N : G.nodes()) {
+ if (TrimNodes.contains(N))
+ continue;
+ NewVertexArray[VertexI].Value = N.getValue();
+ NewVertexArray[VertexI].Edges = &NewEdgeArray[EdgeI];
+ for (const Edge &E : N.edges()) {
+ if (TrimEdges.contains(E))
+ continue;
+ NewEdgeArray[EdgeI].Value = E.getValue();
+ size_type DestIdx = G.getNodeIndex(*E.getDest());
+ size_type NewIdx = RemappedNodeIndex[DestIdx];
+ assert(NewIdx < NewVertexSize);
+ NewEdgeArray[EdgeI].Dest = &NewVertexArray[NewIdx];
+ ++EdgeI;
+ }
+ ++VertexI;
+ }
+ assert(VertexI == NewVertexSize && EdgeI == NewEdgeSize &&
+ "Gadget graph malformed");
+ NewVertexArray[VertexI].Edges = &NewEdgeArray[NewEdgeSize]; // terminator
+ return std::make_unique<GraphT>(std::move(NewVertexArray),
+ std::move(NewEdgeArray), NewVertexSize,
+ NewEdgeSize, std::forward<ArgT>(Args)...);
+ }
+
+private:
+ VertexVec AdjList;
+};
+
+template <typename NodeValueT, typename EdgeValueT>
+struct GraphTraits<ImmutableGraph<NodeValueT, EdgeValueT> *> {
+ using GraphT = ImmutableGraph<NodeValueT, EdgeValueT>;
+ using NodeRef = typename GraphT::Node const *;
+ using EdgeRef = typename GraphT::Edge const &;
+
+ static NodeRef edge_dest(EdgeRef E) { return E.getDest(); }
+ using ChildIteratorType =
+ mapped_iterator<typename GraphT::Edge const *, decltype(&edge_dest)>;
+
+ static NodeRef getEntryNode(GraphT *G) { return G->nodes_begin(); }
+ static ChildIteratorType child_begin(NodeRef N) {
+ return {N->edges_begin(), &edge_dest};
+ }
+ static ChildIteratorType child_end(NodeRef N) {
+ return {N->edges_end(), &edge_dest};
+ }
+
+ static NodeRef getNode(typename GraphT::Node const &N) { return NodeRef{&N}; }
+ using nodes_iterator =
+ mapped_iterator<typename GraphT::Node const *, decltype(&getNode)>;
+ static nodes_iterator nodes_begin(GraphT *G) {
+ return {G->nodes_begin(), &getNode};
+ }
+ static nodes_iterator nodes_end(GraphT *G) {
+ return {G->nodes_end(), &getNode};
+ }
+
+ using ChildEdgeIteratorType = typename GraphT::Edge const *;
+
+ static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
+ return N->edges_begin();
+ }
+ static ChildEdgeIteratorType child_edge_end(NodeRef N) {
+ return N->edges_end();
+ }
+ static typename GraphT::size_type size(GraphT *G) { return G->nodes_size(); }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
new file mode 100644
index 000000000000..c685d7e0db81
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -0,0 +1,498 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTInstPrinter.h"
+#include "X86BaseInfo.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+ StringRef Annot, const MCSubtargetInfo &STI,
+ raw_ostream &OS) {
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
+
+ printInstFlags(MI, OS);
+
+ // Output CALLpcrel32 as "callq" in 64-bit mode.
+ // In Intel annotation it's always emitted as "call".
+ //
+ // TODO: Probably this hack should be redesigned via InstAlias in
+ // InstrInfo.td as soon as Requires clause is supported properly
+ // for InstAlias.
+ if (MI->getOpcode() == X86::CALLpcrel32 &&
+ (STI.getFeatureBits()[X86::Mode64Bit])) {
+ OS << "\tcallq\t";
+ printPCRelImm(MI, Address, 0, OS);
+ }
+ // data16 and data32 both have the same encoding of 0x66. While data32 is
+ // valid only in 16 bit systems, data16 is valid in the rest.
+ // There seems to be some lack of support of the Requires clause that causes
+ // 0x66 to be interpreted as "data16" by the asm printer.
+ // Thus we add an adjustment here in order to print the "right" instruction.
+ else if (MI->getOpcode() == X86::DATA16_PREFIX &&
+ STI.getFeatureBits()[X86::Mode16Bit]) {
+ OS << "\tdata32";
+ }
+ // Try to print any aliases first.
+ else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
+ printInstruction(MI, Address, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+}
+
+bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
+ raw_ostream &OS) {
+ if (MI->getNumOperands() == 0 ||
+ !MI->getOperand(MI->getNumOperands() - 1).isImm())
+ return false;
+
+ int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+
+ // Custom print the vector compare instructions to get the immediate
+ // translated into the mnemonic.
+ switch (MI->getOpcode()) {
+ case X86::CMPPDrmi: case X86::CMPPDrri:
+ case X86::CMPPSrmi: case X86::CMPPSrri:
+ case X86::CMPSDrm: case X86::CMPSDrr:
+ case X86::CMPSDrm_Int: case X86::CMPSDrr_Int:
+ case X86::CMPSSrm: case X86::CMPSSrr:
+ case X86::CMPSSrm_Int: case X86::CMPSSrr_Int:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/false, OS);
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, 2, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, 2, OS);
+ else
+ printxmmwordmem(MI, 2, OS);
+ } else
+ printOperand(MI, 2, OS);
+
+ // Skip operand 1 as its tied to the dest.
+
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ return true;
+ }
+ break;
+
+ case X86::VCMPPDrmi: case X86::VCMPPDrri:
+ case X86::VCMPPDYrmi: case X86::VCMPPDYrri:
+ case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri:
+ case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri:
+ case X86::VCMPPDZrmi: case X86::VCMPPDZrri:
+ case X86::VCMPPSrmi: case X86::VCMPPSrri:
+ case X86::VCMPPSYrmi: case X86::VCMPPSYrri:
+ case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri:
+ case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri:
+ case X86::VCMPPSZrmi: case X86::VCMPPSZrri:
+ case X86::VCMPSDrm: case X86::VCMPSDrr:
+ case X86::VCMPSDZrm: case X86::VCMPSDZrr:
+ case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int:
+ case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int:
+ case X86::VCMPSSrm: case X86::VCMPSSrr:
+ case X86::VCMPSSZrm: case X86::VCMPSSZrr:
+ case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int:
+ case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int:
+ case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik:
+ case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik:
+ case X86::VCMPPDZrmik: case X86::VCMPPDZrrik:
+ case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik:
+ case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik:
+ case X86::VCMPPSZrmik: case X86::VCMPPSZrrik:
+ case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk:
+ case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk:
+ case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik:
+ case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik:
+ case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik:
+ case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik:
+ case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik:
+ case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik:
+ case X86::VCMPPDZrrib: case X86::VCMPPDZrribk:
+ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk:
+ case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
+ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+ if (Imm >= 0 && Imm <= 31) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/true, OS);
+
+ unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2;
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp--, OS);
+ else
+ printdwordmem(MI, CurOp--, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, CurOp--, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, CurOp--, OS);
+ else if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp--, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp--, OS);
+ else
+ printxmmwordmem(MI, CurOp--, OS);
+ }
+ } else {
+ if (Desc.TSFlags & X86II::EVEX_B)
+ OS << "{sae}, ";
+ printOperand(MI, CurOp--, OS);
+ }
+
+ OS << ", ";
+ printOperand(MI, CurOp--, OS);
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ if (CurOp > 0) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp--, OS);
+ OS << "}";
+ }
+
+ return true;
+ }
+ break;
+
+ case X86::VPCOMBmi: case X86::VPCOMBri:
+ case X86::VPCOMDmi: case X86::VPCOMDri:
+ case X86::VPCOMQmi: case X86::VPCOMQri:
+ case X86::VPCOMUBmi: case X86::VPCOMUBri:
+ case X86::VPCOMUDmi: case X86::VPCOMUDri:
+ case X86::VPCOMUQmi: case X86::VPCOMUQri:
+ case X86::VPCOMUWmi: case X86::VPCOMUWri:
+ case X86::VPCOMWmi: case X86::VPCOMWri:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printVPCOMMnemonic(MI, OS);
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem)
+ printxmmwordmem(MI, 2, OS);
+ else
+ printOperand(MI, 2, OS);
+
+ OS << ", ";
+ printOperand(MI, 1, OS);
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ return true;
+ }
+ break;
+
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrri:
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrri:
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrri:
+ case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri:
+ case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri:
+ case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri:
+ case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri:
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrri:
+ case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmik: case X86::VPCMPBZrrik:
+ case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmik: case X86::VPCMPDZrrik:
+ case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmik: case X86::VPCMPQZrrik:
+ case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik:
+ case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik:
+ case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik:
+ case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmik: case X86::VPCMPWZrrik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
+ case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
+ case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk:
+ case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
+ case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
+ case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk:
+ if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) {
+ OS << '\t';
+ printVPCMPMnemonic(MI, OS);
+
+ unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2;
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit as only D and Q are supported.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp--, OS);
+ else
+ printdwordmem(MI, CurOp--, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp--, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp--, OS);
+ else
+ printxmmwordmem(MI, CurOp--, OS);
+ }
+ } else {
+ printOperand(MI, CurOp--, OS);
+ }
+
+ OS << ", ";
+ printOperand(MI, CurOp--, OS);
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ if (CurOp > 0) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp--, OS);
+ OS << "}";
+ }
+
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ // Print immediates as signed values.
+ int64_t Imm = Op.getImm();
+ O << markup("<imm:") << '$' << formatImm(Imm) << markup(">");
+
+ // TODO: This should be in a helper function in the base class, so it can
+ // be used by other printers.
+
+ // If there are no instruction-specific comments, add a comment clarifying
+ // the hex value of the immediate operand when it isn't in the range
+ // [-256,255].
+ if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) {
+ // Don't print unnecessary hex sign bits.
+ if (Imm == (int16_t)(Imm))
+ *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm);
+ else if (Imm == (int32_t)(Imm))
+ *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm);
+ else
+ *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm);
+ }
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << markup("<imm:") << '$';
+ Op.getExpr()->print(O, &MAI);
+ O << markup(">");
+ }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // Do not print the exact form of the memory operand if it references a known
+ // binary object.
+ if (SymbolizeOperands && MIA) {
+ uint64_t Target;
+ if (MIA->evaluateBranch(*MI, 0, 0, Target))
+ return;
+ if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+ return;
+ }
+
+ const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
+
+ if (DispSpec.isImm()) {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+ O << formatImm(DispVal);
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ if (IndexReg.getReg() || BaseReg.getReg()) {
+ O << '(';
+ if (BaseReg.getReg())
+ printOperand(MI, Op + X86::AddrBaseReg, O);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op + X86::AddrIndexReg, O);
+ unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm();
+ if (ScaleVal != 1) {
+ O << ',' << markup("<imm:") << ScaleVal // never printed in hex.
+ << markup(">");
+ }
+ }
+ O << ')';
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+
+ O << "(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << markup("<mem:");
+
+ O << "%es:(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return printOperand(MI, Op, O);
+
+ O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
+ << markup(">");
+}
+
+void X86ATTInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ unsigned Reg = Op.getReg();
+ // Override the default printing to print st(0) instead st.
+ if (Reg == X86::ST0)
+ OS << markup("<reg:") << "%st(0)" << markup(">");
+ else
+ printRegName(OS, Reg);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
new file mode 100644
index 000000000000..f7a850571260
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -0,0 +1,119 @@
+//=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
+
+#include "X86InstPrinterCommon.h"
+
+namespace llvm {
+
+class X86ATTInstPrinter final : public X86InstPrinterCommon {
+public:
+ X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : X86InstPrinterCommon(MAI, MII, MRI), HasCustomInstComment(false) {}
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+ const MCSubtargetInfo &STI, raw_ostream &OS) override;
+ bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS);
+
+ // Autogenerated by tblgen, returns true if we successfully printed an
+ // alias.
+ bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpIdx, unsigned PrintMethodIdx,
+ raw_ostream &O);
+
+ // Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+ void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS) override;
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printSrcIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printDstIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+
+ void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printDstIdx(MI, OpNo, O);
+ }
+ void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemOffset(MI, OpNo, O);
+ }
+
+private:
+ bool HasCustomInstComment;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
new file mode 100644
index 000000000000..95012a148d83
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -0,0 +1,1603 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+/// A wrapper for holding a mask of the values from X86::AlignBranchBoundaryKind
+class X86AlignBranchKind {
+private:
+ uint8_t AlignBranchKind = 0;
+
+public:
+ void operator=(const std::string &Val) {
+ if (Val.empty())
+ return;
+ SmallVector<StringRef, 6> BranchTypes;
+ StringRef(Val).split(BranchTypes, '+', -1, false);
+ for (auto BranchType : BranchTypes) {
+ if (BranchType == "fused")
+ addKind(X86::AlignBranchFused);
+ else if (BranchType == "jcc")
+ addKind(X86::AlignBranchJcc);
+ else if (BranchType == "jmp")
+ addKind(X86::AlignBranchJmp);
+ else if (BranchType == "call")
+ addKind(X86::AlignBranchCall);
+ else if (BranchType == "ret")
+ addKind(X86::AlignBranchRet);
+ else if (BranchType == "indirect")
+ addKind(X86::AlignBranchIndirect);
+ else {
+ errs() << "invalid argument " << BranchType.str()
+ << " to -x86-align-branch=; each element must be one of: fused, "
+ "jcc, jmp, call, ret, indirect.(plus separated)\n";
+ }
+ }
+ }
+
+ operator uint8_t() const { return AlignBranchKind; }
+ void addKind(X86::AlignBranchBoundaryKind Value) { AlignBranchKind |= Value; }
+};
+
+X86AlignBranchKind X86AlignBranchKindLoc;
+
+cl::opt<unsigned> X86AlignBranchBoundary(
+ "x86-align-branch-boundary", cl::init(0),
+ cl::desc(
+ "Control how the assembler should align branches with NOP. If the "
+ "boundary's size is not 0, it should be a power of 2 and no less "
+ "than 32. Branches will be aligned to prevent from being across or "
+ "against the boundary of specified size. The default value 0 does not "
+ "align branches."));
+
+cl::opt<X86AlignBranchKind, true, cl::parser<std::string>> X86AlignBranch(
+ "x86-align-branch",
+ cl::desc(
+ "Specify types of branches to align (plus separated list of types):"
+ "\njcc indicates conditional jumps"
+ "\nfused indicates fused conditional jumps"
+ "\njmp indicates direct unconditional jumps"
+ "\ncall indicates direct and indirect calls"
+ "\nret indicates rets"
+ "\nindirect indicates indirect unconditional jumps"),
+ cl::location(X86AlignBranchKindLoc));
+
+cl::opt<bool> X86AlignBranchWithin32BBoundaries(
+ "x86-branches-within-32B-boundaries", cl::init(false),
+ cl::desc(
+ "Align selected instructions to mitigate negative performance impact "
+ "of Intel's micro code update for errata skx102. May break "
+ "assumptions about labels corresponding to particular instructions, "
+ "and should be used with caution."));
+
+cl::opt<unsigned> X86PadMaxPrefixSize(
+ "x86-pad-max-prefix-size", cl::init(0),
+ cl::desc("Maximum number of prefixes to use for padding"));
+
+cl::opt<bool> X86PadForAlign(
+ "x86-pad-for-align", cl::init(false), cl::Hidden,
+ cl::desc("Pad previous instructions to implement align directives"));
+
+cl::opt<bool> X86PadForBranchAlign(
+ "x86-pad-for-branch-align", cl::init(true), cl::Hidden,
+ cl::desc("Pad previous instructions to implement branch alignment"));
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
+ bool HasRelocationAddend, bool foobar)
+ : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
+};
+
+class X86AsmBackend : public MCAsmBackend {
+ const MCSubtargetInfo &STI;
+ std::unique_ptr<const MCInstrInfo> MCII;
+ X86AlignBranchKind AlignBranchType;
+ Align AlignBoundary;
+ unsigned TargetPrefixMax = 0;
+
+ MCInst PrevInst;
+ MCBoundaryAlignFragment *PendingBA = nullptr;
+ std::pair<MCFragment *, size_t> PrevInstPosition;
+ bool CanPadInst;
+
+ uint8_t determinePaddingPrefix(const MCInst &Inst) const;
+ bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
+ bool needAlign(const MCInst &Inst) const;
+ bool canPadBranches(MCObjectStreamer &OS) const;
+ bool canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const;
+
+public:
+ X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
+ : MCAsmBackend(support::little), STI(STI),
+ MCII(T.createMCInstrInfo()) {
+ if (X86AlignBranchWithin32BBoundaries) {
+ // At the moment, this defaults to aligning fused branches, unconditional
+ // jumps, and (unfused) conditional jumps with nops. Both the
+ // instructions aligned and the alignment method (nop vs prefix) may
+ // change in the future.
+ AlignBoundary = assumeAligned(32);;
+ AlignBranchType.addKind(X86::AlignBranchFused);
+ AlignBranchType.addKind(X86::AlignBranchJcc);
+ AlignBranchType.addKind(X86::AlignBranchJmp);
+ }
+ // Allow overriding defaults set by master flag
+ if (X86AlignBranchBoundary.getNumOccurrences())
+ AlignBoundary = assumeAligned(X86AlignBranchBoundary);
+ if (X86AlignBranch.getNumOccurrences())
+ AlignBranchType = X86AlignBranchKindLoc;
+ if (X86PadMaxPrefixSize.getNumOccurrences())
+ TargetPrefixMax = X86PadMaxPrefixSize;
+ }
+
+ bool allowAutoPadding() const override;
+ bool allowEnhancedRelaxation() const override;
+ void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
+ void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
+
+ unsigned getNumFixupKinds() const override {
+ return X86::NumTargetFixupKinds;
+ }
+
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+ bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target) override;
+
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
+
+ bool mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
+
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override;
+
+ void relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
+
+ bool padInstructionViaRelaxation(MCRelaxableFragment &RF,
+ MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const;
+
+ bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const;
+
+ bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const;
+
+ void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
+
+ unsigned getMaximumNopSize() const override;
+
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool Is16BitMode) {
+ unsigned Op = Inst.getOpcode();
+ switch (Op) {
+ default:
+ return Op;
+ case X86::JCC_1:
+ return (Is16BitMode) ? X86::JCC_2 : X86::JCC_4;
+ case X86::JMP_1:
+ return (Is16BitMode) ? X86::JMP_2 : X86::JMP_4;
+ }
+}
+
+static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
+ unsigned Op = Inst.getOpcode();
+ switch (Op) {
+ default:
+ return Op;
+
+ // IMUL
+ case X86::IMUL16rri8: return X86::IMUL16rri;
+ case X86::IMUL16rmi8: return X86::IMUL16rmi;
+ case X86::IMUL32rri8: return X86::IMUL32rri;
+ case X86::IMUL32rmi8: return X86::IMUL32rmi;
+ case X86::IMUL64rri8: return X86::IMUL64rri32;
+ case X86::IMUL64rmi8: return X86::IMUL64rmi32;
+
+ // AND
+ case X86::AND16ri8: return X86::AND16ri;
+ case X86::AND16mi8: return X86::AND16mi;
+ case X86::AND32ri8: return X86::AND32ri;
+ case X86::AND32mi8: return X86::AND32mi;
+ case X86::AND64ri8: return X86::AND64ri32;
+ case X86::AND64mi8: return X86::AND64mi32;
+
+ // OR
+ case X86::OR16ri8: return X86::OR16ri;
+ case X86::OR16mi8: return X86::OR16mi;
+ case X86::OR32ri8: return X86::OR32ri;
+ case X86::OR32mi8: return X86::OR32mi;
+ case X86::OR64ri8: return X86::OR64ri32;
+ case X86::OR64mi8: return X86::OR64mi32;
+
+ // XOR
+ case X86::XOR16ri8: return X86::XOR16ri;
+ case X86::XOR16mi8: return X86::XOR16mi;
+ case X86::XOR32ri8: return X86::XOR32ri;
+ case X86::XOR32mi8: return X86::XOR32mi;
+ case X86::XOR64ri8: return X86::XOR64ri32;
+ case X86::XOR64mi8: return X86::XOR64mi32;
+
+ // ADD
+ case X86::ADD16ri8: return X86::ADD16ri;
+ case X86::ADD16mi8: return X86::ADD16mi;
+ case X86::ADD32ri8: return X86::ADD32ri;
+ case X86::ADD32mi8: return X86::ADD32mi;
+ case X86::ADD64ri8: return X86::ADD64ri32;
+ case X86::ADD64mi8: return X86::ADD64mi32;
+
+ // ADC
+ case X86::ADC16ri8: return X86::ADC16ri;
+ case X86::ADC16mi8: return X86::ADC16mi;
+ case X86::ADC32ri8: return X86::ADC32ri;
+ case X86::ADC32mi8: return X86::ADC32mi;
+ case X86::ADC64ri8: return X86::ADC64ri32;
+ case X86::ADC64mi8: return X86::ADC64mi32;
+
+ // SUB
+ case X86::SUB16ri8: return X86::SUB16ri;
+ case X86::SUB16mi8: return X86::SUB16mi;
+ case X86::SUB32ri8: return X86::SUB32ri;
+ case X86::SUB32mi8: return X86::SUB32mi;
+ case X86::SUB64ri8: return X86::SUB64ri32;
+ case X86::SUB64mi8: return X86::SUB64mi32;
+
+ // SBB
+ case X86::SBB16ri8: return X86::SBB16ri;
+ case X86::SBB16mi8: return X86::SBB16mi;
+ case X86::SBB32ri8: return X86::SBB32ri;
+ case X86::SBB32mi8: return X86::SBB32mi;
+ case X86::SBB64ri8: return X86::SBB64ri32;
+ case X86::SBB64mi8: return X86::SBB64mi32;
+
+ // CMP
+ case X86::CMP16ri8: return X86::CMP16ri;
+ case X86::CMP16mi8: return X86::CMP16mi;
+ case X86::CMP32ri8: return X86::CMP32ri;
+ case X86::CMP32mi8: return X86::CMP32mi;
+ case X86::CMP64ri8: return X86::CMP64ri32;
+ case X86::CMP64mi8: return X86::CMP64mi32;
+
+ // PUSH
+ case X86::PUSH32i8: return X86::PUSHi32;
+ case X86::PUSH16i8: return X86::PUSHi16;
+ case X86::PUSH64i8: return X86::PUSH64i32;
+ }
+}
+
+static unsigned getRelaxedOpcode(const MCInst &Inst, bool Is16BitMode) {
+ unsigned R = getRelaxedOpcodeArith(Inst);
+ if (R != Inst.getOpcode())
+ return R;
+ return getRelaxedOpcodeBranch(Inst, Is16BitMode);
+}
+
+static X86::CondCode getCondFromBranch(const MCInst &MI,
+ const MCInstrInfo &MCII) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ return X86::COND_INVALID;
+ case X86::JCC_1: {
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ return static_cast<X86::CondCode>(
+ MI.getOperand(Desc.getNumOperands() - 1).getImm());
+ }
+ }
+}
+
+static X86::SecondMacroFusionInstKind
+classifySecondInstInMacroFusion(const MCInst &MI, const MCInstrInfo &MCII) {
+ X86::CondCode CC = getCondFromBranch(MI, MCII);
+ return classifySecondCondCodeInMacroFusion(CC);
+}
+
+/// Check if the instruction uses RIP relative addressing.
+static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ uint64_t TSFlags = Desc.TSFlags;
+ unsigned CurOp = X86II::getOperandBias(Desc);
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+ if (MemoryOperand < 0)
+ return false;
+ unsigned BaseRegNum = MemoryOperand + CurOp + X86::AddrBaseReg;
+ unsigned BaseReg = MI.getOperand(BaseRegNum).getReg();
+ return (BaseReg == X86::RIP);
+}
+
+/// Check if the instruction is a prefix.
+static bool isPrefix(const MCInst &MI, const MCInstrInfo &MCII) {
+ return X86II::isPrefix(MCII.get(MI.getOpcode()).TSFlags);
+}
+
+/// Check if the instruction is valid as the first instruction in macro fusion.
+static bool isFirstMacroFusibleInst(const MCInst &Inst,
+ const MCInstrInfo &MCII) {
+ // An Intel instruction with RIP relative addressing is not macro fusible.
+ if (isRIPRelative(Inst, MCII))
+ return false;
+ X86::FirstMacroFusionInstKind FIK =
+ X86::classifyFirstOpcodeInMacroFusion(Inst.getOpcode());
+ return FIK != X86::FirstMacroFusionInstKind::Invalid;
+}
+
+/// X86 can reduce the bytes of NOP by padding instructions with prefixes to
+/// get a better peformance in some cases. Here, we determine which prefix is
+/// the most suitable.
+///
+/// If the instruction has a segment override prefix, use the existing one.
+/// If the target is 64-bit, use the CS.
+/// If the target is 32-bit,
+/// - If the instruction has a ESP/EBP base register, use SS.
+/// - Otherwise use DS.
+uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const {
+ assert((STI.hasFeature(X86::Mode32Bit) || STI.hasFeature(X86::Mode64Bit)) &&
+ "Prefixes can be added only in 32-bit or 64-bit mode.");
+ const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // Determine where the memory operand starts, if present.
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+ if (MemoryOperand != -1)
+ MemoryOperand += X86II::getOperandBias(Desc);
+
+ unsigned SegmentReg = 0;
+ if (MemoryOperand >= 0) {
+ // Check for explicit segment override on memory operand.
+ SegmentReg = Inst.getOperand(MemoryOperand + X86::AddrSegmentReg).getReg();
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ default:
+ break;
+ case X86II::RawFrmDstSrc: {
+ // Check segment override opcode prefix as needed (not for %ds).
+ if (Inst.getOperand(2).getReg() != X86::DS)
+ SegmentReg = Inst.getOperand(2).getReg();
+ break;
+ }
+ case X86II::RawFrmSrc: {
+ // Check segment override opcode prefix as needed (not for %ds).
+ if (Inst.getOperand(1).getReg() != X86::DS)
+ SegmentReg = Inst.getOperand(1).getReg();
+ break;
+ }
+ case X86II::RawFrmMemOffs: {
+ // Check segment override opcode prefix as needed.
+ SegmentReg = Inst.getOperand(1).getReg();
+ break;
+ }
+ }
+
+ if (SegmentReg != 0)
+ return X86::getSegmentOverridePrefixForReg(SegmentReg);
+
+ if (STI.hasFeature(X86::Mode64Bit))
+ return X86::CS_Encoding;
+
+ if (MemoryOperand >= 0) {
+ unsigned BaseRegNum = MemoryOperand + X86::AddrBaseReg;
+ unsigned BaseReg = Inst.getOperand(BaseRegNum).getReg();
+ if (BaseReg == X86::ESP || BaseReg == X86::EBP)
+ return X86::SS_Encoding;
+ }
+ return X86::DS_Encoding;
+}
+
+/// Check if the two instructions will be macro-fused on the target cpu.
+bool X86AsmBackend::isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const {
+ const MCInstrDesc &InstDesc = MCII->get(Jcc.getOpcode());
+ if (!InstDesc.isConditionalBranch())
+ return false;
+ if (!isFirstMacroFusibleInst(Cmp, *MCII))
+ return false;
+ const X86::FirstMacroFusionInstKind CmpKind =
+ X86::classifyFirstOpcodeInMacroFusion(Cmp.getOpcode());
+ const X86::SecondMacroFusionInstKind BranchKind =
+ classifySecondInstInMacroFusion(Jcc, *MCII);
+ return X86::isMacroFused(CmpKind, BranchKind);
+}
+
+/// Check if the instruction has a variant symbol operand.
+static bool hasVariantSymbol(const MCInst &MI) {
+ for (auto &Operand : MI) {
+ if (!Operand.isExpr())
+ continue;
+ const MCExpr &Expr = *Operand.getExpr();
+ if (Expr.getKind() == MCExpr::SymbolRef &&
+ cast<MCSymbolRefExpr>(Expr).getKind() != MCSymbolRefExpr::VK_None)
+ return true;
+ }
+ return false;
+}
+
+bool X86AsmBackend::allowAutoPadding() const {
+ return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone);
+}
+
+bool X86AsmBackend::allowEnhancedRelaxation() const {
+ return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign;
+}
+
+/// X86 has certain instructions which enable interrupts exactly one
+/// instruction *after* the instruction which stores to SS. Return true if the
+/// given instruction has such an interrupt delay slot.
+static bool hasInterruptDelaySlot(const MCInst &Inst) {
+ switch (Inst.getOpcode()) {
+ case X86::POPSS16:
+ case X86::POPSS32:
+ case X86::STI:
+ return true;
+
+ case X86::MOV16sr:
+ case X86::MOV32sr:
+ case X86::MOV64sr:
+ case X86::MOV16sm:
+ if (Inst.getOperand(0).getReg() == X86::SS)
+ return true;
+ break;
+ }
+ return false;
+}
+
+/// Check if the instruction to be emitted is right after any data.
+static bool
+isRightAfterData(MCFragment *CurrentFragment,
+ const std::pair<MCFragment *, size_t> &PrevInstPosition) {
+ MCFragment *F = CurrentFragment;
+ // Empty data fragments may be created to prevent further data being
+ // added into the previous fragment, we need to skip them since they
+ // have no contents.
+ for (; isa_and_nonnull<MCDataFragment>(F); F = F->getPrevNode())
+ if (cast<MCDataFragment>(F)->getContents().size() != 0)
+ break;
+
+ // Since data is always emitted into a DataFragment, our check strategy is
+ // simple here.
+ // - If the fragment is a DataFragment
+ // - If it's not the fragment where the previous instruction is,
+ // returns true.
+ // - If it's the fragment holding the previous instruction but its
+ // size changed since the the previous instruction was emitted into
+ // it, returns true.
+ // - Otherwise returns false.
+ // - If the fragment is not a DataFragment, returns false.
+ if (auto *DF = dyn_cast_or_null<MCDataFragment>(F))
+ return DF != PrevInstPosition.first ||
+ DF->getContents().size() != PrevInstPosition.second;
+
+ return false;
+}
+
+/// \returns the fragment size if it has instructions, otherwise returns 0.
+static size_t getSizeForInstFragment(const MCFragment *F) {
+ if (!F || !F->hasInstructions())
+ return 0;
+ // MCEncodedFragmentWithContents being templated makes this tricky.
+ switch (F->getKind()) {
+ default:
+ llvm_unreachable("Unknown fragment with instructions!");
+ case MCFragment::FT_Data:
+ return cast<MCDataFragment>(*F).getContents().size();
+ case MCFragment::FT_Relaxable:
+ return cast<MCRelaxableFragment>(*F).getContents().size();
+ case MCFragment::FT_CompactEncodedInst:
+ return cast<MCCompactEncodedInstFragment>(*F).getContents().size();
+ }
+}
+
+/// Return true if we can insert NOP or prefixes automatically before the
+/// the instruction to be emitted.
+bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
+ if (hasVariantSymbol(Inst))
+ // Linker may rewrite the instruction with variant symbol operand(e.g.
+ // TLSCALL).
+ return false;
+
+ if (hasInterruptDelaySlot(PrevInst))
+ // If this instruction follows an interrupt enabling instruction with a one
+ // instruction delay, inserting a nop would change behavior.
+ return false;
+
+ if (isPrefix(PrevInst, *MCII))
+ // If this instruction follows a prefix, inserting a nop/prefix would change
+ // semantic.
+ return false;
+
+ if (isPrefix(Inst, *MCII))
+ // If this instruction is a prefix, inserting a prefix would change
+ // semantic.
+ return false;
+
+ if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
+ // If this instruction follows any data, there is no clear
+ // instruction boundary, inserting a nop/prefix would change semantic.
+ return false;
+
+ return true;
+}
+
+bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const {
+ if (!OS.getAllowAutoPadding())
+ return false;
+ assert(allowAutoPadding() && "incorrect initialization!");
+
+ // We only pad in text section.
+ if (!OS.getCurrentSectionOnly()->getKind().isText())
+ return false;
+
+ // To be Done: Currently don't deal with Bundle cases.
+ if (OS.getAssembler().isBundlingEnabled())
+ return false;
+
+ // Branches only need to be aligned in 32-bit or 64-bit mode.
+ if (!(STI.hasFeature(X86::Mode64Bit) || STI.hasFeature(X86::Mode32Bit)))
+ return false;
+
+ return true;
+}
+
+/// Check if the instruction operand needs to be aligned.
+bool X86AsmBackend::needAlign(const MCInst &Inst) const {
+ const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
+ return (Desc.isConditionalBranch() &&
+ (AlignBranchType & X86::AlignBranchJcc)) ||
+ (Desc.isUnconditionalBranch() &&
+ (AlignBranchType & X86::AlignBranchJmp)) ||
+ (Desc.isCall() && (AlignBranchType & X86::AlignBranchCall)) ||
+ (Desc.isReturn() && (AlignBranchType & X86::AlignBranchRet)) ||
+ (Desc.isIndirectBranch() &&
+ (AlignBranchType & X86::AlignBranchIndirect));
+}
+
+/// Insert BoundaryAlignFragment before instructions to align branches.
+void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
+ const MCInst &Inst) {
+ CanPadInst = canPadInst(Inst, OS);
+
+ if (!canPadBranches(OS))
+ return;
+
+ if (!isMacroFused(PrevInst, Inst))
+ // Macro fusion doesn't happen indeed, clear the pending.
+ PendingBA = nullptr;
+
+ if (!CanPadInst)
+ return;
+
+ if (PendingBA && OS.getCurrentFragment()->getPrevNode() == PendingBA) {
+ // Macro fusion actually happens and there is no other fragment inserted
+ // after the previous instruction.
+ //
+ // Do nothing here since we already inserted a BoudaryAlign fragment when
+ // we met the first instruction in the fused pair and we'll tie them
+ // together in emitInstructionEnd.
+ //
+ // Note: When there is at least one fragment, such as MCAlignFragment,
+ // inserted after the previous instruction, e.g.
+ //
+ // \code
+ // cmp %rax %rcx
+ // .align 16
+ // je .Label0
+ // \ endcode
+ //
+ // We will treat the JCC as a unfused branch although it may be fused
+ // with the CMP.
+ return;
+ }
+
+ if (needAlign(Inst) || ((AlignBranchType & X86::AlignBranchFused) &&
+ isFirstMacroFusibleInst(Inst, *MCII))) {
+ // If we meet a unfused branch or the first instuction in a fusiable pair,
+ // insert a BoundaryAlign fragment.
+ OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary));
+ }
+}
+
+/// Set the last fragment to be aligned for the BoundaryAlignFragment.
+void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) {
+ PrevInst = Inst;
+ MCFragment *CF = OS.getCurrentFragment();
+ PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
+ if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF))
+ F->setAllowAutoPadding(CanPadInst);
+
+ if (!canPadBranches(OS))
+ return;
+
+ if (!needAlign(Inst) || !PendingBA)
+ return;
+
+ // Tie the aligned instructions into a a pending BoundaryAlign.
+ PendingBA->setLastFragment(CF);
+ PendingBA = nullptr;
+
+ // We need to ensure that further data isn't added to the current
+ // DataFragment, so that we can get the size of instructions later in
+ // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
+ // DataFragment.
+ if (isa_and_nonnull<MCDataFragment>(CF))
+ OS.insert(new MCDataFragment());
+
+ // Update the maximum alignment on the current section if necessary.
+ MCSection *Sec = OS.getCurrentSectionOnly();
+ if (AlignBoundary.value() > Sec->getAlignment())
+ Sec->setAlignment(AlignBoundary);
+}
+
+Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
+ if (STI.getTargetTriple().isOSBinFormatELF()) {
+ unsigned Type;
+ if (STI.getTargetTriple().getArch() == Triple::x86_64) {
+ Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
+#undef ELF_RELOC
+ .Default(-1u);
+ } else {
+ Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/i386.def"
+#undef ELF_RELOC
+ .Default(-1u);
+ }
+ if (Type == -1u)
+ return None;
+ return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
+ }
+ return MCAsmBackend::getFixupKind(Name);
+}
+
+const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+ const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+ {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_signed_4byte", 0, 32, 0},
+ {"reloc_signed_4byte_relax", 0, 32, 0},
+ {"reloc_global_offset_table", 0, 32, 0},
+ {"reloc_global_offset_table8", 0, 64, 0},
+ {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ };
+
+ // Fixup kinds from .reloc directive are like R_386_NONE/R_X86_64_NONE. They
+ // do not require any extra processing.
+ if (Kind >= FirstLiteralRelocationKind)
+ return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ assert(Infos[Kind - FirstTargetFixupKind].Name && "Empty fixup name!");
+ return Infos[Kind - FirstTargetFixupKind];
+}
+
+bool X86AsmBackend::shouldForceRelocation(const MCAssembler &,
+ const MCFixup &Fixup,
+ const MCValue &) {
+ return Fixup.getKind() >= FirstLiteralRelocationKind;
+}
+
+static unsigned getFixupKindSize(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_NONE:
+ return 0;
+ case FK_PCRel_1:
+ case FK_SecRel_1:
+ case FK_Data_1:
+ return 1;
+ case FK_PCRel_2:
+ case FK_SecRel_2:
+ case FK_Data_2:
+ return 2;
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_relax:
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ case X86::reloc_global_offset_table:
+ case X86::reloc_branch_4byte_pcrel:
+ case FK_SecRel_4:
+ case FK_Data_4:
+ return 4;
+ case FK_PCRel_8:
+ case FK_SecRel_8:
+ case FK_Data_8:
+ case X86::reloc_global_offset_table8:
+ return 8;
+ }
+}
+
+void X86AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data,
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const {
+ unsigned Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return;
+ unsigned Size = getFixupKindSize(Kind);
+
+ assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
+
+ int64_t SignedValue = static_cast<int64_t>(Value);
+ if ((Target.isAbsolute() || IsResolved) &&
+ getFixupKindInfo(Fixup.getKind()).Flags &
+ MCFixupKindInfo::FKF_IsPCRel) {
+ // check that PC relative fixup fits into the fixup size.
+ if (Size > 0 && !isIntN(Size * 8, SignedValue))
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "value of " + Twine(SignedValue) +
+ " is too large for field of " + Twine(Size) +
+ ((Size == 1) ? " byte." : " bytes."));
+ } else {
+ // Check that uppper bits are either all zeros or all ones.
+ // Specifically ignore overflow/underflow as long as the leakage is
+ // limited to the lower bits. This is to remain compatible with
+ // other assemblers.
+ assert((Size == 0 || isIntN(Size * 8 + 1, SignedValue)) &&
+ "Value does not fit in the Fixup field");
+ }
+
+ for (unsigned i = 0; i != Size; ++i)
+ Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+}
+
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ // Branches can always be relaxed in either mode.
+ if (getRelaxedOpcodeBranch(Inst, false) != Inst.getOpcode())
+ return true;
+
+ // Check if this instruction is ever relaxable.
+ if (getRelaxedOpcodeArith(Inst) == Inst.getOpcode())
+ return false;
+
+
+ // Check if the relaxable operand has an expression. For the current set of
+ // relaxable instructions, the relaxable operand is always the last operand.
+ unsigned RelaxableOp = Inst.getNumOperands() - 1;
+ if (Inst.getOperand(RelaxableOp).isExpr())
+ return true;
+
+ return false;
+}
+
+bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ // Relax if the value is too big for a (signed) i8.
+ return !isInt<8>(Value);
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+ bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+ unsigned RelaxedOp = getRelaxedOpcode(Inst, Is16BitMode);
+
+ if (RelaxedOp == Inst.getOpcode()) {
+ SmallString<256> Tmp;
+ raw_svector_ostream OS(Tmp);
+ Inst.dump_pretty(OS);
+ OS << "\n";
+ report_fatal_error("unexpected instruction to relax: " + OS.str());
+ }
+
+ Inst.setOpcode(RelaxedOp);
+}
+
+/// Return true if this instruction has been fully relaxed into it's most
+/// general available form.
+static bool isFullyRelaxed(const MCRelaxableFragment &RF) {
+ auto &Inst = RF.getInst();
+ auto &STI = *RF.getSubtargetInfo();
+ bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+ return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode();
+}
+
+bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
+ MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const {
+ if (!RF.getAllowAutoPadding())
+ return false;
+ // If the instruction isn't fully relaxed, shifting it around might require a
+ // larger value for one of the fixups then can be encoded. The outer loop
+ // will also catch this before moving to the next instruction, but we need to
+ // prevent padding this single instruction as well.
+ if (!isFullyRelaxed(RF))
+ return false;
+
+ const unsigned OldSize = RF.getContents().size();
+ if (OldSize == 15)
+ return false;
+
+ const unsigned MaxPossiblePad = std::min(15 - OldSize, RemainingSize);
+ const unsigned RemainingPrefixSize = [&]() -> unsigned {
+ SmallString<15> Code;
+ raw_svector_ostream VecOS(Code);
+ Emitter.emitPrefix(RF.getInst(), VecOS, STI);
+ assert(Code.size() < 15 && "The number of prefixes must be less than 15.");
+
+ // TODO: It turns out we need a decent amount of plumbing for the target
+ // specific bits to determine number of prefixes its safe to add. Various
+ // targets (older chips mostly, but also Atom family) encounter decoder
+ // stalls with too many prefixes. For testing purposes, we set the value
+ // externally for the moment.
+ unsigned ExistingPrefixSize = Code.size();
+ if (TargetPrefixMax <= ExistingPrefixSize)
+ return 0;
+ return TargetPrefixMax - ExistingPrefixSize;
+ }();
+ const unsigned PrefixBytesToAdd =
+ std::min(MaxPossiblePad, RemainingPrefixSize);
+ if (PrefixBytesToAdd == 0)
+ return false;
+
+ const uint8_t Prefix = determinePaddingPrefix(RF.getInst());
+
+ SmallString<256> Code;
+ Code.append(PrefixBytesToAdd, Prefix);
+ Code.append(RF.getContents().begin(), RF.getContents().end());
+ RF.getContents() = Code;
+
+ // Adjust the fixups for the change in offsets
+ for (auto &F : RF.getFixups()) {
+ F.setOffset(F.getOffset() + PrefixBytesToAdd);
+ }
+
+ RemainingSize -= PrefixBytesToAdd;
+ return true;
+}
+
+bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
+ MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const {
+ if (isFullyRelaxed(RF))
+ // TODO: There are lots of other tricks we could apply for increasing
+ // encoding size without impacting performance.
+ return false;
+
+ MCInst Relaxed = RF.getInst();
+ relaxInstruction(Relaxed, *RF.getSubtargetInfo());
+
+ SmallVector<MCFixup, 4> Fixups;
+ SmallString<15> Code;
+ raw_svector_ostream VecOS(Code);
+ Emitter.encodeInstruction(Relaxed, VecOS, Fixups, *RF.getSubtargetInfo());
+ const unsigned OldSize = RF.getContents().size();
+ const unsigned NewSize = Code.size();
+ assert(NewSize >= OldSize && "size decrease during relaxation?");
+ unsigned Delta = NewSize - OldSize;
+ if (Delta > RemainingSize)
+ return false;
+ RF.setInst(Relaxed);
+ RF.getContents() = Code;
+ RF.getFixups() = Fixups;
+ RemainingSize -= Delta;
+ return true;
+}
+
+bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
+ MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const {
+ bool Changed = false;
+ if (RemainingSize != 0)
+ Changed |= padInstructionViaRelaxation(RF, Emitter, RemainingSize);
+ if (RemainingSize != 0)
+ Changed |= padInstructionViaPrefix(RF, Emitter, RemainingSize);
+ return Changed;
+}
+
+void X86AsmBackend::finishLayout(MCAssembler const &Asm,
+ MCAsmLayout &Layout) const {
+ // See if we can further relax some instructions to cut down on the number of
+ // nop bytes required for code alignment. The actual win is in reducing
+ // instruction count, not number of bytes. Modern X86-64 can easily end up
+ // decode limited. It is often better to reduce the number of instructions
+ // (i.e. eliminate nops) even at the cost of increasing the size and
+ // complexity of others.
+ if (!X86PadForAlign && !X86PadForBranchAlign)
+ return;
+
+ // The processed regions are delimitered by LabeledFragments. -g may have more
+ // MCSymbols and therefore different relaxation results. X86PadForAlign is
+ // disabled by default to eliminate the -g vs non -g difference.
+ DenseSet<MCFragment *> LabeledFragments;
+ for (const MCSymbol &S : Asm.symbols())
+ LabeledFragments.insert(S.getFragment(false));
+
+ for (MCSection &Sec : Asm) {
+ if (!Sec.getKind().isText())
+ continue;
+
+ SmallVector<MCRelaxableFragment *, 4> Relaxable;
+ for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
+ MCFragment &F = *I;
+
+ if (LabeledFragments.count(&F))
+ Relaxable.clear();
+
+ if (F.getKind() == MCFragment::FT_Data ||
+ F.getKind() == MCFragment::FT_CompactEncodedInst)
+ // Skip and ignore
+ continue;
+
+ if (F.getKind() == MCFragment::FT_Relaxable) {
+ auto &RF = cast<MCRelaxableFragment>(*I);
+ Relaxable.push_back(&RF);
+ continue;
+ }
+
+ auto canHandle = [](MCFragment &F) -> bool {
+ switch (F.getKind()) {
+ default:
+ return false;
+ case MCFragment::FT_Align:
+ return X86PadForAlign;
+ case MCFragment::FT_BoundaryAlign:
+ return X86PadForBranchAlign;
+ }
+ };
+ // For any unhandled kind, assume we can't change layout.
+ if (!canHandle(F)) {
+ Relaxable.clear();
+ continue;
+ }
+
+#ifndef NDEBUG
+ const uint64_t OrigOffset = Layout.getFragmentOffset(&F);
+#endif
+ const uint64_t OrigSize = Asm.computeFragmentSize(Layout, F);
+
+ // To keep the effects local, prefer to relax instructions closest to
+ // the align directive. This is purely about human understandability
+ // of the resulting code. If we later find a reason to expand
+ // particular instructions over others, we can adjust.
+ MCFragment *FirstChangedFragment = nullptr;
+ unsigned RemainingSize = OrigSize;
+ while (!Relaxable.empty() && RemainingSize != 0) {
+ auto &RF = *Relaxable.pop_back_val();
+ // Give the backend a chance to play any tricks it wishes to increase
+ // the encoding size of the given instruction. Target independent code
+ // will try further relaxation, but target's may play further tricks.
+ if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize))
+ FirstChangedFragment = &RF;
+
+ // If we have an instruction which hasn't been fully relaxed, we can't
+ // skip past it and insert bytes before it. Changing its starting
+ // offset might require a larger negative offset than it can encode.
+ // We don't need to worry about larger positive offsets as none of the
+ // possible offsets between this and our align are visible, and the
+ // ones afterwards aren't changing.
+ if (!isFullyRelaxed(RF))
+ break;
+ }
+ Relaxable.clear();
+
+ if (FirstChangedFragment) {
+ // Make sure the offsets for any fragments in the effected range get
+ // updated. Note that this (conservatively) invalidates the offsets of
+ // those following, but this is not required.
+ Layout.invalidateFragmentsFrom(FirstChangedFragment);
+ }
+
+ // BoundaryAlign explicitly tracks it's size (unlike align)
+ if (F.getKind() == MCFragment::FT_BoundaryAlign)
+ cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
+
+#ifndef NDEBUG
+ const uint64_t FinalOffset = Layout.getFragmentOffset(&F);
+ const uint64_t FinalSize = Asm.computeFragmentSize(Layout, F);
+ assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
+ "can't move start of next fragment!");
+ assert(FinalSize == RemainingSize && "inconsistent size computation?");
+#endif
+
+ // If we're looking at a boundary align, make sure we don't try to pad
+ // its target instructions for some following directive. Doing so would
+ // break the alignment of the current boundary align.
+ if (auto *BF = dyn_cast<MCBoundaryAlignFragment>(&F)) {
+ const MCFragment *LastFragment = BF->getLastFragment();
+ if (!LastFragment)
+ continue;
+ while (&*I != LastFragment)
+ ++I;
+ }
+ }
+ }
+
+ // The layout is done. Mark every fragment as valid.
+ for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
+ MCSection &Section = *Layout.getSectionOrder()[i];
+ Layout.getFragmentOffset(&*Section.getFragmentList().rbegin());
+ Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin());
+ }
+}
+
+unsigned X86AsmBackend::getMaximumNopSize() const {
+ if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
+ return 1;
+ if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
+ return 7;
+ if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+ return 15;
+ if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+ return 11;
+ // FIXME: handle 32-bit mode
+ // 15-bytes is the longest single NOP instruction, but 10-bytes is
+ // commonly the longest that can be efficiently decoded.
+ return 10;
+}
+
+/// Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
+bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+ static const char Nops[10][11] = {
+ // nop
+ "\x90",
+ // xchg %ax,%ax
+ "\x66\x90",
+ // nopl (%[re]ax)
+ "\x0f\x1f\x00",
+ // nopl 0(%[re]ax)
+ "\x0f\x1f\x40\x00",
+ // nopl 0(%[re]ax,%[re]ax,1)
+ "\x0f\x1f\x44\x00\x00",
+ // nopw 0(%[re]ax,%[re]ax,1)
+ "\x66\x0f\x1f\x44\x00\x00",
+ // nopl 0L(%[re]ax)
+ "\x0f\x1f\x80\x00\x00\x00\x00",
+ // nopl 0L(%[re]ax,%[re]ax,1)
+ "\x0f\x1f\x84\x00\x00\x00\x00\x00",
+ // nopw 0L(%[re]ax,%[re]ax,1)
+ "\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
+ // nopw %cs:0L(%[re]ax,%[re]ax,1)
+ "\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00",
+ };
+
+ uint64_t MaxNopLength = (uint64_t)getMaximumNopSize();
+
+ // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
+ // length.
+ do {
+ const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
+ const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+ for (uint8_t i = 0; i < Prefixes; i++)
+ OS << '\x66';
+ const uint8_t Rest = ThisNopLength - Prefixes;
+ if (Rest != 0)
+ OS.write(Nops[Rest - 1], Rest);
+ Count -= ThisNopLength;
+ } while (Count != 0);
+
+ return true;
+}
+
+/* *** */
+
+namespace {
+
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+ uint8_t OSABI;
+ ELFX86AsmBackend(const Target &T, uint8_t OSABI, const MCSubtargetInfo &STI)
+ : X86AsmBackend(T, STI), OSABI(OSABI) {}
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_32AsmBackend(const Target &T, uint8_t OSABI,
+ const MCSubtargetInfo &STI)
+ : ELFX86AsmBackend(T, OSABI, STI) {}
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86ELFObjectWriter(/*IsELF64*/ false, OSABI, ELF::EM_386);
+ }
+};
+
+class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI,
+ const MCSubtargetInfo &STI)
+ : ELFX86AsmBackend(T, OSABI, STI) {}
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86ELFObjectWriter(/*IsELF64*/ false, OSABI,
+ ELF::EM_X86_64);
+ }
+};
+
+class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI,
+ const MCSubtargetInfo &STI)
+ : ELFX86AsmBackend(T, OSABI, STI) {}
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86ELFObjectWriter(/*IsELF64*/ false, OSABI,
+ ELF::EM_IAMCU);
+ }
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_64AsmBackend(const Target &T, uint8_t OSABI,
+ const MCSubtargetInfo &STI)
+ : ELFX86AsmBackend(T, OSABI, STI) {}
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86ELFObjectWriter(/*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+ }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+ bool Is64Bit;
+
+public:
+ WindowsX86AsmBackend(const Target &T, bool is64Bit,
+ const MCSubtargetInfo &STI)
+ : X86AsmBackend(T, STI)
+ , Is64Bit(is64Bit) {
+ }
+
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override {
+ return StringSwitch<Optional<MCFixupKind>>(Name)
+ .Case("dir32", FK_Data_4)
+ .Case("secrel32", FK_SecRel_4)
+ .Case("secidx", FK_SecRel_2)
+ .Default(MCAsmBackend::getFixupKind(Name));
+ }
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createX86WinCOFFObjectWriter(Is64Bit);
+ }
+};
+
+namespace CU {
+
+ /// Compact unwind encoding values.
+ enum CompactUnwindEncodings {
+ /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+ /// the return address, then [RE]SP is moved to [RE]BP.
+ UNWIND_MODE_BP_FRAME = 0x01000000,
+
+ /// A frameless function with a small constant stack size.
+ UNWIND_MODE_STACK_IMMD = 0x02000000,
+
+ /// A frameless function with a large constant stack size.
+ UNWIND_MODE_STACK_IND = 0x03000000,
+
+ /// No compact unwind encoding is available.
+ UNWIND_MODE_DWARF = 0x04000000,
+
+ /// Mask for encoding the frame registers.
+ UNWIND_BP_FRAME_REGISTERS = 0x00007FFF,
+
+ /// Mask for encoding the frameless registers.
+ UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+ };
+
+} // namespace CU
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+ const MCRegisterInfo &MRI;
+
+ /// Number of registers that can be saved in a compact unwind encoding.
+ enum { CU_NUM_SAVED_REGS = 6 };
+
+ mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+ Triple TT;
+ bool Is64Bit;
+
+ unsigned OffsetSize; ///< Offset of a "push" instruction.
+ unsigned MoveInstrSize; ///< Size of a "move" instruction.
+ unsigned StackDivide; ///< Amount to adjust stack size by.
+protected:
+ /// Size of a "push" instruction for the given register.
+ unsigned PushInstrSize(unsigned Reg) const {
+ switch (Reg) {
+ case X86::EBX:
+ case X86::ECX:
+ case X86::EDX:
+ case X86::EDI:
+ case X86::ESI:
+ case X86::EBP:
+ case X86::RBX:
+ case X86::RBP:
+ return 1;
+ case X86::R12:
+ case X86::R13:
+ case X86::R14:
+ case X86::R15:
+ return 2;
+ }
+ return 1;
+ }
+
+private:
+ /// Get the compact unwind number for a given register. The number
+ /// corresponds to the enum lists in compact_unwind_encoding.h.
+ int getCompactUnwindRegNum(unsigned Reg) const {
+ static const MCPhysReg CU32BitRegs[7] = {
+ X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+ };
+ static const MCPhysReg CU64BitRegs[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+ const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+ for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+ if (*CURegs == Reg)
+ return Idx;
+
+ return -1;
+ }
+
+ /// Return the registers encoded for a compact encoding with a frame
+ /// pointer.
+ uint32_t encodeCompactUnwindRegistersWithFrame() const {
+ // Encode the registers in the order they were saved --- 3-bits per
+ // register. The list of saved registers is assumed to be in reverse
+ // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+ uint32_t RegEnc = 0;
+ for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+ unsigned Reg = SavedRegs[i];
+ if (Reg == 0) break;
+
+ int CURegNum = getCompactUnwindRegNum(Reg);
+ if (CURegNum == -1) return ~0U;
+
+ // Encode the 3-bit register number in order, skipping over 3-bits for
+ // each register.
+ RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+ }
+
+ assert((RegEnc & 0x3FFFF) == RegEnc &&
+ "Invalid compact register encoding!");
+ return RegEnc;
+ }
+
+ /// Create the permutation encoding used with frameless stacks. It is
+ /// passed the number of registers to be saved and an array of the registers
+ /// saved.
+ uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+ // The saved registers are numbered from 1 to 6. In order to encode the
+ // order in which they were saved, we re-number them according to their
+ // place in the register order. The re-numbering is relative to the last
+ // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+ // that order:
+ //
+ // Orig Re-Num
+ // ---- ------
+ // 6 6
+ // 2 2
+ // 4 3
+ // 5 3
+ //
+ for (unsigned i = 0; i < RegCount; ++i) {
+ int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+ if (CUReg == -1) return ~0U;
+ SavedRegs[i] = CUReg;
+ }
+
+ // Reverse the list.
+ std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+ uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+ for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+ unsigned Countless = 0;
+ for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+ if (SavedRegs[j] < SavedRegs[i])
+ ++Countless;
+
+ RenumRegs[i] = SavedRegs[i] - Countless - 1;
+ }
+
+ // Take the renumbered values and encode them into a 10-bit number.
+ uint32_t permutationEncoding = 0;
+ switch (RegCount) {
+ case 6:
+ permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+ + 6 * RenumRegs[2] + 2 * RenumRegs[3]
+ + RenumRegs[4];
+ break;
+ case 5:
+ permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+ + 6 * RenumRegs[3] + 2 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 4:
+ permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3]
+ + 3 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 3:
+ permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 2:
+ permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 1:
+ permutationEncoding |= RenumRegs[5];
+ break;
+ }
+
+ assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+ "Invalid compact register encoding!");
+ return permutationEncoding;
+ }
+
+public:
+ DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI)
+ : X86AsmBackend(T, STI), MRI(MRI), TT(STI.getTargetTriple()),
+ Is64Bit(TT.isArch64Bit()) {
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+ OffsetSize = Is64Bit ? 8 : 4;
+ MoveInstrSize = Is64Bit ? 3 : 2;
+ StackDivide = Is64Bit ? 8 : 4;
+ }
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ uint32_t CPUType = cantFail(MachO::getCPUType(TT));
+ uint32_t CPUSubType = cantFail(MachO::getCPUSubType(TT));
+ return createX86MachObjectWriter(Is64Bit, CPUType, CPUSubType);
+ }
+
+ /// Implementation of algorithm to generate the compact unwind encoding
+ /// for the CFI instructions.
+ uint32_t
+ generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const override {
+ if (Instrs.empty()) return 0;
+
+ // Reset the saved registers.
+ unsigned SavedRegIdx = 0;
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+
+ bool HasFP = false;
+
+ // Encode that we are using EBP/RBP as the frame pointer.
+ uint32_t CompactUnwindEncoding = 0;
+
+ unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
+ unsigned InstrOffset = 0;
+ unsigned StackAdjust = 0;
+ unsigned StackSize = 0;
+ unsigned NumDefCFAOffsets = 0;
+
+ for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+ const MCCFIInstruction &Inst = Instrs[i];
+
+ switch (Inst.getOperation()) {
+ default:
+ // Any other CFI directives indicate a frame that we aren't prepared
+ // to represent via compact unwind, so just bail out.
+ return 0;
+ case MCCFIInstruction::OpDefCfaRegister: {
+ // Defines a frame pointer. E.g.
+ //
+ // movq %rsp, %rbp
+ // L0:
+ // .cfi_def_cfa_register %rbp
+ //
+ HasFP = true;
+
+ // If the frame pointer is other than esp/rsp, we do not have a way to
+ // generate a compact unwinding representation, so bail out.
+ if (*MRI.getLLVMRegNum(Inst.getRegister(), true) !=
+ (Is64Bit ? X86::RBP : X86::EBP))
+ return 0;
+
+ // Reset the counts.
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+ StackAdjust = 0;
+ SavedRegIdx = 0;
+ InstrOffset += MoveInstrSize;
+ break;
+ }
+ case MCCFIInstruction::OpDefCfaOffset: {
+ // Defines a new offset for the CFA. E.g.
+ //
+ // With frame:
+ //
+ // pushq %rbp
+ // L0:
+ // .cfi_def_cfa_offset 16
+ //
+ // Without frame:
+ //
+ // subq $72, %rsp
+ // L0:
+ // .cfi_def_cfa_offset 80
+ //
+ StackSize = Inst.getOffset() / StackDivide;
+ ++NumDefCFAOffsets;
+ break;
+ }
+ case MCCFIInstruction::OpOffset: {
+ // Defines a "push" of a callee-saved register. E.g.
+ //
+ // pushq %r15
+ // pushq %r14
+ // pushq %rbx
+ // L0:
+ // subq $120, %rsp
+ // L1:
+ // .cfi_offset %rbx, -40
+ // .cfi_offset %r14, -32
+ // .cfi_offset %r15, -24
+ //
+ if (SavedRegIdx == CU_NUM_SAVED_REGS)
+ // If there are too many saved registers, we cannot use a compact
+ // unwind encoding.
+ return CU::UNWIND_MODE_DWARF;
+
+ unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
+ SavedRegs[SavedRegIdx++] = Reg;
+ StackAdjust += OffsetSize;
+ InstrOffset += PushInstrSize(Reg);
+ break;
+ }
+ }
+ }
+
+ StackAdjust /= StackDivide;
+
+ if (HasFP) {
+ if ((StackAdjust & 0xFF) != StackAdjust)
+ // Offset was too big for a compact unwind encoding.
+ return CU::UNWIND_MODE_DWARF;
+
+ // Get the encoding of the saved registers when we have a frame pointer.
+ uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+ CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+ CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+ CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+ } else {
+ SubtractInstrIdx += InstrOffset;
+ ++StackAdjust;
+
+ if ((StackSize & 0xFF) == StackSize) {
+ // Frameless stack with a small stack size.
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+ // Encode the stack size.
+ CompactUnwindEncoding |= (StackSize & 0xFF) << 16;
+ } else {
+ if ((StackAdjust & 0x7) != StackAdjust)
+ // The extra stack adjustments are too big for us to handle.
+ return CU::UNWIND_MODE_DWARF;
+
+ // Frameless stack with an offset too large for us to encode compactly.
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+ // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+ // instruction.
+ CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+ // Encode any extra stack adjustments (done via push instructions).
+ CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+ }
+
+ // Encode the number of registers saved. (Reverse the list first.)
+ std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]);
+ CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+ // Get the encoding of the saved registers when we don't have a frame
+ // pointer.
+ uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx);
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+ // Encode the register encoding.
+ CompactUnwindEncoding |=
+ RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+ }
+
+ return CompactUnwindEncoding;
+ }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
+ const MCTargetOptions &Options) {
+ const Triple &TheTriple = STI.getTargetTriple();
+ if (TheTriple.isOSBinFormatMachO())
+ return new DarwinX86AsmBackend(T, MRI, STI);
+
+ if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
+ return new WindowsX86AsmBackend(T, false, STI);
+
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+ if (TheTriple.isOSIAMCU())
+ return new ELFX86_IAMCUAsmBackend(T, OSABI, STI);
+
+ return new ELFX86_32AsmBackend(T, OSABI, STI);
+}
+
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
+ const MCTargetOptions &Options) {
+ const Triple &TheTriple = STI.getTargetTriple();
+ if (TheTriple.isOSBinFormatMachO())
+ return new DarwinX86AsmBackend(T, MRI, STI);
+
+ if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
+ return new WindowsX86AsmBackend(T, true, STI);
+
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+ if (TheTriple.getEnvironment() == Triple::GNUX32)
+ return new ELFX86_X32AsmBackend(T, OSABI, STI);
+ return new ELFX86_64AsmBackend(T, OSABI, STI);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 000000000000..4db1bfc25177
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,1226 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+namespace X86 {
+ // Enums for memory operand decoding. Each memory operand is represented with
+ // a 5 operand sequence in the form:
+ // [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+ // These enums help decode this.
+ enum {
+ AddrBaseReg = 0,
+ AddrScaleAmt = 1,
+ AddrIndexReg = 2,
+ AddrDisp = 3,
+
+ /// AddrSegmentReg - The operand # of the segment in the memory operand.
+ AddrSegmentReg = 4,
+
+ /// AddrNumOperands - Total number of operands in a memory reference.
+ AddrNumOperands = 5
+ };
+
+ /// AVX512 static rounding constants. These need to match the values in
+ /// avx512fintrin.h.
+ enum STATIC_ROUNDING {
+ TO_NEAREST_INT = 0,
+ TO_NEG_INF = 1,
+ TO_POS_INF = 2,
+ TO_ZERO = 3,
+ CUR_DIRECTION = 4,
+ NO_EXC = 8
+ };
+
+ /// The constants to describe instr prefixes if there are
+ enum IPREFIXES {
+ IP_NO_PREFIX = 0,
+ IP_HAS_OP_SIZE = 1U << 0,
+ IP_HAS_AD_SIZE = 1U << 1,
+ IP_HAS_REPEAT_NE = 1U << 2,
+ IP_HAS_REPEAT = 1U << 3,
+ IP_HAS_LOCK = 1U << 4,
+ IP_HAS_NOTRACK = 1U << 5,
+ IP_USE_VEX = 1U << 6,
+ IP_USE_VEX2 = 1U << 7,
+ IP_USE_VEX3 = 1U << 8,
+ IP_USE_EVEX = 1U << 9,
+ IP_USE_DISP8 = 1U << 10,
+ IP_USE_DISP32 = 1U << 11,
+ };
+
+ enum OperandType : unsigned {
+ /// AVX512 embedded rounding control. This should only have values 0-3.
+ OPERAND_ROUNDING_CONTROL = MCOI::OPERAND_FIRST_TARGET,
+ OPERAND_COND_CODE,
+ };
+
+ // X86 specific condition code. These correspond to X86_*_COND in
+ // X86InstrInfo.td. They must be kept in synch.
+ enum CondCode {
+ COND_O = 0,
+ COND_NO = 1,
+ COND_B = 2,
+ COND_AE = 3,
+ COND_E = 4,
+ COND_NE = 5,
+ COND_BE = 6,
+ COND_A = 7,
+ COND_S = 8,
+ COND_NS = 9,
+ COND_P = 10,
+ COND_NP = 11,
+ COND_L = 12,
+ COND_GE = 13,
+ COND_LE = 14,
+ COND_G = 15,
+ LAST_VALID_COND = COND_G,
+
+ // Artificial condition codes. These are used by analyzeBranch
+ // to indicate a block terminated with two conditional branches that together
+ // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
+ // which can't be represented on x86 with a single condition. These
+ // are never used in MachineInstrs and are inverses of one another.
+ COND_NE_OR_P,
+ COND_E_AND_NP,
+
+ COND_INVALID
+ };
+
+ // The classification for the first instruction in macro fusion.
+ enum class FirstMacroFusionInstKind {
+ // TEST
+ Test,
+ // CMP
+ Cmp,
+ // AND
+ And,
+ // ADD, SUB
+ AddSub,
+ // INC, DEC
+ IncDec,
+ // Not valid as a first macro fusion instruction
+ Invalid
+ };
+
+ enum class SecondMacroFusionInstKind {
+ // JA, JB and variants.
+ AB,
+ // JE, JL, JG and variants.
+ ELG,
+ // JS, JP, JO and variants
+ SPO,
+ // Not a fusible jump.
+ Invalid,
+ };
+
+ /// \returns the type of the first instruction in macro-fusion.
+ inline FirstMacroFusionInstKind
+ classifyFirstOpcodeInMacroFusion(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ return FirstMacroFusionInstKind::Invalid;
+ // TEST
+ case X86::TEST16i16:
+ case X86::TEST16mr:
+ case X86::TEST16ri:
+ case X86::TEST16rr:
+ case X86::TEST32i32:
+ case X86::TEST32mr:
+ case X86::TEST32ri:
+ case X86::TEST32rr:
+ case X86::TEST64i32:
+ case X86::TEST64mr:
+ case X86::TEST64ri32:
+ case X86::TEST64rr:
+ case X86::TEST8i8:
+ case X86::TEST8mr:
+ case X86::TEST8ri:
+ case X86::TEST8rr:
+ return FirstMacroFusionInstKind::Test;
+ case X86::AND16i16:
+ case X86::AND16ri:
+ case X86::AND16ri8:
+ case X86::AND16rm:
+ case X86::AND16rr:
+ case X86::AND16rr_REV:
+ case X86::AND32i32:
+ case X86::AND32ri:
+ case X86::AND32ri8:
+ case X86::AND32rm:
+ case X86::AND32rr:
+ case X86::AND32rr_REV:
+ case X86::AND64i32:
+ case X86::AND64ri32:
+ case X86::AND64ri8:
+ case X86::AND64rm:
+ case X86::AND64rr:
+ case X86::AND64rr_REV:
+ case X86::AND8i8:
+ case X86::AND8ri:
+ case X86::AND8ri8:
+ case X86::AND8rm:
+ case X86::AND8rr:
+ case X86::AND8rr_REV:
+ return FirstMacroFusionInstKind::And;
+ // CMP
+ case X86::CMP16i16:
+ case X86::CMP16mr:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP16rm:
+ case X86::CMP16rr:
+ case X86::CMP16rr_REV:
+ case X86::CMP32i32:
+ case X86::CMP32mr:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP32rm:
+ case X86::CMP32rr:
+ case X86::CMP32rr_REV:
+ case X86::CMP64i32:
+ case X86::CMP64mr:
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP64rm:
+ case X86::CMP64rr:
+ case X86::CMP64rr_REV:
+ case X86::CMP8i8:
+ case X86::CMP8mr:
+ case X86::CMP8ri:
+ case X86::CMP8ri8:
+ case X86::CMP8rm:
+ case X86::CMP8rr:
+ case X86::CMP8rr_REV:
+ return FirstMacroFusionInstKind::Cmp;
+ // ADD
+ case X86::ADD16i16:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16rm:
+ case X86::ADD16rr:
+ case X86::ADD16rr_REV:
+ case X86::ADD32i32:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32rm:
+ case X86::ADD32rr:
+ case X86::ADD32rr_REV:
+ case X86::ADD64i32:
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64rm:
+ case X86::ADD64rr:
+ case X86::ADD64rr_REV:
+ case X86::ADD8i8:
+ case X86::ADD8ri:
+ case X86::ADD8ri8:
+ case X86::ADD8rm:
+ case X86::ADD8rr:
+ case X86::ADD8rr_REV:
+ // SUB
+ case X86::SUB16i16:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB16rm:
+ case X86::SUB16rr:
+ case X86::SUB16rr_REV:
+ case X86::SUB32i32:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB32rm:
+ case X86::SUB32rr:
+ case X86::SUB32rr_REV:
+ case X86::SUB64i32:
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB64rm:
+ case X86::SUB64rr:
+ case X86::SUB64rr_REV:
+ case X86::SUB8i8:
+ case X86::SUB8ri:
+ case X86::SUB8ri8:
+ case X86::SUB8rm:
+ case X86::SUB8rr:
+ case X86::SUB8rr_REV:
+ return FirstMacroFusionInstKind::AddSub;
+ // INC
+ case X86::INC16r:
+ case X86::INC16r_alt:
+ case X86::INC32r:
+ case X86::INC32r_alt:
+ case X86::INC64r:
+ case X86::INC8r:
+ // DEC
+ case X86::DEC16r:
+ case X86::DEC16r_alt:
+ case X86::DEC32r:
+ case X86::DEC32r_alt:
+ case X86::DEC64r:
+ case X86::DEC8r:
+ return FirstMacroFusionInstKind::IncDec;
+ }
+ }
+
+ /// \returns the type of the second instruction in macro-fusion.
+ inline SecondMacroFusionInstKind
+ classifySecondCondCodeInMacroFusion(X86::CondCode CC) {
+ if (CC == X86::COND_INVALID)
+ return SecondMacroFusionInstKind::Invalid;
+
+ switch (CC) {
+ default:
+ return SecondMacroFusionInstKind::Invalid;
+ // JE,JZ
+ case X86::COND_E:
+ // JNE,JNZ
+ case X86::COND_NE:
+ // JL,JNGE
+ case X86::COND_L:
+ // JLE,JNG
+ case X86::COND_LE:
+ // JG,JNLE
+ case X86::COND_G:
+ // JGE,JNL
+ case X86::COND_GE:
+ return SecondMacroFusionInstKind::ELG;
+ // JB,JC
+ case X86::COND_B:
+ // JNA,JBE
+ case X86::COND_BE:
+ // JA,JNBE
+ case X86::COND_A:
+ // JAE,JNC,JNB
+ case X86::COND_AE:
+ return SecondMacroFusionInstKind::AB;
+ // JS
+ case X86::COND_S:
+ // JNS
+ case X86::COND_NS:
+ // JP,JPE
+ case X86::COND_P:
+ // JNP,JPO
+ case X86::COND_NP:
+ // JO
+ case X86::COND_O:
+ // JNO
+ case X86::COND_NO:
+ return SecondMacroFusionInstKind::SPO;
+ }
+ }
+
+ /// \param FirstKind kind of the first instruction in macro fusion.
+ /// \param SecondKind kind of the second instruction in macro fusion.
+ ///
+ /// \returns true if the two instruction can be macro fused.
+ inline bool isMacroFused(FirstMacroFusionInstKind FirstKind,
+ SecondMacroFusionInstKind SecondKind) {
+ switch (FirstKind) {
+ case X86::FirstMacroFusionInstKind::Test:
+ case X86::FirstMacroFusionInstKind::And:
+ return true;
+ case X86::FirstMacroFusionInstKind::Cmp:
+ case X86::FirstMacroFusionInstKind::AddSub:
+ return SecondKind == X86::SecondMacroFusionInstKind::AB ||
+ SecondKind == X86::SecondMacroFusionInstKind::ELG;
+ case X86::FirstMacroFusionInstKind::IncDec:
+ return SecondKind == X86::SecondMacroFusionInstKind::ELG;
+ case X86::FirstMacroFusionInstKind::Invalid:
+ return false;
+ }
+ llvm_unreachable("unknown fusion type");
+ }
+
+ /// Defines the possible values of the branch boundary alignment mask.
+ enum AlignBranchBoundaryKind : uint8_t {
+ AlignBranchNone = 0,
+ AlignBranchFused = 1U << 0,
+ AlignBranchJcc = 1U << 1,
+ AlignBranchJmp = 1U << 2,
+ AlignBranchCall = 1U << 3,
+ AlignBranchRet = 1U << 4,
+ AlignBranchIndirect = 1U << 5
+ };
+
+ /// Defines the encoding values for segment override prefix.
+ enum EncodingOfSegmentOverridePrefix : uint8_t {
+ CS_Encoding = 0x2E,
+ DS_Encoding = 0x3E,
+ ES_Encoding = 0x26,
+ FS_Encoding = 0x64,
+ GS_Encoding = 0x65,
+ SS_Encoding = 0x36
+ };
+
+ /// Given a segment register, return the encoding of the segment override
+ /// prefix for it.
+ inline EncodingOfSegmentOverridePrefix
+ getSegmentOverridePrefixForReg(unsigned Reg) {
+ switch (Reg) {
+ default:
+ llvm_unreachable("Unknown segment register!");
+ case X86::CS:
+ return CS_Encoding;
+ case X86::DS:
+ return DS_Encoding;
+ case X86::ES:
+ return ES_Encoding;
+ case X86::FS:
+ return FS_Encoding;
+ case X86::GS:
+ return GS_Encoding;
+ case X86::SS:
+ return SS_Encoding;
+ }
+ }
+
+} // end namespace X86;
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+ /// Target Operand Flag enum.
+ enum TOF {
+ //===------------------------------------------------------------------===//
+ // X86 Specific MachineOperand flags.
+
+ MO_NO_FLAG,
+
+ /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+ /// relocation of:
+ /// SYMBOL_LABEL + [. - PICBASELABEL]
+ MO_GOT_ABSOLUTE_ADDRESS,
+
+ /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+ /// immediate should get the value of the symbol minus the PIC base label:
+ /// SYMBOL_LABEL - PICBASELABEL
+ MO_PIC_BASE_OFFSET,
+
+ /// MO_GOT - On a symbol operand this indicates that the immediate is the
+ /// offset to the GOT entry for the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOT
+ MO_GOT,
+
+ /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+ /// the offset to the location of the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTOFF
+ MO_GOTOFF,
+
+ /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+ /// offset to the GOT entry for the symbol name from the current code
+ /// location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTPCREL
+ MO_GOTPCREL,
+
+ /// MO_PLT - On a symbol operand this indicates that the immediate is
+ /// offset to the PLT entry of symbol name from the current code location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @PLT
+ MO_PLT,
+
+ /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index structure that contains
+ /// the module number and variable offset for the symbol. Used in the
+ /// general dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSGD
+ MO_TLSGD,
+
+ /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to
+ /// __tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLD
+ MO_TLSLD,
+
+ /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to
+ /// ___tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLDM
+ MO_TLSLDM,
+
+ /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the thread-pointer offset for the
+ /// symbol. Used in the x86-64 initial exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTTPOFF
+ MO_GOTTPOFF,
+
+ /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+ /// the absolute address of the GOT entry with the negative thread-pointer
+ /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+ /// model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @INDNTPOFF
+ MO_INDNTPOFF,
+
+ /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+ /// the thread-pointer offset for the symbol. Used in the x86-64 local
+ /// exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TPOFF
+ MO_TPOFF,
+
+ /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS offset of the symbol. Used
+ /// in the local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @DTPOFF
+ MO_DTPOFF,
+
+ /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+ /// the negative thread-pointer offset for the symbol. Used in the IA32
+ /// local exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @NTPOFF
+ MO_NTPOFF,
+
+ /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the negative thread-pointer offset for
+ /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTNTPOFF
+ MO_GOTNTPOFF,
+
+ /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "__imp_FOO" symbol. This is used for
+ /// dllimport linkage on windows.
+ MO_DLLIMPORT,
+
+ /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+ /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY,
+
+ /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+ /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+ /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY_PIC_BASE,
+
+ /// MO_TLVP - On a symbol operand this indicates that the immediate is
+ /// some TLS offset.
+ ///
+ /// This is the TLS offset for the Darwin TLS mechanism.
+ MO_TLVP,
+
+ /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+ /// is some TLS offset from the picbase.
+ ///
+ /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+ MO_TLVP_PIC_BASE,
+
+ /// MO_SECREL - On a symbol operand this indicates that the immediate is
+ /// the offset from beginning of section.
+ ///
+ /// This is the TLS offset for the COFF/Windows TLS mechanism.
+ MO_SECREL,
+
+ /// MO_ABS8 - On a symbol operand this indicates that the symbol is known
+ /// to be an absolute symbol in range [0,128), so we can use the @ABS8
+ /// symbol modifier.
+ MO_ABS8,
+
+ /// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the ".refptr.FOO" symbol. This is used for
+ /// stub symbols on windows.
+ MO_COFFSTUB,
+ };
+
+ enum : uint64_t {
+ //===------------------------------------------------------------------===//
+ // Instruction encodings. These are the standard/most common forms for X86
+ // instructions.
+ //
+
+ // PseudoFrm - This represents an instruction that is a pseudo instruction
+ // or one that has not been implemented yet. It is illegal to code generate
+ // it, but tolerated for intermediate implementation stages.
+ Pseudo = 0,
+
+ /// Raw - This form is for instructions that don't have any operands, so
+ /// they are just a fixed opcode value, like 'leave'.
+ RawFrm = 1,
+
+ /// AddRegFrm - This form is used for instructions like 'push r32' that have
+ /// their one register operand added to their opcode.
+ AddRegFrm = 2,
+
+ /// RawFrmMemOffs - This form is for instructions that store an absolute
+ /// memory offset as an immediate with a possible segment override.
+ RawFrmMemOffs = 3,
+
+ /// RawFrmSrc - This form is for instructions that use the source index
+ /// register SI/ESI/RSI with a possible segment override.
+ RawFrmSrc = 4,
+
+ /// RawFrmDst - This form is for instructions that use the destination index
+ /// register DI/EDI/RDI.
+ RawFrmDst = 5,
+
+ /// RawFrmDstSrc - This form is for instructions that use the source index
+ /// register SI/ESI/RSI with a possible segment override, and also the
+ /// destination index register DI/EDI/RDI.
+ RawFrmDstSrc = 6,
+
+ /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+ /// immediates, the first of which is a 16-bit immediate (specified by
+ /// the imm encoding) and the second is a 8-bit fixed value.
+ RawFrmImm8 = 7,
+
+ /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+ /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+ /// the imm encoding) and the second is a 16-bit fixed value. In the AMD
+ /// manual, this operand is described as pntr16:32 and pntr16:16
+ RawFrmImm16 = 8,
+
+ /// AddCCFrm - This form is used for Jcc that encode the condition code
+ /// in the lower 4 bits of the opcode.
+ AddCCFrm = 9,
+
+ /// PrefixByte - This form is used for instructions that represent a prefix
+ /// byte like data16 or rep.
+ PrefixByte = 10,
+
+ /// MRM[0-7][rm] - These forms are used to represent instructions that use
+ /// a Mod/RM byte, and use the middle field to hold extended opcode
+ /// information. In the intel manual these are represented as /0, /1, ...
+ ///
+
+ // Instructions operate on a register Reg/Opcode operand not the r/m field.
+ MRMr0 = 21,
+
+ /// MRMSrcMem - But force to use the SIB field.
+ MRMSrcMemFSIB = 22,
+
+ /// MRMDestMem - But force to use the SIB field.
+ MRMDestMemFSIB = 23,
+
+ /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is memory.
+ ///
+ MRMDestMem = 24,
+
+ /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is memory.
+ ///
+ MRMSrcMem = 25,
+
+ /// MRMSrcMem4VOp3 - This form is used for instructions that encode
+ /// operand 3 with VEX.VVVV and load from memory.
+ ///
+ MRMSrcMem4VOp3 = 26,
+
+ /// MRMSrcMemOp4 - This form is used for instructions that use the Mod/RM
+ /// byte to specify the fourth source, which in this case is memory.
+ ///
+ MRMSrcMemOp4 = 27,
+
+ /// MRMSrcMemCC - This form is used for instructions that use the Mod/RM
+ /// byte to specify the operands and also encodes a condition code.
+ ///
+ MRMSrcMemCC = 28,
+
+ /// MRMXm - This form is used for instructions that use the Mod/RM byte
+ /// to specify a memory source, but doesn't use the middle field. And has
+ /// a condition code.
+ ///
+ MRMXmCC = 30,
+
+ /// MRMXm - This form is used for instructions that use the Mod/RM byte
+ /// to specify a memory source, but doesn't use the middle field.
+ ///
+ MRMXm = 31,
+
+ // Next, instructions that operate on a memory r/m operand...
+ MRM0m = 32, MRM1m = 33, MRM2m = 34, MRM3m = 35, // Format /0 /1 /2 /3
+ MRM4m = 36, MRM5m = 37, MRM6m = 38, MRM7m = 39, // Format /4 /5 /6 /7
+
+ /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is a register.
+ ///
+ MRMDestReg = 40,
+
+ /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is a register.
+ ///
+ MRMSrcReg = 41,
+
+ /// MRMSrcReg4VOp3 - This form is used for instructions that encode
+ /// operand 3 with VEX.VVVV and do not load from memory.
+ ///
+ MRMSrcReg4VOp3 = 42,
+
+ /// MRMSrcRegOp4 - This form is used for instructions that use the Mod/RM
+ /// byte to specify the fourth source, which in this case is a register.
+ ///
+ MRMSrcRegOp4 = 43,
+
+ /// MRMSrcRegCC - This form is used for instructions that use the Mod/RM
+ /// byte to specify the operands and also encodes a condition code
+ ///
+ MRMSrcRegCC = 44,
+
+ /// MRMXCCr - This form is used for instructions that use the Mod/RM byte
+ /// to specify a register source, but doesn't use the middle field. And has
+ /// a condition code.
+ ///
+ MRMXrCC = 46,
+
+ /// MRMXr - This form is used for instructions that use the Mod/RM byte
+ /// to specify a register source, but doesn't use the middle field.
+ ///
+ MRMXr = 47,
+
+ // Instructions that operate on a register r/m operand...
+ MRM0r = 48, MRM1r = 49, MRM2r = 50, MRM3r = 51, // Format /0 /1 /2 /3
+ MRM4r = 52, MRM5r = 53, MRM6r = 54, MRM7r = 55, // Format /4 /5 /6 /7
+
+ // Instructions that operate that have mod=11 and an opcode but ignore r/m.
+ MRM0X = 56, MRM1X = 57, MRM2X = 58, MRM3X = 59, // Format /0 /1 /2 /3
+ MRM4X = 60, MRM5X = 61, MRM6X = 62, MRM7X = 63, // Format /4 /5 /6 /7
+
+ /// MRM_XX - A mod/rm byte of exactly 0xXX.
+ MRM_C0 = 64, MRM_C1 = 65, MRM_C2 = 66, MRM_C3 = 67,
+ MRM_C4 = 68, MRM_C5 = 69, MRM_C6 = 70, MRM_C7 = 71,
+ MRM_C8 = 72, MRM_C9 = 73, MRM_CA = 74, MRM_CB = 75,
+ MRM_CC = 76, MRM_CD = 77, MRM_CE = 78, MRM_CF = 79,
+ MRM_D0 = 80, MRM_D1 = 81, MRM_D2 = 82, MRM_D3 = 83,
+ MRM_D4 = 84, MRM_D5 = 85, MRM_D6 = 86, MRM_D7 = 87,
+ MRM_D8 = 88, MRM_D9 = 89, MRM_DA = 90, MRM_DB = 91,
+ MRM_DC = 92, MRM_DD = 93, MRM_DE = 94, MRM_DF = 95,
+ MRM_E0 = 96, MRM_E1 = 97, MRM_E2 = 98, MRM_E3 = 99,
+ MRM_E4 = 100, MRM_E5 = 101, MRM_E6 = 102, MRM_E7 = 103,
+ MRM_E8 = 104, MRM_E9 = 105, MRM_EA = 106, MRM_EB = 107,
+ MRM_EC = 108, MRM_ED = 109, MRM_EE = 110, MRM_EF = 111,
+ MRM_F0 = 112, MRM_F1 = 113, MRM_F2 = 114, MRM_F3 = 115,
+ MRM_F4 = 116, MRM_F5 = 117, MRM_F6 = 118, MRM_F7 = 119,
+ MRM_F8 = 120, MRM_F9 = 121, MRM_FA = 122, MRM_FB = 123,
+ MRM_FC = 124, MRM_FD = 125, MRM_FE = 126, MRM_FF = 127,
+
+ FormMask = 127,
+
+ //===------------------------------------------------------------------===//
+ // Actual flags...
+
+ // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
+ // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
+ // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
+ // prefix in 16-bit mode.
+ OpSizeShift = 7,
+ OpSizeMask = 0x3 << OpSizeShift,
+
+ OpSizeFixed = 0 << OpSizeShift,
+ OpSize16 = 1 << OpSizeShift,
+ OpSize32 = 2 << OpSizeShift,
+
+ // AsSize - AdSizeX implies this instruction determines its need of 0x67
+ // prefix from a normal ModRM memory operand. The other types indicate that
+ // an operand is encoded with a specific width and a prefix is needed if
+ // it differs from the current mode.
+ AdSizeShift = OpSizeShift + 2,
+ AdSizeMask = 0x3 << AdSizeShift,
+
+ AdSizeX = 0 << AdSizeShift,
+ AdSize16 = 1 << AdSizeShift,
+ AdSize32 = 2 << AdSizeShift,
+ AdSize64 = 3 << AdSizeShift,
+
+ //===------------------------------------------------------------------===//
+ // OpPrefix - There are several prefix bytes that are used as opcode
+ // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is
+ // no prefix.
+ //
+ OpPrefixShift = AdSizeShift + 2,
+ OpPrefixMask = 0x3 << OpPrefixShift,
+
+ // PD - Prefix code for packed double precision vector floating point
+ // operations performed in the SSE registers.
+ PD = 1 << OpPrefixShift,
+
+ // XS, XD - These prefix codes are for single and double precision scalar
+ // floating point operations performed in the SSE registers.
+ XS = 2 << OpPrefixShift, XD = 3 << OpPrefixShift,
+
+ //===------------------------------------------------------------------===//
+ // OpMap - This field determines which opcode map this instruction
+ // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
+ //
+ OpMapShift = OpPrefixShift + 2,
+ OpMapMask = 0x7 << OpMapShift,
+
+ // OB - OneByte - Set if this instruction has a one byte opcode.
+ OB = 0 << OpMapShift,
+
+ // TB - TwoByte - Set if this instruction has a two byte opcode, which
+ // starts with a 0x0F byte before the real opcode.
+ TB = 1 << OpMapShift,
+
+ // T8, TA - Prefix after the 0x0F prefix.
+ T8 = 2 << OpMapShift, TA = 3 << OpMapShift,
+
+ // XOP8 - Prefix to include use of imm byte.
+ XOP8 = 4 << OpMapShift,
+
+ // XOP9 - Prefix to exclude use of imm byte.
+ XOP9 = 5 << OpMapShift,
+
+ // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
+ XOPA = 6 << OpMapShift,
+
+ /// ThreeDNow - This indicates that the instruction uses the
+ /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents
+ /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+ /// storing a classifier in the imm8 field. To simplify our implementation,
+ /// we handle this by storeing the classifier in the opcode field and using
+ /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+ ThreeDNow = 7 << OpMapShift,
+
+ //===------------------------------------------------------------------===//
+ // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+ // They are used to specify GPRs and SSE registers, 64-bit operand size,
+ // etc. We only cares about REX.W and REX.R bits and only the former is
+ // statically determined.
+ //
+ REXShift = OpMapShift + 3,
+ REX_W = 1 << REXShift,
+
+ //===------------------------------------------------------------------===//
+ // This three-bit field describes the size of an immediate operand. Zero is
+ // unused so that we can tell if we forgot to set a value.
+ ImmShift = REXShift + 1,
+ ImmMask = 15 << ImmShift,
+ Imm8 = 1 << ImmShift,
+ Imm8PCRel = 2 << ImmShift,
+ Imm8Reg = 3 << ImmShift,
+ Imm16 = 4 << ImmShift,
+ Imm16PCRel = 5 << ImmShift,
+ Imm32 = 6 << ImmShift,
+ Imm32PCRel = 7 << ImmShift,
+ Imm32S = 8 << ImmShift,
+ Imm64 = 9 << ImmShift,
+
+ //===------------------------------------------------------------------===//
+ // FP Instruction Classification... Zero is non-fp instruction.
+
+ // FPTypeMask - Mask for all of the FP types...
+ FPTypeShift = ImmShift + 4,
+ FPTypeMask = 7 << FPTypeShift,
+
+ // NotFP - The default, set for instructions that do not use FP registers.
+ NotFP = 0 << FPTypeShift,
+
+ // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+ ZeroArgFP = 1 << FPTypeShift,
+
+ // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+ OneArgFP = 2 << FPTypeShift,
+
+ // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+ // result back to ST(0). For example, fcos, fsqrt, etc.
+ //
+ OneArgFPRW = 3 << FPTypeShift,
+
+ // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+ // explicit argument, storing the result to either ST(0) or the implicit
+ // argument. For example: fadd, fsub, fmul, etc...
+ TwoArgFP = 4 << FPTypeShift,
+
+ // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+ // explicit argument, but have no destination. Example: fucom, fucomi, ...
+ CompareFP = 5 << FPTypeShift,
+
+ // CondMovFP - "2 operand" floating point conditional move instructions.
+ CondMovFP = 6 << FPTypeShift,
+
+ // SpecialFP - Special instruction forms. Dispatch by opcode explicitly.
+ SpecialFP = 7 << FPTypeShift,
+
+ // Lock prefix
+ LOCKShift = FPTypeShift + 3,
+ LOCK = 1 << LOCKShift,
+
+ // REP prefix
+ REPShift = LOCKShift + 1,
+ REP = 1 << REPShift,
+
+ // Execution domain for SSE instructions.
+ // 0 means normal, non-SSE instruction.
+ SSEDomainShift = REPShift + 1,
+
+ // Encoding
+ EncodingShift = SSEDomainShift + 2,
+ EncodingMask = 0x3 << EncodingShift,
+
+ // VEX - encoding using 0xC4/0xC5
+ VEX = 1 << EncodingShift,
+
+ /// XOP - Opcode prefix used by XOP instructions.
+ XOP = 2 << EncodingShift,
+
+ // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+ // syntax support up to 32 512-bit register operands and up to 7 16-bit
+ // mask operands as well as source operand data swizzling/memory operand
+ // conversion, eviction hint, and rounding mode.
+ EVEX = 3 << EncodingShift,
+
+ // Opcode
+ OpcodeShift = EncodingShift + 2,
+
+ /// VEX_W - Has a opcode specific functionality, but is used in the same
+ /// way as REX_W is for regular SSE instructions.
+ VEX_WShift = OpcodeShift + 8,
+ VEX_W = 1ULL << VEX_WShift,
+
+ /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+ /// address instructions in SSE are represented as 3 address ones in AVX
+ /// and the additional register is encoded in VEX_VVVV prefix.
+ VEX_4VShift = VEX_WShift + 1,
+ VEX_4V = 1ULL << VEX_4VShift,
+
+ /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+ /// instruction uses 256-bit wide registers. This is usually auto detected
+ /// if a VR256 register is used, but some AVX instructions also have this
+ /// field marked when using a f256 memory references.
+ VEX_LShift = VEX_4VShift + 1,
+ VEX_L = 1ULL << VEX_LShift,
+
+ // EVEX_K - Set if this instruction requires masking
+ EVEX_KShift = VEX_LShift + 1,
+ EVEX_K = 1ULL << EVEX_KShift,
+
+ // EVEX_Z - Set if this instruction has EVEX.Z field set.
+ EVEX_ZShift = EVEX_KShift + 1,
+ EVEX_Z = 1ULL << EVEX_ZShift,
+
+ // EVEX_L2 - Set if this instruction has EVEX.L' field set.
+ EVEX_L2Shift = EVEX_ZShift + 1,
+ EVEX_L2 = 1ULL << EVEX_L2Shift,
+
+ // EVEX_B - Set if this instruction has EVEX.B field set.
+ EVEX_BShift = EVEX_L2Shift + 1,
+ EVEX_B = 1ULL << EVEX_BShift,
+
+ // The scaling factor for the AVX512's 8-bit compressed displacement.
+ CD8_Scale_Shift = EVEX_BShift + 1,
+ CD8_Scale_Mask = 127ULL << CD8_Scale_Shift,
+
+ /// Explicitly specified rounding control
+ EVEX_RCShift = CD8_Scale_Shift + 7,
+ EVEX_RC = 1ULL << EVEX_RCShift,
+
+ // NOTRACK prefix
+ NoTrackShift = EVEX_RCShift + 1,
+ NOTRACK = 1ULL << NoTrackShift,
+
+ // Force VEX encoding
+ ExplicitVEXShift = NoTrackShift + 1,
+ ExplicitVEXPrefix = 1ULL << ExplicitVEXShift
+ };
+
+ /// \returns true if the instruction with given opcode is a prefix.
+ inline bool isPrefix(uint64_t TSFlags) {
+ return (TSFlags & X86II::FormMask) == PrefixByte;
+ }
+
+ /// \returns true if the instruction with given opcode is a pseudo.
+ inline bool isPseudo(uint64_t TSFlags) {
+ return (TSFlags & X86II::FormMask) == Pseudo;
+ }
+
+ /// \returns the "base" X86 opcode for the specified machine
+ /// instruction.
+ inline uint8_t getBaseOpcodeFor(uint64_t TSFlags) {
+ return TSFlags >> X86II::OpcodeShift;
+ }
+
+ inline bool hasImm(uint64_t TSFlags) {
+ return (TSFlags & X86II::ImmMask) != 0;
+ }
+
+ /// Decode the "size of immediate" field from the TSFlags field of the
+ /// specified instruction.
+ inline unsigned getSizeOfImm(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate size");
+ case X86II::Imm8:
+ case X86II::Imm8PCRel:
+ case X86II::Imm8Reg: return 1;
+ case X86II::Imm16:
+ case X86II::Imm16PCRel: return 2;
+ case X86II::Imm32:
+ case X86II::Imm32S:
+ case X86II::Imm32PCRel: return 4;
+ case X86II::Imm64: return 8;
+ }
+ }
+
+ /// \returns true if the immediate of the specified instruction's TSFlags
+ /// indicates that it is pc relative.
+ inline bool isImmPCRel(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate size");
+ case X86II::Imm8PCRel:
+ case X86II::Imm16PCRel:
+ case X86II::Imm32PCRel:
+ return true;
+ case X86II::Imm8:
+ case X86II::Imm8Reg:
+ case X86II::Imm16:
+ case X86II::Imm32:
+ case X86II::Imm32S:
+ case X86II::Imm64:
+ return false;
+ }
+ }
+
+ /// \returns true if the immediate of the specified instruction's
+ /// TSFlags indicates that it is signed.
+ inline bool isImmSigned(uint64_t TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: llvm_unreachable("Unknown immediate signedness");
+ case X86II::Imm32S:
+ return true;
+ case X86II::Imm8:
+ case X86II::Imm8PCRel:
+ case X86II::Imm8Reg:
+ case X86II::Imm16:
+ case X86II::Imm16PCRel:
+ case X86II::Imm32:
+ case X86II::Imm32PCRel:
+ case X86II::Imm64:
+ return false;
+ }
+ }
+
+ /// Compute whether all of the def operands are repeated in the uses and
+ /// therefore should be skipped.
+ /// This determines the start of the unique operand list. We need to determine
+ /// if all of the defs have a corresponding tied operand in the uses.
+ /// Unfortunately, the tied operand information is encoded in the uses not
+ /// the defs so we have to use some heuristics to find which operands to
+ /// query.
+ inline unsigned getOperandBias(const MCInstrDesc& Desc) {
+ unsigned NumDefs = Desc.getNumDefs();
+ unsigned NumOps = Desc.getNumOperands();
+ switch (NumDefs) {
+ default: llvm_unreachable("Unexpected number of defs");
+ case 0:
+ return 0;
+ case 1:
+ // Common two addr case.
+ if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ return 1;
+ // Check for AVX-512 scatter which has a TIED_TO in the second to last
+ // operand.
+ if (NumOps == 8 &&
+ Desc.getOperandConstraint(6, MCOI::TIED_TO) == 0)
+ return 1;
+ return 0;
+ case 2:
+ // XCHG/XADD have two destinations and two sources.
+ if (NumOps >= 4 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+ return 2;
+ // Check for gather. AVX-512 has the second tied operand early. AVX2
+ // has it as the last op.
+ if (NumOps == 9 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ (Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1 ||
+ Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1))
+ return 2;
+ return 0;
+ }
+ }
+
+ /// The function returns the MCInst operand # for the first field of the
+ /// memory operand. If the instruction doesn't have a
+ /// memory operand, this returns -1.
+ ///
+ /// Note that this ignores tied operands. If there is a tied register which
+ /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+ /// counted as one operand.
+ ///
+ inline int getMemoryOperandNo(uint64_t TSFlags) {
+ bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+ bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+
+ switch (TSFlags & X86II::FormMask) {
+ default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
+ case X86II::Pseudo:
+ case X86II::RawFrm:
+ case X86II::AddRegFrm:
+ case X86II::RawFrmImm8:
+ case X86II::RawFrmImm16:
+ case X86II::RawFrmMemOffs:
+ case X86II::RawFrmSrc:
+ case X86II::RawFrmDst:
+ case X86II::RawFrmDstSrc:
+ case X86II::AddCCFrm:
+ case X86II::PrefixByte:
+ return -1;
+ case X86II::MRMDestMem:
+ case X86II::MRMDestMemFSIB:
+ return 0;
+ case X86II::MRMSrcMem:
+ case X86II::MRMSrcMemFSIB:
+ // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
+ // mask register.
+ return 1 + HasVEX_4V + HasEVEX_K;
+ case X86II::MRMSrcMem4VOp3:
+ // Skip registers encoded in reg.
+ return 1 + HasEVEX_K;
+ case X86II::MRMSrcMemOp4:
+ // Skip registers encoded in reg, VEX_VVVV, and I8IMM.
+ return 3;
+ case X86II::MRMSrcMemCC:
+ // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
+ // mask register.
+ return 1;
+ case X86II::MRMDestReg:
+ case X86II::MRMSrcReg:
+ case X86II::MRMSrcReg4VOp3:
+ case X86II::MRMSrcRegOp4:
+ case X86II::MRMSrcRegCC:
+ case X86II::MRMXrCC:
+ case X86II::MRMr0:
+ case X86II::MRMXr:
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ return -1;
+ case X86II::MRM0X: case X86II::MRM1X:
+ case X86II::MRM2X: case X86II::MRM3X:
+ case X86II::MRM4X: case X86II::MRM5X:
+ case X86II::MRM6X: case X86II::MRM7X:
+ return -1;
+ case X86II::MRMXmCC:
+ case X86II::MRMXm:
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ // Start from 0, skip registers encoded in VEX_VVVV or a mask register.
+ return 0 + HasVEX_4V + HasEVEX_K;
+ case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+ case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+ case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
+ case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+ case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
+ case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+ case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+ case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+ case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+ case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+ case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+ case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+ case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+ case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+ case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+ case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+ case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+ case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+ case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+ case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+ case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+ case X86II::MRM_FF:
+ return -1;
+ }
+ }
+
+ /// \returns true if the MachineOperand is a x86-64 extended (r8 or
+ /// higher) register, e.g. r8, xmm8, xmm13, etc.
+ inline bool isX86_64ExtendedReg(unsigned RegNo) {
+ if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM31) ||
+ (RegNo >= X86::YMM8 && RegNo <= X86::YMM31) ||
+ (RegNo >= X86::ZMM8 && RegNo <= X86::ZMM31))
+ return true;
+
+ switch (RegNo) {
+ default: break;
+ case X86::R8: case X86::R9: case X86::R10: case X86::R11:
+ case X86::R12: case X86::R13: case X86::R14: case X86::R15:
+ case X86::R8D: case X86::R9D: case X86::R10D: case X86::R11D:
+ case X86::R12D: case X86::R13D: case X86::R14D: case X86::R15D:
+ case X86::R8W: case X86::R9W: case X86::R10W: case X86::R11W:
+ case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W:
+ case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B:
+ case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
+ case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11:
+ case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15:
+ case X86::DR8: case X86::DR9: case X86::DR10: case X86::DR11:
+ case X86::DR12: case X86::DR13: case X86::DR14: case X86::DR15:
+ return true;
+ }
+ return false;
+ }
+
+ /// \returns true if the MemoryOperand is a 32 extended (zmm16 or higher)
+ /// registers, e.g. zmm21, etc.
+ static inline bool is32ExtendedReg(unsigned RegNo) {
+ return ((RegNo >= X86::XMM16 && RegNo <= X86::XMM31) ||
+ (RegNo >= X86::YMM16 && RegNo <= X86::YMM31) ||
+ (RegNo >= X86::ZMM16 && RegNo <= X86::ZMM31));
+ }
+
+
+ inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+ return (reg == X86::SPL || reg == X86::BPL ||
+ reg == X86::SIL || reg == X86::DIL);
+ }
+
+ /// \returns true if this is a masked instruction.
+ inline bool isKMasked(uint64_t TSFlags) {
+ return (TSFlags & X86II::EVEX_K) != 0;
+ }
+
+ /// \returns true if this is a merge masked instruction.
+ inline bool isKMergeMasked(uint64_t TSFlags) {
+ return isKMasked(TSFlags) && (TSFlags & X86II::EVEX_Z) == 0;
+ }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
new file mode 100644
index 000000000000..fa937d381613
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -0,0 +1,345 @@
+//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+namespace {
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+ ~X86ELFObjectWriter() override = default;
+
+protected:
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+};
+
+} // end anonymous namespace
+
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+ uint16_t EMachine)
+ : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+ // Only i386 and IAMCU use Rel instead of RelA.
+ /*HasRelocationAddend*/
+ (EMachine != ELF::EM_386) &&
+ (EMachine != ELF::EM_IAMCU)) {}
+
+enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
+
+static X86_64RelType getType64(MCFixupKind Kind,
+ MCSymbolRefExpr::VariantKind &Modifier,
+ bool &IsPCRel) {
+ switch (unsigned(Kind)) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case FK_NONE:
+ return RT64_NONE;
+ case X86::reloc_global_offset_table8:
+ Modifier = MCSymbolRefExpr::VK_GOT;
+ IsPCRel = true;
+ return RT64_64;
+ case FK_Data_8:
+ return RT64_64;
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel)
+ return RT64_32S;
+ return RT64_32;
+ case X86::reloc_global_offset_table:
+ Modifier = MCSymbolRefExpr::VK_GOT;
+ IsPCRel = true;
+ return RT64_32;
+ case FK_Data_4:
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_relax:
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_riprel_4byte_movq_load:
+ return RT64_32;
+ case X86::reloc_branch_4byte_pcrel:
+ Modifier = MCSymbolRefExpr::VK_PLT;
+ return RT64_32;
+ case FK_PCRel_2:
+ case FK_Data_2:
+ return RT64_16;
+ case FK_PCRel_1:
+ case FK_Data_1:
+ return RT64_8;
+ }
+}
+
+static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
+ if (Type != RT64_32)
+ Ctx.reportError(Loc,
+ "32 bit reloc applied to a field with a different size");
+}
+
+static void checkIs64(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
+ if (Type != RT64_64)
+ Ctx.reportError(Loc,
+ "64 bit reloc applied to a field with a different size");
+}
+
+static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
+ MCSymbolRefExpr::VariantKind Modifier,
+ X86_64RelType Type, bool IsPCRel,
+ MCFixupKind Kind) {
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ case MCSymbolRefExpr::VK_X86_ABS8:
+ switch (Type) {
+ case RT64_NONE:
+ if (Modifier == MCSymbolRefExpr::VK_None)
+ return ELF::R_X86_64_NONE;
+ llvm_unreachable("Unimplemented");
+ case RT64_64:
+ return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
+ case RT64_32:
+ return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32;
+ case RT64_32S:
+ return ELF::R_X86_64_32S;
+ case RT64_16:
+ return IsPCRel ? ELF::R_X86_64_PC16 : ELF::R_X86_64_16;
+ case RT64_8:
+ return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8;
+ }
+ llvm_unreachable("unexpected relocation type!");
+ case MCSymbolRefExpr::VK_GOT:
+ switch (Type) {
+ case RT64_64:
+ return IsPCRel ? ELF::R_X86_64_GOTPC64 : ELF::R_X86_64_GOT64;
+ case RT64_32:
+ return IsPCRel ? ELF::R_X86_64_GOTPC32 : ELF::R_X86_64_GOT32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ case RT64_NONE:
+ llvm_unreachable("Unimplemented");
+ }
+ llvm_unreachable("unexpected relocation type!");
+ case MCSymbolRefExpr::VK_GOTOFF:
+ assert(Type == RT64_64);
+ assert(!IsPCRel);
+ return ELF::R_X86_64_GOTOFF64;
+ case MCSymbolRefExpr::VK_TPOFF:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT64_64:
+ return ELF::R_X86_64_TPOFF64;
+ case RT64_32:
+ return ELF::R_X86_64_TPOFF32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ case RT64_NONE:
+ llvm_unreachable("Unimplemented");
+ }
+ llvm_unreachable("unexpected relocation type!");
+ case MCSymbolRefExpr::VK_DTPOFF:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT64_64:
+ return ELF::R_X86_64_DTPOFF64;
+ case RT64_32:
+ return ELF::R_X86_64_DTPOFF32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ case RT64_NONE:
+ llvm_unreachable("Unimplemented");
+ }
+ llvm_unreachable("unexpected relocation type!");
+ case MCSymbolRefExpr::VK_SIZE:
+ assert(!IsPCRel);
+ switch (Type) {
+ case RT64_64:
+ return ELF::R_X86_64_SIZE64;
+ case RT64_32:
+ return ELF::R_X86_64_SIZE32;
+ case RT64_32S:
+ case RT64_16:
+ case RT64_8:
+ case RT64_NONE:
+ llvm_unreachable("Unimplemented");
+ }
+ llvm_unreachable("unexpected relocation type!");
+ case MCSymbolRefExpr::VK_TLSCALL:
+ return ELF::R_X86_64_TLSDESC_CALL;
+ case MCSymbolRefExpr::VK_TLSDESC:
+ return ELF::R_X86_64_GOTPC32_TLSDESC;
+ case MCSymbolRefExpr::VK_TLSGD:
+ checkIs32(Ctx, Loc, Type);
+ return ELF::R_X86_64_TLSGD;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ checkIs32(Ctx, Loc, Type);
+ return ELF::R_X86_64_GOTTPOFF;
+ case MCSymbolRefExpr::VK_TLSLD:
+ checkIs32(Ctx, Loc, Type);
+ return ELF::R_X86_64_TLSLD;
+ case MCSymbolRefExpr::VK_PLT:
+ checkIs32(Ctx, Loc, Type);
+ return ELF::R_X86_64_PLT32;
+ case MCSymbolRefExpr::VK_GOTPCREL:
+ checkIs32(Ctx, Loc, Type);
+ // Older versions of ld.bfd/ld.gold/lld
+ // do not support GOTPCRELX/REX_GOTPCRELX,
+ // and we want to keep back-compatibility.
+ if (!Ctx.getAsmInfo()->canRelaxRelocations())
+ return ELF::R_X86_64_GOTPCREL;
+ switch (unsigned(Kind)) {
+ default:
+ return ELF::R_X86_64_GOTPCREL;
+ case X86::reloc_riprel_4byte_relax:
+ return ELF::R_X86_64_GOTPCRELX;
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_riprel_4byte_movq_load:
+ return ELF::R_X86_64_REX_GOTPCRELX;
+ }
+ llvm_unreachable("unexpected relocation type!");
+ case MCSymbolRefExpr::VK_X86_PLTOFF:
+ checkIs64(Ctx, Loc, Type);
+ return ELF::R_X86_64_PLTOFF64;
+ }
+}
+
+enum X86_32RelType { RT32_NONE, RT32_32, RT32_16, RT32_8 };
+
+static X86_32RelType getType32(X86_64RelType T) {
+ switch (T) {
+ case RT64_NONE:
+ return RT32_NONE;
+ case RT64_64:
+ llvm_unreachable("Unimplemented");
+ case RT64_32:
+ case RT64_32S:
+ return RT32_32;
+ case RT64_16:
+ return RT32_16;
+ case RT64_8:
+ return RT32_8;
+ }
+ llvm_unreachable("unexpected relocation type!");
+}
+
+static unsigned getRelocType32(MCContext &Ctx,
+ MCSymbolRefExpr::VariantKind Modifier,
+ X86_32RelType Type, bool IsPCRel,
+ MCFixupKind Kind) {
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ case MCSymbolRefExpr::VK_X86_ABS8:
+ switch (Type) {
+ case RT32_NONE:
+ if (Modifier == MCSymbolRefExpr::VK_None)
+ return ELF::R_386_NONE;
+ llvm_unreachable("Unimplemented");
+ case RT32_32:
+ return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
+ case RT32_16:
+ return IsPCRel ? ELF::R_386_PC16 : ELF::R_386_16;
+ case RT32_8:
+ return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8;
+ }
+ llvm_unreachable("unexpected relocation type!");
+ case MCSymbolRefExpr::VK_GOT:
+ assert(Type == RT32_32);
+ if (IsPCRel)
+ return ELF::R_386_GOTPC;
+ // Older versions of ld.bfd/ld.gold/lld do not support R_386_GOT32X and we
+ // want to maintain compatibility.
+ if (!Ctx.getAsmInfo()->canRelaxRelocations())
+ return ELF::R_386_GOT32;
+
+ return Kind == MCFixupKind(X86::reloc_signed_4byte_relax)
+ ? ELF::R_386_GOT32X
+ : ELF::R_386_GOT32;
+ case MCSymbolRefExpr::VK_GOTOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_GOTOFF;
+ case MCSymbolRefExpr::VK_TLSCALL:
+ return ELF::R_386_TLS_DESC_CALL;
+ case MCSymbolRefExpr::VK_TLSDESC:
+ return ELF::R_386_TLS_GOTDESC;
+ case MCSymbolRefExpr::VK_TPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LE_32;
+ case MCSymbolRefExpr::VK_DTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LDO_32;
+ case MCSymbolRefExpr::VK_TLSGD:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_GD;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_IE_32;
+ case MCSymbolRefExpr::VK_PLT:
+ assert(Type == RT32_32);
+ return ELF::R_386_PLT32;
+ case MCSymbolRefExpr::VK_INDNTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_IE;
+ case MCSymbolRefExpr::VK_NTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LE;
+ case MCSymbolRefExpr::VK_GOTNTPOFF:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_GOTIE;
+ case MCSymbolRefExpr::VK_TLSLDM:
+ assert(Type == RT32_32);
+ assert(!IsPCRel);
+ return ELF::R_386_TLS_LDM;
+ }
+}
+
+unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ MCFixupKind Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return Kind - FirstLiteralRelocationKind;
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+ X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
+ if (getEMachine() == ELF::EM_X86_64)
+ return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
+
+ assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) &&
+ "Unsupported ELF machine type.");
+ return getRelocType32(Ctx, Modifier, getType32(Type), IsPCRel, Kind);
+}
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) {
+ return std::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
new file mode 100644
index 000000000000..2d5217115d07
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -0,0 +1,40 @@
+//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+ reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
+ reloc_riprel_4byte_movq_load, // 32-bit rip-relative in movq
+ reloc_riprel_4byte_relax, // 32-bit rip-relative in relaxable
+ // instruction
+ reloc_riprel_4byte_relax_rex, // 32-bit rip-relative in relaxable
+ // instruction with rex prefix
+ reloc_signed_4byte, // 32-bit signed. Unlike FK_Data_4
+ // this will be sign extended at
+ // runtime.
+ reloc_signed_4byte_relax, // like reloc_signed_4byte, but
+ // in a relaxable instruction.
+ reloc_global_offset_table, // 32-bit, relative to the start
+ // of the instruction. Used only
+ // for _GLOBAL_OFFSET_TABLE_.
+ reloc_global_offset_table8, // 64-bit variant.
+ reloc_branch_4byte_pcrel, // 32-bit PC relative branch.
+ // Marker
+ LastTargetFixupKind,
+ NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
new file mode 100644
index 000000000000..b51011e2c52f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -0,0 +1,1461 @@
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "X86ATTInstPrinter.h"
+#include "X86BaseInfo.h"
+#include "X86MCTargetDesc.h"
+#include "X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define CASE_SSE_INS_COMMON(Inst, src) \
+ case X86::Inst##src:
+
+#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src:
+
+#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src##k:
+
+#define CASE_MASKZ_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src##kz:
+
+#define CASE_AVX512_INS_COMMON(Inst, Suffix, src) \
+ CASE_AVX_INS_COMMON(Inst, Suffix, src) \
+ CASE_MASK_INS_COMMON(Inst, Suffix, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Suffix, src)
+
+#define CASE_MOVDUP(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_MOVDUP(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_MOVDUP(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_PMOVZX(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_PMOVZX(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_PMOVZX(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_UNPCK(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_UNPCK(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_UNPCK(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_SHUF(Inst, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, suf) \
+ CASE_AVX_INS_COMMON(Inst, , suf) \
+ CASE_AVX_INS_COMMON(Inst, Y, suf) \
+ CASE_SSE_INS_COMMON(Inst, suf)
+
+#define CASE_MASK_SHUF(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src##i)
+
+#define CASE_MASKZ_SHUF(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, r##src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, r##src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, r##src##i)
+
+#define CASE_VPERMILPI(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, src##i) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, src##i) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, src##i) \
+ CASE_AVX_INS_COMMON(Inst, , src##i) \
+ CASE_AVX_INS_COMMON(Inst, Y, src##i)
+
+#define CASE_MASK_VPERMILPI(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z128, src##i)
+
+#define CASE_MASKZ_VPERMILPI(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z128, src##i)
+
+#define CASE_VPERM(Inst, src) \
+ CASE_AVX512_INS_COMMON(Inst, Z, src##i) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, src##i) \
+ CASE_AVX_INS_COMMON(Inst, Y, src##i)
+
+#define CASE_MASK_VPERM(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, src##i)
+
+#define CASE_MASKZ_VPERM(Inst, src) \
+ CASE_MASKZ_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASKZ_INS_COMMON(Inst, Z256, src##i)
+
+#define CASE_VSHUF(Inst, src) \
+ CASE_AVX512_INS_COMMON(SHUFF##Inst, Z, r##src##i) \
+ CASE_AVX512_INS_COMMON(SHUFI##Inst, Z, r##src##i) \
+ CASE_AVX512_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+ CASE_AVX512_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+#define CASE_MASK_VSHUF(Inst, src) \
+ CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+#define CASE_MASKZ_VSHUF(Inst, src) \
+ CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z, r##src##i) \
+ CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z, r##src##i) \
+ CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+ CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+#define CASE_AVX512_FMA(Inst, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z256, suf) \
+ CASE_AVX512_INS_COMMON(Inst, Z128, suf)
+
+#define CASE_FMA(Inst, suf) \
+ CASE_AVX512_FMA(Inst, suf) \
+ CASE_AVX_INS_COMMON(Inst, , suf) \
+ CASE_AVX_INS_COMMON(Inst, Y, suf)
+
+#define CASE_FMA_PACKED_REG(Inst) \
+ CASE_FMA(Inst##PD, r) \
+ CASE_FMA(Inst##PS, r)
+
+#define CASE_FMA_PACKED_MEM(Inst) \
+ CASE_FMA(Inst##PD, m) \
+ CASE_FMA(Inst##PS, m) \
+ CASE_AVX512_FMA(Inst##PD, mb) \
+ CASE_AVX512_FMA(Inst##PS, mb)
+
+#define CASE_FMA_SCALAR_REG(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD, , r) \
+ CASE_AVX_INS_COMMON(Inst##SS, , r) \
+ CASE_AVX_INS_COMMON(Inst##SD, , r_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS, , r_Int) \
+ CASE_AVX_INS_COMMON(Inst##SD, Z, r) \
+ CASE_AVX_INS_COMMON(Inst##SS, Z, r) \
+ CASE_AVX512_INS_COMMON(Inst##SD, Z, r_Int) \
+ CASE_AVX512_INS_COMMON(Inst##SS, Z, r_Int)
+
+#define CASE_FMA_SCALAR_MEM(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD, , m) \
+ CASE_AVX_INS_COMMON(Inst##SS, , m) \
+ CASE_AVX_INS_COMMON(Inst##SD, , m_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS, , m_Int) \
+ CASE_AVX_INS_COMMON(Inst##SD, Z, m) \
+ CASE_AVX_INS_COMMON(Inst##SS, Z, m) \
+ CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int) \
+ CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int)
+
+#define CASE_FMA4(Inst, suf) \
+ CASE_AVX_INS_COMMON(Inst, 4, suf) \
+ CASE_AVX_INS_COMMON(Inst, 4Y, suf)
+
+#define CASE_FMA4_PACKED_RR(Inst) \
+ CASE_FMA4(Inst##PD, rr) \
+ CASE_FMA4(Inst##PS, rr)
+
+#define CASE_FMA4_PACKED_RM(Inst) \
+ CASE_FMA4(Inst##PD, rm) \
+ CASE_FMA4(Inst##PS, rm)
+
+#define CASE_FMA4_PACKED_MR(Inst) \
+ CASE_FMA4(Inst##PD, mr) \
+ CASE_FMA4(Inst##PS, mr)
+
+#define CASE_FMA4_SCALAR_RR(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , rr) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , rr) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , rr_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , rr_Int)
+
+#define CASE_FMA4_SCALAR_RM(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , rm) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , rm) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , rm_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , rm_Int)
+
+#define CASE_FMA4_SCALAR_MR(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , mr) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , mr) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , mr_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , mr_Int)
+
+static unsigned getVectorRegSize(unsigned RegNo) {
+ if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
+ return 512;
+ if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31)
+ return 256;
+ if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31)
+ return 128;
+ if (X86::MM0 <= RegNo && RegNo <= X86::MM7)
+ return 64;
+
+ llvm_unreachable("Unknown vector reg!");
+}
+
+static unsigned getRegOperandNumElts(const MCInst *MI, unsigned ScalarSize,
+ unsigned OperandIndex) {
+ unsigned OpReg = MI->getOperand(OperandIndex).getReg();
+ return getVectorRegSize(OpReg) / ScalarSize;
+}
+
+static const char *getRegName(unsigned Reg) {
+ return X86ATTInstPrinter::getRegisterName(Reg);
+}
+
+/// Wraps the destination register name with AVX512 mask/maskz filtering.
+static void printMasking(raw_ostream &OS, const MCInst *MI,
+ const MCInstrInfo &MCII) {
+ const MCInstrDesc &Desc = MCII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ if (!(TSFlags & X86II::EVEX_K))
+ return;
+
+ bool MaskWithZero = (TSFlags & X86II::EVEX_Z);
+ unsigned MaskOp = Desc.getNumDefs();
+
+ if (Desc.getOperandConstraint(MaskOp, MCOI::TIED_TO) != -1)
+ ++MaskOp;
+
+ const char *MaskRegName = getRegName(MI->getOperand(MaskOp).getReg());
+
+ // MASK: zmmX {%kY}
+ OS << " {%" << MaskRegName << "}";
+
+ // MASKZ: zmmX {%kY} {z}
+ if (MaskWithZero)
+ OS << " {z}";
+}
+
+static bool printFMAComments(const MCInst *MI, raw_ostream &OS,
+ const MCInstrInfo &MCII) {
+ const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr;
+ unsigned NumOperands = MI->getNumOperands();
+ bool RegForm = false;
+ bool Negate = false;
+ StringRef AccStr = "+";
+
+ // The operands for FMA3 instructions without rounding fall into two forms:
+ // dest, src1, src2, src3
+ // dest, src1, mask, src2, src3
+ // Where src3 is either a register or 5 memory address operands. So to find
+ // dest and src1 we can index from the front. To find src2 and src3 we can
+ // index from the end by taking into account memory vs register form when
+ // finding src2.
+
+ // The operands for FMA4 instructions:
+ // dest, src1, src2, src3
+ // Where src2 OR src3 are either a register or 5 memory address operands. So
+ // to find dest and src1 we can index from the front, src2 (reg/mem) follows
+ // and then src3 (reg) will be at the end.
+
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+
+ CASE_FMA4_PACKED_RR(FMADD)
+ CASE_FMA4_SCALAR_RR(FMADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FMADD)
+ CASE_FMA4_SCALAR_RM(FMADD)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ break;
+ CASE_FMA4_PACKED_MR(FMADD)
+ CASE_FMA4_SCALAR_MR(FMADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ CASE_FMA4_PACKED_RR(FMSUB)
+ CASE_FMA4_SCALAR_RR(FMSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FMSUB)
+ CASE_FMA4_SCALAR_RM(FMSUB)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+ CASE_FMA4_PACKED_MR(FMSUB)
+ CASE_FMA4_SCALAR_MR(FMSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+
+ CASE_FMA4_PACKED_RR(FNMADD)
+ CASE_FMA4_SCALAR_RR(FNMADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FNMADD)
+ CASE_FMA4_SCALAR_RM(FNMADD)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+ CASE_FMA4_PACKED_MR(FNMADD)
+ CASE_FMA4_SCALAR_MR(FNMADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+
+ CASE_FMA4_PACKED_RR(FNMSUB)
+ CASE_FMA4_SCALAR_RR(FNMSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FNMSUB)
+ CASE_FMA4_SCALAR_RM(FNMSUB)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+ CASE_FMA4_PACKED_MR(FNMSUB)
+ CASE_FMA4_SCALAR_MR(FNMSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+
+ CASE_FMA4_PACKED_RR(FMADDSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FMADDSUB)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+ CASE_FMA4_PACKED_MR(FMADDSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+
+ CASE_FMA4_PACKED_RR(FMSUBADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FMSUBADD)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+ CASE_FMA4_PACKED_MR(FMSUBADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+
+ CASE_FMA_PACKED_REG(FMADD132)
+ CASE_FMA_SCALAR_REG(FMADD132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADD132)
+ CASE_FMA_SCALAR_MEM(FMADD132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ CASE_FMA_PACKED_REG(FMADD213)
+ CASE_FMA_SCALAR_REG(FMADD213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADD213)
+ CASE_FMA_SCALAR_MEM(FMADD213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ CASE_FMA_PACKED_REG(FMADD231)
+ CASE_FMA_SCALAR_REG(FMADD231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADD231)
+ CASE_FMA_SCALAR_MEM(FMADD231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUB132)
+ CASE_FMA_SCALAR_REG(FMSUB132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUB132)
+ CASE_FMA_SCALAR_MEM(FMSUB132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUB213)
+ CASE_FMA_SCALAR_REG(FMSUB213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUB213)
+ CASE_FMA_SCALAR_MEM(FMSUB213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUB231)
+ CASE_FMA_SCALAR_REG(FMSUB231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUB231)
+ CASE_FMA_SCALAR_MEM(FMSUB231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+
+ CASE_FMA_PACKED_REG(FNMADD132)
+ CASE_FMA_SCALAR_REG(FNMADD132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMADD132)
+ CASE_FMA_SCALAR_MEM(FNMADD132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMADD213)
+ CASE_FMA_SCALAR_REG(FNMADD213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMADD213)
+ CASE_FMA_SCALAR_MEM(FNMADD213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMADD231)
+ CASE_FMA_SCALAR_REG(FNMADD231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMADD231)
+ CASE_FMA_SCALAR_MEM(FNMADD231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMSUB132)
+ CASE_FMA_SCALAR_REG(FNMSUB132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMSUB132)
+ CASE_FMA_SCALAR_MEM(FNMSUB132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMSUB213)
+ CASE_FMA_SCALAR_REG(FNMSUB213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMSUB213)
+ CASE_FMA_SCALAR_MEM(FNMSUB213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FNMSUB231)
+ CASE_FMA_SCALAR_REG(FNMSUB231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FNMSUB231)
+ CASE_FMA_SCALAR_MEM(FNMSUB231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+
+ CASE_FMA_PACKED_REG(FMADDSUB132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADDSUB132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMADDSUB213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADDSUB213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMADDSUB231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMADDSUB231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUBADD132)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUBADD132)
+ AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUBADD213)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUBADD213)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ Mul2Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+
+ CASE_FMA_PACKED_REG(FMSUBADD231)
+ Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+ CASE_FMA_PACKED_MEM(FMSUBADD231)
+ Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ AccName = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+ }
+
+ const char *DestName = getRegName(MI->getOperand(0).getReg());
+
+ if (!Mul1Name) Mul1Name = "mem";
+ if (!Mul2Name) Mul2Name = "mem";
+ if (!AccName) AccName = "mem";
+
+ OS << DestName;
+ printMasking(OS, MI, MCII);
+ OS << " = ";
+
+ if (Negate)
+ OS << '-';
+
+ OS << '(' << Mul1Name << " * " << Mul2Name << ") " << AccStr << ' '
+ << AccName;
+
+ return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired. This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+ const MCInstrInfo &MCII) {
+ // If this is a shuffle operation, the switch should fill in this state.
+ SmallVector<int, 8> ShuffleMask;
+ const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
+ unsigned NumOperands = MI->getNumOperands();
+ bool RegForm = false;
+
+ if (printFMAComments(MI, OS, MCII))
+ return true;
+
+ switch (MI->getOpcode()) {
+ default:
+ // Not an instruction for which we can decode comments.
+ return false;
+
+ case X86::BLENDPDrri:
+ case X86::VBLENDPDrri:
+ case X86::VBLENDPDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::BLENDPDrmi:
+ case X86::VBLENDPDrmi:
+ case X86::VBLENDPDYrmi:
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeBLENDMask(getRegOperandNumElts(MI, 64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::BLENDPSrri:
+ case X86::VBLENDPSrri:
+ case X86::VBLENDPSYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::BLENDPSrmi:
+ case X86::VBLENDPSrmi:
+ case X86::VBLENDPSYrmi:
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::PBLENDWrri:
+ case X86::VPBLENDWrri:
+ case X86::VPBLENDWYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::PBLENDWrmi:
+ case X86::VPBLENDWrmi:
+ case X86::VPBLENDWYrmi:
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeBLENDMask(getRegOperandNumElts(MI, 16, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::VPBLENDDrri:
+ case X86::VPBLENDDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::VPBLENDDrmi:
+ case X86::VPBLENDDYrmi:
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeBLENDMask(getRegOperandNumElts(MI, 32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ case X86::VINSERTPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::INSERTPSrm:
+ case X86::VINSERTPSrm:
+ case X86::VINSERTPSZrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeINSERTPSMask(MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::MOVLHPSrr:
+ case X86::VMOVLHPSrr:
+ case X86::VMOVLHPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVLHPSMask(2, ShuffleMask);
+ break;
+
+ case X86::MOVHLPSrr:
+ case X86::VMOVHLPSrr:
+ case X86::VMOVHLPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVHLPSMask(2, ShuffleMask);
+ break;
+
+ case X86::MOVHPDrm:
+ case X86::VMOVHPDrm:
+ case X86::VMOVHPDZ128rm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeInsertElementMask(2, 1, 1, ShuffleMask);
+ break;
+
+ case X86::MOVHPSrm:
+ case X86::VMOVHPSrm:
+ case X86::VMOVHPSZ128rm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeInsertElementMask(4, 2, 2, ShuffleMask);
+ break;
+
+ case X86::MOVLPDrm:
+ case X86::VMOVLPDrm:
+ case X86::VMOVLPDZ128rm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeInsertElementMask(2, 0, 1, ShuffleMask);
+ break;
+
+ case X86::MOVLPSrm:
+ case X86::VMOVLPSrm:
+ case X86::VMOVLPSZ128rm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeInsertElementMask(4, 0, 2, ShuffleMask);
+ break;
+
+ CASE_MOVDUP(MOVSLDUP, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_MOVDUP(MOVSLDUP, m)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSLDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+ break;
+
+ CASE_MOVDUP(MOVSHDUP, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_MOVDUP(MOVSHDUP, m)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSHDUPMask(getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+ break;
+
+ CASE_MOVDUP(MOVDDUP, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_MOVDUP(MOVDDUP, m)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVDDUPMask(getRegOperandNumElts(MI, 64, 0), ShuffleMask);
+ break;
+
+ case X86::PSLLDQri:
+ case X86::VPSLLDQri:
+ case X86::VPSLLDQYri:
+ case X86::VPSLLDQZ128ri:
+ case X86::VPSLLDQZ256ri:
+ case X86::VPSLLDQZri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::VPSLLDQZ128mi:
+ case X86::VPSLLDQZ256mi:
+ case X86::VPSLLDQZmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSLLDQMask(getRegOperandNumElts(MI, 8, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSRLDQri:
+ case X86::VPSRLDQri:
+ case X86::VPSRLDQYri:
+ case X86::VPSRLDQZ128ri:
+ case X86::VPSRLDQZ256ri:
+ case X86::VPSRLDQZri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+ case X86::VPSRLDQZ128mi:
+ case X86::VPSRLDQZ256mi:
+ case X86::VPSRLDQZmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSRLDQMask(getRegOperandNumElts(MI, 8, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_SHUF(PALIGNR, rri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(PALIGNR, rmi)
+ Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePALIGNRMask(getRegOperandNumElts(MI, 8, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z, rri)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rri)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z, rmi)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rmi)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rmi)
+ Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVALIGNMask(getRegOperandNumElts(MI, 64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_AVX512_INS_COMMON(ALIGND, Z, rri)
+ CASE_AVX512_INS_COMMON(ALIGND, Z256, rri)
+ CASE_AVX512_INS_COMMON(ALIGND, Z128, rri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_AVX512_INS_COMMON(ALIGND, Z, rmi)
+ CASE_AVX512_INS_COMMON(ALIGND, Z256, rmi)
+ CASE_AVX512_INS_COMMON(ALIGND, Z128, rmi)
+ Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVALIGNMask(getRegOperandNumElts(MI, 32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_SHUF(PSHUFD, ri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(PSHUFD, mi)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32,
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_SHUF(PSHUFHW, ri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(PSHUFHW, mi)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFHWMask(getRegOperandNumElts(MI, 16, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_SHUF(PSHUFLW, ri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(PSHUFLW, mi)
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFLWMask(getRegOperandNumElts(MI, 16, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::MMX_PSHUFWri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::MMX_PSHUFWmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFMask(4, 16, MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSWAPDrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::PSWAPDrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSWAPMask(2, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHBW, r)
+ case X86::MMX_PUNPCKHBWirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKHBW, m)
+ case X86::MMX_PUNPCKHBWirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHWD, r)
+ case X86::MMX_PUNPCKHWDirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKHWD, m)
+ case X86::MMX_PUNPCKHWDirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHDQ, r)
+ case X86::MMX_PUNPCKHDQirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKHDQ, m)
+ case X86::MMX_PUNPCKHDQirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKHQDQ, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKHQDQ, m)
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLBW, r)
+ case X86::MMX_PUNPCKLBWirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKLBW, m)
+ case X86::MMX_PUNPCKLBWirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLWD, r)
+ case X86::MMX_PUNPCKLWDirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKLWD, m)
+ case X86::MMX_PUNPCKLWDirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLDQ, r)
+ case X86::MMX_PUNPCKLDQirr:
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKLDQ, m)
+ case X86::MMX_PUNPCKLDQirm:
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
+ break;
+
+ CASE_UNPCK(PUNPCKLQDQ, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(PUNPCKLQDQ, m)
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
+ break;
+
+ CASE_SHUF(SHUFPD, rri)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(SHUFPD, rmi)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeSHUFPMask(getRegOperandNumElts(MI, 64, 0), 64,
+ MI->getOperand(NumOperands - 1).getImm(), ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_SHUF(SHUFPS, rri)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_SHUF(SHUFPS, rmi)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeSHUFPMask(getRegOperandNumElts(MI, 32, 0), 32,
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VSHUF(64X2, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_VSHUF(64X2, m)
+ decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 64, 0), 64,
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VSHUF(32X4, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_VSHUF(32X4, m)
+ decodeVSHUF64x2FamilyMask(getRegOperandNumElts(MI, 32, 0), 32,
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKLPD, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(UNPCKLPD, m)
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKLPS, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(UNPCKLPS, m)
+ DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKHPD, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(UNPCKHPD, m)
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 64, 0), 64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_UNPCK(UNPCKHPS, r)
+ Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_UNPCK(UNPCKHPS, m)
+ DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERMILPI(PERMILPS, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_VPERMILPI(PERMILPS, m)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFMask(getRegOperandNumElts(MI, 32, 0), 32,
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERMILPI(PERMILPD, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_VPERMILPI(PERMILPD, m)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodePSHUFMask(getRegOperandNumElts(MI, 64, 0), 64,
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::VPERM2F128rr:
+ case X86::VPERM2I128rr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::VPERM2F128rm:
+ case X86::VPERM2I128rm:
+ // For instruction comments purpose, assume the 256-bit vector is v4i64.
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVPERM2X128Mask(4, MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERM(PERMPD, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_VPERM(PERMPD, m)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_VPERM(PERMQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ LLVM_FALLTHROUGH;
+
+ CASE_VPERM(PERMQ, m)
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVPERMMask(getRegOperandNumElts(MI, 64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVSDrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSDZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::MOVSDrm_alt:
+ case X86::MOVSDrm:
+ case X86::VMOVSDrm_alt:
+ case X86::VMOVSDrm:
+ case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
+ DecodeScalarMoveMask(2, nullptr == Src2Name, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVSSrr:
+ case X86::VMOVSSrr:
+ case X86::VMOVSSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
+ case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
+ case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
+ DecodeScalarMoveMask(4, nullptr == Src2Name, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVPQI2QIrr:
+ case X86::MOVZPQILo2PQIrr:
+ case X86::VMOVPQI2QIrr:
+ case X86::VMOVPQI2QIZrr:
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVZPQILo2PQIZrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ LLVM_FALLTHROUGH;
+
+ case X86::MOVQI2PQIrm:
+ case X86::VMOVQI2PQIrm:
+ case X86::VMOVQI2PQIZrm:
+ DecodeZeroMoveLowMask(2, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVDI2PDIrm:
+ case X86::VMOVDI2PDIrm:
+ case X86::VMOVDI2PDIZrm:
+ DecodeZeroMoveLowMask(4, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::EXTRQI:
+ if (MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isImm())
+ DecodeEXTRQIMask(16, 8, MI->getOperand(2).getImm(),
+ MI->getOperand(3).getImm(), ShuffleMask);
+
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ case X86::INSERTQI:
+ if (MI->getOperand(3).isImm() &&
+ MI->getOperand(4).isImm())
+ DecodeINSERTQIMask(16, 8, MI->getOperand(3).getImm(),
+ MI->getOperand(4).getImm(), ShuffleMask);
+
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ break;
+
+ case X86::VBROADCASTF128:
+ case X86::VBROADCASTI128:
+ CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm)
+ DecodeSubVectorBroadcast(4, 2, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm)
+ DecodeSubVectorBroadcast(8, 2, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm)
+ DecodeSubVectorBroadcast(8, 4, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z256, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z256, rm)
+ DecodeSubVectorBroadcast(8, 4, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm)
+ DecodeSubVectorBroadcast(16, 4, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm)
+ DecodeSubVectorBroadcast(16, 8, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rr)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rm)
+ DecodeSubVectorBroadcast(4, 2, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rr)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rr)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rm)
+ DecodeSubVectorBroadcast(8, 2, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rr)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rr)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rm)
+ DecodeSubVectorBroadcast(16, 2, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXBW, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_PMOVZX(PMOVZXBW, m)
+ DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), false,
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXBD, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_PMOVZX(PMOVZXBD, m)
+ DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), false,
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXBQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_PMOVZX(PMOVZXBQ, m)
+ DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), false,
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXWD, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_PMOVZX(PMOVZXWD, m)
+ DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), false,
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXWQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_PMOVZX(PMOVZXWQ, m)
+ DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), false,
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ CASE_PMOVZX(PMOVZXDQ, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_PMOVZX(PMOVZXDQ, m)
+ DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), false,
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ }
+
+ // The only comments we decode are shuffles, so give up if we were unable to
+ // decode a shuffle mask.
+ if (ShuffleMask.empty())
+ return false;
+
+ if (!DestName) DestName = Src1Name;
+ if (DestName) {
+ OS << DestName;
+ printMasking(OS, MI, MCII);
+ } else
+ OS << "mem";
+
+ OS << " = ";
+
+ // If the two sources are the same, canonicalize the input elements to be
+ // from the first src so that we get larger element spans.
+ if (Src1Name == Src2Name) {
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+ ShuffleMask[i] >= (int)e) // From second mask.
+ ShuffleMask[i] -= e;
+ }
+ }
+
+ // The shuffle mask specifies which elements of the src1/src2 fill in the
+ // destination, with a few sentinel values. Loop through and print them
+ // out.
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if (i != 0)
+ OS << ',';
+ if (ShuffleMask[i] == SM_SentinelZero) {
+ OS << "zero";
+ continue;
+ }
+
+ // Otherwise, it must come from src1 or src2. Print the span of elements
+ // that comes from this src.
+ bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+ const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+ OS << (SrcName ? SrcName : "mem") << '[';
+ bool IsFirst = true;
+ while (i != e && (int)ShuffleMask[i] != SM_SentinelZero &&
+ (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+ if (!IsFirst)
+ OS << ',';
+ else
+ IsFirst = false;
+ if (ShuffleMask[i] == SM_SentinelUndef)
+ OS << "u";
+ else
+ OS << ShuffleMask[i] % ShuffleMask.size();
+ ++i;
+ }
+ OS << ']';
+ --i; // For loop increments element #.
+ }
+ OS << '\n';
+
+ // We successfully added a comment to this instruction.
+ return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.h
new file mode 100644
index 000000000000..96760664012a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.h
@@ -0,0 +1,26 @@
+//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H
+
+namespace llvm {
+
+ class MCInst;
+ class MCInstrInfo;
+ class raw_ostream;
+ bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+ const MCInstrInfo &MCII);
+}
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
new file mode 100644
index 000000000000..d8dbbbbf2779
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -0,0 +1,389 @@
+//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes common code for rendering MCInst instances as Intel-style
+// and Intel-style assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstPrinterCommon.h"
+#include "X86BaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+#include <cstdint>
+#include <cassert>
+
+using namespace llvm;
+
+void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid condcode argument!");
+ case 0: O << "o"; break;
+ case 1: O << "no"; break;
+ case 2: O << "b"; break;
+ case 3: O << "ae"; break;
+ case 4: O << "e"; break;
+ case 5: O << "ne"; break;
+ case 6: O << "be"; break;
+ case 7: O << "a"; break;
+ case 8: O << "s"; break;
+ case 9: O << "ns"; break;
+ case 0xa: O << "p"; break;
+ case 0xb: O << "np"; break;
+ case 0xc: O << "l"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "le"; break;
+ case 0xf: O << "g"; break;
+ }
+}
+
+void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid ssecc/avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+void X86InstPrinterCommon::printVPCOMMnemonic(const MCInst *MI,
+ raw_ostream &OS) {
+ OS << "vpcom";
+
+ int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid vpcom argument!");
+ case 0: OS << "lt"; break;
+ case 1: OS << "le"; break;
+ case 2: OS << "gt"; break;
+ case 3: OS << "ge"; break;
+ case 4: OS << "eq"; break;
+ case 5: OS << "neq"; break;
+ case 6: OS << "false"; break;
+ case 7: OS << "true"; break;
+ }
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::VPCOMBmi: case X86::VPCOMBri: OS << "b\t"; break;
+ case X86::VPCOMDmi: case X86::VPCOMDri: OS << "d\t"; break;
+ case X86::VPCOMQmi: case X86::VPCOMQri: OS << "q\t"; break;
+ case X86::VPCOMUBmi: case X86::VPCOMUBri: OS << "ub\t"; break;
+ case X86::VPCOMUDmi: case X86::VPCOMUDri: OS << "ud\t"; break;
+ case X86::VPCOMUQmi: case X86::VPCOMUQri: OS << "uq\t"; break;
+ case X86::VPCOMUWmi: case X86::VPCOMUWri: OS << "uw\t"; break;
+ case X86::VPCOMWmi: case X86::VPCOMWri: OS << "w\t"; break;
+ }
+}
+
+void X86InstPrinterCommon::printVPCMPMnemonic(const MCInst *MI,
+ raw_ostream &OS) {
+ OS << "vpcmp";
+
+ printSSEAVXCC(MI, MI->getNumOperands() - 1, OS);
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrri:
+ case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmik: case X86::VPCMPBZrrik:
+ OS << "b\t";
+ break;
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrri:
+ case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmik: case X86::VPCMPDZrrik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ OS << "d\t";
+ break;
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrri:
+ case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmik: case X86::VPCMPQZrrik:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ OS << "q\t";
+ break;
+ case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri:
+ case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik:
+ OS << "ub\t";
+ break;
+ case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri:
+ case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
+ case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
+ case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk:
+ OS << "ud\t";
+ break;
+ case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri:
+ case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
+ case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
+ case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk:
+ OS << "uq\t";
+ break;
+ case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPUWZ256rri: case X86::VPCMPUWZ256rmi:
+ case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri:
+ case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPUWZ256rrik: case X86::VPCMPUWZ256rmik:
+ case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik:
+ OS << "uw\t";
+ break;
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrri:
+ case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmik: case X86::VPCMPWZrrik:
+ OS << "w\t";
+ break;
+ }
+}
+
+void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
+ raw_ostream &OS) {
+ OS << (IsVCmp ? "vcmp" : "cmp");
+
+ printSSEAVXCC(MI, MI->getNumOperands() - 1, OS);
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::CMPPDrmi: case X86::CMPPDrri:
+ case X86::VCMPPDrmi: case X86::VCMPPDrri:
+ case X86::VCMPPDYrmi: case X86::VCMPPDYrri:
+ case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri:
+ case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri:
+ case X86::VCMPPDZrmi: case X86::VCMPPDZrri:
+ case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik:
+ case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik:
+ case X86::VCMPPDZrmik: case X86::VCMPPDZrrik:
+ case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik:
+ case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik:
+ case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik:
+ case X86::VCMPPDZrrib: case X86::VCMPPDZrribk:
+ OS << "pd\t";
+ break;
+ case X86::CMPPSrmi: case X86::CMPPSrri:
+ case X86::VCMPPSrmi: case X86::VCMPPSrri:
+ case X86::VCMPPSYrmi: case X86::VCMPPSYrri:
+ case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri:
+ case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri:
+ case X86::VCMPPSZrmi: case X86::VCMPPSZrri:
+ case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik:
+ case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik:
+ case X86::VCMPPSZrmik: case X86::VCMPPSZrrik:
+ case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik:
+ case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik:
+ case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik:
+ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk:
+ OS << "ps\t";
+ break;
+ case X86::CMPSDrm: case X86::CMPSDrr:
+ case X86::CMPSDrm_Int: case X86::CMPSDrr_Int:
+ case X86::VCMPSDrm: case X86::VCMPSDrr:
+ case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int:
+ case X86::VCMPSDZrm: case X86::VCMPSDZrr:
+ case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int:
+ case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk:
+ case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
+ OS << "sd\t";
+ break;
+ case X86::CMPSSrm: case X86::CMPSSrr:
+ case X86::CMPSSrm_Int: case X86::CMPSSrr_Int:
+ case X86::VCMPSSrm: case X86::VCMPSSrr:
+ case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int:
+ case X86::VCMPSSZrm: case X86::VCMPSSZrr:
+ case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int:
+ case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk:
+ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+ OS << "ss\t";
+ break;
+ }
+}
+
+void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default:
+ llvm_unreachable("Invalid rounding control!");
+ case X86::TO_NEAREST_INT:
+ O << "{rn-sae}";
+ break;
+ case X86::TO_NEG_INF:
+ O << "{rd-sae}";
+ break;
+ case X86::TO_POS_INF:
+ O << "{ru-sae}";
+ break;
+ case X86::TO_ZERO:
+ O << "{rz-sae}";
+ break;
+ }
+}
+
+/// value (e.g. for jumps and calls). In Intel-style these print slightly
+/// differently than normal immediates. For example, a $ is not emitted.
+///
+/// \p Address The address of the next instruction.
+/// \see MCInstPrinter::printInst
+void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
+ unsigned OpNo, raw_ostream &O) {
+ // Do not print the numberic target address when symbolizing.
+ if (SymbolizeOperands)
+ return;
+
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm()) {
+ if (PrintBranchImmAsAddress) {
+ uint64_t Target = Address + Op.getImm();
+ if (MAI.getCodePointerSize() == 4)
+ Target &= 0xffffffff;
+ O << formatHex(Target);
+ } else
+ O << formatImm(Op.getImm());
+ } else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+ O << formatHex((uint64_t)Address);
+ } else {
+ // Otherwise, just print the expression.
+ Op.getExpr()->print(O, &MAI);
+ }
+ }
+}
+
+void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getReg()) {
+ printOperand(MI, OpNo, O);
+ O << ':';
+ }
+}
+
+void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+ unsigned Flags = MI->getFlags();
+
+ if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
+ O << "\tlock\t";
+
+ if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK))
+ O << "\tnotrack\t";
+
+ if (Flags & X86::IP_HAS_REPEAT_NE)
+ O << "\trepne\t";
+ else if (Flags & X86::IP_HAS_REPEAT)
+ O << "\trep\t";
+
+ // These all require a pseudo prefix
+ if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix))
+ O << "\t{vex}";
+ else if (Flags & X86::IP_USE_VEX2)
+ O << "\t{vex2}";
+ else if (Flags & X86::IP_USE_VEX3)
+ O << "\t{vex3}";
+ else if (Flags & X86::IP_USE_EVEX)
+ O << "\t{evex}";
+
+ if (Flags & X86::IP_USE_DISP8)
+ O << "\t{disp8}";
+ else if (Flags & X86::IP_USE_DISP32)
+ O << "\t{disp32}";
+}
+
+void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ // In assembly listings, a pair is represented by one of its members, any
+ // of the two. Here, we pick k0, k2, k4, k6, but we could as well
+ // print K2_K3 as "k3". It would probably make a lot more sense, if
+ // the assembly would look something like:
+ // "vp2intersect %zmm5, %zmm7, {%k2, %k3}"
+ // but this can work too.
+ switch (MI->getOperand(OpNo).getReg()) {
+ case X86::K0_K1:
+ printRegName(OS, X86::K0);
+ return;
+ case X86::K2_K3:
+ printRegName(OS, X86::K2);
+ return;
+ case X86::K4_K5:
+ printRegName(OS, X86::K4);
+ return;
+ case X86::K6_K7:
+ printRegName(OS, X86::K6);
+ return;
+ }
+ llvm_unreachable("Unknown mask pair register name");
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
new file mode 100644
index 000000000000..bb12ede3b729
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -0,0 +1,43 @@
+//===-- X86InstPrinterCommon.cpp - X86 assembly instruction printing ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code common for rendering MCInst instances as AT&T-style
+// and Intel-style assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class X86InstPrinterCommon : public MCInstPrinter {
+public:
+ using MCInstPrinter::MCInstPrinter;
+
+ virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0;
+ void printCondCode(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printVPCOMMnemonic(const MCInst *MI, raw_ostream &OS);
+ void printVPCMPMnemonic(const MCInst *MI, raw_ostream &OS);
+ void printCMPMnemonic(const MCInst *MI, bool IsVCmp, raw_ostream &OS);
+ void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printPCRelImm(const MCInst *MI, uint64_t Address, unsigned OpNo,
+ raw_ostream &O);
+
+protected:
+ void printInstFlags(const MCInst *MI, raw_ostream &O);
+ void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
new file mode 100644
index 000000000000..d5b205ad9a63
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -0,0 +1,454 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86IntelInstPrinter.h"
+#include "X86BaseInfo.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+ StringRef Annot, const MCSubtargetInfo &STI,
+ raw_ostream &OS) {
+ printInstFlags(MI, OS);
+
+ // In 16-bit mode, print data16 as data32.
+ if (MI->getOpcode() == X86::DATA16_PREFIX &&
+ STI.getFeatureBits()[X86::Mode16Bit]) {
+ OS << "\tdata32";
+ } else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
+ printInstruction(MI, Address, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ EmitAnyX86InstComments(MI, *CommentStream, MII);
+}
+
+bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS) {
+ if (MI->getNumOperands() == 0 ||
+ !MI->getOperand(MI->getNumOperands() - 1).isImm())
+ return false;
+
+ int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+
+ // Custom print the vector compare instructions to get the immediate
+ // translated into the mnemonic.
+ switch (MI->getOpcode()) {
+ case X86::CMPPDrmi: case X86::CMPPDrri:
+ case X86::CMPPSrmi: case X86::CMPPSrri:
+ case X86::CMPSDrm: case X86::CMPSDrr:
+ case X86::CMPSDrm_Int: case X86::CMPSDrr_Int:
+ case X86::CMPSSrm: case X86::CMPSSrr:
+ case X86::CMPSSrm_Int: case X86::CMPSSrr_Int:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/false, OS);
+ printOperand(MI, 0, OS);
+ OS << ", ";
+ // Skip operand 1 as its tied to the dest.
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, 2, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, 2, OS);
+ else
+ printxmmwordmem(MI, 2, OS);
+ } else
+ printOperand(MI, 2, OS);
+
+ return true;
+ }
+ break;
+
+ case X86::VCMPPDrmi: case X86::VCMPPDrri:
+ case X86::VCMPPDYrmi: case X86::VCMPPDYrri:
+ case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri:
+ case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri:
+ case X86::VCMPPDZrmi: case X86::VCMPPDZrri:
+ case X86::VCMPPSrmi: case X86::VCMPPSrri:
+ case X86::VCMPPSYrmi: case X86::VCMPPSYrri:
+ case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri:
+ case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri:
+ case X86::VCMPPSZrmi: case X86::VCMPPSZrri:
+ case X86::VCMPSDrm: case X86::VCMPSDrr:
+ case X86::VCMPSDZrm: case X86::VCMPSDZrr:
+ case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int:
+ case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int:
+ case X86::VCMPSSrm: case X86::VCMPSSrr:
+ case X86::VCMPSSZrm: case X86::VCMPSSZrr:
+ case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int:
+ case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int:
+ case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik:
+ case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik:
+ case X86::VCMPPDZrmik: case X86::VCMPPDZrrik:
+ case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik:
+ case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik:
+ case X86::VCMPPSZrmik: case X86::VCMPPSZrrik:
+ case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk:
+ case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk:
+ case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik:
+ case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik:
+ case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik:
+ case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik:
+ case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik:
+ case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik:
+ case X86::VCMPPDZrrib: case X86::VCMPPDZrribk:
+ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk:
+ case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
+ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+ if (Imm >= 0 && Imm <= 31) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/true, OS);
+
+ unsigned CurOp = 0;
+ printOperand(MI, CurOp++, OS);
+
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp++, OS);
+ OS << "}";
+ }
+ OS << ", ";
+ printOperand(MI, CurOp++, OS);
+ OS << ", ";
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp++, OS);
+ else
+ printdwordmem(MI, CurOp++, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, CurOp++, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, CurOp++, OS);
+ else if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp++, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp++, OS);
+ else
+ printxmmwordmem(MI, CurOp++, OS);
+ }
+ } else {
+ printOperand(MI, CurOp++, OS);
+ if (Desc.TSFlags & X86II::EVEX_B)
+ OS << ", {sae}";
+ }
+
+ return true;
+ }
+ break;
+
+ case X86::VPCOMBmi: case X86::VPCOMBri:
+ case X86::VPCOMDmi: case X86::VPCOMDri:
+ case X86::VPCOMQmi: case X86::VPCOMQri:
+ case X86::VPCOMUBmi: case X86::VPCOMUBri:
+ case X86::VPCOMUDmi: case X86::VPCOMUDri:
+ case X86::VPCOMUQmi: case X86::VPCOMUQri:
+ case X86::VPCOMUWmi: case X86::VPCOMUWri:
+ case X86::VPCOMWmi: case X86::VPCOMWri:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printVPCOMMnemonic(MI, OS);
+ printOperand(MI, 0, OS);
+ OS << ", ";
+ printOperand(MI, 1, OS);
+ OS << ", ";
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem)
+ printxmmwordmem(MI, 2, OS);
+ else
+ printOperand(MI, 2, OS);
+ return true;
+ }
+ break;
+
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrri:
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrri:
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrri:
+ case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri:
+ case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri:
+ case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri:
+ case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri:
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrri:
+ case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmik: case X86::VPCMPBZrrik:
+ case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmik: case X86::VPCMPDZrrik:
+ case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmik: case X86::VPCMPQZrrik:
+ case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik:
+ case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik:
+ case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik:
+ case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmik: case X86::VPCMPWZrrik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
+ case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
+ case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk:
+ case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
+ case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
+ case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk:
+ if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) {
+ OS << '\t';
+ printVPCMPMnemonic(MI, OS);
+
+ unsigned CurOp = 0;
+ printOperand(MI, CurOp++, OS);
+
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp++, OS);
+ OS << "}";
+ }
+ OS << ", ";
+ printOperand(MI, CurOp++, OS);
+ OS << ", ";
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit as only D and Q are supported.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp++, OS);
+ else
+ printdwordmem(MI, CurOp++, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp++, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp++, OS);
+ else
+ printxmmwordmem(MI, CurOp++, OS);
+ }
+ } else {
+ printOperand(MI, CurOp++, OS);
+ }
+
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ O << formatImm((int64_t)Op.getImm());
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << "offset ";
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // Do not print the exact form of the memory operand if it references a known
+ // binary object.
+ if (SymbolizeOperands && MIA) {
+ uint64_t Target;
+ if (MIA->evaluateBranch(*MI, 0, 0, Target))
+ return;
+ if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+ return;
+ }
+ const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(MI, Op+X86::AddrBaseReg, O);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(MI, Op+X86::AddrIndexReg, O);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << formatImm(DispVal);
+ }
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+ O << '[';
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // DI accesses are always ES-based.
+ O << "es:[";
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+
+ O << '[';
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return MI->getOperand(Op).getExpr()->print(O, &MAI);
+
+ O << formatImm(MI->getOperand(Op).getImm() & 0xff);
+}
+
+void X86IntelInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ unsigned Reg = Op.getReg();
+ // Override the default printing to print st(0) instead st.
+ if (Reg == X86::ST0)
+ OS << "st(0)";
+ else
+ printRegName(OS, Reg);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
new file mode 100644
index 000000000000..aa4d0545ea46
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -0,0 +1,138 @@
+//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to Intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H
+
+#include "X86InstPrinterCommon.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class X86IntelInstPrinter final : public X86InstPrinterCommon {
+public:
+ X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI)
+ : X86InstPrinterCommon(MAI, MII, MRI) {}
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+ const MCSubtargetInfo &STI, raw_ostream &OS) override;
+ bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS);
+
+ // Autogenerated by tblgen, returns true if we successfully printed an
+ // alias.
+ bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpIdx, unsigned PrintMethodIdx,
+ raw_ostream &O);
+
+ // Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+ void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) override;
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+
+ void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "xmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "ymmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "zmmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "tbyte ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+
+
+ void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printSrcIdx(MI, OpNo, O);
+ }
+ void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printDstIdx(MI, OpNo, O);
+ }
+ void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "byte ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "word ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "dword ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+ void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "qword ptr ";
+ printMemOffset(MI, OpNo, O);
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
new file mode 100644
index 000000000000..c294da6baffa
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -0,0 +1,169 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+ // Note: This numbering has to match the GCC assembler dialects for inline
+ // asm alternatives to work right.
+ ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy> AsmWriterFlavor(
+ "x86-asm-syntax", cl::init(ATT), cl::Hidden,
+ cl::desc("Choose style of code to emit from X86 backend:"),
+ cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"),
+ clEnumValN(Intel, "intel", "Emit Intel-style assembly")));
+
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(true),
+ cl::desc("Mark code section jump table data regions."),
+ cl::Hidden);
+
+void X86MCAsmInfoDarwin::anchor() { }
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
+ bool is64Bit = T.getArch() == Triple::x86_64;
+ if (is64Bit)
+ CodePointerSize = CalleeSaveStackSlotSize = 8;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ if (!is64Bit)
+ Data64bitsDirective = nullptr; // we can't emit a 64-bit unit
+
+ // Use ## as a comment string so that .s files generated by llvm can go
+ // through the GCC preprocessor without causing an error. This is needed
+ // because "clang foo.s" runs the C preprocessor, which is usually reserved
+ // for .S files on other systems. Perhaps this is because the file system
+ // wasn't always case preserving or something.
+ CommentString = "##";
+
+ SupportsDebugInformation = true;
+ UseDataRegionDirectives = MarkedJTDataRegions;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+
+ // old assembler lacks some directives
+ // FIXME: this should really be a check on the assembler characteristics
+ // rather than OS version
+ if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+ HasWeakDefCanBeHiddenDirective = false;
+
+ // Assume ld64 is new enough that the abs-ified FDE relocs may be used
+ // (actually, must, since otherwise the non-extern relocations we produce
+ // overwhelm ld64's tiny little mind and it fails).
+ DwarfFDESymbolsUseAbsDiff = true;
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+ : X86MCAsmInfoDarwin(Triple) {
+}
+
+void X86ELFMCAsmInfo::anchor() { }
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
+ bool is64Bit = T.getArch() == Triple::x86_64;
+ bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+ // For ELF, x86-64 pointer size depends on the ABI.
+ // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+ // with the x32 ABI, pointer size remains the default 4.
+ CodePointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+ // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+ CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ // Debug Information
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+ unsigned Encoding,
+ MCStreamer &Streamer) const {
+ MCContext &Context = Streamer.getContext();
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+ const MCExpr *Four = MCConstantExpr::create(4, Context);
+ return MCBinaryExpr::createAdd(Res, Four, Context);
+}
+
+void X86MCAsmInfoMicrosoft::anchor() { }
+
+X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
+ if (Triple.getArch() == Triple::x86_64) {
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+ CodePointerSize = 8;
+ WinEHEncodingType = WinEH::EncodingType::Itanium;
+ } else {
+ // 32-bit X86 doesn't use CFI, so this isn't a real encoding type. It's just
+ // a place holder that the Windows EHStreamer looks for to suppress CFI
+ // output. In particular, usesWindowsCFI() returns false.
+ WinEHEncodingType = WinEH::EncodingType::X86;
+ }
+
+ ExceptionsType = ExceptionHandling::WinEH;
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ AllowAtInName = true;
+}
+
+void X86MCAsmInfoMicrosoftMASM::anchor() { }
+
+X86MCAsmInfoMicrosoftMASM::X86MCAsmInfoMicrosoftMASM(const Triple &Triple)
+ : X86MCAsmInfoMicrosoft(Triple) {
+ DollarIsPC = true;
+ SeparatorString = "\n";
+ CommentString = ";";
+ AllowSymbolAtNameStart = true;
+}
+
+void X86MCAsmInfoGNUCOFF::anchor() { }
+
+X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+ assert(Triple.isOSWindows() && "Windows is the only supported COFF target");
+ if (Triple.getArch() == Triple::x86_64) {
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+ CodePointerSize = 8;
+ WinEHEncodingType = WinEH::EncodingType::Itanium;
+ ExceptionsType = ExceptionHandling::WinEH;
+ } else {
+ ExceptionsType = ExceptionHandling::DwarfCFI;
+ }
+
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ AllowAtInName = true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
new file mode 100644
index 000000000000..ce8e84fb96b9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -0,0 +1,66 @@
+//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+
+public:
+ explicit X86MCAsmInfoDarwin(const Triple &Triple);
+};
+
+struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+ explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+ const MCExpr *
+ getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+ MCStreamer &Streamer) const override;
+};
+
+class X86ELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit X86ELFMCAsmInfo(const Triple &Triple);
+};
+
+class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+};
+
+class X86MCAsmInfoMicrosoftMASM : public X86MCAsmInfoMicrosoft {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoMicrosoftMASM(const Triple &Triple);
+};
+
+class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
new file mode 100644
index 000000000000..260253a5302d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -0,0 +1,1840 @@
+//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+
+class X86MCCodeEmitter : public MCCodeEmitter {
+ const MCInstrInfo &MCII;
+ MCContext &Ctx;
+
+public:
+ X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+ : MCII(mcii), Ctx(ctx) {}
+ X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
+ X86MCCodeEmitter &operator=(const X86MCCodeEmitter &) = delete;
+ ~X86MCCodeEmitter() override = default;
+
+ void emitPrefix(const MCInst &MI, raw_ostream &OS,
+ const MCSubtargetInfo &STI) const override;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+private:
+ unsigned getX86RegNum(const MCOperand &MO) const;
+
+ unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const;
+
+ /// \param MI a single low-level machine instruction.
+ /// \param OpNum the operand #.
+ /// \returns true if the OpNumth operand of MI require a bit to be set in
+ /// REX prefix.
+ bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const;
+
+ void emitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize,
+ MCFixupKind FixupKind, uint64_t StartByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
+
+ void emitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+ raw_ostream &OS) const;
+
+ void emitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+ raw_ostream &OS) const;
+
+ void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField,
+ uint64_t TSFlags, bool HasREX, uint64_t StartByte,
+ raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI,
+ bool ForceSIB = false) const;
+
+ bool emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
+ const MCSubtargetInfo &STI, raw_ostream &OS) const;
+
+ void emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
+ raw_ostream &OS) const;
+
+ void emitSegmentOverridePrefix(unsigned SegOperand, const MCInst &MI,
+ raw_ostream &OS) const;
+
+ bool emitOpcodePrefix(int MemOperand, const MCInst &MI,
+ const MCSubtargetInfo &STI, raw_ostream &OS) const;
+
+ bool emitREXPrefix(int MemOperand, const MCInst &MI,
+ const MCSubtargetInfo &STI, raw_ostream &OS) const;
+};
+
+} // end anonymous namespace
+
+static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+static void emitByte(uint8_t C, raw_ostream &OS) { OS << static_cast<char>(C); }
+
+static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) {
+ // Output the constant in little endian byte order.
+ for (unsigned i = 0; i != Size; ++i) {
+ emitByte(Val & 255, OS);
+ Val >>= 8;
+ }
+}
+
+/// Determine if this immediate can fit in a disp8 or a compressed disp8 for
+/// EVEX instructions. \p will be set to the value to pass to the ImmOffset
+/// parameter of emitImmediate.
+static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) {
+ bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
+
+ int CD8_Scale =
+ (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
+ if (!HasEVEX || CD8_Scale == 0)
+ return isInt<8>(Value);
+
+ assert(isPowerOf2_32(CD8_Scale) && "Unexpected CD8 scale!");
+ if (Value & (CD8_Scale - 1)) // Unaligned offset
+ return false;
+
+ int CDisp8 = Value / CD8_Scale;
+ if (!isInt<8>(CDisp8))
+ return false;
+
+ // ImmOffset will be added to Value in emitImmediate leaving just CDisp8.
+ ImmOffset = CDisp8 - Value;
+ return true;
+}
+
+/// \returns the appropriate fixup kind to use for an immediate in an
+/// instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
+ unsigned Size = X86II::getSizeOfImm(TSFlags);
+ bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+ if (X86II::isImmSigned(TSFlags)) {
+ switch (Size) {
+ default:
+ llvm_unreachable("Unsupported signed fixup size!");
+ case 4:
+ return MCFixupKind(X86::reloc_signed_4byte);
+ }
+ }
+ return MCFixup::getKindForSize(Size, isPCRel);
+}
+
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 16-bit memory operand.
+static bool is16BitMemOperand(const MCInst &MI, unsigned Op,
+ const MCSubtargetInfo &STI) {
+ const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+
+ unsigned BaseReg = Base.getReg();
+ unsigned IndexReg = Index.getReg();
+
+ if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0)
+ return true;
+ if ((BaseReg != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) ||
+ (IndexReg != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)))
+ return true;
+ return false;
+}
+
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 32-bit memory operand.
+static bool is32BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+ return true;
+ if (BaseReg.getReg() == X86::EIP) {
+ assert(IndexReg.getReg() == 0 && "Invalid eip-based address.");
+ return true;
+ }
+ if (IndexReg.getReg() == X86::EIZ)
+ return true;
+ return false;
+}
+
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 64-bit memory operand.
+#ifndef NDEBUG
+static bool is64BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+#endif
+
+enum GlobalOffsetTableExprKind { GOT_None, GOT_Normal, GOT_SymDiff };
+
+/// Check if this expression starts with _GLOBAL_OFFSET_TABLE_ and if it is
+/// of the form _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on
+/// ELF i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start of a
+/// binary expression.
+static GlobalOffsetTableExprKind
+startsWithGlobalOffsetTable(const MCExpr *Expr) {
+ const MCExpr *RHS = nullptr;
+ if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
+ Expr = BE->getLHS();
+ RHS = BE->getRHS();
+ }
+
+ if (Expr->getKind() != MCExpr::SymbolRef)
+ return GOT_None;
+
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+ const MCSymbol &S = Ref->getSymbol();
+ if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
+ return GOT_None;
+ if (RHS && RHS->getKind() == MCExpr::SymbolRef)
+ return GOT_SymDiff;
+ return GOT_Normal;
+}
+
+static bool hasSecRelSymbolRef(const MCExpr *Expr) {
+ if (Expr->getKind() == MCExpr::SymbolRef) {
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+ return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
+ }
+ return false;
+}
+
+static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4 &&
+ Opcode != X86::JCC_4) ||
+ getImmFixupKind(Desc.TSFlags) != FK_PCRel_4)
+ return false;
+
+ unsigned CurOp = X86II::getOperandBias(Desc);
+ const MCOperand &Op = MI.getOperand(CurOp);
+ if (!Op.isExpr())
+ return false;
+
+ const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(Op.getExpr());
+ return Ref && Ref->getKind() == MCSymbolRefExpr::VK_None;
+}
+
+unsigned X86MCCodeEmitter::getX86RegNum(const MCOperand &MO) const {
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
+}
+
+unsigned X86MCCodeEmitter::getX86RegEncoding(const MCInst &MI,
+ unsigned OpNum) const {
+ return Ctx.getRegisterInfo()->getEncodingValue(MI.getOperand(OpNum).getReg());
+}
+
+/// \param MI a single low-level machine instruction.
+/// \param OpNum the operand #.
+/// \returns true if the OpNumth operand of MI require a bit to be set in
+/// REX prefix.
+bool X86MCCodeEmitter::isREXExtendedReg(const MCInst &MI,
+ unsigned OpNum) const {
+ return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
+}
+
+void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
+ unsigned Size, MCFixupKind FixupKind,
+ uint64_t StartByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ int ImmOffset) const {
+ const MCExpr *Expr = nullptr;
+ if (DispOp.isImm()) {
+ // If this is a simple integer displacement that doesn't require a
+ // relocation, emit it now.
+ if (FixupKind != FK_PCRel_1 && FixupKind != FK_PCRel_2 &&
+ FixupKind != FK_PCRel_4) {
+ emitConstant(DispOp.getImm() + ImmOffset, Size, OS);
+ return;
+ }
+ Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
+ } else {
+ Expr = DispOp.getExpr();
+ }
+
+ // If we have an immoffset, add it to the expression.
+ if ((FixupKind == FK_Data_4 || FixupKind == FK_Data_8 ||
+ FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
+ GlobalOffsetTableExprKind Kind = startsWithGlobalOffsetTable(Expr);
+ if (Kind != GOT_None) {
+ assert(ImmOffset == 0);
+
+ if (Size == 8) {
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
+ } else {
+ assert(Size == 4);
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+ }
+
+ if (Kind == GOT_Normal)
+ ImmOffset = static_cast<int>(OS.tell() - StartByte);
+ } else if (Expr->getKind() == MCExpr::SymbolRef) {
+ if (hasSecRelSymbolRef(Expr)) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
+ } else if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr *>(Expr);
+ if (hasSecRelSymbolRef(Bin->getLHS()) ||
+ hasSecRelSymbolRef(Bin->getRHS())) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
+ }
+ }
+
+ // If the fixup is pc-relative, we need to bias the value to be relative to
+ // the start of the field, not the end of the field.
+ if (FixupKind == FK_PCRel_4 ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex) ||
+ FixupKind == MCFixupKind(X86::reloc_branch_4byte_pcrel)) {
+ ImmOffset -= 4;
+ // If this is a pc-relative load off _GLOBAL_OFFSET_TABLE_:
+ // leaq _GLOBAL_OFFSET_TABLE_(%rip), %r15
+ // this needs to be a GOTPC32 relocation.
+ if (startsWithGlobalOffsetTable(Expr) != GOT_None)
+ FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+ }
+ if (FixupKind == FK_PCRel_2)
+ ImmOffset -= 2;
+ if (FixupKind == FK_PCRel_1)
+ ImmOffset -= 1;
+
+ if (ImmOffset)
+ Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(ImmOffset, Ctx),
+ Ctx);
+
+ // Emit a symbolic constant as a fixup and 4 zeros.
+ Fixups.push_back(MCFixup::create(static_cast<uint32_t>(OS.tell() - StartByte),
+ Expr, FixupKind, Loc));
+ emitConstant(0, Size, OS);
+}
+
+void X86MCCodeEmitter::emitRegModRMByte(const MCOperand &ModRMReg,
+ unsigned RegOpcodeFld,
+ raw_ostream &OS) const {
+ emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), OS);
+}
+
+void X86MCCodeEmitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+ raw_ostream &OS) const {
+ // SIB byte is in the same format as the modRMByte.
+ emitByte(modRMByte(SS, Index, Base), OS);
+}
+
+void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
+ unsigned RegOpcodeField,
+ uint64_t TSFlags, bool HasREX,
+ uint64_t StartByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI,
+ bool ForceSIB) const {
+ const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp);
+ const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt);
+ const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
+ unsigned BaseReg = Base.getReg();
+
+ // Handle %rip relative addressing.
+ if (BaseReg == X86::RIP ||
+ BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
+ assert(STI.hasFeature(X86::Mode64Bit) &&
+ "Rip-relative addressing requires 64-bit mode");
+ assert(IndexReg.getReg() == 0 && !ForceSIB &&
+ "Invalid rip-relative address");
+ emitByte(modRMByte(0, RegOpcodeField, 5), OS);
+
+ unsigned Opcode = MI.getOpcode();
+ unsigned FixupKind = [&]() {
+ // Enable relaxed relocation only for a MCSymbolRefExpr. We cannot use a
+ // relaxed relocation if an offset is present (e.g. x@GOTPCREL+4).
+ if (!(Disp.isExpr() && isa<MCSymbolRefExpr>(Disp.getExpr())))
+ return X86::reloc_riprel_4byte;
+
+ // Certain loads for GOT references can be relocated against the symbol
+ // directly if the symbol ends up in the same linkage unit.
+ switch (Opcode) {
+ default:
+ return X86::reloc_riprel_4byte;
+ case X86::MOV64rm:
+ // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a
+ // special case because COFF and Mach-O don't support ELF's more
+ // flexible R_X86_64_REX_GOTPCRELX relaxation.
+ assert(HasREX);
+ return X86::reloc_riprel_4byte_movq_load;
+ case X86::ADC32rm:
+ case X86::ADD32rm:
+ case X86::AND32rm:
+ case X86::CMP32rm:
+ case X86::MOV32rm:
+ case X86::OR32rm:
+ case X86::SBB32rm:
+ case X86::SUB32rm:
+ case X86::TEST32mr:
+ case X86::XOR32rm:
+ case X86::CALL64m:
+ case X86::JMP64m:
+ case X86::TAILJMPm64:
+ case X86::TEST64mr:
+ case X86::ADC64rm:
+ case X86::ADD64rm:
+ case X86::AND64rm:
+ case X86::CMP64rm:
+ case X86::OR64rm:
+ case X86::SBB64rm:
+ case X86::SUB64rm:
+ case X86::XOR64rm:
+ return HasREX ? X86::reloc_riprel_4byte_relax_rex
+ : X86::reloc_riprel_4byte_relax;
+ }
+ }();
+
+ // rip-relative addressing is actually relative to the *next* instruction.
+ // Since an immediate can follow the mod/rm byte for an instruction, this
+ // means that we need to bias the displacement field of the instruction with
+ // the size of the immediate field. If we have this case, add it into the
+ // expression to emit.
+ // Note: rip-relative addressing using immediate displacement values should
+ // not be adjusted, assuming it was the user's intent.
+ int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
+ ? X86II::getSizeOfImm(TSFlags)
+ : 0;
+
+ emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS,
+ Fixups, -ImmSize);
+ return;
+ }
+
+ unsigned BaseRegNo = BaseReg ? getX86RegNum(Base) : -1U;
+
+ // 16-bit addressing forms of the ModR/M byte have a different encoding for
+ // the R/M field and are far more limited in which registers can be used.
+ if (is16BitMemOperand(MI, Op, STI)) {
+ if (BaseReg) {
+ // For 32-bit addressing, the row and column values in Table 2-2 are
+ // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
+ // some special cases. And getX86RegNum reflects that numbering.
+ // For 16-bit addressing it's more fun, as shown in the SDM Vol 2A,
+ // Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only
+ // use SI/DI/BP/BX, which have "row" values 4-7 in no particular order,
+ // while values 0-3 indicate the allowed combinations (base+index) of
+ // those: 0 for BX+SI, 1 for BX+DI, 2 for BP+SI, 3 for BP+DI.
+ //
+ // R16Table[] is a lookup from the normal RegNo, to the row values from
+ // Table 2-1 for 16-bit addressing modes. Where zero means disallowed.
+ static const unsigned R16Table[] = {0, 0, 0, 7, 0, 6, 4, 5};
+ unsigned RMfield = R16Table[BaseRegNo];
+
+ assert(RMfield && "invalid 16-bit base register");
+
+ if (IndexReg.getReg()) {
+ unsigned IndexReg16 = R16Table[getX86RegNum(IndexReg)];
+
+ assert(IndexReg16 && "invalid 16-bit index register");
+ // We must have one of SI/DI (4,5), and one of BP/BX (6,7).
+ assert(((IndexReg16 ^ RMfield) & 2) &&
+ "invalid 16-bit base/index register combination");
+ assert(Scale.getImm() == 1 &&
+ "invalid scale for 16-bit memory reference");
+
+ // Allow base/index to appear in either order (although GAS doesn't).
+ if (IndexReg16 & 2)
+ RMfield = (RMfield & 1) | ((7 - IndexReg16) << 1);
+ else
+ RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
+ }
+
+ if (Disp.isImm() && isInt<8>(Disp.getImm())) {
+ if (Disp.getImm() == 0 && RMfield != 6) {
+ // There is no displacement; just the register.
+ emitByte(modRMByte(0, RegOpcodeField, RMfield), OS);
+ return;
+ }
+ // Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
+ emitByte(modRMByte(1, RegOpcodeField, RMfield), OS);
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups);
+ return;
+ }
+ // This is the [REG]+disp16 case.
+ emitByte(modRMByte(2, RegOpcodeField, RMfield), OS);
+ } else {
+ assert(IndexReg.getReg() == 0 && "Unexpected index register!");
+ // There is no BaseReg; this is the plain [disp16] case.
+ emitByte(modRMByte(0, RegOpcodeField, 6), OS);
+ }
+
+ // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
+ emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, StartByte, OS, Fixups);
+ return;
+ }
+
+ // Check for presence of {disp8} or {disp32} pseudo prefixes.
+ bool UseDisp8 = MI.getFlags() & X86::IP_USE_DISP8;
+ bool UseDisp32 = MI.getFlags() & X86::IP_USE_DISP32;
+
+ // We only allow no displacement if no pseudo prefix is present.
+ bool AllowNoDisp = !UseDisp8 && !UseDisp32;
+ // Disp8 is allowed unless the {disp32} prefix is present.
+ bool AllowDisp8 = !UseDisp32;
+
+ // Determine whether a SIB byte is needed.
+ if (// The SIB byte must be used if there is an index register or the
+ // encoding requires a SIB byte.
+ !ForceSIB && IndexReg.getReg() == 0 &&
+ // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+ // encode to an R/M value of 4, which indicates that a SIB byte is
+ // present.
+ BaseRegNo != N86::ESP &&
+ // If there is no base register and we're in 64-bit mode, we need a SIB
+ // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+ (!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) {
+
+ if (BaseReg == 0) { // [disp32] in X86-32 mode
+ emitByte(modRMByte(0, RegOpcodeField, 5), OS);
+ emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, StartByte, OS, Fixups);
+ return;
+ }
+
+ // If the base is not EBP/ESP/R12/R13 and there is no displacement, use
+ // simple indirect register encoding, this handles addresses like [EAX].
+ // The encoding for [EBP] or[R13] with no displacement means [disp32] so we
+ // handle it by emitting a displacement of 0 later.
+ if (BaseRegNo != N86::EBP) {
+ if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp) {
+ emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
+ return;
+ }
+
+ // If the displacement is @tlscall, treat it as a zero.
+ if (Disp.isExpr()) {
+ auto *Sym = dyn_cast<MCSymbolRefExpr>(Disp.getExpr());
+ if (Sym && Sym->getKind() == MCSymbolRefExpr::VK_TLSCALL) {
+ // This is exclusively used by call *a@tlscall(base). The relocation
+ // (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning.
+ Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc()));
+ emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
+ return;
+ }
+ }
+ }
+
+ // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+ // Including a compressed disp8 for EVEX instructions that support it.
+ // This also handles the 0 displacement for [EBP] or [R13]. We can't use
+ // disp8 if the {disp32} pseudo prefix is present.
+ if (Disp.isImm() && AllowDisp8) {
+ int ImmOffset = 0;
+ if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
+ emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
+ ImmOffset);
+ return;
+ }
+ }
+
+ // Otherwise, emit the most general non-SIB encoding: [REG+disp32].
+ // Displacement may be 0 for [EBP] or [R13] case if {disp32} pseudo prefix
+ // prevented using disp8 above.
+ emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS);
+ unsigned Opcode = MI.getOpcode();
+ unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
+ : X86::reloc_signed_4byte;
+ emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS,
+ Fixups);
+ return;
+ }
+
+ // We need a SIB byte, so start by outputting the ModR/M byte first
+ assert(IndexReg.getReg() != X86::ESP && IndexReg.getReg() != X86::RSP &&
+ "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ bool ForceDisp8 = false;
+ int ImmOffset = 0;
+ if (BaseReg == 0) {
+ // If there is no base register, we emit the special case SIB byte with
+ // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+ BaseRegNo = 5;
+ emitByte(modRMByte(0, RegOpcodeField, 4), OS);
+ ForceDisp32 = true;
+ } else if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp &&
+ // Base reg can't be EBP/RBP/R13 as that would end up with '5' as
+ // the base field, but that is the magic [*] nomenclature that
+ // indicates no base when mod=0. For these cases we'll emit a 0
+ // displacement instead.
+ BaseRegNo != N86::EBP) {
+ // Emit no displacement ModR/M byte
+ emitByte(modRMByte(0, RegOpcodeField, 4), OS);
+ } else if (Disp.isImm() && AllowDisp8 &&
+ isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
+ // Displacement fits in a byte or matches an EVEX compressed disp8, use
+ // disp8 encoding. This also handles EBP/R13 base with 0 displacement unless
+ // {disp32} pseudo prefix was used.
+ emitByte(modRMByte(1, RegOpcodeField, 4), OS);
+ ForceDisp8 = true;
+ } else {
+ // Otherwise, emit the normal disp32 encoding.
+ emitByte(modRMByte(2, RegOpcodeField, 4), OS);
+ ForceDisp32 = true;
+ }
+
+ // Calculate what the SS field value should be...
+ static const unsigned SSTable[] = {~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3};
+ unsigned SS = SSTable[Scale.getImm()];
+
+ unsigned IndexRegNo = IndexReg.getReg() ? getX86RegNum(IndexReg) : 4;
+
+ emitSIBByte(SS, IndexRegNo, BaseRegNo, OS);
+
+ // Do we need to output a displacement?
+ if (ForceDisp8)
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
+ ImmOffset);
+ else if (ForceDisp32)
+ emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+ StartByte, OS, Fixups);
+}
+
+/// Emit all instruction prefixes.
+///
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const {
+ uint64_t TSFlags = MCII.get(MI.getOpcode()).TSFlags;
+ // Determine where the memory operand starts, if present.
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+ // Emit segment override opcode prefix as needed.
+ if (MemoryOperand != -1) {
+ MemoryOperand += CurOp;
+ emitSegmentOverridePrefix(MemoryOperand + X86::AddrSegmentReg, MI, OS);
+ }
+
+ // Emit the repeat opcode prefix as needed.
+ unsigned Flags = MI.getFlags();
+ if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT)
+ emitByte(0xF3, OS);
+ if (Flags & X86::IP_HAS_REPEAT_NE)
+ emitByte(0xF2, OS);
+
+ // Emit the address size opcode prefix as needed.
+ bool NeedAddressOverride;
+ uint64_t AdSize = TSFlags & X86II::AdSizeMask;
+ if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) ||
+ (STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) ||
+ (STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) {
+ NeedAddressOverride = true;
+ } else if (MemoryOperand < 0) {
+ NeedAddressOverride = false;
+ } else if (STI.hasFeature(X86::Mode64Bit)) {
+ assert(!is16BitMemOperand(MI, MemoryOperand, STI));
+ NeedAddressOverride = is32BitMemOperand(MI, MemoryOperand);
+ } else if (STI.hasFeature(X86::Mode32Bit)) {
+ assert(!is64BitMemOperand(MI, MemoryOperand));
+ NeedAddressOverride = is16BitMemOperand(MI, MemoryOperand, STI);
+ } else {
+ assert(STI.hasFeature(X86::Mode16Bit));
+ assert(!is64BitMemOperand(MI, MemoryOperand));
+ NeedAddressOverride = !is16BitMemOperand(MI, MemoryOperand, STI);
+ }
+
+ if (NeedAddressOverride)
+ emitByte(0x67, OS);
+
+ // Encoding type for this instruction.
+ uint64_t Encoding = TSFlags & X86II::EncodingMask;
+ bool HasREX = false;
+ if (Encoding)
+ emitVEXOpcodePrefix(MemoryOperand, MI, OS);
+ else
+ HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS);
+
+ uint64_t Form = TSFlags & X86II::FormMask;
+ switch (Form) {
+ default:
+ break;
+ case X86II::RawFrmDstSrc: {
+ unsigned siReg = MI.getOperand(1).getReg();
+ assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+ (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+ (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+ "SI and DI register sizes do not match");
+ // Emit segment override opcode prefix as needed (not for %ds).
+ if (MI.getOperand(2).getReg() != X86::DS)
+ emitSegmentOverridePrefix(2, MI, OS);
+ // Emit AdSize prefix as needed.
+ if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
+ (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
+ emitByte(0x67, OS);
+ CurOp += 3; // Consume operands.
+ break;
+ }
+ case X86II::RawFrmSrc: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ // Emit segment override opcode prefix as needed (not for %ds).
+ if (MI.getOperand(1).getReg() != X86::DS)
+ emitSegmentOverridePrefix(1, MI, OS);
+ // Emit AdSize prefix as needed.
+ if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
+ (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
+ emitByte(0x67, OS);
+ CurOp += 2; // Consume operands.
+ break;
+ }
+ case X86II::RawFrmDst: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ // Emit AdSize prefix as needed.
+ if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) ||
+ (STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI))
+ emitByte(0x67, OS);
+ ++CurOp; // Consume operand.
+ break;
+ }
+ case X86II::RawFrmMemOffs: {
+ // Emit segment override opcode prefix as needed.
+ emitSegmentOverridePrefix(1, MI, OS);
+ break;
+ }
+ }
+
+ return HasREX;
+}
+
+/// AVX instructions are encoded using a opcode prefix called VEX.
+void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
+ raw_ostream &OS) const {
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");
+
+ uint64_t Encoding = TSFlags & X86II::EncodingMask;
+ bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+ bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+ bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+ // VEX_R: opcode externsion equivalent to REX.R in
+ // 1's complement (inverted) form
+ //
+ // 1: Same as REX_R=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX_R=1 (64 bit mode only)
+ //
+ uint8_t VEX_R = 0x1;
+ uint8_t EVEX_R2 = 0x1;
+
+ // VEX_X: equivalent to REX.X, only used when a
+ // register is used for index in SIB Byte.
+ //
+ // 1: Same as REX.X=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX.X=1 (64-bit mode only)
+ uint8_t VEX_X = 0x1;
+
+ // VEX_B:
+ //
+ // 1: Same as REX_B=0 (ignored in 32-bit mode)
+ // 0: Same as REX_B=1 (64 bit mode only)
+ //
+ uint8_t VEX_B = 0x1;
+
+ // VEX_W: opcode specific (use like REX.W, or used for
+ // opcode extension, or ignored, depending on the opcode byte)
+ uint8_t VEX_W = (TSFlags & X86II::VEX_W) ? 1 : 0;
+
+ // VEX_5M (VEX m-mmmmm field):
+ //
+ // 0b00000: Reserved for future use
+ // 0b00001: implied 0F leading opcode
+ // 0b00010: implied 0F 38 leading opcode bytes
+ // 0b00011: implied 0F 3A leading opcode bytes
+ // 0b00100-0b11111: Reserved for future use
+ // 0b01000: XOP map select - 08h instructions with imm byte
+ // 0b01001: XOP map select - 09h instructions with no imm byte
+ // 0b01010: XOP map select - 0Ah instructions with imm dword
+ uint8_t VEX_5M;
+ switch (TSFlags & X86II::OpMapMask) {
+ default:
+ llvm_unreachable("Invalid prefix!");
+ case X86II::TB:
+ VEX_5M = 0x1;
+ break; // 0F
+ case X86II::T8:
+ VEX_5M = 0x2;
+ break; // 0F 38
+ case X86II::TA:
+ VEX_5M = 0x3;
+ break; // 0F 3A
+ case X86II::XOP8:
+ VEX_5M = 0x8;
+ break;
+ case X86II::XOP9:
+ VEX_5M = 0x9;
+ break;
+ case X86II::XOPA:
+ VEX_5M = 0xA;
+ break;
+ }
+
+ // VEX_4V (VEX vvvv field): a register specifier
+ // (in 1's complement form) or 1111 if unused.
+ uint8_t VEX_4V = 0xf;
+ uint8_t EVEX_V2 = 0x1;
+
+ // EVEX_L2/VEX_L (Vector Length):
+ //
+ // L2 L
+ // 0 0: scalar or 128-bit vector
+ // 0 1: 256-bit vector
+ // 1 0: 512-bit vector
+ //
+ uint8_t VEX_L = (TSFlags & X86II::VEX_L) ? 1 : 0;
+ uint8_t EVEX_L2 = (TSFlags & X86II::EVEX_L2) ? 1 : 0;
+
+ // VEX_PP: opcode extension providing equivalent
+ // functionality of a SIMD prefix
+ //
+ // 0b00: None
+ // 0b01: 66
+ // 0b10: F3
+ // 0b11: F2
+ //
+ uint8_t VEX_PP = 0;
+ switch (TSFlags & X86II::OpPrefixMask) {
+ case X86II::PD:
+ VEX_PP = 0x1;
+ break; // 66
+ case X86II::XS:
+ VEX_PP = 0x2;
+ break; // F3
+ case X86II::XD:
+ VEX_PP = 0x3;
+ break; // F2
+ }
+
+ // EVEX_U
+ uint8_t EVEX_U = 1; // Always '1' so far
+
+ // EVEX_z
+ uint8_t EVEX_z = (HasEVEX_K && (TSFlags & X86II::EVEX_Z)) ? 1 : 0;
+
+ // EVEX_b
+ uint8_t EVEX_b = (TSFlags & X86II::EVEX_B) ? 1 : 0;
+
+ // EVEX_rc
+ uint8_t EVEX_rc = 0;
+
+ // EVEX_aaa
+ uint8_t EVEX_aaa = 0;
+
+ bool EncodeRC = false;
+
+ // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ switch (TSFlags & X86II::FormMask) {
+ default:
+ llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!");
+ case X86II::MRM_C0:
+ case X86II::RawFrm:
+ case X86II::PrefixByte:
+ break;
+ case X86II::MRMDestMemFSIB:
+ case X86II::MRMDestMem: {
+ // MRMDestMem instructions forms:
+ // MemAddr, src1(ModR/M)
+ // MemAddr, src1(VEX_4V), src2(ModR/M)
+ // MemAddr, src1(ModR/M), imm8
+ //
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+ if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+ EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
+ CurOp += X86::AddrNumOperands;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+ break;
+ }
+ case X86II::MRMSrcMemFSIB:
+ case X86II::MRMSrcMem: {
+ // MRMSrcMem instructions forms:
+ // src1(ModR/M), MemAddr
+ // src1(ModR/M), src2(VEX_4V), MemAddr
+ // src1(ModR/M), MemAddr, imm8
+ // src1(ModR/M), MemAddr, src2(Imm[7:4])
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+ if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+ EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
+ break;
+ }
+ case X86II::MRMSrcMem4VOp3: {
+ // Instruction format for 4VOp3:
+ // src1(ModR/M), MemAddr, src3(VEX_4V)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+
+ VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf;
+ break;
+ }
+ case X86II::MRMSrcMemOp4: {
+ // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+ break;
+ }
+ case X86II::MRM0m:
+ case X86II::MRM1m:
+ case X86II::MRM2m:
+ case X86II::MRM3m:
+ case X86II::MRM4m:
+ case X86II::MRM5m:
+ case X86II::MRM6m:
+ case X86II::MRM7m: {
+ // MRM[0-9]m instructions forms:
+ // MemAddr
+ // src1(VEX_4V), MemAddr
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+ if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+ EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
+ break;
+ }
+ case X86II::MRMSrcReg: {
+ // MRMSrcReg instructions forms:
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
+ // dst(ModR/M), src1(ModR/M)
+ // dst(ModR/M), src1(ModR/M), imm8
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+ VEX_X = ~(RegEnc >> 4) & 1;
+
+ if (EVEX_b) {
+ if (HasEVEX_RC) {
+ unsigned RcOperand = NumOps - 1;
+ assert(RcOperand >= CurOp);
+ EVEX_rc = MI.getOperand(RcOperand).getImm();
+ assert(EVEX_rc <= 3 && "Invalid rounding control!");
+ }
+ EncodeRC = true;
+ }
+ break;
+ }
+ case X86II::MRMSrcReg4VOp3: {
+ // Instruction format for 4VOp3:
+ // src1(ModR/M), src2(ModR/M), src3(VEX_4V)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+
+ VEX_4V = ~getX86RegEncoding(MI, CurOp++) & 0xf;
+ break;
+ }
+ case X86II::MRMSrcRegOp4: {
+ // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+
+ // Skip second register source (encoded in Imm[7:4])
+ ++CurOp;
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+ VEX_X = ~(RegEnc >> 4) & 1;
+ break;
+ }
+ case X86II::MRMDestReg: {
+ // MRMDestReg instructions forms:
+ // dst(ModR/M), src(ModR/M)
+ // dst(ModR/M), src(ModR/M), imm8
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+ VEX_X = ~(RegEnc >> 4) & 1;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+ if (EVEX_b)
+ EncodeRC = true;
+ break;
+ }
+ case X86II::MRMr0: {
+ // MRMr0 instructions forms:
+ // 11:rrr:000
+ // dst(ModR/M)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+ break;
+ }
+ case X86II::MRM0r:
+ case X86II::MRM1r:
+ case X86II::MRM2r:
+ case X86II::MRM3r:
+ case X86II::MRM4r:
+ case X86II::MRM5r:
+ case X86II::MRM6r:
+ case X86II::MRM7r: {
+ // MRM0r-MRM7r instructions forms:
+ // dst(VEX_4V), src(ModR/M), imm8
+ if (HasVEX_4V) {
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+ EVEX_V2 = ~(VRegEnc >> 4) & 1;
+ }
+ if (HasEVEX_K)
+ EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+ VEX_X = ~(RegEnc >> 4) & 1;
+ break;
+ }
+ }
+
+ if (Encoding == X86II::VEX || Encoding == X86II::XOP) {
+ // VEX opcode prefix can have 2 or 3 bytes
+ //
+ // 3 bytes:
+ // +-----+ +--------------+ +-------------------+
+ // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ // 2 bytes:
+ // +-----+ +-------------------+
+ // | C5h | | R | vvvv | L | pp |
+ // +-----+ +-------------------+
+ //
+ // XOP uses a similar prefix:
+ // +-----+ +--------------+ +-------------------+
+ // | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+ // Can we use the 2 byte VEX prefix?
+ if (!(MI.getFlags() & X86::IP_USE_VEX3) && Encoding == X86II::VEX &&
+ VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+ emitByte(0xC5, OS);
+ emitByte(LastByte | (VEX_R << 7), OS);
+ return;
+ }
+
+ // 3 byte VEX prefix
+ emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, OS);
+ emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, OS);
+ emitByte(LastByte | (VEX_W << 7), OS);
+ } else {
+ assert(Encoding == X86II::EVEX && "unknown encoding!");
+ // EVEX opcode prefix can have 4 bytes
+ //
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ assert((VEX_5M & 0x3) == VEX_5M &&
+ "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+ emitByte(0x62, OS);
+ emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) |
+ VEX_5M,
+ OS);
+ emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, OS);
+ if (EncodeRC)
+ emitByte((EVEX_z << 7) | (EVEX_rc << 5) | (EVEX_b << 4) | (EVEX_V2 << 3) |
+ EVEX_aaa,
+ OS);
+ else
+ emitByte((EVEX_z << 7) | (EVEX_L2 << 6) | (VEX_L << 5) | (EVEX_b << 4) |
+ (EVEX_V2 << 3) | EVEX_aaa,
+ OS);
+ }
+}
+
+/// Emit REX prefix which specifies
+/// 1) 64-bit instructions,
+/// 2) non-default operand size, and
+/// 3) use of X86-64 extended registers.
+///
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const {
+ uint8_t REX = [&, MemOperand]() {
+ uint8_t REX = 0;
+ bool UsesHighByteReg = false;
+
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ if (TSFlags & X86II::REX_W)
+ REX |= 1 << 3; // set REX.W
+
+ if (MI.getNumOperands() == 0)
+ return REX;
+
+ unsigned NumOps = MI.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ for (unsigned i = CurOp; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH ||
+ Reg == X86::DH)
+ UsesHighByteReg = true;
+ if (X86II::isX86_64NonExtLowByteReg(Reg))
+ // FIXME: The caller of determineREXPrefix slaps this prefix onto
+ // anything that returns non-zero.
+ REX |= 0x40; // REX fixed encoding prefix
+ } else if (MO.isExpr() &&
+ STI.getTargetTriple().getEnvironment() == Triple::GNUX32) {
+ // GOTTPOFF and TLSDESC relocations require a REX prefix to allow
+ // linker optimizations: even if the instructions we see may not require
+ // any prefix, they may be replaced by instructions that do. This is
+ // handled as a special case here so that it also works for hand-written
+ // assembly without the user needing to write REX, as with GNU as.
+ const auto *Ref = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+ if (Ref && (Ref->getKind() == MCSymbolRefExpr::VK_GOTTPOFF ||
+ Ref->getKind() == MCSymbolRefExpr::VK_TLSDESC)) {
+ REX |= 0x40; // REX fixed encoding prefix
+ }
+ }
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::AddRegFrm:
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ break;
+ case X86II::MRMSrcReg:
+ case X86II::MRMSrcRegCC:
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ break;
+ case X86II::MRMSrcMem:
+ case X86II::MRMSrcMemCC:
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+ CurOp += X86::AddrNumOperands;
+ break;
+ case X86II::MRMDestReg:
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ break;
+ case X86II::MRMDestMem:
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+ CurOp += X86::AddrNumOperands;
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ break;
+ case X86II::MRMXmCC:
+ case X86II::MRMXm:
+ case X86II::MRM0m:
+ case X86II::MRM1m:
+ case X86II::MRM2m:
+ case X86II::MRM3m:
+ case X86II::MRM4m:
+ case X86II::MRM5m:
+ case X86II::MRM6m:
+ case X86II::MRM7m:
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+ break;
+ case X86II::MRMXrCC:
+ case X86II::MRMXr:
+ case X86II::MRM0r:
+ case X86II::MRM1r:
+ case X86II::MRM2r:
+ case X86II::MRM3r:
+ case X86II::MRM4r:
+ case X86II::MRM5r:
+ case X86II::MRM6r:
+ case X86II::MRM7r:
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ break;
+ case X86II::MRMr0:
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ break;
+ case X86II::MRMDestMemFSIB:
+ llvm_unreachable("FSIB format never need REX prefix!");
+ }
+ if (REX && UsesHighByteReg)
+ report_fatal_error(
+ "Cannot encode high byte register in REX-prefixed instruction");
+ return REX;
+ }();
+
+ if (!REX)
+ return false;
+
+ emitByte(0x40 | REX, OS);
+ return true;
+}
+
+/// Emit segment override opcode prefix as needed.
+void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned SegOperand,
+ const MCInst &MI,
+ raw_ostream &OS) const {
+ // Check for explicit segment override on memory operand.
+ if (unsigned Reg = MI.getOperand(SegOperand).getReg())
+ emitByte(X86::getSegmentOverridePrefixForReg(Reg), OS);
+}
+
+/// Emit all instruction prefixes prior to the opcode.
+///
+/// \param MemOperand the operand # of the start of a memory operand if present.
+/// If not present, it is -1.
+///
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const {
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // Emit the operand size opcode prefix as needed.
+ if ((TSFlags & X86II::OpSizeMask) ==
+ (STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16))
+ emitByte(0x66, OS);
+
+ // Emit the LOCK opcode prefix.
+ if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK)
+ emitByte(0xF0, OS);
+
+ // Emit the NOTRACK opcode prefix.
+ if (TSFlags & X86II::NOTRACK || MI.getFlags() & X86::IP_HAS_NOTRACK)
+ emitByte(0x3E, OS);
+
+ switch (TSFlags & X86II::OpPrefixMask) {
+ case X86II::PD: // 66
+ emitByte(0x66, OS);
+ break;
+ case X86II::XS: // F3
+ emitByte(0xF3, OS);
+ break;
+ case X86II::XD: // F2
+ emitByte(0xF2, OS);
+ break;
+ }
+
+ // Handle REX prefix.
+ assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) &&
+ "REX.W requires 64bit mode.");
+ bool HasREX = STI.hasFeature(X86::Mode64Bit)
+ ? emitREXPrefix(MemOperand, MI, STI, OS)
+ : false;
+
+ // 0x0F escape code must be emitted just before the opcode.
+ switch (TSFlags & X86II::OpMapMask) {
+ case X86II::TB: // Two-byte opcode map
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller.
+ emitByte(0x0F, OS);
+ break;
+ }
+
+ switch (TSFlags & X86II::OpMapMask) {
+ case X86II::T8: // 0F 38
+ emitByte(0x38, OS);
+ break;
+ case X86II::TA: // 0F 3A
+ emitByte(0x3A, OS);
+ break;
+ }
+
+ return HasREX;
+}
+
+void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS,
+ const MCSubtargetInfo &STI) const {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // Pseudo instructions don't get encoded.
+ if (X86II::isPseudo(TSFlags))
+ return;
+
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ emitPrefixImpl(CurOp, MI, STI, OS);
+}
+
+void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // Pseudo instructions don't get encoded.
+ if (X86II::isPseudo(TSFlags))
+ return;
+
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ uint64_t StartByte = OS.tell();
+
+ bool HasREX = emitPrefixImpl(CurOp, MI, STI, OS);
+
+ // It uses the VEX.VVVV field?
+ bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+ bool HasVEX_I8Reg = (TSFlags & X86II::ImmMask) == X86II::Imm8Reg;
+
+ // It uses the EVEX.aaa field?
+ bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+ bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+ // Used if a register is encoded in 7:4 of immediate.
+ unsigned I8RegNum = 0;
+
+ uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+
+ if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
+ BaseOpcode = 0x0F; // Weird 3DNow! encoding.
+
+ unsigned OpcodeOffset = 0;
+
+ uint64_t Form = TSFlags & X86II::FormMask;
+ switch (Form) {
+ default:
+ errs() << "FORM: " << Form << "\n";
+ llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
+ case X86II::Pseudo:
+ llvm_unreachable("Pseudo instruction shouldn't be emitted");
+ case X86II::RawFrmDstSrc:
+ case X86II::RawFrmSrc:
+ case X86II::RawFrmDst:
+ case X86II::PrefixByte:
+ emitByte(BaseOpcode, OS);
+ break;
+ case X86II::AddCCFrm: {
+ // This will be added to the opcode in the fallthrough.
+ OpcodeOffset = MI.getOperand(NumOps - 1).getImm();
+ assert(OpcodeOffset < 16 && "Unexpected opcode offset!");
+ --NumOps; // Drop the operand from the end.
+ LLVM_FALLTHROUGH;
+ case X86II::RawFrm:
+ emitByte(BaseOpcode + OpcodeOffset, OS);
+
+ if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII))
+ break;
+
+ const MCOperand &Op = MI.getOperand(CurOp++);
+ emitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags),
+ MCFixupKind(X86::reloc_branch_4byte_pcrel), StartByte, OS,
+ Fixups);
+ break;
+ }
+ case X86II::RawFrmMemOffs:
+ emitByte(BaseOpcode, OS);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ StartByte, OS, Fixups);
+ ++CurOp; // skip segment operand
+ break;
+ case X86II::RawFrmImm8:
+ emitByte(BaseOpcode, OS);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ StartByte, OS, Fixups);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, StartByte,
+ OS, Fixups);
+ break;
+ case X86II::RawFrmImm16:
+ emitByte(BaseOpcode, OS);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ StartByte, OS, Fixups);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, StartByte,
+ OS, Fixups);
+ break;
+
+ case X86II::AddRegFrm:
+ emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), OS);
+ break;
+
+ case X86II::MRMDestReg: {
+ emitByte(BaseOpcode, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ if (HasEVEX_K) // Skip writemask
+ ++SrcRegNum;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ emitRegModRMByte(MI.getOperand(CurOp),
+ getX86RegNum(MI.getOperand(SrcRegNum)), OS);
+ CurOp = SrcRegNum + 1;
+ break;
+ }
+ case X86II::MRMDestMemFSIB:
+ case X86II::MRMDestMem: {
+ emitByte(BaseOpcode, OS);
+ unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
+
+ if (HasEVEX_K) // Skip writemask
+ ++SrcRegNum;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ bool ForceSIB = (Form == X86II::MRMDestMemFSIB);
+ emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
+ HasREX, StartByte, OS, Fixups, STI, ForceSIB);
+ CurOp = SrcRegNum + 1;
+ break;
+ }
+ case X86II::MRMSrcReg: {
+ emitByte(BaseOpcode, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ if (HasEVEX_K) // Skip writemask
+ ++SrcRegNum;
+
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ emitRegModRMByte(MI.getOperand(SrcRegNum),
+ getX86RegNum(MI.getOperand(CurOp)), OS);
+ CurOp = SrcRegNum + 1;
+ if (HasVEX_I8Reg)
+ I8RegNum = getX86RegEncoding(MI, CurOp++);
+ // do not count the rounding control operand
+ if (HasEVEX_RC)
+ --NumOps;
+ break;
+ }
+ case X86II::MRMSrcReg4VOp3: {
+ emitByte(BaseOpcode, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ emitRegModRMByte(MI.getOperand(SrcRegNum),
+ getX86RegNum(MI.getOperand(CurOp)), OS);
+ CurOp = SrcRegNum + 1;
+ ++CurOp; // Encoded in VEX.VVVV
+ break;
+ }
+ case X86II::MRMSrcRegOp4: {
+ emitByte(BaseOpcode, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ // Capture 2nd src (which is encoded in Imm[7:4])
+ assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
+ I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
+
+ emitRegModRMByte(MI.getOperand(SrcRegNum),
+ getX86RegNum(MI.getOperand(CurOp)), OS);
+ CurOp = SrcRegNum + 1;
+ break;
+ }
+ case X86II::MRMSrcRegCC: {
+ unsigned FirstOp = CurOp++;
+ unsigned SecondOp = CurOp++;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ emitByte(BaseOpcode + CC, OS);
+
+ emitRegModRMByte(MI.getOperand(SecondOp),
+ getX86RegNum(MI.getOperand(FirstOp)), OS);
+ break;
+ }
+ case X86II::MRMSrcMemFSIB:
+ case X86II::MRMSrcMem: {
+ unsigned FirstMemOp = CurOp + 1;
+
+ if (HasEVEX_K) // Skip writemask
+ ++FirstMemOp;
+
+ if (HasVEX_4V)
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+
+ emitByte(BaseOpcode, OS);
+
+ bool ForceSIB = (Form == X86II::MRMSrcMemFSIB);
+ emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, HasREX, StartByte, OS, Fixups, STI, ForceSIB);
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+ if (HasVEX_I8Reg)
+ I8RegNum = getX86RegEncoding(MI, CurOp++);
+ break;
+ }
+ case X86II::MRMSrcMem4VOp3: {
+ unsigned FirstMemOp = CurOp + 1;
+
+ emitByte(BaseOpcode, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, HasREX, StartByte, OS, Fixups, STI);
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+ ++CurOp; // Encoded in VEX.VVVV.
+ break;
+ }
+ case X86II::MRMSrcMemOp4: {
+ unsigned FirstMemOp = CurOp + 1;
+
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+
+ // Capture second register source (encoded in Imm[7:4])
+ assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
+ I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
+
+ emitByte(BaseOpcode, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, HasREX, StartByte, OS, Fixups, STI);
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+ break;
+ }
+ case X86II::MRMSrcMemCC: {
+ unsigned RegOp = CurOp++;
+ unsigned FirstMemOp = CurOp;
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ emitByte(BaseOpcode + CC, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(RegOp)),
+ TSFlags, HasREX, StartByte, OS, Fixups, STI);
+ break;
+ }
+
+ case X86II::MRMXrCC: {
+ unsigned RegOp = CurOp++;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ emitByte(BaseOpcode + CC, OS);
+ emitRegModRMByte(MI.getOperand(RegOp), 0, OS);
+ break;
+ }
+
+ case X86II::MRMXr:
+ case X86II::MRM0r:
+ case X86II::MRM1r:
+ case X86II::MRM2r:
+ case X86II::MRM3r:
+ case X86II::MRM4r:
+ case X86II::MRM5r:
+ case X86II::MRM6r:
+ case X86II::MRM7r:
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
+ emitByte(BaseOpcode, OS);
+ emitRegModRMByte(MI.getOperand(CurOp++),
+ (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, OS);
+ break;
+ case X86II::MRMr0:
+ emitByte(BaseOpcode, OS);
+ emitByte(modRMByte(3, getX86RegNum(MI.getOperand(CurOp++)),0), OS);
+ break;
+
+ case X86II::MRMXmCC: {
+ unsigned FirstMemOp = CurOp;
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ emitByte(BaseOpcode + CC, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, HasREX, StartByte, OS, Fixups,
+ STI);
+ break;
+ }
+
+ case X86II::MRMXm:
+ case X86II::MRM0m:
+ case X86II::MRM1m:
+ case X86II::MRM2m:
+ case X86II::MRM3m:
+ case X86II::MRM4m:
+ case X86II::MRM5m:
+ case X86II::MRM6m:
+ case X86II::MRM7m:
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
+ emitByte(BaseOpcode, OS);
+ emitMemModRMByte(MI, CurOp,
+ (Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags,
+ HasREX, StartByte, OS, Fixups, STI);
+ CurOp += X86::AddrNumOperands;
+ break;
+
+ case X86II::MRM0X:
+ case X86II::MRM1X:
+ case X86II::MRM2X:
+ case X86II::MRM3X:
+ case X86II::MRM4X:
+ case X86II::MRM5X:
+ case X86II::MRM6X:
+ case X86II::MRM7X:
+ emitByte(BaseOpcode, OS);
+ emitByte(0xC0 + ((Form - X86II::MRM0X) << 3), OS);
+ break;
+
+ case X86II::MRM_C0:
+ case X86II::MRM_C1:
+ case X86II::MRM_C2:
+ case X86II::MRM_C3:
+ case X86II::MRM_C4:
+ case X86II::MRM_C5:
+ case X86II::MRM_C6:
+ case X86II::MRM_C7:
+ case X86II::MRM_C8:
+ case X86II::MRM_C9:
+ case X86II::MRM_CA:
+ case X86II::MRM_CB:
+ case X86II::MRM_CC:
+ case X86II::MRM_CD:
+ case X86II::MRM_CE:
+ case X86II::MRM_CF:
+ case X86II::MRM_D0:
+ case X86II::MRM_D1:
+ case X86II::MRM_D2:
+ case X86II::MRM_D3:
+ case X86II::MRM_D4:
+ case X86II::MRM_D5:
+ case X86II::MRM_D6:
+ case X86II::MRM_D7:
+ case X86II::MRM_D8:
+ case X86II::MRM_D9:
+ case X86II::MRM_DA:
+ case X86II::MRM_DB:
+ case X86II::MRM_DC:
+ case X86II::MRM_DD:
+ case X86II::MRM_DE:
+ case X86II::MRM_DF:
+ case X86II::MRM_E0:
+ case X86II::MRM_E1:
+ case X86II::MRM_E2:
+ case X86II::MRM_E3:
+ case X86II::MRM_E4:
+ case X86II::MRM_E5:
+ case X86II::MRM_E6:
+ case X86II::MRM_E7:
+ case X86II::MRM_E8:
+ case X86II::MRM_E9:
+ case X86II::MRM_EA:
+ case X86II::MRM_EB:
+ case X86II::MRM_EC:
+ case X86II::MRM_ED:
+ case X86II::MRM_EE:
+ case X86II::MRM_EF:
+ case X86II::MRM_F0:
+ case X86II::MRM_F1:
+ case X86II::MRM_F2:
+ case X86II::MRM_F3:
+ case X86II::MRM_F4:
+ case X86II::MRM_F5:
+ case X86II::MRM_F6:
+ case X86II::MRM_F7:
+ case X86II::MRM_F8:
+ case X86II::MRM_F9:
+ case X86II::MRM_FA:
+ case X86II::MRM_FB:
+ case X86II::MRM_FC:
+ case X86II::MRM_FD:
+ case X86II::MRM_FE:
+ case X86II::MRM_FF:
+ emitByte(BaseOpcode, OS);
+ emitByte(0xC0 + Form - X86II::MRM_C0, OS);
+ break;
+ }
+
+ if (HasVEX_I8Reg) {
+ // The last source register of a 4 operand instruction in AVX is encoded
+ // in bits[7:4] of a immediate byte.
+ assert(I8RegNum < 16 && "Register encoding out of range");
+ I8RegNum <<= 4;
+ if (CurOp != NumOps) {
+ unsigned Val = MI.getOperand(CurOp++).getImm();
+ assert(Val < 16 && "Immediate operand value out of range");
+ I8RegNum |= Val;
+ }
+ emitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
+ StartByte, OS, Fixups);
+ } else {
+ // If there is a remaining operand, it must be a trailing immediate. Emit it
+ // according to the right size for the instruction. Some instructions
+ // (SSE4a extrq and insertq) have two trailing immediates.
+ while (CurOp != NumOps && NumOps - CurOp <= 2) {
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ StartByte, OS, Fixups);
+ }
+ }
+
+ if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
+ emitByte(X86II::getBaseOpcodeFor(TSFlags), OS);
+
+ assert(OS.tell() - StartByte <= 15 &&
+ "The size of instruction must be no longer than 15.");
+#ifndef NDEBUG
+ // FIXME: Verify.
+ if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+ errs() << "Cannot encode all operands of: ";
+ MI.dump();
+ errs() << '\n';
+ abort();
+ }
+#endif
+}
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new X86MCCodeEmitter(MCII, Ctx);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
new file mode 100644
index 000000000000..532fecd9951b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
@@ -0,0 +1,79 @@
+//=--- X86MCExpr.h - X86 specific MC expression classes ---*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes X86-specific MCExprs, i.e, registers used for
+// extended variable assignments.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
+
+#include "X86ATTInstPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class X86MCExpr : public MCTargetExpr {
+
+private:
+ const int64_t RegNo; // All
+
+ explicit X86MCExpr(int64_t R) : RegNo(R) {}
+
+public:
+ /// @name Construction
+ /// @{
+
+ static const X86MCExpr *create(int64_t RegNo, MCContext &Ctx) {
+ return new (Ctx) X86MCExpr(RegNo);
+ }
+
+ /// @}
+ /// @name Accessors
+ /// @{
+
+ /// getSubExpr - Get the child of this expression.
+ int64_t getRegNo() const { return RegNo; }
+
+ /// @}
+
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override {
+ if (!MAI || MAI->getAssemblerDialect() == 0)
+ OS << '%';
+ OS << X86ATTInstPrinter::getRegisterName(RegNo);
+ }
+
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override {
+ return false;
+ }
+ // Register values should be inlined as they are not valid .set expressions.
+ bool inlineAssignedExpr() const override { return true; }
+ bool isEqualTo(const MCExpr *X) const override {
+ if (auto *E = dyn_cast<X86MCExpr>(X))
+ return getRegNo() == E->getRegNo();
+ return false;
+ }
+ void visitUsedExpr(MCStreamer &Streamer) const override{};
+ MCFragment *findAssociatedFragment() const override { return nullptr; }
+
+ // There are no TLS X86MCExprs at the moment.
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
new file mode 100644
index 000000000000..5cf8d77519d9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -0,0 +1,790 @@
+//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "TargetInfo/X86TargetInfo.h"
+#include "X86ATTInstPrinter.h"
+#include "X86BaseInfo.h"
+#include "X86IntelInstPrinter.h"
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_MC_DESC
+#include "X86GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#define GET_INSTRINFO_MC_HELPERS
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "X86GenSubtargetInfo.inc"
+
+std::string X86_MC::ParseX86Triple(const Triple &TT) {
+ std::string FS;
+ // SSE2 should default to enabled in 64-bit mode, but can be turned off
+ // explicitly.
+ if (TT.isArch64Bit())
+ FS = "+64bit-mode,-32bit-mode,-16bit-mode,+sse2";
+ else if (TT.getEnvironment() != Triple::CODE16)
+ FS = "-64bit-mode,+32bit-mode,-16bit-mode";
+ else
+ FS = "-64bit-mode,-32bit-mode,+16bit-mode";
+
+ return FS;
+}
+
+unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
+ if (TT.getArch() == Triple::x86_64)
+ return DWARFFlavour::X86_64;
+
+ if (TT.isOSDarwin())
+ return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+ if (TT.isOSCygMing())
+ // Unsupported by now, just quick fallback
+ return DWARFFlavour::X86_32_Generic;
+ return DWARFFlavour::X86_32_Generic;
+}
+
+bool X86_MC::hasLockPrefix(const MCInst &MI) {
+ return MI.getFlags() & X86::IP_HAS_LOCK;
+}
+
+void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
+ // FIXME: TableGen these.
+ for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+ unsigned SEH = MRI->getEncodingValue(Reg);
+ MRI->mapLLVMRegToSEHReg(Reg, SEH);
+ }
+
+ // Mapping from CodeView to MC register id.
+ static const struct {
+ codeview::RegisterId CVReg;
+ MCPhysReg Reg;
+ } RegMap[] = {
+ {codeview::RegisterId::AL, X86::AL},
+ {codeview::RegisterId::CL, X86::CL},
+ {codeview::RegisterId::DL, X86::DL},
+ {codeview::RegisterId::BL, X86::BL},
+ {codeview::RegisterId::AH, X86::AH},
+ {codeview::RegisterId::CH, X86::CH},
+ {codeview::RegisterId::DH, X86::DH},
+ {codeview::RegisterId::BH, X86::BH},
+ {codeview::RegisterId::AX, X86::AX},
+ {codeview::RegisterId::CX, X86::CX},
+ {codeview::RegisterId::DX, X86::DX},
+ {codeview::RegisterId::BX, X86::BX},
+ {codeview::RegisterId::SP, X86::SP},
+ {codeview::RegisterId::BP, X86::BP},
+ {codeview::RegisterId::SI, X86::SI},
+ {codeview::RegisterId::DI, X86::DI},
+ {codeview::RegisterId::EAX, X86::EAX},
+ {codeview::RegisterId::ECX, X86::ECX},
+ {codeview::RegisterId::EDX, X86::EDX},
+ {codeview::RegisterId::EBX, X86::EBX},
+ {codeview::RegisterId::ESP, X86::ESP},
+ {codeview::RegisterId::EBP, X86::EBP},
+ {codeview::RegisterId::ESI, X86::ESI},
+ {codeview::RegisterId::EDI, X86::EDI},
+
+ {codeview::RegisterId::EFLAGS, X86::EFLAGS},
+
+ {codeview::RegisterId::ST0, X86::FP0},
+ {codeview::RegisterId::ST1, X86::FP1},
+ {codeview::RegisterId::ST2, X86::FP2},
+ {codeview::RegisterId::ST3, X86::FP3},
+ {codeview::RegisterId::ST4, X86::FP4},
+ {codeview::RegisterId::ST5, X86::FP5},
+ {codeview::RegisterId::ST6, X86::FP6},
+ {codeview::RegisterId::ST7, X86::FP7},
+
+ {codeview::RegisterId::MM0, X86::MM0},
+ {codeview::RegisterId::MM1, X86::MM1},
+ {codeview::RegisterId::MM2, X86::MM2},
+ {codeview::RegisterId::MM3, X86::MM3},
+ {codeview::RegisterId::MM4, X86::MM4},
+ {codeview::RegisterId::MM5, X86::MM5},
+ {codeview::RegisterId::MM6, X86::MM6},
+ {codeview::RegisterId::MM7, X86::MM7},
+
+ {codeview::RegisterId::XMM0, X86::XMM0},
+ {codeview::RegisterId::XMM1, X86::XMM1},
+ {codeview::RegisterId::XMM2, X86::XMM2},
+ {codeview::RegisterId::XMM3, X86::XMM3},
+ {codeview::RegisterId::XMM4, X86::XMM4},
+ {codeview::RegisterId::XMM5, X86::XMM5},
+ {codeview::RegisterId::XMM6, X86::XMM6},
+ {codeview::RegisterId::XMM7, X86::XMM7},
+
+ {codeview::RegisterId::XMM8, X86::XMM8},
+ {codeview::RegisterId::XMM9, X86::XMM9},
+ {codeview::RegisterId::XMM10, X86::XMM10},
+ {codeview::RegisterId::XMM11, X86::XMM11},
+ {codeview::RegisterId::XMM12, X86::XMM12},
+ {codeview::RegisterId::XMM13, X86::XMM13},
+ {codeview::RegisterId::XMM14, X86::XMM14},
+ {codeview::RegisterId::XMM15, X86::XMM15},
+
+ {codeview::RegisterId::SIL, X86::SIL},
+ {codeview::RegisterId::DIL, X86::DIL},
+ {codeview::RegisterId::BPL, X86::BPL},
+ {codeview::RegisterId::SPL, X86::SPL},
+ {codeview::RegisterId::RAX, X86::RAX},
+ {codeview::RegisterId::RBX, X86::RBX},
+ {codeview::RegisterId::RCX, X86::RCX},
+ {codeview::RegisterId::RDX, X86::RDX},
+ {codeview::RegisterId::RSI, X86::RSI},
+ {codeview::RegisterId::RDI, X86::RDI},
+ {codeview::RegisterId::RBP, X86::RBP},
+ {codeview::RegisterId::RSP, X86::RSP},
+ {codeview::RegisterId::R8, X86::R8},
+ {codeview::RegisterId::R9, X86::R9},
+ {codeview::RegisterId::R10, X86::R10},
+ {codeview::RegisterId::R11, X86::R11},
+ {codeview::RegisterId::R12, X86::R12},
+ {codeview::RegisterId::R13, X86::R13},
+ {codeview::RegisterId::R14, X86::R14},
+ {codeview::RegisterId::R15, X86::R15},
+ {codeview::RegisterId::R8B, X86::R8B},
+ {codeview::RegisterId::R9B, X86::R9B},
+ {codeview::RegisterId::R10B, X86::R10B},
+ {codeview::RegisterId::R11B, X86::R11B},
+ {codeview::RegisterId::R12B, X86::R12B},
+ {codeview::RegisterId::R13B, X86::R13B},
+ {codeview::RegisterId::R14B, X86::R14B},
+ {codeview::RegisterId::R15B, X86::R15B},
+ {codeview::RegisterId::R8W, X86::R8W},
+ {codeview::RegisterId::R9W, X86::R9W},
+ {codeview::RegisterId::R10W, X86::R10W},
+ {codeview::RegisterId::R11W, X86::R11W},
+ {codeview::RegisterId::R12W, X86::R12W},
+ {codeview::RegisterId::R13W, X86::R13W},
+ {codeview::RegisterId::R14W, X86::R14W},
+ {codeview::RegisterId::R15W, X86::R15W},
+ {codeview::RegisterId::R8D, X86::R8D},
+ {codeview::RegisterId::R9D, X86::R9D},
+ {codeview::RegisterId::R10D, X86::R10D},
+ {codeview::RegisterId::R11D, X86::R11D},
+ {codeview::RegisterId::R12D, X86::R12D},
+ {codeview::RegisterId::R13D, X86::R13D},
+ {codeview::RegisterId::R14D, X86::R14D},
+ {codeview::RegisterId::R15D, X86::R15D},
+ {codeview::RegisterId::AMD64_YMM0, X86::YMM0},
+ {codeview::RegisterId::AMD64_YMM1, X86::YMM1},
+ {codeview::RegisterId::AMD64_YMM2, X86::YMM2},
+ {codeview::RegisterId::AMD64_YMM3, X86::YMM3},
+ {codeview::RegisterId::AMD64_YMM4, X86::YMM4},
+ {codeview::RegisterId::AMD64_YMM5, X86::YMM5},
+ {codeview::RegisterId::AMD64_YMM6, X86::YMM6},
+ {codeview::RegisterId::AMD64_YMM7, X86::YMM7},
+ {codeview::RegisterId::AMD64_YMM8, X86::YMM8},
+ {codeview::RegisterId::AMD64_YMM9, X86::YMM9},
+ {codeview::RegisterId::AMD64_YMM10, X86::YMM10},
+ {codeview::RegisterId::AMD64_YMM11, X86::YMM11},
+ {codeview::RegisterId::AMD64_YMM12, X86::YMM12},
+ {codeview::RegisterId::AMD64_YMM13, X86::YMM13},
+ {codeview::RegisterId::AMD64_YMM14, X86::YMM14},
+ {codeview::RegisterId::AMD64_YMM15, X86::YMM15},
+ {codeview::RegisterId::AMD64_YMM16, X86::YMM16},
+ {codeview::RegisterId::AMD64_YMM17, X86::YMM17},
+ {codeview::RegisterId::AMD64_YMM18, X86::YMM18},
+ {codeview::RegisterId::AMD64_YMM19, X86::YMM19},
+ {codeview::RegisterId::AMD64_YMM20, X86::YMM20},
+ {codeview::RegisterId::AMD64_YMM21, X86::YMM21},
+ {codeview::RegisterId::AMD64_YMM22, X86::YMM22},
+ {codeview::RegisterId::AMD64_YMM23, X86::YMM23},
+ {codeview::RegisterId::AMD64_YMM24, X86::YMM24},
+ {codeview::RegisterId::AMD64_YMM25, X86::YMM25},
+ {codeview::RegisterId::AMD64_YMM26, X86::YMM26},
+ {codeview::RegisterId::AMD64_YMM27, X86::YMM27},
+ {codeview::RegisterId::AMD64_YMM28, X86::YMM28},
+ {codeview::RegisterId::AMD64_YMM29, X86::YMM29},
+ {codeview::RegisterId::AMD64_YMM30, X86::YMM30},
+ {codeview::RegisterId::AMD64_YMM31, X86::YMM31},
+ {codeview::RegisterId::AMD64_ZMM0, X86::ZMM0},
+ {codeview::RegisterId::AMD64_ZMM1, X86::ZMM1},
+ {codeview::RegisterId::AMD64_ZMM2, X86::ZMM2},
+ {codeview::RegisterId::AMD64_ZMM3, X86::ZMM3},
+ {codeview::RegisterId::AMD64_ZMM4, X86::ZMM4},
+ {codeview::RegisterId::AMD64_ZMM5, X86::ZMM5},
+ {codeview::RegisterId::AMD64_ZMM6, X86::ZMM6},
+ {codeview::RegisterId::AMD64_ZMM7, X86::ZMM7},
+ {codeview::RegisterId::AMD64_ZMM8, X86::ZMM8},
+ {codeview::RegisterId::AMD64_ZMM9, X86::ZMM9},
+ {codeview::RegisterId::AMD64_ZMM10, X86::ZMM10},
+ {codeview::RegisterId::AMD64_ZMM11, X86::ZMM11},
+ {codeview::RegisterId::AMD64_ZMM12, X86::ZMM12},
+ {codeview::RegisterId::AMD64_ZMM13, X86::ZMM13},
+ {codeview::RegisterId::AMD64_ZMM14, X86::ZMM14},
+ {codeview::RegisterId::AMD64_ZMM15, X86::ZMM15},
+ {codeview::RegisterId::AMD64_ZMM16, X86::ZMM16},
+ {codeview::RegisterId::AMD64_ZMM17, X86::ZMM17},
+ {codeview::RegisterId::AMD64_ZMM18, X86::ZMM18},
+ {codeview::RegisterId::AMD64_ZMM19, X86::ZMM19},
+ {codeview::RegisterId::AMD64_ZMM20, X86::ZMM20},
+ {codeview::RegisterId::AMD64_ZMM21, X86::ZMM21},
+ {codeview::RegisterId::AMD64_ZMM22, X86::ZMM22},
+ {codeview::RegisterId::AMD64_ZMM23, X86::ZMM23},
+ {codeview::RegisterId::AMD64_ZMM24, X86::ZMM24},
+ {codeview::RegisterId::AMD64_ZMM25, X86::ZMM25},
+ {codeview::RegisterId::AMD64_ZMM26, X86::ZMM26},
+ {codeview::RegisterId::AMD64_ZMM27, X86::ZMM27},
+ {codeview::RegisterId::AMD64_ZMM28, X86::ZMM28},
+ {codeview::RegisterId::AMD64_ZMM29, X86::ZMM29},
+ {codeview::RegisterId::AMD64_ZMM30, X86::ZMM30},
+ {codeview::RegisterId::AMD64_ZMM31, X86::ZMM31},
+ {codeview::RegisterId::AMD64_K0, X86::K0},
+ {codeview::RegisterId::AMD64_K1, X86::K1},
+ {codeview::RegisterId::AMD64_K2, X86::K2},
+ {codeview::RegisterId::AMD64_K3, X86::K3},
+ {codeview::RegisterId::AMD64_K4, X86::K4},
+ {codeview::RegisterId::AMD64_K5, X86::K5},
+ {codeview::RegisterId::AMD64_K6, X86::K6},
+ {codeview::RegisterId::AMD64_K7, X86::K7},
+ {codeview::RegisterId::AMD64_XMM16, X86::XMM16},
+ {codeview::RegisterId::AMD64_XMM17, X86::XMM17},
+ {codeview::RegisterId::AMD64_XMM18, X86::XMM18},
+ {codeview::RegisterId::AMD64_XMM19, X86::XMM19},
+ {codeview::RegisterId::AMD64_XMM20, X86::XMM20},
+ {codeview::RegisterId::AMD64_XMM21, X86::XMM21},
+ {codeview::RegisterId::AMD64_XMM22, X86::XMM22},
+ {codeview::RegisterId::AMD64_XMM23, X86::XMM23},
+ {codeview::RegisterId::AMD64_XMM24, X86::XMM24},
+ {codeview::RegisterId::AMD64_XMM25, X86::XMM25},
+ {codeview::RegisterId::AMD64_XMM26, X86::XMM26},
+ {codeview::RegisterId::AMD64_XMM27, X86::XMM27},
+ {codeview::RegisterId::AMD64_XMM28, X86::XMM28},
+ {codeview::RegisterId::AMD64_XMM29, X86::XMM29},
+ {codeview::RegisterId::AMD64_XMM30, X86::XMM30},
+ {codeview::RegisterId::AMD64_XMM31, X86::XMM31},
+
+ };
+ for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
+ MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
+}
+
+MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
+ StringRef CPU, StringRef FS) {
+ std::string ArchFS = X86_MC::ParseX86Triple(TT);
+ assert(!ArchFS.empty() && "Failed to parse X86 triple");
+ if (!FS.empty())
+ ArchFS = (Twine(ArchFS) + "," + FS).str();
+
+ if (CPU.empty())
+ CPU = "generic";
+
+ return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
+}
+
+static MCInstrInfo *createX86MCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitX86MCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) {
+ unsigned RA = (TT.getArch() == Triple::x86_64)
+ ? X86::RIP // Should have dwarf #16.
+ : X86::EIP; // Should have dwarf #8.
+
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false),
+ X86_MC::getDwarfRegFlavour(TT, true), RA);
+ X86_MC::initLLVMToSEHAndCVRegMapping(X);
+ return X;
+}
+
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TheTriple,
+ const MCTargetOptions &Options) {
+ bool is64Bit = TheTriple.getArch() == Triple::x86_64;
+
+ MCAsmInfo *MAI;
+ if (TheTriple.isOSBinFormatMachO()) {
+ if (is64Bit)
+ MAI = new X86_64MCAsmInfoDarwin(TheTriple);
+ else
+ MAI = new X86MCAsmInfoDarwin(TheTriple);
+ } else if (TheTriple.isOSBinFormatELF()) {
+ // Force the use of an ELF container.
+ MAI = new X86ELFMCAsmInfo(TheTriple);
+ } else if (TheTriple.isWindowsMSVCEnvironment() ||
+ TheTriple.isWindowsCoreCLREnvironment()) {
+ if (Options.getAssemblyLanguage().equals_lower("masm"))
+ MAI = new X86MCAsmInfoMicrosoftMASM(TheTriple);
+ else
+ MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+ } else if (TheTriple.isOSCygMing() ||
+ TheTriple.isWindowsItaniumEnvironment()) {
+ MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
+ } else {
+ // The default is ELF.
+ MAI = new X86ELFMCAsmInfo(TheTriple);
+ }
+
+ // Initialize initial frame state.
+ // Calculate amount of bytes used for return address storing
+ int stackGrowth = is64Bit ? -8 : -4;
+
+ // Initial state of the frame pointer is esp+stackGrowth.
+ unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
+ MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(
+ nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+ MAI->addInitialFrameState(Inst);
+
+ // Add return address to move list
+ unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
+ MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
+ nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+ MAI->addInitialFrameState(Inst2);
+
+ return MAI;
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Triple &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI) {
+ if (SyntaxVariant == 0)
+ return new X86ATTInstPrinter(MAI, MII, MRI);
+ if (SyntaxVariant == 1)
+ return new X86IntelInstPrinter(MAI, MII, MRI);
+ return nullptr;
+}
+
+static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple,
+ MCContext &Ctx) {
+ // Default to the stock relocation info.
+ return llvm::createMCRelocationInfo(TheTriple, Ctx);
+}
+
+namespace llvm {
+namespace X86_MC {
+
+class X86MCInstrAnalysis : public MCInstrAnalysis {
+ X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete;
+ X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete;
+ virtual ~X86MCInstrAnalysis() = default;
+
+public:
+ X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}
+
+#define GET_STIPREDICATE_DECLS_FOR_MC_ANALYSIS
+#include "X86GenSubtargetInfo.inc"
+
+ bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+ APInt &Mask) const override;
+ std::vector<std::pair<uint64_t, uint64_t>>
+ findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
+ uint64_t GotSectionVA,
+ const Triple &TargetTriple) const override;
+
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+ uint64_t &Target) const override;
+ Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
+ uint64_t Addr,
+ uint64_t Size) const override;
+};
+
+#define GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS
+#include "X86GenSubtargetInfo.inc"
+
+bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
+ const MCInst &Inst,
+ APInt &Mask) const {
+ const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+ unsigned NumDefs = Desc.getNumDefs();
+ unsigned NumImplicitDefs = Desc.getNumImplicitDefs();
+ assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+ "Unexpected number of bits in the mask!");
+
+ bool HasVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::VEX;
+ bool HasEVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX;
+ bool HasXOP = (Desc.TSFlags & X86II::EncodingMask) == X86II::XOP;
+
+ const MCRegisterClass &GR32RC = MRI.getRegClass(X86::GR32RegClassID);
+ const MCRegisterClass &VR128XRC = MRI.getRegClass(X86::VR128XRegClassID);
+ const MCRegisterClass &VR256XRC = MRI.getRegClass(X86::VR256XRegClassID);
+
+ auto ClearsSuperReg = [=](unsigned RegID) {
+ // On X86-64, a general purpose integer register is viewed as a 64-bit
+ // register internal to the processor.
+ // An update to the lower 32 bits of a 64 bit integer register is
+ // architecturally defined to zero extend the upper 32 bits.
+ if (GR32RC.contains(RegID))
+ return true;
+
+ // Early exit if this instruction has no vex/evex/xop prefix.
+ if (!HasEVEX && !HasVEX && !HasXOP)
+ return false;
+
+ // All VEX and EVEX encoded instructions are defined to zero the high bits
+ // of the destination register up to VLMAX (i.e. the maximum vector register
+ // width pertaining to the instruction).
+ // We assume the same behavior for XOP instructions too.
+ return VR128XRC.contains(RegID) || VR256XRC.contains(RegID);
+ };
+
+ Mask.clearAllBits();
+ for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+ const MCOperand &Op = Inst.getOperand(I);
+ if (ClearsSuperReg(Op.getReg()))
+ Mask.setBit(I);
+ }
+
+ for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+ const MCPhysReg Reg = Desc.getImplicitDefs()[I];
+ if (ClearsSuperReg(Reg))
+ Mask.setBit(NumDefs + I);
+ }
+
+ return Mask.getBoolValue();
+}
+
+static std::vector<std::pair<uint64_t, uint64_t>>
+findX86PltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
+ uint64_t GotPltSectionVA) {
+ // Do a lightweight parsing of PLT entries.
+ std::vector<std::pair<uint64_t, uint64_t>> Result;
+ for (uint64_t Byte = 0, End = PltContents.size(); Byte + 6 < End; ) {
+ // Recognize a jmp.
+ if (PltContents[Byte] == 0xff && PltContents[Byte + 1] == 0xa3) {
+ // The jmp instruction at the beginning of each PLT entry jumps to the
+ // address of the base of the .got.plt section plus the immediate.
+ uint32_t Imm = support::endian::read32le(PltContents.data() + Byte + 2);
+ Result.push_back(
+ std::make_pair(PltSectionVA + Byte, GotPltSectionVA + Imm));
+ Byte += 6;
+ } else if (PltContents[Byte] == 0xff && PltContents[Byte + 1] == 0x25) {
+ // The jmp instruction at the beginning of each PLT entry jumps to the
+ // immediate.
+ uint32_t Imm = support::endian::read32le(PltContents.data() + Byte + 2);
+ Result.push_back(std::make_pair(PltSectionVA + Byte, Imm));
+ Byte += 6;
+ } else
+ Byte++;
+ }
+ return Result;
+}
+
+static std::vector<std::pair<uint64_t, uint64_t>>
+findX86_64PltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents) {
+ // Do a lightweight parsing of PLT entries.
+ std::vector<std::pair<uint64_t, uint64_t>> Result;
+ for (uint64_t Byte = 0, End = PltContents.size(); Byte + 6 < End; ) {
+ // Recognize a jmp.
+ if (PltContents[Byte] == 0xff && PltContents[Byte + 1] == 0x25) {
+ // The jmp instruction at the beginning of each PLT entry jumps to the
+ // address of the next instruction plus the immediate.
+ uint32_t Imm = support::endian::read32le(PltContents.data() + Byte + 2);
+ Result.push_back(
+ std::make_pair(PltSectionVA + Byte, PltSectionVA + Byte + 6 + Imm));
+ Byte += 6;
+ } else
+ Byte++;
+ }
+ return Result;
+}
+
+std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries(
+ uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
+ uint64_t GotPltSectionVA, const Triple &TargetTriple) const {
+ switch (TargetTriple.getArch()) {
+ case Triple::x86:
+ return findX86PltEntries(PltSectionVA, PltContents, GotPltSectionVA);
+ case Triple::x86_64:
+ return findX86_64PltEntries(PltSectionVA, PltContents);
+ default:
+ return {};
+ }
+}
+
+bool X86MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+ uint64_t Size, uint64_t &Target) const {
+ if (Inst.getNumOperands() == 0 ||
+ Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL)
+ return false;
+ Target = Addr + Size + Inst.getOperand(0).getImm();
+ return true;
+}
+
+Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress(
+ const MCInst &Inst, uint64_t Addr, uint64_t Size) const {
+ const MCInstrDesc &MCID = Info->get(Inst.getOpcode());
+ int MemOpStart = X86II::getMemoryOperandNo(MCID.TSFlags);
+ if (MemOpStart == -1)
+ return None;
+ MemOpStart += X86II::getOperandBias(MCID);
+
+ const MCOperand &SegReg = Inst.getOperand(MemOpStart + X86::AddrSegmentReg);
+ const MCOperand &BaseReg = Inst.getOperand(MemOpStart + X86::AddrBaseReg);
+ const MCOperand &IndexReg = Inst.getOperand(MemOpStart + X86::AddrIndexReg);
+ const MCOperand &ScaleAmt = Inst.getOperand(MemOpStart + X86::AddrScaleAmt);
+ const MCOperand &Disp = Inst.getOperand(MemOpStart + X86::AddrDisp);
+ if (SegReg.getReg() != 0 || IndexReg.getReg() != 0 || ScaleAmt.getImm() != 1 ||
+ !Disp.isImm())
+ return None;
+
+ // RIP-relative addressing.
+ if (BaseReg.getReg() == X86::RIP)
+ return Addr + Size + Disp.getImm();
+
+ return None;
+}
+
+} // end of namespace X86_MC
+
+} // end of namespace llvm
+
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+ return new X86_MC::X86MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetMC() {
+ for (Target *T : {&getTheX86_32Target(), &getTheX86_64Target()}) {
+ // Register the MC asm info.
+ RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(*T, createX86MCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(*T,
+ X86_MC::createX86MCSubtargetInfo);
+
+ // Register the MC instruction analyzer.
+ TargetRegistry::RegisterMCInstrAnalysis(*T, createX86MCInstrAnalysis);
+
+ // Register the code emitter.
+ TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter);
+
+ // Register the obj target streamer.
+ TargetRegistry::RegisterObjectTargetStreamer(*T,
+ createX86ObjectTargetStreamer);
+
+ // Register the asm target streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T, createX86AsmTargetStreamer);
+
+ TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter);
+
+ // Register the MC relocation info.
+ TargetRegistry::RegisterMCRelocationInfo(*T, createX86MCRelocationInfo);
+ }
+
+ // Register the asm backend.
+ TargetRegistry::RegisterMCAsmBackend(getTheX86_32Target(),
+ createX86_32AsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(getTheX86_64Target(),
+ createX86_64AsmBackend);
+}
+
+MCRegister llvm::getX86SubSuperRegisterOrZero(MCRegister Reg, unsigned Size,
+ bool High) {
+ switch (Size) {
+ default: return X86::NoRegister;
+ case 8:
+ if (High) {
+ switch (Reg.id()) {
+ default: return getX86SubSuperRegisterOrZero(Reg, 64);
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AH;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DH;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CH;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BH;
+ }
+ } else {
+ switch (Reg.id()) {
+ default: return X86::NoRegister;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AL;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DL;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CL;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BL;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SIL;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DIL;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BPL;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SPL;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8B;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9B;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10B;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11B;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12B;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13B;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14B;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15B;
+ }
+ }
+ case 16:
+ switch (Reg.id()) {
+ default: return X86::NoRegister;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8W;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9W;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10W;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11W;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12W;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13W;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14W;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15W;
+ }
+ case 32:
+ switch (Reg.id()) {
+ default: return X86::NoRegister;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::EAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::EDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::ECX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::EBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::ESI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::EDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::EBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::ESP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8D;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9D;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10D;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11D;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12D;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13D;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14D;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15D;
+ }
+ case 64:
+ switch (Reg.id()) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::RAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::RDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::RCX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::RBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::RSI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::RDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::RBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::RSP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15;
+ }
+ }
+}
+
+MCRegister llvm::getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High) {
+ MCRegister Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+ assert(Res != X86::NoRegister && "Unexpected register or VT");
+ return Res;
+}
+
+
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
new file mode 100644
index 000000000000..35604cd3ec0a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -0,0 +1,145 @@
+//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+
+#include <memory>
+#include <string>
+
+namespace llvm {
+class formatted_raw_ostream;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInst;
+class MCInstPrinter;
+class MCInstrInfo;
+class MCObjectTargetWriter;
+class MCObjectWriter;
+class MCRegister;
+class MCRegisterInfo;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class MCTargetStreamer;
+class Target;
+class Triple;
+class StringRef;
+
+/// Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+ enum {
+ X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+ };
+}
+
+/// Native X86 register numbers
+///
+namespace N86 {
+ enum {
+ EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+ };
+}
+
+namespace X86_MC {
+std::string ParseX86Triple(const Triple &TT);
+
+unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
+
+void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
+
+
+/// Returns true if this instruction has a LOCK prefix.
+bool hasLockPrefix(const MCInst &MI);
+
+/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
+/// do not need to go through TargetRegistry.
+MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
+ StringRef FS);
+}
+
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
+ const MCTargetOptions &Options);
+MCAsmBackend *createX86_64AsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
+ const MCTargetOptions &Options);
+
+/// Implements X86-only directives for assembly emission.
+MCTargetStreamer *createX86AsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrinter,
+ bool IsVerboseAsm);
+
+/// Implements X86-only directives for object files.
+MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &S,
+ const MCSubtargetInfo &STI);
+
+/// Construct an X86 Windows COFF machine code streamer which will generate
+/// PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createX86WinCOFFStreamer(MCContext &C,
+ std::unique_ptr<MCAsmBackend> &&AB,
+ std::unique_ptr<MCObjectWriter> &&OW,
+ std::unique_ptr<MCCodeEmitter> &&CE,
+ bool RelaxAll,
+ bool IncrementalLinkerCompatible);
+
+/// Construct an X86 Mach-O object writer.
+std::unique_ptr<MCObjectTargetWriter>
+createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype);
+
+/// Construct an X86 ELF object writer.
+std::unique_ptr<MCObjectTargetWriter>
+createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+/// Construct an X86 Win COFF object writer.
+std::unique_ptr<MCObjectTargetWriter>
+createX86WinCOFFObjectWriter(bool Is64Bit);
+
+/// Returns the sub or super register of a specific X86 register.
+/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
+/// Aborts on error.
+MCRegister getX86SubSuperRegister(MCRegister, unsigned, bool High=false);
+
+/// Returns the sub or super register of a specific X86 register.
+/// Like getX86SubSuperRegister() but returns 0 on error.
+MCRegister getX86SubSuperRegisterOrZero(MCRegister, unsigned,
+ bool High = false);
+
+} // End llvm namespace
+
+
+// Defines symbolic names for X86 registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
new file mode 100644
index 000000000000..b98e58d653db
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -0,0 +1,603 @@
+//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+namespace {
+class X86MachObjectWriter : public MCMachObjectTargetWriter {
+ bool recordScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Log2Size,
+ uint64_t &FixedValue);
+ void recordTLVPRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue);
+
+ void RecordX86Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue);
+ void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup,
+ MCValue Target, uint64_t &FixedValue);
+
+public:
+ X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+ : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+ void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+ const MCAsmLayout &Layout, const MCFragment *Fragment,
+ const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) override {
+ if (Writer->is64Bit())
+ RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ else
+ RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ }
+};
+} // namespace
+
+static bool isFixupKindRIPRel(unsigned Kind) {
+ return Kind == X86::reloc_riprel_4byte ||
+ Kind == X86::reloc_riprel_4byte_movq_load ||
+ Kind == X86::reloc_riprel_4byte_relax ||
+ Kind == X86::reloc_riprel_4byte_relax_rex;
+}
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_PCRel_1:
+ case FK_Data_1: return 0;
+ case FK_PCRel_2:
+ case FK_Data_2: return 1;
+ case FK_PCRel_4:
+ // FIXME: Remove these!!!
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_relax:
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ case X86::reloc_branch_4byte_pcrel:
+ case FK_Data_4: return 2;
+ case FK_Data_8: return 3;
+ }
+}
+
+void X86MachObjectWriter::RecordX86_64Relocation(
+ MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
+ const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+ // See <reloc.h>.
+ uint32_t FixupOffset =
+ Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+ uint32_t FixupAddress =
+ Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+ int64_t Value = 0;
+ unsigned Index = 0;
+ unsigned IsExtern = 0;
+ unsigned Type = 0;
+ const MCSymbol *RelSymbol = nullptr;
+
+ Value = Target.getConstant();
+
+ if (IsPCRel) {
+ // Compensate for the relocation offset, Darwin x86_64 relocations only have
+ // the addend and appear to have attempted to define it to be the actual
+ // expression addend without the PCrel bias. However, instructions with data
+ // following the relocation are not accommodated for (see comment below
+ // regarding SIGNED{1,2,4}), so it isn't exactly that either.
+ Value += 1LL << Log2Size;
+ }
+
+ if (Target.isAbsolute()) { // constant
+ // SymbolNum of 0 indicates the absolute section.
+ Type = MachO::X86_64_RELOC_UNSIGNED;
+
+ // FIXME: I believe this is broken, I don't think the linker can understand
+ // it. I think it would require a local relocation, but I'm not sure if that
+ // would work either. The official way to get an absolute PCrel relocation
+ // is to use an absolute symbol (which we don't support yet).
+ if (IsPCRel) {
+ IsExtern = 1;
+ Type = MachO::X86_64_RELOC_BRANCH;
+ }
+ } else if (Target.getSymB()) { // A - B + constant
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+ if (A->isTemporary())
+ A = &Writer->findAliasedSymbol(*A);
+ const MCSymbol *A_Base = Asm.getAtom(*A);
+
+ const MCSymbol *B = &Target.getSymB()->getSymbol();
+ if (B->isTemporary())
+ B = &Writer->findAliasedSymbol(*B);
+ const MCSymbol *B_Base = Asm.getAtom(*B);
+
+ // Neither symbol can be modified.
+ if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of modified symbol");
+ return;
+ }
+
+ // We don't support PCrel relocations of differences. Darwin 'as' doesn't
+ // implement most of these correctly.
+ if (IsPCRel) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported pc-relative relocation of difference");
+ return;
+ }
+
+ // The support for the situation where one or both of the symbols would
+ // require a local relocation is handled just like if the symbols were
+ // external. This is certainly used in the case of debug sections where the
+ // section has only temporary symbols and thus the symbols don't have base
+ // symbols. This is encoded using the section ordinal and non-extern
+ // relocation entries.
+
+ // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
+ // single SIGNED relocation); reject it for now. Except the case where both
+ // symbols don't have a base, equal but both NULL.
+ if (A_Base == B_Base && A_Base) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation with identical base");
+ return;
+ }
+
+ // A subtraction expression where either symbol is undefined is a
+ // non-relocatable expression.
+ if (A->isUndefined() || B->isUndefined()) {
+ StringRef Name = A->isUndefined() ? A->getName() : B->getName();
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation with subtraction expression, symbol '" +
+ Name + "' can not be undefined in a subtraction expression");
+ return;
+ }
+
+ Value += Writer->getSymbolAddress(*A, Layout) -
+ (!A_Base ? 0 : Writer->getSymbolAddress(*A_Base, Layout));
+ Value -= Writer->getSymbolAddress(*B, Layout) -
+ (!B_Base ? 0 : Writer->getSymbolAddress(*B_Base, Layout));
+
+ if (!A_Base)
+ Index = A->getFragment()->getParent()->getOrdinal() + 1;
+ Type = MachO::X86_64_RELOC_UNSIGNED;
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+
+ if (B_Base)
+ RelSymbol = B_Base;
+ else
+ Index = B->getFragment()->getParent()->getOrdinal() + 1;
+ Type = MachO::X86_64_RELOC_SUBTRACTOR;
+ } else {
+ const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+ if (Symbol->isTemporary() && Value) {
+ const MCSection &Sec = Symbol->getSection();
+ if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+ Symbol->setUsedInReloc();
+ }
+ RelSymbol = Asm.getAtom(*Symbol);
+
+ // Relocations inside debug sections always use local relocations when
+ // possible. This seems to be done because the debugger doesn't fully
+ // understand x86_64 relocation entries, and expects to find values that
+ // have already been fixed up.
+ if (Symbol->isInSection()) {
+ const MCSectionMachO &Section =
+ static_cast<const MCSectionMachO &>(*Fragment->getParent());
+ if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+ RelSymbol = nullptr;
+ }
+
+ // x86_64 almost always uses external relocations, except when there is no
+ // symbol to use as a base address (a local symbol with no preceding
+ // non-local symbol).
+ if (RelSymbol) {
+ // Add the local offset, if needed.
+ if (RelSymbol != Symbol)
+ Value += Layout.getSymbolOffset(*Symbol) -
+ Layout.getSymbolOffset(*RelSymbol);
+ } else if (Symbol->isInSection() && !Symbol->isVariable()) {
+ // The index is the section ordinal (1-based).
+ Index = Symbol->getFragment()->getParent()->getOrdinal() + 1;
+ Value += Writer->getSymbolAddress(*Symbol, Layout);
+
+ if (IsPCRel)
+ Value -= FixupAddress + (1 << Log2Size);
+ } else if (Symbol->isVariable()) {
+ const MCExpr *Value = Symbol->getVariableValue();
+ int64_t Res;
+ bool isAbs = Value->evaluateAsAbsolute(Res, Layout,
+ Writer->getSectionAddressMap());
+ if (isAbs) {
+ FixedValue = Res;
+ return;
+ } else {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of variable '" +
+ Symbol->getName() + "'");
+ return;
+ }
+ } else {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation of undefined symbol '" +
+ Symbol->getName() + "'");
+ return;
+ }
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
+ if (IsPCRel) {
+ if (IsRIPRel) {
+ if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+ // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
+ // rewrite the movq to an leaq at link time if the symbol ends up in
+ // the same linkage unit.
+ if (Fixup.getTargetKind() == X86::reloc_riprel_4byte_movq_load)
+ Type = MachO::X86_64_RELOC_GOT_LOAD;
+ else
+ Type = MachO::X86_64_RELOC_GOT;
+ } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+ Type = MachO::X86_64_RELOC_TLV;
+ } else if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported symbol modifier in relocation");
+ return;
+ } else {
+ Type = MachO::X86_64_RELOC_SIGNED;
+
+ // The Darwin x86_64 relocation format has a problem where it cannot
+ // encode an address (L<foo> + <constant>) which is outside the atom
+ // containing L<foo>. Generally, this shouldn't occur but it does
+ // happen when we have a RIPrel instruction with data following the
+ // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
+ // adjustment Darwin x86_64 uses, the offset is still negative and the
+ // linker has no way to recognize this.
+ //
+ // To work around this, Darwin uses several special relocation types
+ // to indicate the offsets. However, the specification or
+ // implementation of these seems to also be incomplete; they should
+ // adjust the addend as well based on the actual encoded instruction
+ // (the additional bias), but instead appear to just look at the final
+ // offset.
+ switch (-(Target.getConstant() + (1LL << Log2Size))) {
+ case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break;
+ case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break;
+ case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break;
+ }
+ }
+ } else {
+ if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "unsupported symbol modifier in branch relocation");
+ return;
+ }
+
+ Type = MachO::X86_64_RELOC_BRANCH;
+ }
+ } else {
+ if (Modifier == MCSymbolRefExpr::VK_GOT) {
+ Type = MachO::X86_64_RELOC_GOT;
+ } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+ // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
+ // case all we do is set the PCrel bit in the relocation entry; this is
+ // used with exception handling, for example. The source is required to
+ // include any necessary offset directly.
+ Type = MachO::X86_64_RELOC_GOT;
+ IsPCRel = 1;
+ } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel");
+ return;
+ } else if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported symbol modifier in relocation");
+ return;
+ } else {
+ Type = MachO::X86_64_RELOC_UNSIGNED;
+ if (Fixup.getTargetKind() == X86::reloc_signed_4byte) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "32-bit absolute addressing is not supported in 64-bit mode");
+ return;
+ }
+ }
+ }
+ }
+
+ // x86_64 always writes custom values into the fixups.
+ FixedValue = Value;
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+ (IsExtern << 27) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ unsigned Log2Size,
+ uint64_t &FixedValue) {
+ uint64_t OriginalFixedValue = FixedValue;
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+
+ // See <reloc.h>.
+ const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+ if (!A->getFragment()) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "symbol '" + A->getName() +
+ "' can not be undefined in a subtraction expression");
+ return false;
+ }
+
+ uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+ uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+ FixedValue += SecAddr;
+ uint32_t Value2 = 0;
+
+ if (const MCSymbolRefExpr *B = Target.getSymB()) {
+ const MCSymbol *SB = &B->getSymbol();
+
+ if (!SB->getFragment()) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "symbol '" + SB->getName() +
+ "' can not be undefined in a subtraction expression");
+ return false;
+ }
+
+ // Select the appropriate difference relocation type.
+ //
+ // Note that there is no longer any semantic difference between these two
+ // relocation types from the linkers point of view, this is done solely for
+ // pedantic compatibility with 'as'.
+ Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF
+ : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
+ Value2 = Writer->getSymbolAddress(*SB, Layout);
+ FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+ }
+
+ // Relocations are written out in reverse order, so the PAIR comes first.
+ if (Type == MachO::GENERIC_RELOC_SECTDIFF ||
+ Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
+ // If the offset is too large to fit in a scattered relocation,
+ // we're hosed. It's an unfortunate limitation of the MachO format.
+ if (FixupOffset > 0xffffff) {
+ char Buffer[32];
+ format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+ Asm.getContext().reportError(Fixup.getLoc(),
+ Twine("Section too large, can't encode "
+ "r_address (") + Buffer +
+ ") into 24 bits of scattered "
+ "relocation entry.");
+ return false;
+ }
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((0 << 0) | // r_address
+ (MachO::GENERIC_RELOC_PAIR << 24) | // r_type
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value2;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ } else {
+ // If the offset is more than 24-bits, it won't fit in a scattered
+ // relocation offset field, so we fall back to using a non-scattered
+ // relocation. This is a bit risky, as if the offset reaches out of
+ // the block and the linker is doing scattered loading on this
+ // symbol, things can go badly.
+ //
+ // Required for 'as' compatibility.
+ if (FixupOffset > 0xffffff) {
+ FixedValue = OriginalFixedValue;
+ return false;
+ }
+ }
+
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = ((FixupOffset << 0) |
+ (Type << 24) |
+ (Log2Size << 28) |
+ (IsPCRel << 30) |
+ MachO::R_SCATTERED);
+ MRE.r_word1 = Value;
+ Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+ return true;
+}
+
+void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ const MCSymbolRefExpr *SymA = Target.getSymA();
+ assert(SymA->getKind() == MCSymbolRefExpr::VK_TLVP && !is64Bit() &&
+ "Should only be called with a 32-bit TLVP relocation!");
+
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+ uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned IsPCRel = 0;
+
+ // We're only going to have a second symbol in pic mode and it'll be a
+ // subtraction from the picbase. For 32-bit pic the addend is the difference
+ // between the picbase and the next address. For 32-bit static the addend is
+ // zero.
+ if (auto *SymB = Target.getSymB()) {
+ // If this is a subtraction then we're pcrel.
+ uint32_t FixupAddress =
+ Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+ IsPCRel = 1;
+ FixedValue = FixupAddress -
+ Writer->getSymbolAddress(SymB->getSymbol(), Layout) +
+ Target.getConstant();
+ FixedValue += 1ULL << Log2Size;
+ } else {
+ FixedValue = 0;
+ }
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = Value;
+ MRE.r_word1 =
+ (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
+ Writer->addRelocation(&SymA->getSymbol(), Fragment->getParent(), MRE);
+}
+
+void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
+ const MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ const MCFragment *Fragment,
+ const MCFixup &Fixup,
+ MCValue Target,
+ uint64_t &FixedValue) {
+ unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+ unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+ // If this is a 32-bit TLVP reloc it's handled a bit differently.
+ if (Target.getSymA() &&
+ Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+ recordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ FixedValue);
+ return;
+ }
+
+ // If this is a difference or a defined symbol plus an offset, then we need a
+ // scattered relocation entry. Differences always require scattered
+ // relocations.
+ if (Target.getSymB()) {
+ recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+ Target, Log2Size, FixedValue);
+ return;
+ }
+
+ // Get the symbol data, if any.
+ const MCSymbol *A = nullptr;
+ if (Target.getSymA())
+ A = &Target.getSymA()->getSymbol();
+
+ // If this is an internal relocation with an offset, it also needs a scattered
+ // relocation entry.
+ uint32_t Offset = Target.getConstant();
+ if (IsPCRel)
+ Offset += 1 << Log2Size;
+ // Try to record the scattered relocation if needed. Fall back to non
+ // scattered if necessary (see comments in recordScatteredRelocation()
+ // for details).
+ if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
+ recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+ Log2Size, FixedValue))
+ return;
+
+ // See <reloc.h>.
+ uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+ unsigned Index = 0;
+ unsigned Type = 0;
+ const MCSymbol *RelSymbol = nullptr;
+
+ if (Target.isAbsolute()) { // constant
+ // SymbolNum of 0 indicates the absolute section.
+ //
+ // FIXME: Currently, these are never generated (see code below). I cannot
+ // find a case where they are actually emitted.
+ Type = MachO::GENERIC_RELOC_VANILLA;
+ } else {
+ // Resolve constant variables.
+ if (A->isVariable()) {
+ int64_t Res;
+ if (A->getVariableValue()->evaluateAsAbsolute(
+ Res, Layout, Writer->getSectionAddressMap())) {
+ FixedValue = Res;
+ return;
+ }
+ }
+
+ // Check whether we need an external or internal relocation.
+ if (Writer->doesSymbolRequireExternRelocation(*A)) {
+ RelSymbol = A;
+ // For external relocations, make sure to offset the fixup value to
+ // compensate for the addend of the symbol address, if it was
+ // undefined. This occurs with weak definitions, for example.
+ if (!A->isUndefined())
+ FixedValue -= Layout.getSymbolOffset(*A);
+ } else {
+ // The index is the section ordinal (1-based).
+ const MCSection &Sec = A->getSection();
+ Index = Sec.getOrdinal() + 1;
+ FixedValue += Writer->getSectionAddress(&Sec);
+ }
+ if (IsPCRel)
+ FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+ Type = MachO::GENERIC_RELOC_VANILLA;
+ }
+
+ // struct relocation_info (8 bytes)
+ MachO::any_relocation_info MRE;
+ MRE.r_word0 = FixupOffset;
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
+ uint32_t CPUSubtype) {
+ return std::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
new file mode 100644
index 000000000000..201b22d6232d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
@@ -0,0 +1,571 @@
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ // Defaults the copying the dest value.
+ ShuffleMask.push_back(0);
+ ShuffleMask.push_back(1);
+ ShuffleMask.push_back(2);
+ ShuffleMask.push_back(3);
+
+ // Decode the immediate.
+ unsigned ZMask = Imm & 15;
+ unsigned CountD = (Imm >> 4) & 3;
+ unsigned CountS = (Imm >> 6) & 3;
+
+ // CountS selects which input element to use.
+ unsigned InVal = 4 + CountS;
+ // CountD specifies which element of destination to update.
+ ShuffleMask[CountD] = InVal;
+ // ZMask zaps values, potentially overriding the CountD elt.
+ if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+ if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+ if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+ if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+void DecodeInsertElementMask(unsigned NumElts, unsigned Idx, unsigned Len,
+ SmallVectorImpl<int> &ShuffleMask) {
+ assert((Idx + Len) <= NumElts && "Insertion out of range");
+
+ for (unsigned i = 0; i != NumElts; ++i)
+ ShuffleMask.push_back(i);
+ for (unsigned i = 0; i != Len; ++i)
+ ShuffleMask[Idx + i] = NumElts + i;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = NElts / 2; i != NElts; ++i)
+ ShuffleMask.push_back(NElts + i);
+
+ for (unsigned i = NElts / 2; i != NElts; ++i)
+ ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i != NElts / 2; ++i)
+ ShuffleMask.push_back(i);
+
+ for (unsigned i = 0; i != NElts / 2; ++i)
+ ShuffleMask.push_back(NElts + i);
+}
+
+void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = NumElts / 2; i < e; ++i) {
+ ShuffleMask.push_back(2 * i);
+ ShuffleMask.push_back(2 * i);
+ }
+}
+
+void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = NumElts / 2; i < e; ++i) {
+ ShuffleMask.push_back(2 * i + 1);
+ ShuffleMask.push_back(2 * i + 1);
+ }
+}
+
+void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+ const unsigned NumLaneElts = 2;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; ++i)
+ ShuffleMask.push_back(l);
+}
+
+void DecodePSLLDQMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ const unsigned NumLaneElts = 16;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; ++i) {
+ int M = SM_SentinelZero;
+ if (i >= Imm) M = i - Imm + l;
+ ShuffleMask.push_back(M);
+ }
+}
+
+void DecodePSRLDQMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ const unsigned NumLaneElts = 16;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; ++i) {
+ unsigned Base = i + Imm;
+ int M = Base + l;
+ if (Base >= NumLaneElts) M = SM_SentinelZero;
+ ShuffleMask.push_back(M);
+ }
+}
+
+void DecodePALIGNRMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ const unsigned NumLaneElts = 16;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ unsigned Base = i + Imm;
+ // if i+imm is out of this lane then we actually need the other source
+ if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+ ShuffleMask.push_back(Base + l);
+ }
+ }
+}
+
+void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ // Not all bits of the immediate are used so mask it.
+ assert(isPowerOf2_32(NumElts) && "NumElts should be power of 2");
+ Imm = Imm & (NumElts - 1);
+ for (unsigned i = 0; i != NumElts; ++i)
+ ShuffleMask.push_back(i + Imm);
+}
+
+void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned Size = NumElts * ScalarBits;
+ unsigned NumLanes = Size / 128;
+ if (NumLanes == 0) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ uint32_t SplatImm = (Imm & 0xff) * 0x01010101;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ ShuffleMask.push_back(SplatImm % NumLaneElts + l);
+ SplatImm /= NumLaneElts;
+ }
+ }
+}
+
+void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + 4 + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ }
+}
+
+void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ }
+}
+
+void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumHalfElts = NumElts / 2;
+
+ for (unsigned l = 0; l != NumHalfElts; ++l)
+ ShuffleMask.push_back(l + NumHalfElts);
+ for (unsigned h = 0; h != NumHalfElts; ++h)
+ ShuffleMask.push_back(h);
+}
+
+void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits,
+ unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumLaneElts = 128 / ScalarBits;
+
+ unsigned NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ // each half of a lane comes from different source
+ for (unsigned s = 0; s != NumElts * 2; s += NumElts) {
+ for (unsigned i = 0; i != NumLaneElts / 2; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+ NewImm /= NumLaneElts;
+ }
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
+ }
+}
+
+void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
+ SmallVectorImpl<int> &ShuffleMask) {
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = (NumElts * ScalarBits) / 128;
+ if (NumLanes == 0) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l + NumLaneElts / 2, e = l + NumLaneElts; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+ }
+ }
+}
+
+void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
+ SmallVectorImpl<int> &ShuffleMask) {
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = (NumElts * ScalarBits) / 128;
+ if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l, e = l + NumLaneElts / 2; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+ }
+ }
+}
+
+void DecodeVectorBroadcast(unsigned NumElts,
+ SmallVectorImpl<int> &ShuffleMask) {
+ ShuffleMask.append(NumElts, 0);
+}
+
+void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned Scale = DstNumElts / SrcNumElts;
+
+ for (unsigned i = 0; i != Scale; ++i)
+ for (unsigned j = 0; j != SrcNumElts; ++j)
+ ShuffleMask.push_back(j);
+}
+
+void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
+ unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElementsInLane = 128 / ScalarSize;
+ unsigned NumLanes = NumElts / NumElementsInLane;
+
+ for (unsigned l = 0; l != NumElts; l += NumElementsInLane) {
+ unsigned Index = (Imm % NumLanes) * NumElementsInLane;
+ Imm /= NumLanes; // Discard the bits we just used.
+ // We actually need the other source.
+ if (l >= (NumElts / 2))
+ Index += NumElts;
+ for (unsigned i = 0; i != NumElementsInLane; ++i)
+ ShuffleMask.push_back(Index + i);
+ }
+}
+
+void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned HalfSize = NumElts / 2;
+
+ for (unsigned l = 0; l != 2; ++l) {
+ unsigned HalfMask = Imm >> (l * 4);
+ unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
+ for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
+ ShuffleMask.push_back((HalfMask & 8) ? SM_SentinelZero : (int)i);
+ }
+}
+
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ // For 256/512-bit vectors the base of the shuffle is the 128-bit
+ // subvector we're inside.
+ int Base = (i / 16) * 16;
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (M & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (M & 0xf);
+ ShuffleMask.push_back(Index);
+ }
+ }
+}
+
+void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i < NumElts; ++i) {
+ // If there are more than 8 elements in the vector, then any immediate blend
+ // mask wraps around.
+ unsigned Bit = i % 8;
+ ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElts + i : i);
+ }
+}
+
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask) {
+ assert(RawMask.size() == 16 && "Illegal VPPERM shuffle mask size");
+
+ // VPPERM Operation
+ // Bits[4:0] - Byte Index (0 - 31)
+ // Bits[7:5] - Permute Operation
+ //
+ // Permute Operation:
+ // 0 - Source byte (no logical operation).
+ // 1 - Invert source byte.
+ // 2 - Bit reverse of source byte.
+ // 3 - Bit reverse of inverted source byte.
+ // 4 - 00h (zero - fill).
+ // 5 - FFh (ones - fill).
+ // 6 - Most significant bit of source byte replicated in all bit positions.
+ // 7 - Invert most significant bit of source byte and replicate in all bit positions.
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ uint64_t M = RawMask[i];
+ uint64_t PermuteOp = (M >> 5) & 0x7;
+ if (PermuteOp == 4) {
+ ShuffleMask.push_back(SM_SentinelZero);
+ continue;
+ }
+ if (PermuteOp != 0) {
+ ShuffleMask.clear();
+ return;
+ }
+
+ uint64_t Index = M & 0x1F;
+ ShuffleMask.push_back((int)Index);
+ }
+}
+
+void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned l = 0; l != NumElts; l += 4)
+ for (unsigned i = 0; i != 4; ++i)
+ ShuffleMask.push_back(l + ((Imm >> (2 * i)) & 3));
+}
+
+void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
+ unsigned NumDstElts, bool IsAnyExtend,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned Scale = DstScalarBits / SrcScalarBits;
+ assert(SrcScalarBits < DstScalarBits &&
+ "Expected zero extension mask to increase scalar size");
+
+ int Sentinel = IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero;
+ for (unsigned i = 0; i != NumDstElts; i++) {
+ ShuffleMask.push_back(i);
+ ShuffleMask.append(Scale - 1, Sentinel);
+ }
+}
+
+void DecodeZeroMoveLowMask(unsigned NumElts,
+ SmallVectorImpl<int> &ShuffleMask) {
+ ShuffleMask.push_back(0);
+ ShuffleMask.append(NumElts - 1, SM_SentinelZero);
+}
+
+void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
+ SmallVectorImpl<int> &ShuffleMask) {
+ // First element comes from the first element of second source.
+ // Remaining elements: Load zero extends / Move copies from first source.
+ ShuffleMask.push_back(NumElts);
+ for (unsigned i = 1; i < NumElts; i++)
+ ShuffleMask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+}
+
+void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned HalfElts = NumElts / 2;
+
+ // Only the bottom 6 bits are valid for each immediate.
+ Len &= 0x3F;
+ Idx &= 0x3F;
+
+ // We can only decode this bit extraction instruction as a shuffle if both the
+ // length and index work with whole elements.
+ if (0 != (Len % EltSize) || 0 != (Idx % EltSize))
+ return;
+
+ // A length of zero is equivalent to a bit length of 64.
+ if (Len == 0)
+ Len = 64;
+
+ // If the length + index exceeds the bottom 64 bits the result is undefined.
+ if ((Len + Idx) > 64) {
+ ShuffleMask.append(NumElts, SM_SentinelUndef);
+ return;
+ }
+
+ // Convert index and index to work with elements.
+ Len /= EltSize;
+ Idx /= EltSize;
+
+ // EXTRQ: Extract Len elements starting from Idx. Zero pad the remaining
+ // elements of the lower 64-bits. The upper 64-bits are undefined.
+ for (int i = 0; i != Len; ++i)
+ ShuffleMask.push_back(i + Idx);
+ for (int i = Len; i != (int)HalfElts; ++i)
+ ShuffleMask.push_back(SM_SentinelZero);
+ for (int i = HalfElts; i != (int)NumElts; ++i)
+ ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned HalfElts = NumElts / 2;
+
+ // Only the bottom 6 bits are valid for each immediate.
+ Len &= 0x3F;
+ Idx &= 0x3F;
+
+ // We can only decode this bit insertion instruction as a shuffle if both the
+ // length and index work with whole elements.
+ if (0 != (Len % EltSize) || 0 != (Idx % EltSize))
+ return;
+
+ // A length of zero is equivalent to a bit length of 64.
+ if (Len == 0)
+ Len = 64;
+
+ // If the length + index exceeds the bottom 64 bits the result is undefined.
+ if ((Len + Idx) > 64) {
+ ShuffleMask.append(NumElts, SM_SentinelUndef);
+ return;
+ }
+
+ // Convert index and index to work with elements.
+ Len /= EltSize;
+ Idx /= EltSize;
+
+ // INSERTQ: Extract lowest Len elements from lower half of second source and
+ // insert over first source starting at Idx element. The upper 64-bits are
+ // undefined.
+ for (int i = 0; i != Idx; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 0; i != Len; ++i)
+ ShuffleMask.push_back(i + NumElts);
+ for (int i = Idx + Len; i != (int)HalfElts; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = HalfElts; i != (int)NumElts; ++i)
+ ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
+ ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VecSize = NumElts * ScalarBits;
+ unsigned NumLanes = VecSize / 128;
+ unsigned NumEltsPerLane = NumElts / NumLanes;
+ assert((VecSize == 128 || VecSize == 256 || VecSize == 512) &&
+ "Unexpected vector size");
+ assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
+
+ for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ uint64_t M = RawMask[i];
+ M = (ScalarBits == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
+ unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
+ ShuffleMask.push_back((int)(LaneOffset + M));
+ }
+}
+
+void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
+ ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VecSize = NumElts * ScalarBits;
+ unsigned NumLanes = VecSize / 128;
+ unsigned NumEltsPerLane = NumElts / NumLanes;
+ assert((VecSize == 128 || VecSize == 256) && "Unexpected vector size");
+ assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
+ assert((NumElts == RawMask.size()) && "Unexpected mask size");
+
+ for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ // VPERMIL2 Operation.
+ // Bits[3] - Match Bit.
+ // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+ // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+ uint64_t Selector = RawMask[i];
+ unsigned MatchBit = (Selector >> 3) & 0x1;
+
+ // M2Z[0:1] MatchBit
+ // 0Xb X Source selected by Selector index.
+ // 10b 0 Source selected by Selector index.
+ // 10b 1 Zero.
+ // 11b 0 Zero.
+ // 11b 1 Source selected by Selector index.
+ if ((M2Z & 0x2) != 0 && MatchBit != (M2Z & 0x1)) {
+ ShuffleMask.push_back(SM_SentinelZero);
+ continue;
+ }
+
+ int Index = i & ~(NumEltsPerLane - 1);
+ if (ScalarBits == 64)
+ Index += (Selector >> 1) & 0x1;
+ else
+ Index += Selector & 0x3;
+
+ int Src = (Selector >> 2) & 0x1;
+ Index += Src * NumElts;
+ ShuffleMask.push_back(Index);
+ }
+}
+
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask) {
+ uint64_t EltMaskSize = RawMask.size() - 1;
+ for (int i = 0, e = RawMask.size(); i != e; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ uint64_t M = RawMask[i];
+ M &= EltMaskSize;
+ ShuffleMask.push_back((int)M);
+ }
+}
+
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask) {
+ uint64_t EltMaskSize = (RawMask.size() * 2) - 1;
+ for (int i = 0, e = RawMask.size(); i != e; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ uint64_t M = RawMask[i];
+ M &= EltMaskSize;
+ ShuffleMask.push_back((int)M);
+ }
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h
new file mode 100644
index 000000000000..4ef9959f7a27
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h
@@ -0,0 +1,166 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+
+#include <cstdint>
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class APInt;
+template <typename T> class ArrayRef;
+template <typename T> class SmallVectorImpl;
+
+enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
+
+/// Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// Insert the bottom Len elements from a second source into a vector starting at
+// element Idx.
+void DecodeInsertElementMask(unsigned NumElts, unsigned Idx, unsigned Len,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSLLDQMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSRLDQMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
+void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshufhw.
+void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshuflw.
+void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a PSWAPD 3DNow! instruction.
+void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for shufp*.
+void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
+void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
+void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a broadcast of the first element of a vector.
+void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a broadcast of a subvector to a larger vector type.
+void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a PSHUFB mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a BLEND immediate mask into a shuffle mask.
+void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
+ unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for VPERMQ/VPERMPD.
+void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPPERM mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+/// This can only basic masks (permutes + zeros), not any of the other
+/// operations that VPPERM can perform.
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a zero extension instruction as a shuffle mask.
+void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
+ unsigned NumDstElts, bool IsAnyExtend,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a move lower and zero upper instruction as a shuffle mask.
+void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a scalar float move instruction as a shuffle mask.
+void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a SSE4A EXTRQ instruction as a shuffle mask.
+void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a SSE4A INSERTQ instruction as a shuffle mask.
+void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
+void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
+ ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
+void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
+ ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+ SmallVectorImpl<int> &ShuffleMask);
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
new file mode 100644
index 000000000000..3b1e9e7c34fb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
@@ -0,0 +1,34 @@
+//===- X86TargetStreamer.h ------------------------------*- C++ -*---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86TARGETSTREAMER_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86TARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+/// X86 target streamer implementing x86-only assembly directives.
+class X86TargetStreamer : public MCTargetStreamer {
+public:
+ X86TargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+ virtual bool emitFPOProc(const MCSymbol *ProcSym, unsigned ParamsSize,
+ SMLoc L = {}) = 0;
+ virtual bool emitFPOEndPrologue(SMLoc L = {}) = 0;
+ virtual bool emitFPOEndProc(SMLoc L = {}) = 0;
+ virtual bool emitFPOData(const MCSymbol *ProcSym, SMLoc L = {}) = 0;
+ virtual bool emitFPOPushReg(unsigned Reg, SMLoc L = {}) = 0;
+ virtual bool emitFPOStackAlloc(unsigned StackAlloc, SMLoc L = {}) = 0;
+ virtual bool emitFPOStackAlign(unsigned Align, SMLoc L = {}) = 0;
+ virtual bool emitFPOSetFrame(unsigned Reg, SMLoc L = {}) = 0;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..760239f76505
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -0,0 +1,113 @@
+//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+
+class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+ X86WinCOFFObjectWriter(bool Is64Bit);
+ ~X86WinCOFFObjectWriter() override = default;
+
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsCrossSection,
+ const MCAsmBackend &MAB) const override;
+};
+
+} // end anonymous namespace
+
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
+ : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
+ : COFF::IMAGE_FILE_MACHINE_I386) {}
+
+unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsCrossSection,
+ const MCAsmBackend &MAB) const {
+ unsigned FixupKind = Fixup.getKind();
+ if (IsCrossSection) {
+ if (FixupKind != FK_Data_4 && FixupKind != llvm::X86::reloc_signed_4byte) {
+ Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
+ return COFF::IMAGE_REL_AMD64_ADDR32;
+ }
+ FixupKind = FK_PCRel_4;
+ }
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+ MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+ if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_riprel_4byte_relax:
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_branch_4byte_pcrel:
+ return COFF::IMAGE_REL_AMD64_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return COFF::IMAGE_REL_AMD64_ADDR32NB;
+ if (Modifier == MCSymbolRefExpr::VK_SECREL)
+ return COFF::IMAGE_REL_AMD64_SECREL;
+ return COFF::IMAGE_REL_AMD64_ADDR32;
+ case FK_Data_8:
+ return COFF::IMAGE_REL_AMD64_ADDR64;
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_AMD64_SECTION;
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_AMD64_SECREL;
+ default:
+ Ctx.reportError(Fixup.getLoc(), "unsupported relocation type");
+ return COFF::IMAGE_REL_AMD64_ADDR32;
+ }
+ } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) {
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ return COFF::IMAGE_REL_I386_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return COFF::IMAGE_REL_I386_DIR32NB;
+ if (Modifier == MCSymbolRefExpr::VK_SECREL)
+ return COFF::IMAGE_REL_AMD64_SECREL;
+ return COFF::IMAGE_REL_I386_DIR32;
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_I386_SECTION;
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_I386_SECREL;
+ default:
+ Ctx.reportError(Fixup.getLoc(), "unsupported relocation type");
+ return COFF::IMAGE_REL_I386_DIR32;
+ }
+ } else
+ llvm_unreachable("Unsupported COFF machine type.");
+}
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createX86WinCOFFObjectWriter(bool Is64Bit) {
+ return std::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
new file mode 100644
index 000000000000..c29211246123
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -0,0 +1,80 @@
+//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "X86TargetStreamer.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+ Win64EH::UnwindEmitter EHStreamer;
+public:
+ X86WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
+ std::unique_ptr<MCCodeEmitter> CE,
+ std::unique_ptr<MCObjectWriter> OW)
+ : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
+
+ void EmitWinEHHandlerData(SMLoc Loc) override;
+ void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
+ void EmitWindowsUnwindTables() override;
+ void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
+ void finishImpl() override;
+};
+
+void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
+ MCStreamer::EmitWinEHHandlerData(Loc);
+
+ // We have to emit the unwind info now, because this directive
+ // actually switches to the .xdata section.
+ if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo())
+ EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true);
+}
+
+void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+ EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
+}
+
+void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
+ if (!getNumWinFrameInfos())
+ return;
+ EHStreamer.Emit(*this);
+}
+
+void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
+ X86TargetStreamer *XTS =
+ static_cast<X86TargetStreamer *>(getTargetStreamer());
+ XTS->emitFPOData(ProcSym, Loc);
+}
+
+void X86WinCOFFStreamer::finishImpl() {
+ emitFrames(nullptr);
+ EmitWindowsUnwindTables();
+
+ MCWinCOFFStreamer::finishImpl();
+}
+} // namespace
+
+MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C,
+ std::unique_ptr<MCAsmBackend> &&AB,
+ std::unique_ptr<MCObjectWriter> &&OW,
+ std::unique_ptr<MCCodeEmitter> &&CE,
+ bool RelaxAll,
+ bool IncrementalLinkerCompatible) {
+ X86WinCOFFStreamer *S =
+ new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW));
+ S->getAssembler().setRelaxAll(RelaxAll);
+ S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+ return S;
+}
+
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
new file mode 100644
index 000000000000..11251fb2b2ba
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -0,0 +1,461 @@
+//===-- X86WinCOFFTargetStreamer.cpp ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "X86TargetStreamer.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/MC/MCCodeView.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+/// Implements Windows x86-only directives for assembly emission.
+class X86WinCOFFAsmTargetStreamer : public X86TargetStreamer {
+ formatted_raw_ostream &OS;
+ MCInstPrinter &InstPrinter;
+
+public:
+ X86WinCOFFAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+ MCInstPrinter &InstPrinter)
+ : X86TargetStreamer(S), OS(OS), InstPrinter(InstPrinter) {}
+
+ bool emitFPOProc(const MCSymbol *ProcSym, unsigned ParamsSize,
+ SMLoc L) override;
+ bool emitFPOEndPrologue(SMLoc L) override;
+ bool emitFPOEndProc(SMLoc L) override;
+ bool emitFPOData(const MCSymbol *ProcSym, SMLoc L) override;
+ bool emitFPOPushReg(unsigned Reg, SMLoc L) override;
+ bool emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) override;
+ bool emitFPOStackAlign(unsigned Align, SMLoc L) override;
+ bool emitFPOSetFrame(unsigned Reg, SMLoc L) override;
+};
+
+/// Represents a single FPO directive.
+struct FPOInstruction {
+ MCSymbol *Label;
+ enum Operation {
+ PushReg,
+ StackAlloc,
+ StackAlign,
+ SetFrame,
+ } Op;
+ unsigned RegOrOffset;
+};
+
+struct FPOData {
+ const MCSymbol *Function = nullptr;
+ MCSymbol *Begin = nullptr;
+ MCSymbol *PrologueEnd = nullptr;
+ MCSymbol *End = nullptr;
+ unsigned ParamsSize = 0;
+
+ SmallVector<FPOInstruction, 5> Instructions;
+};
+
+/// Implements Windows x86-only directives for object emission.
+class X86WinCOFFTargetStreamer : public X86TargetStreamer {
+ /// Map from function symbol to its FPO data.
+ DenseMap<const MCSymbol *, std::unique_ptr<FPOData>> AllFPOData;
+
+ /// Current FPO data created by .cv_fpo_proc.
+ std::unique_ptr<FPOData> CurFPOData;
+
+ bool haveOpenFPOData() { return !!CurFPOData; }
+
+ /// Diagnoses an error at L if we are not in an FPO prologue. Return true on
+ /// error.
+ bool checkInFPOPrologue(SMLoc L);
+
+ MCSymbol *emitFPOLabel();
+
+ MCContext &getContext() { return getStreamer().getContext(); }
+
+public:
+ X86WinCOFFTargetStreamer(MCStreamer &S) : X86TargetStreamer(S) {}
+
+ bool emitFPOProc(const MCSymbol *ProcSym, unsigned ParamsSize,
+ SMLoc L) override;
+ bool emitFPOEndPrologue(SMLoc L) override;
+ bool emitFPOEndProc(SMLoc L) override;
+ bool emitFPOData(const MCSymbol *ProcSym, SMLoc L) override;
+ bool emitFPOPushReg(unsigned Reg, SMLoc L) override;
+ bool emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) override;
+ bool emitFPOStackAlign(unsigned Align, SMLoc L) override;
+ bool emitFPOSetFrame(unsigned Reg, SMLoc L) override;
+};
+} // end namespace
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOProc(const MCSymbol *ProcSym,
+ unsigned ParamsSize, SMLoc L) {
+ OS << "\t.cv_fpo_proc\t";
+ ProcSym->print(OS, getStreamer().getContext().getAsmInfo());
+ OS << ' ' << ParamsSize << '\n';
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOEndPrologue(SMLoc L) {
+ OS << "\t.cv_fpo_endprologue\n";
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOEndProc(SMLoc L) {
+ OS << "\t.cv_fpo_endproc\n";
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOData(const MCSymbol *ProcSym,
+ SMLoc L) {
+ OS << "\t.cv_fpo_data\t";
+ ProcSym->print(OS, getStreamer().getContext().getAsmInfo());
+ OS << '\n';
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOPushReg(unsigned Reg, SMLoc L) {
+ OS << "\t.cv_fpo_pushreg\t";
+ InstPrinter.printRegName(OS, Reg);
+ OS << '\n';
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOStackAlloc(unsigned StackAlloc,
+ SMLoc L) {
+ OS << "\t.cv_fpo_stackalloc\t" << StackAlloc << '\n';
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOStackAlign(unsigned Align, SMLoc L) {
+ OS << "\t.cv_fpo_stackalign\t" << Align << '\n';
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOSetFrame(unsigned Reg, SMLoc L) {
+ OS << "\t.cv_fpo_setframe\t";
+ InstPrinter.printRegName(OS, Reg);
+ OS << '\n';
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::checkInFPOPrologue(SMLoc L) {
+ if (!haveOpenFPOData() || CurFPOData->PrologueEnd) {
+ getContext().reportError(
+ L,
+ "directive must appear between .cv_fpo_proc and .cv_fpo_endprologue");
+ return true;
+ }
+ return false;
+}
+
+MCSymbol *X86WinCOFFTargetStreamer::emitFPOLabel() {
+ MCSymbol *Label = getContext().createTempSymbol("cfi", true);
+ getStreamer().emitLabel(Label);
+ return Label;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOProc(const MCSymbol *ProcSym,
+ unsigned ParamsSize, SMLoc L) {
+ if (haveOpenFPOData()) {
+ getContext().reportError(
+ L, "opening new .cv_fpo_proc before closing previous frame");
+ return true;
+ }
+ CurFPOData = std::make_unique<FPOData>();
+ CurFPOData->Function = ProcSym;
+ CurFPOData->Begin = emitFPOLabel();
+ CurFPOData->ParamsSize = ParamsSize;
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOEndProc(SMLoc L) {
+ if (!haveOpenFPOData()) {
+ getContext().reportError(L, ".cv_fpo_endproc must appear after .cv_proc");
+ return true;
+ }
+ if (!CurFPOData->PrologueEnd) {
+ // Complain if there were prologue setup instructions but no end prologue.
+ if (!CurFPOData->Instructions.empty()) {
+ getContext().reportError(L, "missing .cv_fpo_endprologue");
+ CurFPOData->Instructions.clear();
+ }
+
+ // Claim there is a zero-length prologue to make the label math work out
+ // later.
+ CurFPOData->PrologueEnd = CurFPOData->Begin;
+ }
+
+ CurFPOData->End = emitFPOLabel();
+ const MCSymbol *Fn = CurFPOData->Function;
+ AllFPOData.insert({Fn, std::move(CurFPOData)});
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOSetFrame(unsigned Reg, SMLoc L) {
+ if (checkInFPOPrologue(L))
+ return true;
+ FPOInstruction Inst;
+ Inst.Label = emitFPOLabel();
+ Inst.Op = FPOInstruction::SetFrame;
+ Inst.RegOrOffset = Reg;
+ CurFPOData->Instructions.push_back(Inst);
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOPushReg(unsigned Reg, SMLoc L) {
+ if (checkInFPOPrologue(L))
+ return true;
+ FPOInstruction Inst;
+ Inst.Label = emitFPOLabel();
+ Inst.Op = FPOInstruction::PushReg;
+ Inst.RegOrOffset = Reg;
+ CurFPOData->Instructions.push_back(Inst);
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) {
+ if (checkInFPOPrologue(L))
+ return true;
+ FPOInstruction Inst;
+ Inst.Label = emitFPOLabel();
+ Inst.Op = FPOInstruction::StackAlloc;
+ Inst.RegOrOffset = StackAlloc;
+ CurFPOData->Instructions.push_back(Inst);
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOStackAlign(unsigned Align, SMLoc L) {
+ if (checkInFPOPrologue(L))
+ return true;
+ if (!llvm::any_of(CurFPOData->Instructions, [](const FPOInstruction &Inst) {
+ return Inst.Op == FPOInstruction::SetFrame;
+ })) {
+ getContext().reportError(
+ L, "a frame register must be established before aligning the stack");
+ return true;
+ }
+ FPOInstruction Inst;
+ Inst.Label = emitFPOLabel();
+ Inst.Op = FPOInstruction::StackAlign;
+ Inst.RegOrOffset = Align;
+ CurFPOData->Instructions.push_back(Inst);
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOEndPrologue(SMLoc L) {
+ if (checkInFPOPrologue(L))
+ return true;
+ CurFPOData->PrologueEnd = emitFPOLabel();
+ return false;
+}
+
+namespace {
+struct RegSaveOffset {
+ RegSaveOffset(unsigned Reg, unsigned Offset) : Reg(Reg), Offset(Offset) {}
+
+ unsigned Reg = 0;
+ unsigned Offset = 0;
+};
+
+struct FPOStateMachine {
+ explicit FPOStateMachine(const FPOData *FPO) : FPO(FPO) {}
+
+ const FPOData *FPO = nullptr;
+ unsigned FrameReg = 0;
+ unsigned FrameRegOff = 0;
+ unsigned CurOffset = 0;
+ unsigned LocalSize = 0;
+ unsigned SavedRegSize = 0;
+ unsigned StackOffsetBeforeAlign = 0;
+ unsigned StackAlign = 0;
+ unsigned Flags = 0; // FIXME: Set HasSEH / HasEH.
+
+ SmallString<128> FrameFunc;
+
+ SmallVector<RegSaveOffset, 4> RegSaveOffsets;
+
+ void emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label);
+};
+} // end namespace
+
+static Printable printFPOReg(const MCRegisterInfo *MRI, unsigned LLVMReg) {
+ return Printable([MRI, LLVMReg](raw_ostream &OS) {
+ switch (LLVMReg) {
+ // MSVC only seems to emit symbolic register names for EIP, EBP, and ESP,
+ // but the format seems to support more than that, so we emit them.
+ case X86::EAX: OS << "$eax"; break;
+ case X86::EBX: OS << "$ebx"; break;
+ case X86::ECX: OS << "$ecx"; break;
+ case X86::EDX: OS << "$edx"; break;
+ case X86::EDI: OS << "$edi"; break;
+ case X86::ESI: OS << "$esi"; break;
+ case X86::ESP: OS << "$esp"; break;
+ case X86::EBP: OS << "$ebp"; break;
+ case X86::EIP: OS << "$eip"; break;
+ // Otherwise, get the codeview register number and print $N.
+ default:
+ OS << '$' << MRI->getCodeViewRegNum(LLVMReg);
+ break;
+ }
+ });
+}
+
+void FPOStateMachine::emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label) {
+ unsigned CurFlags = Flags;
+ if (Label == FPO->Begin)
+ CurFlags |= FrameData::IsFunctionStart;
+
+ // Compute the new FrameFunc string.
+ FrameFunc.clear();
+ raw_svector_ostream FuncOS(FrameFunc);
+ const MCRegisterInfo *MRI = OS.getContext().getRegisterInfo();
+ assert((StackAlign == 0 || FrameReg != 0) &&
+ "cannot align stack without frame reg");
+ StringRef CFAVar = StackAlign == 0 ? "$T0" : "$T1";
+
+ if (FrameReg) {
+ // CFA is FrameReg + FrameRegOff.
+ FuncOS << CFAVar << ' ' << printFPOReg(MRI, FrameReg) << ' ' << FrameRegOff
+ << " + = ";
+
+ // Assign $T0, the VFRAME register, the value of ESP after it is aligned.
+ // Starting from the CFA, we subtract the size of all pushed registers, and
+ // align the result. While we don't store any CSRs in this area, $T0 is used
+ // by S_DEFRANGE_FRAMEPOINTER_REL records to find local variables.
+ if (StackAlign) {
+ FuncOS << "$T0 " << CFAVar << ' ' << StackOffsetBeforeAlign << " - "
+ << StackAlign << " @ = ";
+ }
+ } else {
+ // The address of return address is ESP + CurOffset, but we use .raSearch to
+ // match MSVC. This seems to ask the debugger to subtract some combination
+ // of LocalSize and SavedRegSize from ESP and grovel around in that memory
+ // to find the address of a plausible return address.
+ FuncOS << CFAVar << " .raSearch = ";
+ }
+
+ // Caller's $eip should be dereferenced CFA, and $esp should be CFA plus 4.
+ FuncOS << "$eip " << CFAVar << " ^ = ";
+ FuncOS << "$esp " << CFAVar << " 4 + = ";
+
+ // Each saved register is stored at an unchanging negative CFA offset.
+ for (RegSaveOffset RO : RegSaveOffsets)
+ FuncOS << printFPOReg(MRI, RO.Reg) << ' ' << CFAVar << ' ' << RO.Offset
+ << " - ^ = ";
+
+ // Add it to the CV string table.
+ CodeViewContext &CVCtx = OS.getContext().getCVContext();
+ unsigned FrameFuncStrTabOff = CVCtx.addToStringTable(FuncOS.str()).second;
+
+ // MSVC has only ever been observed to emit a MaxStackSize of zero.
+ unsigned MaxStackSize = 0;
+
+ // The FrameData record format is:
+ // ulittle32_t RvaStart;
+ // ulittle32_t CodeSize;
+ // ulittle32_t LocalSize;
+ // ulittle32_t ParamsSize;
+ // ulittle32_t MaxStackSize;
+ // ulittle32_t FrameFunc; // String table offset
+ // ulittle16_t PrologSize;
+ // ulittle16_t SavedRegsSize;
+ // ulittle32_t Flags;
+
+ OS.emitAbsoluteSymbolDiff(Label, FPO->Begin, 4); // RvaStart
+ OS.emitAbsoluteSymbolDiff(FPO->End, Label, 4); // CodeSize
+ OS.emitInt32(LocalSize);
+ OS.emitInt32(FPO->ParamsSize);
+ OS.emitInt32(MaxStackSize);
+ OS.emitInt32(FrameFuncStrTabOff); // FrameFunc
+ OS.emitAbsoluteSymbolDiff(FPO->PrologueEnd, Label, 2);
+ OS.emitInt16(SavedRegSize);
+ OS.emitInt32(CurFlags);
+}
+
+/// Compute and emit the real CodeView FrameData subsection.
+bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) {
+ MCStreamer &OS = getStreamer();
+ MCContext &Ctx = OS.getContext();
+
+ auto I = AllFPOData.find(ProcSym);
+ if (I == AllFPOData.end()) {
+ Ctx.reportError(L, Twine("no FPO data found for symbol ") +
+ ProcSym->getName());
+ return true;
+ }
+ const FPOData *FPO = I->second.get();
+ assert(FPO->Begin && FPO->End && FPO->PrologueEnd && "missing FPO label");
+
+ MCSymbol *FrameBegin = Ctx.createTempSymbol(),
+ *FrameEnd = Ctx.createTempSymbol();
+
+ OS.emitInt32(unsigned(DebugSubsectionKind::FrameData));
+ OS.emitAbsoluteSymbolDiff(FrameEnd, FrameBegin, 4);
+ OS.emitLabel(FrameBegin);
+
+ // Start with the RVA of the function in question.
+ OS.emitValue(MCSymbolRefExpr::create(FPO->Function,
+ MCSymbolRefExpr::VK_COFF_IMGREL32, Ctx),
+ 4);
+
+ // Emit a sequence of FrameData records.
+ FPOStateMachine FSM(FPO);
+
+ FSM.emitFrameDataRecord(OS, FPO->Begin);
+ for (const FPOInstruction &Inst : FPO->Instructions) {
+ switch (Inst.Op) {
+ case FPOInstruction::PushReg:
+ FSM.CurOffset += 4;
+ FSM.SavedRegSize += 4;
+ FSM.RegSaveOffsets.push_back({Inst.RegOrOffset, FSM.CurOffset});
+ break;
+ case FPOInstruction::SetFrame:
+ FSM.FrameReg = Inst.RegOrOffset;
+ FSM.FrameRegOff = FSM.CurOffset;
+ break;
+ case FPOInstruction::StackAlign:
+ FSM.StackOffsetBeforeAlign = FSM.CurOffset;
+ FSM.StackAlign = Inst.RegOrOffset;
+ break;
+ case FPOInstruction::StackAlloc:
+ FSM.CurOffset += Inst.RegOrOffset;
+ FSM.LocalSize += Inst.RegOrOffset;
+ // No need to emit FrameData for stack allocations with a frame pointer.
+ if (FSM.FrameReg)
+ continue;
+ break;
+ }
+ FSM.emitFrameDataRecord(OS, Inst.Label);
+ }
+
+ OS.emitValueToAlignment(4, 0);
+ OS.emitLabel(FrameEnd);
+ return false;
+}
+
+MCTargetStreamer *llvm::createX86AsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrinter,
+ bool IsVerboseAsm) {
+ // FIXME: This makes it so we textually assemble COFF directives on ELF.
+ // That's kind of nonsensical.
+ return new X86WinCOFFAsmTargetStreamer(S, OS, *InstPrinter);
+}
+
+MCTargetStreamer *
+llvm::createX86ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+ // No need to register a target streamer.
+ if (!STI.getTargetTriple().isOSBinFormatCOFF())
+ return nullptr;
+ // Registers itself to the MCStreamer.
+ return new X86WinCOFFTargetStreamer(S);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 000000000000..18cda8f591c3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,28 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetInfo/X86TargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheX86_32Target() {
+ static Target TheX86_32Target;
+ return TheX86_32Target;
+}
+Target &llvm::getTheX86_64Target() {
+ static Target TheX86_64Target;
+ return TheX86_64Target;
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetInfo() {
+ RegisterTarget<Triple::x86, /*HasJIT=*/true> X(
+ getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above", "X86");
+
+ RegisterTarget<Triple::x86_64, /*HasJIT=*/true> Y(
+ getTheX86_64Target(), "x86-64", "64-bit X86: EM64T and AMD64", "X86");
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.h
new file mode 100644
index 000000000000..caf6b8d424fc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.h
@@ -0,0 +1,21 @@
+//===-- X86TargetInfo.h - X86 Target Implementation -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H
+#define LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheX86_32Target();
+Target &getTheX86_64Target();
+
+}
+
+#endif // LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.h b/contrib/llvm-project/llvm/lib/Target/X86/X86.h
new file mode 100644
index 000000000000..e17b9ba5500b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.h
@@ -0,0 +1,186 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86_H
+#define LLVM_LIB_TARGET_X86_X86_H
+
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class FunctionPass;
+class InstructionSelector;
+class PassRegistry;
+class X86RegisterBankInfo;
+class X86Subtarget;
+class X86TargetMachine;
+
+/// This pass converts a legalized DAG into a X86-specific DAG, ready for
+/// instruction scheduling.
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+
+/// This pass initializes a global base register for PIC on x86-32.
+FunctionPass *createX86GlobalBaseRegPass();
+
+/// This pass combines multiple accesses to local-dynamic TLS variables so that
+/// the TLS base address for the module is only fetched once per execution path
+/// through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
+/// This function returns a pass which converts floating-point register
+/// references and pseudo instructions into floating-point stack references and
+/// physical instructions.
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// This pass inserts AVX vzeroupper instructions before each call to avoid
+/// transition penalty between functions encoded with AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
+
+/// This pass inserts ENDBR instructions before indirect jump/call
+/// destinations as part of CET IBT mechanism.
+FunctionPass *createX86IndirectBranchTrackingPass();
+
+/// Return a pass that pads short functions with NOOPs.
+/// This will prevent a stall when returning on the Atom.
+FunctionPass *createX86PadShortFunctions();
+
+/// Return a pass that selectively replaces certain instructions (like add,
+/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
+/// instructions, in order to eliminate execution delays in some processors.
+FunctionPass *createX86FixupLEAs();
+
+/// Return a pass that removes redundant LEA instructions and redundant address
+/// recalculations.
+FunctionPass *createX86OptimizeLEAs();
+
+/// Return a pass that transforms setcc + movzx pairs into xor + setcc.
+FunctionPass *createX86FixupSetCC();
+
+/// Return a pass that avoids creating store forward block issues in the hardware.
+FunctionPass *createX86AvoidStoreForwardingBlocks();
+
+/// Return a pass that lowers EFLAGS copy pseudo instructions.
+FunctionPass *createX86FlagsCopyLoweringPass();
+
+/// Return a pass that expands WinAlloca pseudo-instructions.
+FunctionPass *createX86WinAllocaExpander();
+
+FunctionPass *createX86TileConfigPass();
+
+FunctionPass *createX86PreTileConfigPass();
+
+/// Return a pass that inserts int3 at the end of the function if it ends with a
+/// CALL instruction. The pass does the same for each funclet as well. This
+/// ensures that the open interval of function start and end PCs contains all
+/// return addresses for the benefit of the Windows x64 unwinder.
+FunctionPass *createX86AvoidTrailingCallPass();
+
+/// Return a pass that optimizes the code-size of x86 call sequences. This is
+/// done by replacing esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
+/// Return an IR pass that inserts EH registration stack objects and explicit
+/// EH state updates. This pass must run after EH preparation, which does
+/// Windows-specific but architecture-neutral preparation.
+FunctionPass *createX86WinEHStatePass();
+
+/// Return a Machine IR pass that expands X86-specific pseudo
+/// instructions into a sequence of actual instructions. This pass
+/// must run after prologue/epilogue insertion and before lowering
+/// the MachineInstr to MC.
+FunctionPass *createX86ExpandPseudoPass();
+
+/// This pass converts X86 cmov instructions into branch when profitable.
+FunctionPass *createX86CmovConverterPass();
+
+/// Return a Machine IR pass that selectively replaces
+/// certain byte and word instructions by equivalent 32 bit instructions,
+/// in order to eliminate partial register usage, false dependences on
+/// the upper portions of registers, and to save code size.
+FunctionPass *createX86FixupBWInsts();
+
+/// Return a Machine IR pass that reassigns instruction chains from one domain
+/// to another, when profitable.
+FunctionPass *createX86DomainReassignmentPass();
+
+/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX
+/// encoding when possible in order to reduce code size.
+FunctionPass *createX86EvexToVexInsts();
+
+/// This pass creates the thunks for the retpoline feature.
+FunctionPass *createX86IndirectThunksPass();
+
+/// This pass ensures instructions featuring a memory operand
+/// have distinctive <LineNumber, Discriminator> (with respect to eachother)
+FunctionPass *createX86DiscriminateMemOpsPass();
+
+/// This pass applies profiling information to insert cache prefetches.
+FunctionPass *createX86InsertPrefetchPass();
+
+/// This pass insert wait instruction after X87 instructions which could raise
+/// fp exceptions when strict-fp enabled.
+FunctionPass *createX86InsertX87waitPass();
+
+/// This pass optimizes arithmetic based on knowledge that is only used by
+/// a reduction sequence and is therefore safe to reassociate in interesting
+/// ways.
+FunctionPass *createX86PartialReductionPass();
+
+InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
+ X86Subtarget &,
+ X86RegisterBankInfo &);
+
+FunctionPass *createX86LoadValueInjectionLoadHardeningPass();
+FunctionPass *createX86LoadValueInjectionRetHardeningPass();
+FunctionPass *createX86SpeculativeLoadHardeningPass();
+FunctionPass *createX86SpeculativeExecutionSideEffectSuppression();
+
+void initializeEvexToVexInstPassPass(PassRegistry &);
+void initializeFixupBWInstPassPass(PassRegistry &);
+void initializeFixupLEAPassPass(PassRegistry &);
+void initializeFPSPass(PassRegistry &);
+void initializeWinEHStatePassPass(PassRegistry &);
+void initializeX86AvoidSFBPassPass(PassRegistry &);
+void initializeX86AvoidTrailingCallPassPass(PassRegistry &);
+void initializeX86CallFrameOptimizationPass(PassRegistry &);
+void initializeX86CmovConverterPassPass(PassRegistry &);
+void initializeX86DomainReassignmentPass(PassRegistry &);
+void initializeX86ExecutionDomainFixPass(PassRegistry &);
+void initializeX86ExpandPseudoPass(PassRegistry &);
+void initializeX86FixupSetCCPassPass(PassRegistry &);
+void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &);
+void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &);
+void initializeX86OptimizeLEAPassPass(PassRegistry &);
+void initializeX86PartialReductionPass(PassRegistry &);
+void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
+void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86TileConfigPass(PassRegistry &);
+void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
+
+namespace X86AS {
+enum : unsigned {
+ GS = 256,
+ FS = 257,
+ SS = 258,
+ PTR32_SPTR = 270,
+ PTR32_UPTR = 271,
+ PTR64 = 272
+};
+} // End X86AS namespace
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.td b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
new file mode 100644
index 000000000000..c492d686c52e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
@@ -0,0 +1,1477 @@
+//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget state
+//
+
+def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
+ "64-bit mode (x86_64)">;
+def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
+ "32-bit mode (80386)">;
+def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
+ "16-bit mode (i8086)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureX87 : SubtargetFeature<"x87","HasX87", "true",
+ "Enable X87 float instructions">;
+
+def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true",
+ "Enable NOPL instruction">;
+
+def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
+ "Enable conditional move instructions">;
+
+def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true",
+ "Support CMPXCHG8B instructions">;
+
+def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
+ "Support POPCNT instruction">;
+
+def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true",
+ "Support fxsave/fxrestore instructions">;
+
+def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
+ "Support xsave instructions">;
+
+def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
+ "Support xsaveopt instructions",
+ [FeatureXSAVE]>;
+
+def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
+ "Support xsavec instructions",
+ [FeatureXSAVE]>;
+
+def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
+ "Support xsaves instructions",
+ [FeatureXSAVE]>;
+
+def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+ "Enable SSE instructions">;
+def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+ "Enable SSE2 instructions",
+ [FeatureSSE1]>;
+def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+ "Enable SSE3 instructions",
+ [FeatureSSE2]>;
+def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+ "Enable SSSE3 instructions",
+ [FeatureSSE3]>;
+def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
+ "Enable SSE 4.1 instructions",
+ [FeatureSSSE3]>;
+def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
+ "Enable SSE 4.2 instructions",
+ [FeatureSSE41]>;
+// The MMX subtarget feature is separate from the rest of the SSE features
+// because it's important (for odd compatibility reasons) to be able to
+// turn it off explicitly while allowing SSE+ to be on.
+def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
+ "Enable MMX instructions">;
+def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+ "Enable 3DNow! instructions",
+ [FeatureMMX]>;
+def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+ "Enable 3DNow! Athlon instructions",
+ [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode. Nothing should imply this feature bit. It
+// is used to enforce that only 64-bit capable CPUs are used in 64-bit mode.
+def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
+ "Support 64-bit instructions">;
+def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
+ "64-bit with cmpxchg16b",
+ [FeatureCMPXCHG8B]>;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+ "SHLD instruction is slow">;
+def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
+ "PMULLD instruction is slow">;
+def FeatureSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
+ "true",
+ "PMADDWD is slower than PMULLD">;
+// FIXME: This should not apply to CPUs that do not have SSE.
+def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
+ "IsUAMem16Slow", "true",
+ "Slow unaligned 16-byte memory access">;
+def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
+ "IsUAMem32Slow", "true",
+ "Slow unaligned 32-byte memory access">;
+def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+ "Support SSE 4a instructions",
+ [FeatureSSE3]>;
+
+def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
+ "Enable AVX instructions",
+ [FeatureSSE42]>;
+def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
+ "Enable AVX2 instructions",
+ [FeatureAVX]>;
+def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
+ "Enable three-operand fused multiple-add",
+ [FeatureAVX]>;
+def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
+ "Support 16-bit floating point conversion instructions",
+ [FeatureAVX]>;
+def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
+ "Enable AVX-512 instructions",
+ [FeatureAVX2, FeatureFMA, FeatureF16C]>;
+def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
+ "Enable AVX-512 Exponential and Reciprocal Instructions",
+ [FeatureAVX512]>;
+def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true",
+ "Enable AVX-512 Conflict Detection Instructions",
+ [FeatureAVX512]>;
+def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
+ "true", "Enable AVX-512 Population Count Instructions",
+ [FeatureAVX512]>;
+def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true",
+ "Enable AVX-512 PreFetch Instructions",
+ [FeatureAVX512]>;
+def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1",
+ "true",
+ "Prefetch with Intent to Write and T1 Hint">;
+def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true",
+ "Enable AVX-512 Doubleword and Quadword Instructions",
+ [FeatureAVX512]>;
+def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true",
+ "Enable AVX-512 Byte and Word Instructions",
+ [FeatureAVX512]>;
+def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
+ "Enable AVX-512 Vector Length eXtensions",
+ [FeatureAVX512]>;
+def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
+ "Enable AVX-512 Vector Byte Manipulation Instructions",
+ [FeatureBWI]>;
+def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true",
+ "Enable AVX-512 further Vector Byte Manipulation Instructions",
+ [FeatureBWI]>;
+def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
+ "Enable AVX-512 Integer Fused Multiple-Add",
+ [FeatureAVX512]>;
+def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
+ "Enable protection keys">;
+def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
+ "Enable AVX-512 Vector Neural Network Instructions",
+ [FeatureAVX512]>;
+def FeatureAVXVNNI : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true",
+ "Support AVX_VNNI encoding",
+ [FeatureAVX2]>;
+def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
+ "Support bfloat16 floating point",
+ [FeatureBWI]>;
+def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true",
+ "Enable AVX-512 Bit Algorithms",
+ [FeatureBWI]>;
+def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect",
+ "HasVP2INTERSECT", "true",
+ "Enable AVX-512 vp2intersect",
+ [FeatureAVX512]>;
+def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+ "Enable packed carry-less multiplication instructions",
+ [FeatureSSE2]>;
+def FeatureGFNI : SubtargetFeature<"gfni", "HasGFNI", "true",
+ "Enable Galois Field Arithmetic Instructions",
+ [FeatureSSE2]>;
+def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true",
+ "Enable vpclmulqdq instructions",
+ [FeatureAVX, FeaturePCLMUL]>;
+def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
+ "Enable four-operand fused multiple-add",
+ [FeatureAVX, FeatureSSE4A]>;
+def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
+ "Enable XOP instructions",
+ [FeatureFMA4]>;
+def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
+ "HasSSEUnalignedMem", "true",
+ "Allow unaligned memory operands with SSE instructions">;
+def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
+ "Enable AES instructions",
+ [FeatureSSE2]>;
+def FeatureVAES : SubtargetFeature<"vaes", "HasVAES", "true",
+ "Promote selected AES instructions to AVX512/AVX registers",
+ [FeatureAVX, FeatureAES]>;
+def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true",
+ "Enable TBM instructions">;
+def FeatureLWP : SubtargetFeature<"lwp", "HasLWP", "true",
+ "Enable LWP instructions">;
+def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
+ "Support MOVBE instruction">;
+def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
+ "Support RDRAND instruction">;
+def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
+ "Support FS/GS Base instructions">;
+def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+ "Support LZCNT instruction">;
+def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true",
+ "Support BMI instructions">;
+def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
+ "Support BMI2 instructions">;
+def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
+ "Support RTM instructions">;
+def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
+ "Support ADX instructions">;
+def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
+ "Enable SHA instructions",
+ [FeatureSSE2]>;
+def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true",
+ "Support CET Shadow-Stack instructions">;
+def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
+ "Support PRFCHW instructions">;
+def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
+ "Support RDSEED instruction">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true",
+ "Support LAHF and SAHF instructions in 64-bit mode">;
+def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
+ "Enable MONITORX/MWAITX timer functionality">;
+def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true",
+ "Enable Cache Line Zero">;
+def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
+ "Enable Cache Demote">;
+def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
+ "Support ptwrite instruction">;
+def FeatureAMXTILE : SubtargetFeature<"amx-tile", "HasAMXTILE", "true",
+ "Support AMX-TILE instructions">;
+def FeatureAMXINT8 : SubtargetFeature<"amx-int8", "HasAMXINT8", "true",
+ "Support AMX-INT8 instructions",
+ [FeatureAMXTILE]>;
+def FeatureAMXBF16 : SubtargetFeature<"amx-bf16", "HasAMXBF16", "true",
+ "Support AMX-BF16 instructions",
+ [FeatureAMXTILE]>;
+def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+ "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
+ "HasSlowDivide32", "true",
+ "Use 8-bit divide for positive values less than 256">;
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
+ "HasSlowDivide64", "true",
+ "Use 32-bit divide for positive values less than 2^32">;
+def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
+ "PadShortFunctions", "true",
+ "Pad short functions">;
+def FeatureINVPCID : SubtargetFeature<"invpcid", "HasINVPCID", "true",
+ "Invalidate Process-Context Identifier">;
+def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true",
+ "Enable Software Guard Extensions">;
+def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
+ "Flush A Cache Line Optimized">;
+def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true",
+ "Cache Line Write Back">;
+def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true",
+ "Write Back No Invalidate">;
+def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
+ "Support RDPID instructions">;
+def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
+ "Wait and pause enhancements">;
+def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
+ "Has ENQCMD instructions">;
+def FeatureKL : SubtargetFeature<"kl", "HasKL", "true",
+ "Support Key Locker kl Instructions",
+ [FeatureSSE2]>;
+def FeatureWIDEKL : SubtargetFeature<"widekl", "HasWIDEKL", "true",
+ "Support Key Locker wide Instructions",
+ [FeatureKL]>;
+def FeatureHRESET : SubtargetFeature<"hreset", "HasHRESET", "true",
+ "Has hreset instruction">;
+def FeatureSERIALIZE : SubtargetFeature<"serialize", "HasSERIALIZE", "true",
+ "Has serialize instruction">;
+def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true",
+ "Support TSXLDTRK instructions">;
+def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true",
+ "Has UINTR Instructions">;
+// On some processors, instructions that implicitly take two memory operands are
+// slow. In practice, this means that CALL, PUSH, and POP with memory operands
+// should be avoided in favor of a MOV + register CALL/PUSH/POP.
+def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
+ "SlowTwoMemOps", "true",
+ "Two memory operand instructions are slow">;
+def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+ "LEA instruction needs inputs at AG stage">;
+def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+ "LEA instruction with certain arguments is slow">;
+def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
+ "LEA instruction with 3 ops or certain registers is slow">;
+def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+ "INC and DEC instructions are slower than ADD and SUB">;
+def FeatureSoftFloat
+ : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+ "Use software floating point features">;
+def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
+ "HasPOPCNTFalseDeps", "true",
+ "POPCNT has a false dependency on dest register">;
+def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
+ "HasLZCNTFalseDeps", "true",
+ "LZCNT/TZCNT have a false dependency on dest register">;
+def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
+ "platform configuration instruction">;
+// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
+// using a variable mask over multiple fixed shuffles.
+def FeatureFastVariableShuffle
+ : SubtargetFeature<"fast-variable-shuffle",
+ "HasFastVariableShuffle",
+ "true", "Shuffles with variable masks are fast">;
+// On some X86 processors, a vzeroupper instruction should be inserted after
+// using ymm/zmm registers before executing code that may use SSE instructions.
+def FeatureInsertVZEROUPPER
+ : SubtargetFeature<"vzeroupper",
+ "InsertVZEROUPPER",
+ "true", "Should insert vzeroupper instructions">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+ : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+ "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+ : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+ "true", "Vector SQRT is fast (disable Newton-Raphson)">;
+// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
+// be used to replace test/set sequences.
+def FeatureFastLZCNT
+ : SubtargetFeature<
+ "fast-lzcnt", "HasFastLZCNT", "true",
+ "LZCNT instructions are as fast as most simple integer ops">;
+// If the target can efficiently decode NOPs upto 7-bytes in length.
+def FeatureFast7ByteNOP
+ : SubtargetFeature<
+ "fast-7bytenop", "HasFast7ByteNOP", "true",
+ "Target can quickly decode up to 7 byte NOPs">;
+// If the target can efficiently decode NOPs upto 11-bytes in length.
+def FeatureFast11ByteNOP
+ : SubtargetFeature<
+ "fast-11bytenop", "HasFast11ByteNOP", "true",
+ "Target can quickly decode up to 11 byte NOPs">;
+// If the target can efficiently decode NOPs upto 15-bytes in length.
+def FeatureFast15ByteNOP
+ : SubtargetFeature<
+ "fast-15bytenop", "HasFast15ByteNOP", "true",
+ "Target can quickly decode up to 15 byte NOPs">;
+// Sandy Bridge and newer processors can use SHLD with the same source on both
+// inputs to implement rotate to avoid the partial flag update of the normal
+// rotate instructions.
+def FeatureFastSHLDRotate
+ : SubtargetFeature<
+ "fast-shld-rotate", "HasFastSHLDRotate", "true",
+ "SHLD can be used as a faster rotate">;
+
+// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
+// "string operations"). See "REP String Enhancement" in the Intel Software
+// Development Manual. This feature essentially means that REP MOVSB will copy
+// using the largest available size instead of copying bytes one by one, making
+// it at least as fast as REPMOVS{W,D,Q}.
+def FeatureERMSB
+ : SubtargetFeature<
+ "ermsb", "HasERMSB", "true",
+ "REP MOVS/STOS are fast">;
+
+// Icelake and newer processors have Fast Short REP MOV.
+def FeatureFSRM
+ : SubtargetFeature<
+ "fsrm", "HasFSRM", "true",
+ "REP MOVSB of short lengths is faster">;
+
+// Bulldozer and newer processors can merge CMP/TEST (but not other
+// instructions) with conditional branches.
+def FeatureBranchFusion
+ : SubtargetFeature<"branchfusion", "HasBranchFusion", "true",
+ "CMP/TEST can be fused with conditional branches">;
+
+// Sandy Bridge and newer processors have many instructions that can be
+// fused with conditional branches and pass through the CPU as a single
+// operation.
+def FeatureMacroFusion
+ : SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
+ "Various instructions can be fused with conditional branches">;
+
+// Gather is available since Haswell (AVX2 set). So technically, we can
+// generate Gathers on all AVX2 processors. But the overhead on HSW is high.
+// Skylake Client processor has faster Gathers than HSW and performance is
+// similar to Skylake Server (AVX-512).
+def FeatureHasFastGather
+ : SubtargetFeature<"fast-gather", "HasFastGather", "true",
+ "Indicates if gather is reasonably fast">;
+
+def FeaturePrefer128Bit
+ : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
+ "Prefer 128-bit AVX instructions">;
+
+def FeaturePrefer256Bit
+ : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
+ "Prefer 256-bit AVX instructions">;
+
+def FeaturePreferMaskRegisters
+ : SubtargetFeature<"prefer-mask-registers", "PreferMaskRegisters", "true",
+ "Prefer AVX512 mask registers over PTEST/MOVMSK">;
+
+// Lower indirect calls using a special construct called a `retpoline` to
+// mitigate potential Spectre v2 attacks against them.
+def FeatureRetpolineIndirectCalls
+ : SubtargetFeature<
+ "retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true",
+ "Remove speculation of indirect calls from the generated code">;
+
+// Lower indirect branches and switches either using conditional branch trees
+// or using a special construct called a `retpoline` to mitigate potential
+// Spectre v2 attacks against them.
+def FeatureRetpolineIndirectBranches
+ : SubtargetFeature<
+ "retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true",
+ "Remove speculation of indirect branches from the generated code">;
+
+// Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and
+// `retpoline-indirect-branches` above.
+def FeatureRetpoline
+ : SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true",
+ "Remove speculation of indirect branches from the "
+ "generated code, either by avoiding them entirely or "
+ "lowering them with a speculation blocking construct",
+ [FeatureRetpolineIndirectCalls,
+ FeatureRetpolineIndirectBranches]>;
+
+// Rely on external thunks for the emitted retpoline calls. This allows users
+// to provide their own custom thunk definitions in highly specialized
+// environments such as a kernel that does boot-time hot patching.
+def FeatureRetpolineExternalThunk
+ : SubtargetFeature<
+ "retpoline-external-thunk", "UseRetpolineExternalThunk", "true",
+ "When lowering an indirect call or branch using a `retpoline`, rely "
+ "on the specified user provided thunk rather than emitting one "
+ "ourselves. Only has effect when combined with some other retpoline "
+ "feature", [FeatureRetpolineIndirectCalls]>;
+
+// Mitigate LVI attacks against indirect calls/branches and call returns
+def FeatureLVIControlFlowIntegrity
+ : SubtargetFeature<
+ "lvi-cfi", "UseLVIControlFlowIntegrity", "true",
+ "Prevent indirect calls/branches from using a memory operand, and "
+ "precede all indirect calls/branches from a register with an "
+ "LFENCE instruction to serialize control flow. Also decompose RET "
+ "instructions into a POP+LFENCE+JMP sequence.">;
+
+// Enable SESES to mitigate speculative execution attacks
+def FeatureSpeculativeExecutionSideEffectSuppression
+ : SubtargetFeature<
+ "seses", "UseSpeculativeExecutionSideEffectSuppression", "true",
+ "Prevent speculative execution side channel timing attacks by "
+ "inserting a speculation barrier before memory reads, memory writes, "
+ "and conditional branches. Implies LVI Control Flow integrity.",
+ [FeatureLVIControlFlowIntegrity]>;
+
+// Mitigate LVI attacks against data loads
+def FeatureLVILoadHardening
+ : SubtargetFeature<
+ "lvi-load-hardening", "UseLVILoadHardening", "true",
+ "Insert LFENCE instructions to prevent data speculatively injected "
+ "into loads from being used maliciously.">;
+
+// Direct Move instructions.
+def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
+ "Support movdiri instruction">;
+def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
+ "Support movdir64b instruction">;
+
+def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
+ "Indicates that the BEXTR instruction is implemented as a single uop "
+ "with good throughput">;
+
+// Combine vector math operations with shuffles into horizontal math
+// instructions if a CPU implements horizontal operations (introduced with
+// SSE3) with better latency/throughput than the alternative sequence.
+def FeatureFastHorizontalOps
+ : SubtargetFeature<
+ "fast-hops", "HasFastHorizontalOps", "true",
+ "Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
+ "normal vector instructions with shuffles">;
+
+def FeatureFastScalarShiftMasks
+ : SubtargetFeature<
+ "fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true",
+ "Prefer a left/right scalar logical shift pair over a shift+and pair">;
+
+def FeatureFastVectorShiftMasks
+ : SubtargetFeature<
+ "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
+ "Prefer a left/right vector logical shift pair over a shift+and pair">;
+
+def FeatureUseGLMDivSqrtCosts
+ : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
+ "Use Goldmont specific floating point div/sqrt costs">;
+
+// Enable use of alias analysis during code generation.
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+ "Use alias analysis during codegen">;
+
+// Bonnell
+def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
+// Silvermont
+def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+include "X86RegisterBanks.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86Schedule.td"
+include "X86InstrInfo.td"
+include "X86SchedPredicates.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// X86 Scheduler Models
+//===----------------------------------------------------------------------===//
+
+include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
+include "X86SchedBroadwell.td"
+include "X86ScheduleSLM.td"
+include "X86ScheduleZnver1.td"
+include "X86ScheduleZnver2.td"
+include "X86ScheduleBdVer2.td"
+include "X86ScheduleBtVer2.td"
+include "X86SchedSkylakeClient.td"
+include "X86SchedSkylakeServer.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Processor Feature Lists
+//===----------------------------------------------------------------------===//
+
+def ProcessorFeatures {
+ // x86-64 and x86-64-v[234]
+ list<SubtargetFeature> X86_64V1Features = [
+ FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, Feature64Bit
+ ];
+ list<SubtargetFeature> X86_64V2Features = !listconcat(
+ X86_64V1Features,
+ [FeatureCMPXCHG16B, FeatureLAHFSAHF, FeaturePOPCNT, FeatureSSE42]);
+ list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
+ FeatureAVX2, FeatureBMI, FeatureBMI2, FeatureF16C, FeatureFMA, FeatureLZCNT,
+ FeatureMOVBE, FeatureXSAVE
+ ]);
+ list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
+ FeatureBWI,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureVLX,
+ ]);
+
+ // Nehalem
+ list<SubtargetFeature> NHMFeatures = X86_64V2Features;
+ list<SubtargetFeature> NHMTuning = [FeatureMacroFusion,
+ FeatureInsertVZEROUPPER];
+
+ // Westmere
+ list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
+ list<SubtargetFeature> WSMTuning = NHMTuning;
+ list<SubtargetFeature> WSMFeatures =
+ !listconcat(NHMFeatures, WSMAdditionalFeatures);
+
+ // Sandybridge
+ list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
+ FeatureXSAVE,
+ FeatureXSAVEOPT];
+ list<SubtargetFeature> SNBTuning = [FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureSlowUAMem32,
+ FeatureFastScalarFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> SNBFeatures =
+ !listconcat(WSMFeatures, SNBAdditionalFeatures);
+
+ // Ivybridge
+ list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase];
+ list<SubtargetFeature> IVBTuning = SNBTuning;
+ list<SubtargetFeature> IVBFeatures =
+ !listconcat(SNBFeatures, IVBAdditionalFeatures);
+
+ // Haswell
+ list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureERMSB,
+ FeatureFMA,
+ FeatureINVPCID,
+ FeatureLZCNT,
+ FeatureMOVBE];
+ list<SubtargetFeature> HSWTuning = [FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePOPCNTFalseDeps,
+ FeatureLZCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> HSWFeatures =
+ !listconcat(IVBFeatures, HSWAdditionalFeatures);
+
+ // Broadwell
+ list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX,
+ FeatureRDSEED,
+ FeaturePRFCHW];
+ list<SubtargetFeature> BDWTuning = HSWTuning;
+ list<SubtargetFeature> BDWFeatures =
+ !listconcat(HSWFeatures, BDWAdditionalFeatures);
+
+ // Skylake
+ list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureCLFLUSHOPT,
+ FeatureSGX];
+ list<SubtargetFeature> SKLTuning = [FeatureHasFastGather,
+ FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastVectorFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> SKLFeatures =
+ !listconcat(BDWFeatures, SKLAdditionalFeatures);
+
+ // Skylake-AVX512
+ list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAES,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureCLFLUSHOPT,
+ FeatureAVX512,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
+ FeatureCLWB];
+ list<SubtargetFeature> SKXTuning = [FeatureHasFastGather,
+ FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastVectorFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePrefer256Bit,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> SKXFeatures =
+ !listconcat(BDWFeatures, SKXAdditionalFeatures);
+
+ // Cascadelake
+ list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI];
+ list<SubtargetFeature> CLXTuning = SKXTuning;
+ list<SubtargetFeature> CLXFeatures =
+ !listconcat(SKXFeatures, CLXAdditionalFeatures);
+
+ // Cooperlake
+ list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16];
+ list<SubtargetFeature> CPXTuning = SKXTuning;
+ list<SubtargetFeature> CPXFeatures =
+ !listconcat(CLXFeatures, CPXAdditionalFeatures);
+
+ // Cannonlake
+ list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
+ FeatureVBMI,
+ FeatureIFMA,
+ FeatureSHA];
+ list<SubtargetFeature> CNLTuning = [FeatureHasFastGather,
+ FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastVectorFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePrefer256Bit,
+ FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> CNLFeatures =
+ !listconcat(SKLFeatures, CNLAdditionalFeatures);
+
+ // Icelake
+ list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG,
+ FeatureVAES,
+ FeatureVBMI2,
+ FeatureVNNI,
+ FeatureVPCLMULQDQ,
+ FeatureVPOPCNTDQ,
+ FeatureGFNI,
+ FeatureCLWB,
+ FeatureRDPID,
+ FeatureFSRM];
+ list<SubtargetFeature> ICLTuning = CNLTuning;
+ list<SubtargetFeature> ICLFeatures =
+ !listconcat(CNLFeatures, ICLAdditionalFeatures);
+
+ // Icelake Server
+ list<SubtargetFeature> ICXAdditionalFeatures = [FeaturePCONFIG,
+ FeatureWBNOINVD];
+ list<SubtargetFeature> ICXTuning = CNLTuning;
+ list<SubtargetFeature> ICXFeatures =
+ !listconcat(ICLFeatures, ICXAdditionalFeatures);
+
+ //Tigerlake
+ list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureSHSTK];
+ list<SubtargetFeature> TGLTuning = CNLTuning;
+ list<SubtargetFeature> TGLFeatures =
+ !listconcat(ICLFeatures, TGLAdditionalFeatures );
+
+ //Sapphirerapids
+ list<SubtargetFeature> SPRAdditionalFeatures = [FeatureAMXTILE,
+ FeatureAMXINT8,
+ FeatureAMXBF16,
+ FeatureBF16,
+ FeatureSERIALIZE,
+ FeatureCLDEMOTE,
+ FeatureWAITPKG,
+ FeaturePTWRITE,
+ FeatureAVXVNNI,
+ FeatureTSXLDTRK,
+ FeatureENQCMD,
+ FeatureSHSTK,
+ FeatureVP2INTERSECT,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureUINTR];
+ list<SubtargetFeature> SPRTuning = ICXTuning;
+ list<SubtargetFeature> SPRFeatures =
+ !listconcat(ICXFeatures, SPRAdditionalFeatures);
+
+ // Alderlake
+ list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
+ FeatureCLDEMOTE,
+ FeatureHRESET,
+ FeaturePTWRITE,
+ FeatureSERIALIZE,
+ FeatureWAITPKG];
+ list<SubtargetFeature> ADLTuning = SKLTuning;
+ list<SubtargetFeature> ADLFeatures =
+ !listconcat(SKLFeatures, ADLAdditionalFeatures);
+
+ // Atom
+ list<SubtargetFeature> AtomFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> AtomTuning = [ProcIntelAtom,
+ FeatureSlowUAMem16,
+ FeatureLEAForSP,
+ FeatureSlowDivide32,
+ FeatureSlowDivide64,
+ FeatureSlowTwoMemOps,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions,
+ FeatureInsertVZEROUPPER];
+
+ // Silvermont
+ list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
+ FeaturePOPCNT,
+ FeaturePCLMUL,
+ FeaturePRFCHW,
+ FeatureRDRAND];
+ list<SubtargetFeature> SLMTuning = [ProcIntelSLM,
+ FeatureSlowTwoMemOps,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureSlowDivide64,
+ FeatureSlowPMULLD,
+ FeatureFast7ByteNOP,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> SLMFeatures =
+ !listconcat(AtomFeatures, SLMAdditionalFeatures);
+
+ // Goldmont
+ list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
+ FeatureSHA,
+ FeatureRDSEED,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureCLFLUSHOPT,
+ FeatureFSGSBase];
+ list<SubtargetFeature> GLMTuning = [FeatureUseGLMDivSqrtCosts,
+ FeatureSlowTwoMemOps,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> GLMFeatures =
+ !listconcat(SLMFeatures, GLMAdditionalFeatures);
+
+ // Goldmont Plus
+ list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
+ FeatureRDPID,
+ FeatureSGX];
+ list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts,
+ FeatureSlowTwoMemOps,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> GLPFeatures =
+ !listconcat(GLMFeatures, GLPAdditionalFeatures);
+
+ // Tremont
+ list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
+ FeatureGFNI];
+ list<SubtargetFeature> TRMTuning = GLPTuning;
+ list<SubtargetFeature> TRMFeatures =
+ !listconcat(GLPFeatures, TRMAdditionalFeatures);
+
+ // Knights Landing
+ list<SubtargetFeature> KNLFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePOPCNT,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureLAHFSAHF,
+ FeatureAES,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureAVX512,
+ FeatureERI,
+ FeatureCDI,
+ FeaturePFI,
+ FeaturePREFETCHWT1,
+ FeatureADX,
+ FeatureRDSEED,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeaturePRFCHW];
+ list<SubtargetFeature> KNLTuning = [FeatureSlowDivide64,
+ FeatureSlow3OpsLEA,
+ FeatureSlowIncDec,
+ FeatureSlowTwoMemOps,
+ FeaturePreferMaskRegisters,
+ FeatureHasFastGather,
+ FeatureSlowPMADDWD];
+ // TODO Add AVX5124FMAPS/AVX5124VNNIW features
+ list<SubtargetFeature> KNMFeatures =
+ !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
+
+ // Barcelona
+ list<SubtargetFeature> BarcelonaFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureSSE4A,
+ Feature3DNowA,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF,
+ FeatureCMOV,
+ Feature64Bit];
+ list<SubtargetFeature> BarcelonaTuning = [FeatureFastScalarShiftMasks,
+ FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER];
+
+ // Bobcat
+ list<SubtargetFeature> BtVer1Features = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureSSE4A,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> BtVer1Tuning = [FeatureFast15ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureFastVectorShiftMasks,
+ FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER];
+
+ // Jaguar
+ list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureBMI,
+ FeatureF16C,
+ FeatureMOVBE,
+ FeatureXSAVE,
+ FeatureXSAVEOPT];
+ list<SubtargetFeature> BtVer2Tuning = [FeatureFastLZCNT,
+ FeatureFastBEXTR,
+ FeatureFastHorizontalOps,
+ FeatureFast15ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureFastVectorShiftMasks,
+ FeatureSlowSHLD];
+ list<SubtargetFeature> BtVer2Features =
+ !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
+
+ // Bulldozer
+ list<SubtargetFeature> BdVer1Features = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureXOP,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureLWP,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> BdVer1Tuning = [FeatureSlowSHLD,
+ FeatureFast11ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureBranchFusion,
+ FeatureInsertVZEROUPPER];
+
+ // PileDriver
+ list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
+ FeatureBMI,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureFastBEXTR];
+ list<SubtargetFeature> BdVer2Tuning = BdVer1Tuning;
+ list<SubtargetFeature> BdVer2Features =
+ !listconcat(BdVer1Features, BdVer2AdditionalFeatures);
+
+ // Steamroller
+ list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT,
+ FeatureFSGSBase];
+ list<SubtargetFeature> BdVer3Tuning = BdVer2Tuning;
+ list<SubtargetFeature> BdVer3Features =
+ !listconcat(BdVer2Features, BdVer3AdditionalFeatures);
+
+ // Excavator
+ list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
+ FeatureBMI2,
+ FeatureMOVBE,
+ FeatureRDRAND,
+ FeatureMWAITX];
+ list<SubtargetFeature> BdVer4Tuning = BdVer3Tuning;
+ list<SubtargetFeature> BdVer4Features =
+ !listconcat(BdVer3Features, BdVer4AdditionalFeatures);
+
+
+ // AMD Zen Processors common ISAs
+ list<SubtargetFeature> ZNFeatures = [FeatureADX,
+ FeatureAES,
+ FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureCLFLUSHOPT,
+ FeatureCLZERO,
+ FeatureCMOV,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureF16C,
+ FeatureFMA,
+ FeatureFSGSBase,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureLAHFSAHF,
+ FeatureLZCNT,
+ FeatureMMX,
+ FeatureMOVBE,
+ FeatureMWAITX,
+ FeaturePCLMUL,
+ FeaturePOPCNT,
+ FeaturePRFCHW,
+ FeatureRDRAND,
+ FeatureRDSEED,
+ FeatureSHA,
+ FeatureSSE4A,
+ FeatureX87,
+ FeatureXSAVE,
+ FeatureXSAVEC,
+ FeatureXSAVEOPT,
+ FeatureXSAVES];
+ list<SubtargetFeature> ZNTuning = [FeatureFastLZCNT,
+ FeatureFastBEXTR,
+ FeatureFast15ByteNOP,
+ FeatureBranchFusion,
+ FeatureFastScalarShiftMasks,
+ FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
+ FeatureRDPID,
+ FeatureWBNOINVD];
+ list<SubtargetFeature> ZN2Tuning = ZNTuning;
+ list<SubtargetFeature> ZN2Features =
+ !listconcat(ZNFeatures, ZN2AdditionalFeatures);
+ list<SubtargetFeature> ZN3AdditionalFeatures = [FeatureFSRM,
+ FeatureINVPCID,
+ FeaturePKU,
+ FeatureVAES,
+ FeatureVPCLMULQDQ];
+ list<SubtargetFeature> ZN3Tuning = ZNTuning;
+ list<SubtargetFeature> ZN3Features =
+ !listconcat(ZN2Features, ZN3AdditionalFeatures);
+}
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features,
+ list<SubtargetFeature> TuneFeatures>
+ : ProcessorModel<Name, GenericModel, Features, TuneFeatures>;
+
+class ProcModel<string Name, SchedMachineModel Model,
+ list<SubtargetFeature> Features,
+ list<SubtargetFeature> TuneFeatures>
+ : ProcessorModel<Name, Model, Features, TuneFeatures>;
+
+// NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled
+// if i386/i486 is specifically requested.
+// NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget
+// constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled.
+// It has no effect on code generation.
+def : ProcModel<"generic", SandyBridgeModel,
+ [FeatureX87, FeatureCMPXCHG8B, Feature64Bit],
+ [FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureSlowIncDec,
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER]>;
+
+def : Proc<"i386", [FeatureX87],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"i486", [FeatureX87],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+ FeatureNOPL],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV,
+ FeatureFXSR, FeatureNOPL],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+foreach P = ["pentium3", "pentium3m"] in {
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+ FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+}
+
+// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
+// The intent is to enable it for pentium4 which is the current default
+// processor in a vanilla 32-bit clang compilation when no specific
+// architecture is specified. This generally gives a nice performance
+// increase on silvermont, with largely neutral behavior on other
+// contemporary large core processors.
+// pentium-m, pentium4m, prescott and nocona are included as a preventative
+// measure to avoid performance surprises, in case clang's default cpu
+// changes slightly.
+
+def : ProcModel<"pentium-m", GenericPostRAModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+foreach P = ["pentium4", "pentium4m"] in {
+ def : ProcModel<P, GenericPostRAModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+}
+
+// Intel Quark.
+def : Proc<"lakemont", [FeatureCMPXCHG8B],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+// Intel Core Duo.
+def : ProcModel<"yonah", SandyBridgeModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+// NetBurst.
+def : ProcModel<"prescott", GenericPostRAModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"nocona", GenericPostRAModel, [
+ FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSE3,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+],
+[
+ FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER
+]>;
+
+// Intel Core 2 Solo/Duo.
+def : ProcModel<"core2", SandyBridgeModel, [
+ FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureLAHFSAHF
+],
+[
+ FeatureMacroFusion,
+ FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER
+]>;
+def : ProcModel<"penryn", SandyBridgeModel, [
+ FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSE41,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureLAHFSAHF
+],
+[
+ FeatureMacroFusion,
+ FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER
+]>;
+
+// Atom CPUs.
+foreach P = ["bonnell", "atom"] in {
+ def : ProcModel<P, AtomModel, ProcessorFeatures.AtomFeatures,
+ ProcessorFeatures.AtomTuning>;
+}
+
+foreach P = ["silvermont", "slm"] in {
+ def : ProcModel<P, SLMModel, ProcessorFeatures.SLMFeatures,
+ ProcessorFeatures.SLMTuning>;
+}
+
+def : ProcModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures,
+ ProcessorFeatures.GLMTuning>;
+def : ProcModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures,
+ ProcessorFeatures.GLPTuning>;
+def : ProcModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures,
+ ProcessorFeatures.TRMTuning>;
+
+// "Arrandale" along with corei3 and corei5
+foreach P = ["nehalem", "corei7"] in {
+ def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures,
+ ProcessorFeatures.NHMTuning>;
+}
+
+// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
+def : ProcModel<"westmere", SandyBridgeModel, ProcessorFeatures.WSMFeatures,
+ ProcessorFeatures.WSMTuning>;
+
+foreach P = ["sandybridge", "corei7-avx"] in {
+ def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures,
+ ProcessorFeatures.SNBTuning>;
+}
+
+foreach P = ["ivybridge", "core-avx-i"] in {
+ def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures,
+ ProcessorFeatures.IVBTuning>;
+}
+
+foreach P = ["haswell", "core-avx2"] in {
+ def : ProcModel<P, HaswellModel, ProcessorFeatures.HSWFeatures,
+ ProcessorFeatures.HSWTuning>;
+}
+
+def : ProcModel<"broadwell", BroadwellModel, ProcessorFeatures.BDWFeatures,
+ ProcessorFeatures.BDWTuning>;
+
+def : ProcModel<"skylake", SkylakeClientModel, ProcessorFeatures.SKLFeatures,
+ ProcessorFeatures.SKLTuning>;
+
+// FIXME: define KNL scheduler model
+def : ProcModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures,
+ ProcessorFeatures.KNLTuning>;
+def : ProcModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures,
+ ProcessorFeatures.KNLTuning>;
+
+foreach P = ["skylake-avx512", "skx"] in {
+ def : ProcModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures,
+ ProcessorFeatures.SKXTuning>;
+}
+
+def : ProcModel<"cascadelake", SkylakeServerModel,
+ ProcessorFeatures.CLXFeatures, ProcessorFeatures.CLXTuning>;
+def : ProcModel<"cooperlake", SkylakeServerModel,
+ ProcessorFeatures.CPXFeatures, ProcessorFeatures.CPXTuning>;
+def : ProcModel<"cannonlake", SkylakeServerModel,
+ ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>;
+def : ProcModel<"icelake-client", SkylakeServerModel,
+ ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>;
+def : ProcModel<"icelake-server", SkylakeServerModel,
+ ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>;
+def : ProcModel<"tigerlake", SkylakeServerModel,
+ ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>;
+def : ProcModel<"sapphirerapids", SkylakeServerModel,
+ ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>;
+def : ProcModel<"alderlake", SkylakeClientModel,
+ ProcessorFeatures.ADLFeatures, ProcessorFeatures.ADLTuning>;
+
+// AMD CPUs.
+
+def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+foreach P = ["athlon", "athlon-tbird"] in {
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA,
+ FeatureNOPL],
+ [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+}
+
+foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+ FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
+ [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+}
+
+foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
+ [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
+}
+
+foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
+ Feature64Bit],
+ [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
+}
+
+foreach P = ["amdfam10", "barcelona"] in {
+ def : Proc<P, ProcessorFeatures.BarcelonaFeatures,
+ ProcessorFeatures.BarcelonaTuning>;
+}
+
+// Bobcat
+def : Proc<"btver1", ProcessorFeatures.BtVer1Features,
+ ProcessorFeatures.BtVer1Tuning>;
+// Jaguar
+def : ProcModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features,
+ ProcessorFeatures.BtVer2Tuning>;
+
+// Bulldozer
+def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features,
+ ProcessorFeatures.BdVer1Tuning>;
+// Piledriver
+def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features,
+ ProcessorFeatures.BdVer2Tuning>;
+// Steamroller
+def : Proc<"bdver3", ProcessorFeatures.BdVer3Features,
+ ProcessorFeatures.BdVer3Tuning>;
+// Excavator
+def : Proc<"bdver4", ProcessorFeatures.BdVer4Features,
+ ProcessorFeatures.BdVer4Tuning>;
+
+def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures,
+ ProcessorFeatures.ZNTuning>;
+def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features,
+ ProcessorFeatures.ZN2Tuning>;
+def : ProcModel<"znver3", Znver2Model, ProcessorFeatures.ZN3Features,
+ ProcessorFeatures.ZN3Tuning>;
+
+def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"winchip-c6", [FeatureX87, FeatureMMX],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"winchip2", [FeatureX87, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"c3", [FeatureX87, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+ FeatureSSE1, FeatureFXSR, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+// We also provide a generic 64-bit specific x86 processor model which tries to
+// be good for modern chips without enabling instruction set encodings past the
+// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
+// modern 64-bit x86 chip, and enables features that are generally beneficial.
+//
+// We currently use the Sandy Bridge model as the default scheduling model as
+// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
+// covers a huge swath of x86 processors. If there are specific scheduling
+// knobs which need to be tuned differently for AMD chips, we might consider
+// forming a common base for them.
+def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features,
+[
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureSlowIncDec,
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER
+]>;
+
+// x86-64 micro-architecture levels.
+def : ProcModel<"x86-64-v2", SandyBridgeModel, ProcessorFeatures.X86_64V2Features,
+ ProcessorFeatures.SNBTuning>;
+// Close to Haswell.
+def : ProcModel<"x86-64-v3", HaswellModel, ProcessorFeatures.X86_64V3Features,
+ ProcessorFeatures.HSWTuning>;
+// Close to the AVX-512 level implemented by Xeon Scalable Processors.
+def : ProcModel<"x86-64-v4", HaswellModel, ProcessorFeatures.X86_64V4Features,
+ ProcessorFeatures.SKXTuning>;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Parser
+//===----------------------------------------------------------------------===//
+
+def ATTAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+
+ // Variant name.
+ string Name = "att";
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = "#";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "%";
+}
+
+def IntelAsmParserVariant : AsmParserVariant {
+ int Variant = 1;
+
+ // Variant name.
+ string Name = "intel";
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = ";";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+ string AsmWriterClassName = "ATTInstPrinter";
+ int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+ string AsmWriterClassName = "IntelInstPrinter";
+ int Variant = 1;
+}
+
+def X86 : Target {
+ // Information about the instructions...
+ let InstructionSet = X86InstrInfo;
+ let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
+ let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+ let AllowRegisterRenaming = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Pfm Counters
+//===----------------------------------------------------------------------===//
+
+include "X86PfmCounters.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 000000000000..2d434bda5530
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,802 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "MCTargetDesc/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86TargetStreamer.h"
+#include "TargetInfo/X86TargetInfo.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+X86AsmPrinter::X86AsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {}
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &MF.getSubtarget<X86Subtarget>();
+
+ SMShadowTracker.startFunction(MF);
+ CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
+ *Subtarget->getInstrInfo(), *Subtarget->getRegisterInfo(),
+ MF.getContext()));
+
+ EmitFPOData =
+ Subtarget->isTargetWin32() && MF.getMMI().getModule()->getCodeViewFlag();
+
+ SetupMachineFunction(MF);
+
+ if (Subtarget->isTargetCOFF()) {
+ bool Local = MF.getFunction().hasLocalLinkage();
+ OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer->EmitCOFFSymbolStorageClass(
+ Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+ OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+ << COFF::SCT_COMPLEX_TYPE_SHIFT);
+ OutStreamer->EndCOFFSymbolDef();
+ }
+
+ // Emit the rest of the function body.
+ emitFunctionBody();
+
+ // Emit the XRay table for this function.
+ emitXRayTable();
+
+ EmitFPOData = false;
+
+ // We didn't modify anything.
+ return false;
+}
+
+void X86AsmPrinter::emitFunctionBodyStart() {
+ if (EmitFPOData) {
+ if (auto *XTS =
+ static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()))
+ XTS->emitFPOProc(
+ CurrentFnSym,
+ MF->getInfo<X86MachineFunctionInfo>()->getArgumentStackSize());
+ }
+}
+
+void X86AsmPrinter::emitFunctionBodyEnd() {
+ if (EmitFPOData) {
+ if (auto *XTS =
+ static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()))
+ XTS->emitFPOEndProc();
+ }
+}
+
+/// PrintSymbolOperand - Print a raw symbol reference operand. This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+void X86AsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
+ raw_ostream &O) {
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown symbol type!");
+ case MachineOperand::MO_ConstantPoolIndex:
+ GetCPISymbol(MO.getIndex())->print(O, MAI);
+ printOffset(MO.getOffset(), O);
+ break;
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+
+ MCSymbol *GVSym;
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE)
+ GVSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ else
+ GVSym = getSymbolPreferLocal(*GV);
+
+ // Handle dllimport linkage.
+ if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+ GVSym = OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+ else if (MO.getTargetFlags() == X86II::MO_COFFSTUB)
+ GVSym =
+ OutContext.getOrCreateSymbol(Twine(".refptr.") + GVSym->getName());
+
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+ MCSymbol *Sym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+ if (!StubSym.getPointer())
+ StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+ !GV->hasInternalLinkage());
+ }
+
+ // If the name begins with a dollar-sign, enclose it in parens. We do this
+ // to avoid having it look like an integer immediate to the assembler.
+ if (GVSym->getName()[0] != '$')
+ GVSym->print(O, MAI);
+ else {
+ O << '(';
+ GVSym->print(O, MAI);
+ O << ')';
+ }
+ printOffset(MO.getOffset(), O);
+ break;
+ }
+ }
+
+ switch (MO.getTargetFlags()) {
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ case X86II::MO_COFFSTUB:
+ // These affect the name of the symbol, not any suffix.
+ break;
+ case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+ O << " + [.-";
+ MF->getPICBaseSymbol()->print(O, MAI);
+ O << ']';
+ break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ O << '-';
+ MF->getPICBaseSymbol()->print(O, MAI);
+ break;
+ case X86II::MO_TLSGD: O << "@TLSGD"; break;
+ case X86II::MO_TLSLD: O << "@TLSLD"; break;
+ case X86II::MO_TLSLDM: O << "@TLSLDM"; break;
+ case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break;
+ case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+ case X86II::MO_TPOFF: O << "@TPOFF"; break;
+ case X86II::MO_DTPOFF: O << "@DTPOFF"; break;
+ case X86II::MO_NTPOFF: O << "@NTPOFF"; break;
+ case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
+ case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break;
+ case X86II::MO_GOT: O << "@GOT"; break;
+ case X86II::MO_GOTOFF: O << "@GOTOFF"; break;
+ case X86II::MO_PLT: O << "@PLT"; break;
+ case X86II::MO_TLVP: O << "@TLVP"; break;
+ case X86II::MO_TLVP_PIC_BASE:
+ O << "@TLVP" << '-';
+ MF->getPICBaseSymbol()->print(O, MAI);
+ break;
+ case X86II::MO_SECREL: O << "@SECREL32"; break;
+ }
+}
+
+void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ const bool IsATT = MI->getInlineAsmDialect() == InlineAsm::AD_ATT;
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Register: {
+ if (IsATT)
+ O << '%';
+ O << X86ATTInstPrinter::getRegisterName(MO.getReg());
+ return;
+ }
+
+ case MachineOperand::MO_Immediate:
+ if (IsATT)
+ O << '$';
+ O << MO.getImm();
+ return;
+
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_GlobalAddress: {
+ switch (MI->getInlineAsmDialect()) {
+ case InlineAsm::AD_ATT:
+ O << '$';
+ break;
+ case InlineAsm::AD_Intel:
+ O << "offset ";
+ break;
+ }
+ PrintSymbolOperand(MO, O);
+ break;
+ }
+ case MachineOperand::MO_BlockAddress: {
+ MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress());
+ Sym->print(O, MAI);
+ break;
+ }
+ }
+}
+
+/// PrintModifiedOperand - Print subregisters based on supplied modifier,
+/// deferring to PrintOperand() if no modifier was supplied or if operand is not
+/// a register.
+void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ if (!Modifier || MO.getType() != MachineOperand::MO_Register)
+ return PrintOperand(MI, OpNo, O);
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
+ O << '%';
+ Register Reg = MO.getReg();
+ if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
+ (strcmp(Modifier+6,"32") == 0) ? 32 :
+ (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
+ Reg = getX86SubSuperRegister(Reg, Size);
+ }
+ O << X86ATTInstPrinter::getRegisterName(Reg);
+}
+
+/// PrintPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value. These print slightly differently, for
+/// example, a $ is not emitted.
+void X86AsmPrinter::PrintPCRelImm(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("Unknown pcrel immediate operand");
+ case MachineOperand::MO_Register:
+ // pc-relativeness was handled when computing the value in the reg.
+ PrintOperand(MI, OpNo, O);
+ return;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return;
+ case MachineOperand::MO_GlobalAddress:
+ PrintSymbolOperand(MO, O);
+ return;
+ }
+}
+
+void X86AsmPrinter::PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp);
+
+ // If we really don't want to print out (rip), don't.
+ bool HasBaseReg = BaseReg.getReg() != 0;
+ if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+ BaseReg.getReg() == X86::RIP)
+ HasBaseReg = false;
+
+ // HasParenPart - True if we will print out the () part of the mem ref.
+ bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+
+ switch (DispSpec.getType()) {
+ default:
+ llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Immediate: {
+ int DispVal = DispSpec.getImm();
+ if (DispVal || !HasParenPart)
+ O << DispVal;
+ break;
+ }
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ConstantPoolIndex:
+ PrintSymbolOperand(DispSpec, O);
+ break;
+ }
+
+ if (Modifier && strcmp(Modifier, "H") == 0)
+ O << "+8";
+
+ if (HasParenPart) {
+ assert(IndexReg.getReg() != X86::ESP &&
+ "X86 doesn't allow scaling by ESP");
+
+ O << '(';
+ if (HasBaseReg)
+ PrintModifiedOperand(MI, OpNo + X86::AddrBaseReg, O, Modifier);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ PrintModifiedOperand(MI, OpNo + X86::AddrIndexReg, O, Modifier);
+ unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm();
+ if (ScaleVal != 1)
+ O << ',' << ScaleVal;
+ }
+ O << ')';
+ }
+}
+
+void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ assert(isMem(*MI, OpNo) && "Invalid memory reference!");
+ const MachineOperand &Segment = MI->getOperand(OpNo + X86::AddrSegmentReg);
+ if (Segment.getReg()) {
+ PrintModifiedOperand(MI, OpNo + X86::AddrSegmentReg, O, Modifier);
+ O << ':';
+ }
+ PrintLeaMemReference(MI, OpNo, O, Modifier);
+}
+
+
+void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
+ unsigned OpNo, raw_ostream &O,
+ const char *Modifier) {
+ const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm();
+ const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp);
+ const MachineOperand &SegReg = MI->getOperand(OpNo + X86::AddrSegmentReg);
+
+ // If we really don't want to print out (rip), don't.
+ bool HasBaseReg = BaseReg.getReg() != 0;
+ if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+ BaseReg.getReg() == X86::RIP)
+ HasBaseReg = false;
+
+ // If this has a segment register, print it.
+ if (SegReg.getReg()) {
+ PrintOperand(MI, OpNo + X86::AddrSegmentReg, O);
+ O << ':';
+ }
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (HasBaseReg) {
+ PrintOperand(MI, OpNo + X86::AddrBaseReg, O);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ PrintOperand(MI, OpNo + X86::AddrIndexReg, O);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ PrintOperand(MI, OpNo + X86::AddrDisp, O);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !HasBaseReg)) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << DispVal;
+ }
+ }
+ O << ']';
+}
+
+static bool printAsmMRegister(const X86AsmPrinter &P, const MachineOperand &MO,
+ char Mode, raw_ostream &O) {
+ Register Reg = MO.getReg();
+ bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
+
+ if (!X86::GR8RegClass.contains(Reg) &&
+ !X86::GR16RegClass.contains(Reg) &&
+ !X86::GR32RegClass.contains(Reg) &&
+ !X86::GR64RegClass.contains(Reg))
+ return true;
+
+ switch (Mode) {
+ default: return true; // Unknown mode.
+ case 'b': // Print QImode register
+ Reg = getX86SubSuperRegister(Reg, 8);
+ break;
+ case 'h': // Print QImode high register
+ Reg = getX86SubSuperRegister(Reg, 8, true);
+ break;
+ case 'w': // Print HImode register
+ Reg = getX86SubSuperRegister(Reg, 16);
+ break;
+ case 'k': // Print SImode register
+ Reg = getX86SubSuperRegister(Reg, 32);
+ break;
+ case 'V':
+ EmitPercent = false;
+ LLVM_FALLTHROUGH;
+ case 'q':
+ // Print 64-bit register names if 64-bit integer registers are available.
+ // Otherwise, print 32-bit register names.
+ Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32);
+ break;
+ }
+
+ if (EmitPercent)
+ O << '%';
+
+ O << X86ATTInstPrinter::getRegisterName(Reg);
+ return false;
+}
+
+static bool printAsmVRegister(const MachineOperand &MO, char Mode,
+ raw_ostream &O) {
+ Register Reg = MO.getReg();
+ bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
+
+ unsigned Index;
+ if (X86::VR128XRegClass.contains(Reg))
+ Index = Reg - X86::XMM0;
+ else if (X86::VR256XRegClass.contains(Reg))
+ Index = Reg - X86::YMM0;
+ else if (X86::VR512RegClass.contains(Reg))
+ Index = Reg - X86::ZMM0;
+ else
+ return true;
+
+ switch (Mode) {
+ default: // Unknown mode.
+ return true;
+ case 'x': // Print V4SFmode register
+ Reg = X86::XMM0 + Index;
+ break;
+ case 't': // Print V8SFmode register
+ Reg = X86::YMM0 + Index;
+ break;
+ case 'g': // Print V16SFmode register
+ Reg = X86::ZMM0 + Index;
+ break;
+ }
+
+ if (EmitPercent)
+ O << '%';
+
+ O << X86ATTInstPrinter::getRegisterName(Reg);
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode, raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ const MachineOperand &MO = MI->getOperand(OpNo);
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+ case 'a': // This is an address. Currently only 'i' and 'r' are expected.
+ switch (MO.getType()) {
+ default:
+ return true;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return false;
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ExternalSymbol:
+ llvm_unreachable("unexpected operand type!");
+ case MachineOperand::MO_GlobalAddress:
+ PrintSymbolOperand(MO, O);
+ if (Subtarget->isPICStyleRIPRel())
+ O << "(%rip)";
+ return false;
+ case MachineOperand::MO_Register:
+ O << '(';
+ PrintOperand(MI, OpNo, O);
+ O << ')';
+ return false;
+ }
+
+ case 'c': // Don't print "$" before a global var name or constant.
+ switch (MO.getType()) {
+ default:
+ PrintOperand(MI, OpNo, O);
+ break;
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ExternalSymbol:
+ llvm_unreachable("unexpected operand type!");
+ case MachineOperand::MO_GlobalAddress:
+ PrintSymbolOperand(MO, O);
+ break;
+ }
+ return false;
+
+ case 'A': // Print '*' before a register (it must be a register)
+ if (MO.isReg()) {
+ O << '*';
+ PrintOperand(MI, OpNo, O);
+ return false;
+ }
+ return true;
+
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print DImode register
+ case 'V': // Print native register without '%'
+ if (MO.isReg())
+ return printAsmMRegister(*this, MO, ExtraCode[0], O);
+ PrintOperand(MI, OpNo, O);
+ return false;
+
+ case 'x': // Print V4SFmode register
+ case 't': // Print V8SFmode register
+ case 'g': // Print V16SFmode register
+ if (MO.isReg())
+ return printAsmVRegister(MO, ExtraCode[0], O);
+ PrintOperand(MI, OpNo, O);
+ return false;
+
+ case 'P': // This is the operand of a call, treat specially.
+ PrintPCRelImm(MI, OpNo, O);
+ return false;
+
+ case 'n': // Negate the immediate or print a '-' before the operand.
+ // Note: this is a temporary solution. It should be handled target
+ // independently as part of the 'MC' work.
+ if (MO.isImm()) {
+ O << -MO.getImm();
+ return false;
+ }
+ O << '-';
+ }
+ }
+
+ PrintOperand(MI, OpNo, O);
+ return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print SImode register
+ // These only apply to registers, ignore on mem.
+ break;
+ case 'H':
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
+ return true; // Unsupported modifier in Intel inline assembly.
+ } else {
+ PrintMemReference(MI, OpNo, O, "H");
+ }
+ return false;
+ case 'P': // Don't print @PLT, but do print as memory.
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
+ PrintIntelMemReference(MI, OpNo, O, "no-rip");
+ } else {
+ PrintMemReference(MI, OpNo, O, "no-rip");
+ }
+ return false;
+ }
+ }
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
+ PrintIntelMemReference(MI, OpNo, O, nullptr);
+ } else {
+ PrintMemReference(MI, OpNo, O, nullptr);
+ }
+ return false;
+}
+
+void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
+ const Triple &TT = TM.getTargetTriple();
+
+ if (TT.isOSBinFormatELF()) {
+ // Assemble feature flags that may require creation of a note section.
+ unsigned FeatureFlagsAnd = 0;
+ if (M.getModuleFlag("cf-protection-branch"))
+ FeatureFlagsAnd |= ELF::GNU_PROPERTY_X86_FEATURE_1_IBT;
+ if (M.getModuleFlag("cf-protection-return"))
+ FeatureFlagsAnd |= ELF::GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+
+ if (FeatureFlagsAnd) {
+ // Emit a .note.gnu.property section with the flags.
+ if (!TT.isArch32Bit() && !TT.isArch64Bit())
+ llvm_unreachable("CFProtection used on invalid architecture!");
+ MCSection *Cur = OutStreamer->getCurrentSectionOnly();
+ MCSection *Nt = MMI->getContext().getELFSection(
+ ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC);
+ OutStreamer->SwitchSection(Nt);
+
+ // Emitting note header.
+ int WordSize = TT.isArch64Bit() ? 8 : 4;
+ emitAlignment(WordSize == 4 ? Align(4) : Align(8));
+ OutStreamer->emitIntValue(4, 4 /*size*/); // data size for "GNU\0"
+ OutStreamer->emitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
+ OutStreamer->emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
+ OutStreamer->emitBytes(StringRef("GNU", 4)); // note name
+
+ // Emitting an Elf_Prop for the CET properties.
+ OutStreamer->emitInt32(ELF::GNU_PROPERTY_X86_FEATURE_1_AND);
+ OutStreamer->emitInt32(4); // data size
+ OutStreamer->emitInt32(FeatureFlagsAnd); // data
+ emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
+
+ OutStreamer->endSection(Nt);
+ OutStreamer->SwitchSection(Cur);
+ }
+ }
+
+ if (TT.isOSBinFormatMachO())
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+
+ if (TT.isOSBinFormatCOFF()) {
+ // Emit an absolute @feat.00 symbol. This appears to be some kind of
+ // compiler features bitfield read by link.exe.
+ MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
+ OutStreamer->BeginCOFFSymbolDef(S);
+ OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+ OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+ OutStreamer->EndCOFFSymbolDef();
+ int64_t Feat00Flags = 0;
+
+ if (TT.getArch() == Triple::x86) {
+ // According to the PE-COFF spec, the LSB of this value marks the object
+ // for "registered SEH". This means that all SEH handler entry points
+ // must be registered in .sxdata. Use of any unregistered handlers will
+ // cause the process to terminate immediately. LLVM does not know how to
+ // register any SEH handlers, so its object files should be safe.
+ Feat00Flags |= 1;
+ }
+
+ if (M.getModuleFlag("cfguard"))
+ Feat00Flags |= 0x800; // Object is CFG-aware.
+
+ OutStreamer->emitSymbolAttribute(S, MCSA_Global);
+ OutStreamer->emitAssignment(
+ S, MCConstantExpr::create(Feat00Flags, MMI->getContext()));
+ }
+ OutStreamer->emitSyntaxDirective();
+
+ // If this is not inline asm and we're in 16-bit
+ // mode prefix assembly with .code16.
+ bool is16 = TT.getEnvironment() == Triple::CODE16;
+ if (M.getModuleInlineAsm().empty() && is16)
+ OutStreamer->emitAssemblerFlag(MCAF_Code16);
+}
+
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+ MachineModuleInfoImpl::StubValueTy &MCSym) {
+ // L_foo$stub:
+ OutStreamer.emitLabel(StubLabel);
+ // .indirect_symbol _foo
+ OutStreamer.emitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+ if (MCSym.getInt())
+ // External to current translation unit.
+ OutStreamer.emitIntValue(0, 4/*size*/);
+ else
+ // Internal to current translation unit.
+ //
+ // When we place the LSDA into the TEXT section, the type info
+ // pointers need to be indirect and pc-rel. We accomplish this by
+ // using NLPs; however, sometimes the types are local to the file.
+ // We need to fill in the value for the NLP in those cases.
+ OutStreamer.emitValue(
+ MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
+ 4 /*size*/);
+}
+
+static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
+
+ MachineModuleInfoMachO &MMIMacho =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+ // Output stubs for dynamically-linked functions.
+ MachineModuleInfoMachO::SymbolListTy Stubs;
+
+ // Output stubs for external and common global variables.
+ Stubs = MMIMacho.GetGVStubList();
+ if (!Stubs.empty()) {
+ OutStreamer.SwitchSection(MMI->getContext().getMachOSection(
+ "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
+ SectionKind::getMetadata()));
+
+ for (auto &Stub : Stubs)
+ emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
+ Stubs.clear();
+ OutStreamer.AddBlankLine();
+ }
+}
+
+void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
+ const Triple &TT = TM.getTargetTriple();
+
+ if (TT.isOSBinFormatMachO()) {
+ // Mach-O uses non-lazy symbol stubs to encode per-TU information into
+ // global table for symbol lookup.
+ emitNonLazyStubs(MMI, *OutStreamer);
+
+ // Emit stack and fault map information.
+ emitStackMaps(SM);
+ FM.serializeToFaultMapSection();
+
+ // This flag tells the linker that no global symbols contain code that fall
+ // through to other global symbols (e.g. an implementation of multiple entry
+ // points). If this doesn't occur, the linker can safely perform dead code
+ // stripping. Since LLVM never generates code that does this, it is always
+ // safe to set.
+ OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ } else if (TT.isOSBinFormatCOFF()) {
+ if (MMI->usesMSVCFloatingPoint()) {
+ // In Windows' libcmt.lib, there is a file which is linked in only if the
+ // symbol _fltused is referenced. Linking this in causes some
+ // side-effects:
+ //
+ // 1. For x86-32, it will set the x87 rounding mode to 53-bit instead of
+ // 64-bit mantissas at program start.
+ //
+ // 2. It links in support routines for floating-point in scanf and printf.
+ //
+ // MSVC emits an undefined reference to _fltused when there are any
+ // floating point operations in the program (including calls). A program
+ // that only has: `scanf("%f", &global_float);` may fail to trigger this,
+ // but oh well...that's a documented issue.
+ StringRef SymbolName =
+ (TT.getArch() == Triple::x86) ? "__fltused" : "_fltused";
+ MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
+ OutStreamer->emitSymbolAttribute(S, MCSA_Global);
+ return;
+ }
+ emitStackMaps(SM);
+ } else if (TT.isOSBinFormatELF()) {
+ emitStackMaps(SM);
+ FM.serializeToFaultMapSection();
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86AsmPrinter() {
+ RegisterAsmPrinter<X86AsmPrinter> X(getTheX86_32Target());
+ RegisterAsmPrinter<X86AsmPrinter> Y(getTheX86_64Target());
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
new file mode 100644
index 000000000000..a3b74c8ee387
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,155 @@
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
+#include "llvm/CodeGen/StackMaps.h"
+
+// Implemented in X86MCInstLower.cpp
+namespace {
+ class X86MCInstLower;
+}
+
+namespace llvm {
+class MCCodeEmitter;
+class MCStreamer;
+class X86Subtarget;
+class TargetMachine;
+
+class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+ const X86Subtarget *Subtarget = nullptr;
+ StackMaps SM;
+ FaultMaps FM;
+ std::unique_ptr<MCCodeEmitter> CodeEmitter;
+ bool EmitFPOData = false;
+ bool NeedsRetpoline = false;
+
+ // This utility class tracks the length of a stackmap instruction's 'shadow'.
+ // It is used by the X86AsmPrinter to ensure that the stackmap shadow
+ // invariants (i.e. no other stackmaps, patchpoints, or control flow within
+ // the shadow) are met, while outputting a minimal number of NOPs for padding.
+ //
+ // To minimise the number of NOPs used, the shadow tracker counts the number
+ // of instruction bytes output since the last stackmap. Only if there are too
+ // few instruction bytes to cover the shadow are NOPs used for padding.
+ class StackMapShadowTracker {
+ public:
+ void startFunction(MachineFunction &MF) {
+ this->MF = &MF;
+ }
+ void count(MCInst &Inst, const MCSubtargetInfo &STI,
+ MCCodeEmitter *CodeEmitter);
+
+ // Called to signal the start of a shadow of RequiredSize bytes.
+ void reset(unsigned RequiredSize) {
+ RequiredShadowSize = RequiredSize;
+ CurrentShadowSize = 0;
+ InShadow = true;
+ }
+
+ // Called before every stackmap/patchpoint, and at the end of basic blocks,
+ // to emit any necessary padding-NOPs.
+ void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
+ private:
+ const MachineFunction *MF = nullptr;
+ bool InShadow = false;
+
+ // RequiredShadowSize holds the length of the shadow specified in the most
+ // recently encountered STACKMAP instruction.
+ // CurrentShadowSize counts the number of bytes encoded since the most
+ // recently encountered STACKMAP, stopping when that number is greater than
+ // or equal to RequiredShadowSize.
+ unsigned RequiredShadowSize = 0, CurrentShadowSize = 0;
+ };
+
+ StackMapShadowTracker SMShadowTracker;
+
+ // All instructions emitted by the X86AsmPrinter should use this helper
+ // method.
+ //
+ // This helper function invokes the SMShadowTracker on each instruction before
+ // outputting it to the OutStream. This allows the shadow tracker to minimise
+ // the number of NOPs used for stackmap padding.
+ void EmitAndCountInstruction(MCInst &Inst);
+ void LowerSTACKMAP(const MachineInstr &MI);
+ void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerFAULTING_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+ void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
+
+ // XRay-specific lowering for X86.
+ void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
+ X86MCInstLower &MCIL);
+ void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+ void LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
+ X86MCInstLower &MCIL);
+
+ void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+ // Choose between emitting .seh_ directives and .cv_fpo_ directives.
+ void EmitSEHInstruction(const MachineInstr *MI);
+
+ void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override;
+ void PrintOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+ void PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier);
+ void PrintPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+ void PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier);
+ void PrintMemReference(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier);
+ void PrintIntelMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier);
+
+public:
+ X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
+
+ StringRef getPassName() const override {
+ return "X86 Assembly Printer";
+ }
+
+ const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+ void emitStartOfAsmFile(Module &M) override;
+
+ void emitEndOfAsmFile(Module &M) override;
+
+ void emitInstruction(const MachineInstr *MI) override;
+
+ void emitBasicBlockEnd(const MachineBasicBlock &MBB) override {
+ AsmPrinter::emitBasicBlockEnd(MBB);
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+ }
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode, raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode, raw_ostream &O) override;
+
+ bool doInitialization(Module &M) override {
+ SMShadowTracker.reset(0);
+ SM.reset();
+ FM.reset();
+ return AsmPrinter::doInitialization(M);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void emitFunctionBodyStart() override;
+ void emitFunctionBodyEnd() override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
new file mode 100644
index 000000000000..fdc65acffe3d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -0,0 +1,735 @@
+//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// If a load follows a store and reloads data that the store has written to
+// memory, Intel microarchitectures can in many cases forward the data directly
+// from the store to the load, This "store forwarding" saves cycles by enabling
+// the load to directly obtain the data instead of accessing the data from
+// cache or memory.
+// A "store forward block" occurs in cases that a store cannot be forwarded to
+// the load. The most typical case of store forward block on Intel Core
+// microarchitecture that a small store cannot be forwarded to a large load.
+// The estimated penalty for a store forward block is ~13 cycles.
+//
+// This pass tries to recognize and handle cases where "store forward block"
+// is created by the compiler when lowering memcpy calls to a sequence
+// of a load and a store.
+//
+// The pass currently only handles cases where memcpy is lowered to
+// XMM/YMM registers, it tries to break the memcpy into smaller copies.
+// breaking the memcpy should be possible since there is no atomicity
+// guarantee for loads and stores to XMM/YMM.
+//
+// It could be better for performance to solve the problem by loading
+// to XMM/YMM then inserting the partial store before storing back from XMM/YMM
+// to memory, but this will result in a more conservative optimization since it
+// requires we prove that all memory accesses between the blocking store and the
+// load must alias/don't alias before we can move the store, whereas the
+// transformation done here is correct regardless to other memory accesses.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCInstrDesc.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-avoid-SFB"
+
+static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
+ "x86-disable-avoid-SFB", cl::Hidden,
+ cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
+
+static cl::opt<unsigned> X86AvoidSFBInspectionLimit(
+ "x86-sfb-inspection-limit",
+ cl::desc("X86: Number of instructions backward to "
+ "inspect for store forwarding blocks."),
+ cl::init(20), cl::Hidden);
+
+namespace {
+
+using DisplacementSizeMap = std::map<int64_t, unsigned>;
+
+class X86AvoidSFBPass : public MachineFunctionPass {
+public:
+ static char ID;
+ X86AvoidSFBPass() : MachineFunctionPass(ID) { }
+
+ StringRef getPassName() const override {
+ return "X86 Avoid Store Forwarding Blocks";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AAResultsWrapperPass>();
+ }
+
+private:
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
+ SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
+ BlockedLoadsStoresPairs;
+ SmallVector<MachineInstr *, 2> ForRemoval;
+ AliasAnalysis *AA = nullptr;
+
+ /// Returns couples of Load then Store to memory which look
+ /// like a memcpy.
+ void findPotentiallylBlockedCopies(MachineFunction &MF);
+ /// Break the memcpy's load and store into smaller copies
+ /// such that each memory load that was blocked by a smaller store
+ /// would now be copied separately.
+ void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
+ const DisplacementSizeMap &BlockingStoresDispSizeMap);
+ /// Break a copy of size Size to smaller copies.
+ void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
+ MachineInstr *StoreInst, int64_t StDispImm,
+ int64_t LMMOffset, int64_t SMMOffset);
+
+ void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
+ MachineInstr *StoreInst, unsigned NStoreOpcode,
+ int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
+ int64_t SMMOffset);
+
+ bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
+
+ unsigned getRegSizeInBytes(MachineInstr *Inst);
+};
+
+} // end anonymous namespace
+
+char X86AvoidSFBPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,
+ false)
+
+FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {
+ return new X86AvoidSFBPass();
+}
+
+static bool isXMMLoadOpcode(unsigned Opcode) {
+ return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
+ Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
+ Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
+ Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
+ Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
+ Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
+ Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
+ Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
+}
+static bool isYMMLoadOpcode(unsigned Opcode) {
+ return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
+ Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
+ Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
+ Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
+ Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
+ Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
+ Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
+}
+
+static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
+ return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
+}
+
+static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
+ switch (LdOpcode) {
+ case X86::MOVUPSrm:
+ case X86::MOVAPSrm:
+ return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPSrm:
+ return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
+ case X86::VMOVUPDrm:
+ case X86::VMOVAPDrm:
+ return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
+ case X86::VMOVDQUrm:
+ case X86::VMOVDQArm:
+ return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm:
+ return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVAPDZ128rm:
+ return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPSYrm:
+ return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
+ case X86::VMOVUPDYrm:
+ case X86::VMOVAPDYrm:
+ return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
+ case X86::VMOVDQUYrm:
+ case X86::VMOVDQAYrm:
+ return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm:
+ return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVAPDZ256rm:
+ return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVDQA64Z128rm:
+ return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQA32Z128rm:
+ return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA32Z256rm:
+ return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
+ default:
+ return false;
+ }
+}
+
+static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
+ bool PBlock = false;
+ PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
+ Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
+ Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
+ Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
+ if (isYMMLoadOpcode(LoadOpcode))
+ PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
+ Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
+ Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
+ Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
+ Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
+ Opcode == X86::VMOVDQU64Z128mr ||
+ Opcode == X86::VMOVDQA64Z128mr ||
+ Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
+ return PBlock;
+}
+
+static const int MOV128SZ = 16;
+static const int MOV64SZ = 8;
+static const int MOV32SZ = 4;
+static const int MOV16SZ = 2;
+static const int MOV8SZ = 1;
+
+static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
+ switch (LoadOpcode) {
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPSYrm:
+ return X86::VMOVUPSrm;
+ case X86::VMOVUPDYrm:
+ case X86::VMOVAPDYrm:
+ return X86::VMOVUPDrm;
+ case X86::VMOVDQUYrm:
+ case X86::VMOVDQAYrm:
+ return X86::VMOVDQUrm;
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm:
+ return X86::VMOVUPSZ128rm;
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVAPDZ256rm:
+ return X86::VMOVUPDZ128rm;
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ return X86::VMOVDQU64Z128rm;
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA32Z256rm:
+ return X86::VMOVDQU32Z128rm;
+ default:
+ llvm_unreachable("Unexpected Load Instruction Opcode");
+ }
+ return 0;
+}
+
+static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
+ switch (StoreOpcode) {
+ case X86::VMOVUPSYmr:
+ case X86::VMOVAPSYmr:
+ return X86::VMOVUPSmr;
+ case X86::VMOVUPDYmr:
+ case X86::VMOVAPDYmr:
+ return X86::VMOVUPDmr;
+ case X86::VMOVDQUYmr:
+ case X86::VMOVDQAYmr:
+ return X86::VMOVDQUmr;
+ case X86::VMOVUPSZ256mr:
+ case X86::VMOVAPSZ256mr:
+ return X86::VMOVUPSZ128mr;
+ case X86::VMOVUPDZ256mr:
+ case X86::VMOVAPDZ256mr:
+ return X86::VMOVUPDZ128mr;
+ case X86::VMOVDQU64Z256mr:
+ case X86::VMOVDQA64Z256mr:
+ return X86::VMOVDQU64Z128mr;
+ case X86::VMOVDQU32Z256mr:
+ case X86::VMOVDQA32Z256mr:
+ return X86::VMOVDQU32Z128mr;
+ default:
+ llvm_unreachable("Unexpected Load Instruction Opcode");
+ }
+ return 0;
+}
+
+static int getAddrOffset(const MachineInstr *MI) {
+ const MCInstrDesc &Descl = MI->getDesc();
+ int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
+ assert(AddrOffset != -1 && "Expected Memory Operand");
+ AddrOffset += X86II::getOperandBias(Descl);
+ return AddrOffset;
+}
+
+static MachineOperand &getBaseOperand(MachineInstr *MI) {
+ int AddrOffset = getAddrOffset(MI);
+ return MI->getOperand(AddrOffset + X86::AddrBaseReg);
+}
+
+static MachineOperand &getDispOperand(MachineInstr *MI) {
+ int AddrOffset = getAddrOffset(MI);
+ return MI->getOperand(AddrOffset + X86::AddrDisp);
+}
+
+// Relevant addressing modes contain only base register and immediate
+// displacement or frameindex and immediate displacement.
+// TODO: Consider expanding to other addressing modes in the future
+static bool isRelevantAddressingMode(MachineInstr *MI) {
+ int AddrOffset = getAddrOffset(MI);
+ const MachineOperand &Base = getBaseOperand(MI);
+ const MachineOperand &Disp = getDispOperand(MI);
+ const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
+ const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+ const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
+
+ if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
+ return false;
+ if (!Disp.isImm())
+ return false;
+ if (Scale.getImm() != 1)
+ return false;
+ if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
+ return false;
+ if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
+ return false;
+ return true;
+}
+
+// Collect potentially blocking stores.
+// Limit the number of instructions backwards we want to inspect
+// since the effect of store block won't be visible if the store
+// and load instructions have enough instructions in between to
+// keep the core busy.
+static SmallVector<MachineInstr *, 2>
+findPotentialBlockers(MachineInstr *LoadInst) {
+ SmallVector<MachineInstr *, 2> PotentialBlockers;
+ unsigned BlockCount = 0;
+ const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
+ for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
+ E = LoadInst->getParent()->rend();
+ PBInst != E; ++PBInst) {
+ if (PBInst->isMetaInstruction())
+ continue;
+ BlockCount++;
+ if (BlockCount >= InspectionLimit)
+ break;
+ MachineInstr &MI = *PBInst;
+ if (MI.getDesc().isCall())
+ return PotentialBlockers;
+ PotentialBlockers.push_back(&MI);
+ }
+ // If we didn't get to the instructions limit try predecessing blocks.
+ // Ideally we should traverse the predecessor blocks in depth with some
+ // coloring algorithm, but for now let's just look at the first order
+ // predecessors.
+ if (BlockCount < InspectionLimit) {
+ MachineBasicBlock *MBB = LoadInst->getParent();
+ int LimitLeft = InspectionLimit - BlockCount;
+ for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
+ PE = MBB->pred_end();
+ PB != PE; ++PB) {
+ MachineBasicBlock *PMBB = *PB;
+ int PredCount = 0;
+ for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
+ PME = PMBB->rend();
+ PBInst != PME; ++PBInst) {
+ if (PBInst->isMetaInstruction())
+ continue;
+ PredCount++;
+ if (PredCount >= LimitLeft)
+ break;
+ if (PBInst->getDesc().isCall())
+ break;
+ PotentialBlockers.push_back(&*PBInst);
+ }
+ }
+ }
+ return PotentialBlockers;
+}
+
+void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
+ int64_t LoadDisp, MachineInstr *StoreInst,
+ unsigned NStoreOpcode, int64_t StoreDisp,
+ unsigned Size, int64_t LMMOffset,
+ int64_t SMMOffset) {
+ MachineOperand &LoadBase = getBaseOperand(LoadInst);
+ MachineOperand &StoreBase = getBaseOperand(StoreInst);
+ MachineBasicBlock *MBB = LoadInst->getParent();
+ MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
+ MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
+
+ Register Reg1 = MRI->createVirtualRegister(
+ TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
+ MachineInstr *NewLoad =
+ BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
+ Reg1)
+ .add(LoadBase)
+ .addImm(1)
+ .addReg(X86::NoRegister)
+ .addImm(LoadDisp)
+ .addReg(X86::NoRegister)
+ .addMemOperand(
+ MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
+ if (LoadBase.isReg())
+ getBaseOperand(NewLoad).setIsKill(false);
+ LLVM_DEBUG(NewLoad->dump());
+ // If the load and store are consecutive, use the loadInst location to
+ // reduce register pressure.
+ MachineInstr *StInst = StoreInst;
+ auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
+ MBB->instr_begin());
+ if (PrevInstrIt.getNodePtr() == LoadInst)
+ StInst = LoadInst;
+ MachineInstr *NewStore =
+ BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
+ .add(StoreBase)
+ .addImm(1)
+ .addReg(X86::NoRegister)
+ .addImm(StoreDisp)
+ .addReg(X86::NoRegister)
+ .addReg(Reg1)
+ .addMemOperand(
+ MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
+ if (StoreBase.isReg())
+ getBaseOperand(NewStore).setIsKill(false);
+ MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
+ assert(StoreSrcVReg.isReg() && "Expected virtual register");
+ NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
+ LLVM_DEBUG(NewStore->dump());
+}
+
+void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
+ int64_t LdDispImm, MachineInstr *StoreInst,
+ int64_t StDispImm, int64_t LMMOffset,
+ int64_t SMMOffset) {
+ int LdDisp = LdDispImm;
+ int StDisp = StDispImm;
+ while (Size > 0) {
+ if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
+ Size = Size - MOV128SZ;
+ buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
+ StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
+ StDisp, MOV128SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV128SZ;
+ StDisp += MOV128SZ;
+ LMMOffset += MOV128SZ;
+ SMMOffset += MOV128SZ;
+ continue;
+ }
+ if (Size - MOV64SZ >= 0) {
+ Size = Size - MOV64SZ;
+ buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
+ MOV64SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV64SZ;
+ StDisp += MOV64SZ;
+ LMMOffset += MOV64SZ;
+ SMMOffset += MOV64SZ;
+ continue;
+ }
+ if (Size - MOV32SZ >= 0) {
+ Size = Size - MOV32SZ;
+ buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
+ MOV32SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV32SZ;
+ StDisp += MOV32SZ;
+ LMMOffset += MOV32SZ;
+ SMMOffset += MOV32SZ;
+ continue;
+ }
+ if (Size - MOV16SZ >= 0) {
+ Size = Size - MOV16SZ;
+ buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
+ MOV16SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV16SZ;
+ StDisp += MOV16SZ;
+ LMMOffset += MOV16SZ;
+ SMMOffset += MOV16SZ;
+ continue;
+ }
+ if (Size - MOV8SZ >= 0) {
+ Size = Size - MOV8SZ;
+ buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
+ MOV8SZ, LMMOffset, SMMOffset);
+ LdDisp += MOV8SZ;
+ StDisp += MOV8SZ;
+ LMMOffset += MOV8SZ;
+ SMMOffset += MOV8SZ;
+ continue;
+ }
+ }
+ assert(Size == 0 && "Wrong size division");
+}
+
+static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
+ MachineOperand &LoadBase = getBaseOperand(LoadInst);
+ MachineOperand &StoreBase = getBaseOperand(StoreInst);
+ auto *StorePrevNonDbgInstr =
+ prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
+ LoadInst->getParent()->instr_begin())
+ .getNodePtr();
+ if (LoadBase.isReg()) {
+ MachineInstr *LastLoad = LoadInst->getPrevNode();
+ // If the original load and store to xmm/ymm were consecutive
+ // then the partial copies were also created in
+ // a consecutive order to reduce register pressure,
+ // and the location of the last load is before the last store.
+ if (StorePrevNonDbgInstr == LoadInst)
+ LastLoad = LoadInst->getPrevNode()->getPrevNode();
+ getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
+ }
+ if (StoreBase.isReg()) {
+ MachineInstr *StInst = StoreInst;
+ if (StorePrevNonDbgInstr == LoadInst)
+ StInst = LoadInst;
+ getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
+ }
+}
+
+bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
+ const MachineMemOperand &Op2) const {
+ if (!Op1.getValue() || !Op2.getValue())
+ return true;
+
+ int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
+ int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
+ int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
+
+ AliasResult AAResult =
+ AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
+ MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
+ return AAResult != NoAlias;
+}
+
+void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
+ for (auto &MBB : MF)
+ for (auto &MI : MBB) {
+ if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
+ continue;
+ int DefVR = MI.getOperand(0).getReg();
+ if (!MRI->hasOneNonDBGUse(DefVR))
+ continue;
+ for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
+ UI != UE;) {
+ MachineOperand &StoreMO = *UI++;
+ MachineInstr &StoreMI = *StoreMO.getParent();
+ // Skip cases where the memcpy may overlap.
+ if (StoreMI.getParent() == MI.getParent() &&
+ isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
+ isRelevantAddressingMode(&MI) &&
+ isRelevantAddressingMode(&StoreMI) &&
+ MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
+ if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
+ BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
+ }
+ }
+ }
+}
+
+unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
+ const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
+ *LoadInst->getParent()->getParent());
+ return TRI->getRegSizeInBits(*TRC) / 8;
+}
+
+void X86AvoidSFBPass::breakBlockedCopies(
+ MachineInstr *LoadInst, MachineInstr *StoreInst,
+ const DisplacementSizeMap &BlockingStoresDispSizeMap) {
+ int64_t LdDispImm = getDispOperand(LoadInst).getImm();
+ int64_t StDispImm = getDispOperand(StoreInst).getImm();
+ int64_t LMMOffset = 0;
+ int64_t SMMOffset = 0;
+
+ int64_t LdDisp1 = LdDispImm;
+ int64_t LdDisp2 = 0;
+ int64_t StDisp1 = StDispImm;
+ int64_t StDisp2 = 0;
+ unsigned Size1 = 0;
+ unsigned Size2 = 0;
+ int64_t LdStDelta = StDispImm - LdDispImm;
+
+ for (auto DispSizePair : BlockingStoresDispSizeMap) {
+ LdDisp2 = DispSizePair.first;
+ StDisp2 = DispSizePair.first + LdStDelta;
+ Size2 = DispSizePair.second;
+ // Avoid copying overlapping areas.
+ if (LdDisp2 < LdDisp1) {
+ int OverlapDelta = LdDisp1 - LdDisp2;
+ LdDisp2 += OverlapDelta;
+ StDisp2 += OverlapDelta;
+ Size2 -= OverlapDelta;
+ }
+ Size1 = LdDisp2 - LdDisp1;
+
+ // Build a copy for the point until the current blocking store's
+ // displacement.
+ buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
+ SMMOffset);
+ // Build a copy for the current blocking store.
+ buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
+ SMMOffset + Size1);
+ LdDisp1 = LdDisp2 + Size2;
+ StDisp1 = StDisp2 + Size2;
+ LMMOffset += Size1 + Size2;
+ SMMOffset += Size1 + Size2;
+ }
+ unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
+ buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
+ LMMOffset);
+}
+
+static bool hasSameBaseOpValue(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ const MachineOperand &LoadBase = getBaseOperand(LoadInst);
+ const MachineOperand &StoreBase = getBaseOperand(StoreInst);
+ if (LoadBase.isReg() != StoreBase.isReg())
+ return false;
+ if (LoadBase.isReg())
+ return LoadBase.getReg() == StoreBase.getReg();
+ return LoadBase.getIndex() == StoreBase.getIndex();
+}
+
+static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
+ int64_t StoreDispImm, unsigned StoreSize) {
+ return ((StoreDispImm >= LoadDispImm) &&
+ (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
+}
+
+// Keep track of all stores blocking a load
+static void
+updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
+ int64_t DispImm, unsigned Size) {
+ if (BlockingStoresDispSizeMap.count(DispImm)) {
+ // Choose the smallest blocking store starting at this displacement.
+ if (BlockingStoresDispSizeMap[DispImm] > Size)
+ BlockingStoresDispSizeMap[DispImm] = Size;
+
+ } else
+ BlockingStoresDispSizeMap[DispImm] = Size;
+}
+
+// Remove blocking stores contained in each other.
+static void
+removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
+ if (BlockingStoresDispSizeMap.size() <= 1)
+ return;
+
+ SmallVector<std::pair<int64_t, unsigned>, 0> DispSizeStack;
+ for (auto DispSizePair : BlockingStoresDispSizeMap) {
+ int64_t CurrDisp = DispSizePair.first;
+ unsigned CurrSize = DispSizePair.second;
+ while (DispSizeStack.size()) {
+ int64_t PrevDisp = DispSizeStack.back().first;
+ unsigned PrevSize = DispSizeStack.back().second;
+ if (CurrDisp + CurrSize > PrevDisp + PrevSize)
+ break;
+ DispSizeStack.pop_back();
+ }
+ DispSizeStack.push_back(DispSizePair);
+ }
+ BlockingStoresDispSizeMap.clear();
+ for (auto Disp : DispSizeStack)
+ BlockingStoresDispSizeMap.insert(Disp);
+}
+
+bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+
+ if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
+ !MF.getSubtarget<X86Subtarget>().is64Bit())
+ return false;
+
+ MRI = &MF.getRegInfo();
+ assert(MRI->isSSA() && "Expected MIR to be in SSA form");
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
+ // Look for a load then a store to XMM/YMM which look like a memcpy
+ findPotentiallylBlockedCopies(MF);
+
+ for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
+ MachineInstr *LoadInst = LoadStoreInstPair.first;
+ int64_t LdDispImm = getDispOperand(LoadInst).getImm();
+ DisplacementSizeMap BlockingStoresDispSizeMap;
+
+ SmallVector<MachineInstr *, 2> PotentialBlockers =
+ findPotentialBlockers(LoadInst);
+ for (auto *PBInst : PotentialBlockers) {
+ if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
+ LoadInst->getOpcode()) ||
+ !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
+ continue;
+ int64_t PBstDispImm = getDispOperand(PBInst).getImm();
+ unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
+ // This check doesn't cover all cases, but it will suffice for now.
+ // TODO: take branch probability into consideration, if the blocking
+ // store is in an unreached block, breaking the memcopy could lose
+ // performance.
+ if (hasSameBaseOpValue(LoadInst, PBInst) &&
+ isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
+ PBstSize))
+ updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
+ PBstSize);
+ }
+
+ if (BlockingStoresDispSizeMap.empty())
+ continue;
+
+ // We found a store forward block, break the memcpy's load and store
+ // into smaller copies such that each smaller store that was causing
+ // a store block would now be copied separately.
+ MachineInstr *StoreInst = LoadStoreInstPair.second;
+ LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
+ LLVM_DEBUG(LoadInst->dump());
+ LLVM_DEBUG(StoreInst->dump());
+ LLVM_DEBUG(dbgs() << "Replaced with:\n");
+ removeRedundantBlockingStores(BlockingStoresDispSizeMap);
+ breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
+ updateKillStatus(LoadInst, StoreInst);
+ ForRemoval.push_back(LoadInst);
+ ForRemoval.push_back(StoreInst);
+ }
+ for (auto *RemovedInst : ForRemoval) {
+ RemovedInst->eraseFromParent();
+ }
+ ForRemoval.clear();
+ BlockedLoadsStoresPairs.clear();
+ LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
+
+ return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
new file mode 100644
index 000000000000..0899783d5f60
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -0,0 +1,135 @@
+//===----- X86AvoidTrailingCall.cpp - Insert int3 after trailing calls ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The Windows x64 unwinder decodes the instruction stream during unwinding.
+// The unwinder decodes forward from the current PC to detect epilogue code
+// patterns.
+//
+// First, this means that there must be an instruction after every
+// call instruction for the unwinder to decode. LLVM must maintain the invariant
+// that the last instruction of a function or funclet is not a call, or the
+// unwinder may decode into the next function. Similarly, a call may not
+// immediately precede an epilogue code pattern. As of this writing, the
+// SEH_Epilogue pseudo instruction takes care of that.
+//
+// Second, all non-tail call jump targets must be within the *half-open*
+// interval of the bounds of the function. The unwinder distinguishes between
+// internal jump instructions and tail calls in an epilogue sequence by checking
+// the jump target against the function bounds from the .pdata section. This
+// means that the last regular MBB of an LLVM function must not be empty if
+// there are regular jumps targeting it.
+//
+// This pass upholds these invariants by ensuring that blocks at the end of a
+// function or funclet are a) not empty and b) do not end in a CALL instruction.
+//
+// Unwinder implementation for reference:
+// https://github.com/dotnet/coreclr/blob/a9f3fc16483eecfc47fb79c362811d870be02249/src/unwinder/amd64/unwinder_amd64.cpp#L1015
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+#define AVOIDCALL_DESC "X86 avoid trailing call pass"
+#define AVOIDCALL_NAME "x86-avoid-trailing-call"
+
+#define DEBUG_TYPE AVOIDCALL_NAME
+
+using namespace llvm;
+
+namespace {
+class X86AvoidTrailingCallPass : public MachineFunctionPass {
+public:
+ X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ static char ID;
+
+private:
+ StringRef getPassName() const override { return AVOIDCALL_DESC; }
+};
+} // end anonymous namespace
+
+char X86AvoidTrailingCallPass::ID = 0;
+
+FunctionPass *llvm::createX86AvoidTrailingCallPass() {
+ return new X86AvoidTrailingCallPass();
+}
+
+INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, false)
+
+// A real instruction is a non-meta, non-pseudo instruction. Some pseudos
+// expand to nothing, and some expand to code. This logic conservatively assumes
+// they might expand to nothing.
+static bool isRealInstruction(MachineInstr &MI) {
+ return !MI.isPseudo() && !MI.isMetaInstruction();
+}
+
+// Return true if this is a call instruction, but not a tail call.
+static bool isCallInstruction(const MachineInstr &MI) {
+ return MI.isCall() && !MI.isReturn();
+}
+
+bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86InstrInfo &TII = *STI.getInstrInfo();
+ assert(STI.isTargetWin64() && "pass only runs on Win64");
+
+ // We don't need to worry about any of the invariants described above if there
+ // is no unwind info (CFI).
+ if (!MF.hasWinCFI())
+ return false;
+
+ // FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops
+ // before epilogues.
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ // Look for basic blocks that precede funclet entries or are at the end of
+ // the function.
+ MachineBasicBlock *NextMBB = MBB.getNextNode();
+ if (NextMBB && !NextMBB->isEHFuncletEntry())
+ continue;
+
+ // Find the last real instruction in this block.
+ auto LastRealInstr = llvm::find_if(reverse(MBB), isRealInstruction);
+
+ // If the block is empty or the last real instruction is a call instruction,
+ // insert an int3. If there is a call instruction, insert the int3 between
+ // the call and any labels or other meta instructions. If the block is
+ // empty, insert at block end.
+ bool IsEmpty = LastRealInstr == MBB.rend();
+ bool IsCall = !IsEmpty && isCallInstruction(*LastRealInstr);
+ if (IsEmpty || IsCall) {
+ LLVM_DEBUG({
+ if (IsCall) {
+ dbgs() << "inserting int3 after trailing call instruction:\n";
+ LastRealInstr->dump();
+ dbgs() << '\n';
+ } else {
+ dbgs() << "inserting int3 in trailing empty MBB:\n";
+ MBB.dump();
+ }
+ });
+
+ MachineBasicBlock::iterator MBBI = MBB.end();
+ DebugLoc DL;
+ if (IsCall) {
+ MBBI = std::next(LastRealInstr.getReverse());
+ DL = LastRealInstr->getDebugLoc();
+ }
+ BuildMI(MBB, MBBI, DL, TII.get(X86::INT3));
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
new file mode 100644
index 000000000000..fae4e688c8b4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -0,0 +1,640 @@
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than a stack-ptr-based mov.
+// 2) It is possible to push memory arguments directly. So, if the
+// the transformation is performed pre-reg-alloc, it can help relieve
+// register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86.h"
+#include "X86FrameLowering.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+static cl::opt<bool>
+ NoX86CFOpt("no-x86-call-frame-opt",
+ cl::desc("Avoid optimizing x86 call frames for size"),
+ cl::init(false), cl::Hidden);
+
+namespace {
+
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+ X86CallFrameOptimization() : MachineFunctionPass(ID) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ static char ID;
+
+private:
+ // Information we know about a particular call site
+ struct CallContext {
+ CallContext() : FrameSetup(nullptr), ArgStoreVector(4, nullptr) {}
+
+ // Iterator referring to the frame setup instruction
+ MachineBasicBlock::iterator FrameSetup;
+
+ // Actual call instruction
+ MachineInstr *Call = nullptr;
+
+ // A copy of the stack pointer
+ MachineInstr *SPCopy = nullptr;
+
+ // The total displacement of all passed parameters
+ int64_t ExpectedDist = 0;
+
+ // The sequence of storing instructions used to pass the parameters
+ SmallVector<MachineInstr *, 4> ArgStoreVector;
+
+ // True if this call site has no stack parameters
+ bool NoStackParams = false;
+
+ // True if this call site can use push instructions
+ bool UsePush = false;
+ };
+
+ typedef SmallVector<CallContext, 8> ContextVector;
+
+ bool isLegal(MachineFunction &MF);
+
+ bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);
+
+ void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, CallContext &Context);
+
+ void adjustCallSequence(MachineFunction &MF, const CallContext &Context);
+
+ MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+ Register Reg);
+
+ enum InstClassification { Convert, Skip, Exit };
+
+ InstClassification classifyInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const X86RegisterInfo &RegInfo,
+ DenseSet<unsigned int> &UsedRegs);
+
+ StringRef getPassName() const override { return "X86 Optimize Call Frame"; }
+
+ const X86InstrInfo *TII = nullptr;
+ const X86FrameLowering *TFL = nullptr;
+ const X86Subtarget *STI = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ unsigned SlotSize = 0;
+ unsigned Log2SlotSize = 0;
+};
+
+} // end anonymous namespace
+char X86CallFrameOptimization::ID = 0;
+INITIALIZE_PASS(X86CallFrameOptimization, DEBUG_TYPE,
+ "X86 Call Frame Optimization", false, false)
+
+// This checks whether the transformation is legal.
+// Also returns false in cases where it's potentially legal, but
+// we don't even want to try.
+bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
+ if (NoX86CFOpt.getValue())
+ return false;
+
+ // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
+ // in the compact unwind encoding that Darwin uses. So, bail if there
+ // is a danger of that being generated.
+ if (STI->isTargetDarwin() &&
+ (!MF.getLandingPads().empty() ||
+ (MF.getFunction().needsUnwindTableEntry() && !TFL->hasFP(MF))))
+ return false;
+
+ // It is not valid to change the stack pointer outside the prolog/epilog
+ // on 64-bit Windows.
+ if (STI->isTargetWin64())
+ return false;
+
+ // You would expect straight-line code between call-frame setup and
+ // call-frame destroy. You would be wrong. There are circumstances (e.g.
+ // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+ // end up with the setup and the destroy in different basic blocks.
+ // This is bad, and breaks SP adjustment.
+ // So, check that all of the frames in the function are closed inside
+ // the same block, and, for good measure, that there are no nested frames.
+ //
+ // If any call allocates more argument stack memory than the stack
+ // probe size, don't do this optimization. Otherwise, this pass
+ // would need to synthesize additional stack probe calls to allocate
+ // memory for arguments.
+ unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+ unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+ bool EmitStackProbeCall = STI->getTargetLowering()->hasStackProbeSymbol(MF);
+ unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF);
+ for (MachineBasicBlock &BB : MF) {
+ bool InsideFrameSequence = false;
+ for (MachineInstr &MI : BB) {
+ if (MI.getOpcode() == FrameSetupOpcode) {
+ if (TII->getFrameSize(MI) >= StackProbeSize && EmitStackProbeCall)
+ return false;
+ if (InsideFrameSequence)
+ return false;
+ InsideFrameSequence = true;
+ } else if (MI.getOpcode() == FrameDestroyOpcode) {
+ if (!InsideFrameSequence)
+ return false;
+ InsideFrameSequence = false;
+ }
+ }
+
+ if (InsideFrameSequence)
+ return false;
+ }
+
+ return true;
+}
+
+// Check whether this transformation is profitable for a particular
+// function - in terms of code size.
+bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
+ ContextVector &CallSeqVector) {
+ // This transformation is always a win when we do not expect to have
+ // a reserved call frame. Under other circumstances, it may be either
+ // a win or a loss, and requires a heuristic.
+ bool CannotReserveFrame = MF.getFrameInfo().hasVarSizedObjects();
+ if (CannotReserveFrame)
+ return true;
+
+ Align StackAlign = TFL->getStackAlign();
+
+ int64_t Advantage = 0;
+ for (const auto &CC : CallSeqVector) {
+ // Call sites where no parameters are passed on the stack
+ // do not affect the cost, since there needs to be no
+ // stack adjustment.
+ if (CC.NoStackParams)
+ continue;
+
+ if (!CC.UsePush) {
+ // If we don't use pushes for a particular call site,
+ // we pay for not having a reserved call frame with an
+ // additional sub/add esp pair. The cost is ~3 bytes per instruction,
+ // depending on the size of the constant.
+ // TODO: Callee-pop functions should have a smaller penalty, because
+ // an add is needed even with a reserved call frame.
+ Advantage -= 6;
+ } else {
+ // We can use pushes. First, account for the fixed costs.
+ // We'll need a add after the call.
+ Advantage -= 3;
+ // If we have to realign the stack, we'll also need a sub before
+ if (!isAligned(StackAlign, CC.ExpectedDist))
+ Advantage -= 3;
+ // Now, for each push, we save ~3 bytes. For small constants, we actually,
+ // save more (up to 5 bytes), but 3 should be a good approximation.
+ Advantage += (CC.ExpectedDist >> Log2SlotSize) * 3;
+ }
+ }
+
+ return Advantage >= 0;
+}
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+ STI = &MF.getSubtarget<X86Subtarget>();
+ TII = STI->getInstrInfo();
+ TFL = STI->getFrameLowering();
+ MRI = &MF.getRegInfo();
+
+ const X86RegisterInfo &RegInfo =
+ *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+ SlotSize = RegInfo.getSlotSize();
+ assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size");
+ Log2SlotSize = Log2_32(SlotSize);
+
+ if (skipFunction(MF.getFunction()) || !isLegal(MF))
+ return false;
+
+ unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+ bool Changed = false;
+
+ ContextVector CallSeqVector;
+
+ for (auto &MBB : MF)
+ for (auto &MI : MBB)
+ if (MI.getOpcode() == FrameSetupOpcode) {
+ CallContext Context;
+ collectCallInfo(MF, MBB, MI, Context);
+ CallSeqVector.push_back(Context);
+ }
+
+ if (!isProfitable(MF, CallSeqVector))
+ return false;
+
+ for (const auto &CC : CallSeqVector) {
+ if (CC.UsePush) {
+ adjustCallSequence(MF, CC);
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+X86CallFrameOptimization::InstClassification
+X86CallFrameOptimization::classifyInstruction(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {
+ if (MI == MBB.end())
+ return Exit;
+
+ // The instructions we actually care about are movs onto the stack or special
+ // cases of constant-stores to stack
+ switch (MI->getOpcode()) {
+ case X86::AND16mi8:
+ case X86::AND32mi8:
+ case X86::AND64mi8: {
+ const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
+ return ImmOp.getImm() == 0 ? Convert : Exit;
+ }
+ case X86::OR16mi8:
+ case X86::OR32mi8:
+ case X86::OR64mi8: {
+ const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
+ return ImmOp.getImm() == -1 ? Convert : Exit;
+ }
+ case X86::MOV32mi:
+ case X86::MOV32mr:
+ case X86::MOV64mi32:
+ case X86::MOV64mr:
+ return Convert;
+ }
+
+ // Not all calling conventions have only stack MOVs between the stack
+ // adjust and the call.
+
+ // We want to tolerate other instructions, to cover more cases.
+ // In particular:
+ // a) PCrel calls, where we expect an additional COPY of the basereg.
+ // b) Passing frame-index addresses.
+ // c) Calling conventions that have inreg parameters. These generate
+ // both copies and movs into registers.
+ // To avoid creating lots of special cases, allow any instruction
+ // that does not write into memory, does not def or use the stack
+ // pointer, and does not def any register that was used by a preceding
+ // push.
+ // (Reading from memory is allowed, even if referenced through a
+ // frame index, since these will get adjusted properly in PEI)
+
+ // The reason for the last condition is that the pushes can't replace
+ // the movs in place, because the order must be reversed.
+ // So if we have a MOV32mr that uses EDX, then an instruction that defs
+ // EDX, and then the call, after the transformation the push will use
+ // the modified version of EDX, and not the original one.
+ // Since we are still in SSA form at this point, we only need to
+ // make sure we don't clobber any *physical* registers that were
+ // used by an earlier mov that will become a push.
+
+ if (MI->isCall() || MI->mayStore())
+ return Exit;
+
+ for (const MachineOperand &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg.isPhysical())
+ continue;
+ if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
+ return Exit;
+ if (MO.isDef()) {
+ for (unsigned int U : UsedRegs)
+ if (RegInfo.regsOverlap(Reg, U))
+ return Exit;
+ }
+ }
+
+ return Skip;
+}
+
+void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ CallContext &Context) {
+ // Check that this particular call sequence is amenable to the
+ // transformation.
+ const X86RegisterInfo &RegInfo =
+ *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+
+ // We expect to enter this at the beginning of a call sequence
+ assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+ MachineBasicBlock::iterator FrameSetup = I++;
+ Context.FrameSetup = FrameSetup;
+
+ // How much do we adjust the stack? This puts an upper bound on
+ // the number of parameters actually passed on it.
+ unsigned int MaxAdjust = TII->getFrameSize(*FrameSetup) >> Log2SlotSize;
+
+ // A zero adjustment means no stack parameters
+ if (!MaxAdjust) {
+ Context.NoStackParams = true;
+ return;
+ }
+
+ // Skip over DEBUG_VALUE.
+ // For globals in PIC mode, we can have some LEAs here. Skip them as well.
+ // TODO: Extend this to something that covers more cases.
+ while (I->getOpcode() == X86::LEA32r || I->isDebugInstr())
+ ++I;
+
+ Register StackPtr = RegInfo.getStackRegister();
+ auto StackPtrCopyInst = MBB.end();
+ // SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual
+ // register. If it's there, use that virtual register as stack pointer
+ // instead. Also, we need to locate this instruction so that we can later
+ // safely ignore it while doing the conservative processing of the call chain.
+ // The COPY can be located anywhere between the call-frame setup
+ // instruction and its first use. We use the call instruction as a boundary
+ // because it is usually cheaper to check if an instruction is a call than
+ // checking if an instruction uses a register.
+ for (auto J = I; !J->isCall(); ++J)
+ if (J->isCopy() && J->getOperand(0).isReg() && J->getOperand(1).isReg() &&
+ J->getOperand(1).getReg() == StackPtr) {
+ StackPtrCopyInst = J;
+ Context.SPCopy = &*J++;
+ StackPtr = Context.SPCopy->getOperand(0).getReg();
+ break;
+ }
+
+ // Scan the call setup sequence for the pattern we're looking for.
+ // We only handle a simple case - a sequence of store instructions that
+ // push a sequence of stack-slot-aligned values onto the stack, with
+ // no gaps between them.
+ if (MaxAdjust > 4)
+ Context.ArgStoreVector.resize(MaxAdjust, nullptr);
+
+ DenseSet<unsigned int> UsedRegs;
+
+ for (InstClassification Classification = Skip; Classification != Exit; ++I) {
+ // If this is the COPY of the stack pointer, it's ok to ignore.
+ if (I == StackPtrCopyInst)
+ continue;
+ Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs);
+ if (Classification != Convert)
+ continue;
+ // We know the instruction has a supported store opcode.
+ // We only want movs of the form:
+ // mov imm/reg, k(%StackPtr)
+ // If we run into something else, bail.
+ // Note that AddrBaseReg may, counter to its name, not be a register,
+ // but rather a frame index.
+ // TODO: Support the fi case. This should probably work now that we
+ // have the infrastructure to track the stack pointer within a call
+ // sequence.
+ if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+ (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+ !I->getOperand(X86::AddrScaleAmt).isImm() ||
+ (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+ (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+ (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+ !I->getOperand(X86::AddrDisp).isImm())
+ return;
+
+ int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+ assert(StackDisp >= 0 &&
+ "Negative stack displacement when passing parameters");
+
+ // We really don't want to consider the unaligned case.
+ if (StackDisp & (SlotSize - 1))
+ return;
+ StackDisp >>= Log2SlotSize;
+
+ assert((size_t)StackDisp < Context.ArgStoreVector.size() &&
+ "Function call has more parameters than the stack is adjusted for.");
+
+ // If the same stack slot is being filled twice, something's fishy.
+ if (Context.ArgStoreVector[StackDisp] != nullptr)
+ return;
+ Context.ArgStoreVector[StackDisp] = &*I;
+
+ for (const MachineOperand &MO : I->uses()) {
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical())
+ UsedRegs.insert(Reg);
+ }
+ }
+
+ --I;
+
+ // We now expect the end of the sequence. If we stopped early,
+ // or reached the end of the block without finding a call, bail.
+ if (I == MBB.end() || !I->isCall())
+ return;
+
+ Context.Call = &*I;
+ if ((++I)->getOpcode() != TII->getCallFrameDestroyOpcode())
+ return;
+
+ // Now, go through the vector, and see that we don't have any gaps,
+ // but only a series of storing instructions.
+ auto MMI = Context.ArgStoreVector.begin(), MME = Context.ArgStoreVector.end();
+ for (; MMI != MME; ++MMI, Context.ExpectedDist += SlotSize)
+ if (*MMI == nullptr)
+ break;
+
+ // If the call had no parameters, do nothing
+ if (MMI == Context.ArgStoreVector.begin())
+ return;
+
+ // We are either at the last parameter, or a gap.
+ // Make sure it's not a gap
+ for (; MMI != MME; ++MMI)
+ if (*MMI != nullptr)
+ return;
+
+ Context.UsePush = true;
+}
+
+void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+ const CallContext &Context) {
+ // Ok, we can in fact do the transformation for this call.
+ // Do not remove the FrameSetup instruction, but adjust the parameters.
+ // PEI will end up finalizing the handling of this.
+ MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
+ MachineBasicBlock &MBB = *(FrameSetup->getParent());
+ TII->setFrameAdjustment(*FrameSetup, Context.ExpectedDist);
+
+ DebugLoc DL = FrameSetup->getDebugLoc();
+ bool Is64Bit = STI->is64Bit();
+ // Now, iterate through the vector in reverse order, and replace the store to
+ // stack with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
+ // replace uses.
+ for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) {
+ MachineBasicBlock::iterator Store = *Context.ArgStoreVector[Idx];
+ const MachineOperand &PushOp = Store->getOperand(X86::AddrNumOperands);
+ MachineBasicBlock::iterator Push = nullptr;
+ unsigned PushOpcode;
+ switch (Store->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected Opcode!");
+ case X86::AND16mi8:
+ case X86::AND32mi8:
+ case X86::AND64mi8:
+ case X86::OR16mi8:
+ case X86::OR32mi8:
+ case X86::OR64mi8:
+ case X86::MOV32mi:
+ case X86::MOV64mi32:
+ PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSHi32;
+ // If the operand is a small (8-bit) immediate, we can use a
+ // PUSH instruction with a shorter encoding.
+ // Note that isImm() may fail even though this is a MOVmi, because
+ // the operand can also be a symbol.
+ if (PushOp.isImm()) {
+ int64_t Val = PushOp.getImm();
+ if (isInt<8>(Val))
+ PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
+ }
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp);
+ Push->cloneMemRefs(MF, *Store);
+ break;
+ case X86::MOV32mr:
+ case X86::MOV64mr: {
+ Register Reg = PushOp.getReg();
+
+ // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
+ // in preparation for the PUSH64. The upper 32 bits can be undef.
+ if (Is64Bit && Store->getOpcode() == X86::MOV32mr) {
+ Register UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
+ BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
+ .addReg(UndefReg)
+ .add(PushOp)
+ .addImm(X86::sub_32bit);
+ }
+
+ // If PUSHrmm is not slow on this target, try to fold the source of the
+ // push into the instruction.
+ bool SlowPUSHrmm = STI->slowTwoMemOps();
+
+ // Check that this is legal to fold. Right now, we're extremely
+ // conservative about that.
+ MachineInstr *DefMov = nullptr;
+ if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+ PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm;
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode));
+
+ unsigned NumOps = DefMov->getDesc().getNumOperands();
+ for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+ Push->addOperand(DefMov->getOperand(i));
+ Push->cloneMergedMemRefs(MF, {DefMov, &*Store});
+ DefMov->eraseFromParent();
+ } else {
+ PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+ .addReg(Reg)
+ .getInstr();
+ Push->cloneMemRefs(MF, *Store);
+ }
+ break;
+ }
+ }
+
+ // For debugging, when using SP-based CFA, we need to adjust the CFA
+ // offset after each push.
+ // TODO: This is needed only if we require precise CFA.
+ if (!TFL->hasFP(MF))
+ TFL->BuildCFI(
+ MBB, std::next(Push), DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, SlotSize));
+
+ MBB.erase(Store);
+ }
+
+ // The stack-pointer copy is no longer used in the call sequences.
+ // There should not be any other users, but we can't commit to that, so:
+ if (Context.SPCopy && MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
+ Context.SPCopy->eraseFromParent();
+
+ // Once we've done this, we need to make sure PEI doesn't assume a reserved
+ // frame.
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ FuncInfo->setHasPushSequences(true);
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+ MachineBasicBlock::iterator FrameSetup, Register Reg) {
+ // Do an extremely restricted form of load folding.
+ // ISel will often create patterns like:
+ // movl 4(%edi), %eax
+ // movl 8(%edi), %ecx
+ // movl 12(%edi), %edx
+ // movl %edx, 8(%esp)
+ // movl %ecx, 4(%esp)
+ // movl %eax, (%esp)
+ // call
+ // Get rid of those with prejudice.
+ if (!Reg.isVirtual())
+ return nullptr;
+
+ // Make sure this is the only use of Reg.
+ if (!MRI->hasOneNonDBGUse(Reg))
+ return nullptr;
+
+ MachineInstr &DefMI = *MRI->getVRegDef(Reg);
+
+ // Make sure the def is a MOV from memory.
+ // If the def is in another block, give up.
+ if ((DefMI.getOpcode() != X86::MOV32rm &&
+ DefMI.getOpcode() != X86::MOV64rm) ||
+ DefMI.getParent() != FrameSetup->getParent())
+ return nullptr;
+
+ // Make sure we don't have any instructions between DefMI and the
+ // push that make folding the load illegal.
+ for (MachineBasicBlock::iterator I = DefMI; I != FrameSetup; ++I)
+ if (I->isLoadFoldBarrier())
+ return nullptr;
+
+ return &DefMI;
+}
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+ return new X86CallFrameOptimization();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
new file mode 100644
index 000000000000..53f57565d56e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -0,0 +1,489 @@
+//===- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86CallLowering.h"
+#include "X86CallingConv.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MachineValueType.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
+ : CallLowering(&TLI) {}
+
+bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL,
+ MachineRegisterInfo &MRI,
+ SplitArgTy PerformArgSplit) const {
+ const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
+ LLVMContext &Context = OrigArg.Ty->getContext();
+
+ SmallVector<EVT, 4> SplitVTs;
+ SmallVector<uint64_t, 4> Offsets;
+ ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+ assert(OrigArg.Regs.size() == 1 && "Can't handle multple regs yet");
+
+ if (OrigArg.Ty->isVoidTy())
+ return true;
+
+ EVT VT = SplitVTs[0];
+ unsigned NumParts = TLI.getNumRegisters(Context, VT);
+
+ if (NumParts == 1) {
+ // replace the original type ( pointer -> GPR ).
+ SplitArgs.emplace_back(OrigArg.Regs[0], VT.getTypeForEVT(Context),
+ OrigArg.Flags, OrigArg.IsFixed);
+ return true;
+ }
+
+ SmallVector<Register, 8> SplitRegs;
+
+ EVT PartVT = TLI.getRegisterType(Context, VT);
+ Type *PartTy = PartVT.getTypeForEVT(Context);
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ ArgInfo Info =
+ ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)),
+ PartTy, OrigArg.Flags};
+ SplitArgs.push_back(Info);
+ SplitRegs.push_back(Info.Regs[0]);
+ }
+
+ PerformArgSplit(SplitRegs);
+ return true;
+}
+
+namespace {
+
+struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
+ X86OutgoingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, MachineInstrBuilder &MIB,
+ CCAssignFn *AssignFn)
+ : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+ DL(MIRBuilder.getMF().getDataLayout()),
+ STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
+ LLT SType = LLT::scalar(DL.getPointerSizeInBits(0));
+ auto SPReg =
+ MIRBuilder.buildCopy(p0, STI.getRegisterInfo()->getStackRegister());
+
+ auto OffsetReg = MIRBuilder.buildConstant(SType, Offset);
+
+ auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
+
+ MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+ return AddrReg.getReg(0);
+ }
+
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override {
+ MIB.addUse(PhysReg, RegState::Implicit);
+
+ Register ExtReg;
+ // If we are copying the value to a physical register with the
+ // size larger than the size of the value itself - build AnyExt
+ // to the size of the register first and only then do the copy.
+ // The example of that would be copying from s32 to xmm0, for which
+ // case ValVT == LocVT == MVT::f32. If LocSize and ValSize are not equal
+ // we expect normal extendRegister mechanism to work.
+ unsigned PhysRegSize =
+ MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI);
+ unsigned ValSize = VA.getValVT().getSizeInBits();
+ unsigned LocSize = VA.getLocVT().getSizeInBits();
+ if (PhysRegSize > ValSize && LocSize == ValSize) {
+ assert((PhysRegSize == 128 || PhysRegSize == 80) &&
+ "We expect that to be 128 bit");
+ ExtReg =
+ MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg).getReg(0);
+ } else
+ ExtReg = extendRegister(ValVReg, VA);
+
+ MIRBuilder.buildCopy(PhysReg, ExtReg);
+ }
+
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ MachineFunction &MF = MIRBuilder.getMF();
+ Register ExtReg = extendRegister(ValVReg, VA);
+
+ auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
+ VA.getLocVT().getStoreSize(),
+ inferAlignFromPtrInfo(MF, MPO));
+ MIRBuilder.buildStore(ExtReg, Addr, *MMO);
+ }
+
+ bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State) override {
+ bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+ StackSize = State.getNextStackOffset();
+
+ static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+ X86::XMM3, X86::XMM4, X86::XMM5,
+ X86::XMM6, X86::XMM7};
+ if (!Info.IsFixed)
+ NumXMMRegs = State.getFirstUnallocated(XMMArgRegs);
+
+ return Res;
+ }
+
+ uint64_t getStackSize() { return StackSize; }
+ uint64_t getNumXmmRegs() { return NumXMMRegs; }
+
+protected:
+ MachineInstrBuilder &MIB;
+ uint64_t StackSize = 0;
+ const DataLayout &DL;
+ const X86Subtarget &STI;
+ unsigned NumXMMRegs = 0;
+};
+
+} // end anonymous namespace
+
+bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const {
+ assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
+ "Return value without a vreg");
+ auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
+
+ if (!VRegs.empty()) {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const DataLayout &DL = MF.getDataLayout();
+ LLVMContext &Ctx = Val->getType()->getContext();
+ const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
+
+ SmallVector<EVT, 4> SplitEVTs;
+ ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
+ assert(VRegs.size() == SplitEVTs.size() &&
+ "For each split Type there should be exactly one VReg.");
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
+ ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
+ setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
+ if (!splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI,
+ [&](ArrayRef<Register> Regs) {
+ MIRBuilder.buildUnmerge(Regs, VRegs[i]);
+ }))
+ return false;
+ }
+
+ X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ return false;
+ }
+
+ MIRBuilder.insertInstr(MIB);
+ return true;
+}
+
+namespace {
+
+struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler {
+ X86IncomingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
+ : IncomingValueHandler(MIRBuilder, MRI, AssignFn),
+ DL(MIRBuilder.getMF().getDataLayout()) {}
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ auto &MFI = MIRBuilder.getMF().getFrameInfo();
+ int FI = MFI.CreateFixedObject(Size, Offset, true);
+ MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+
+ return MIRBuilder
+ .buildFrameIndex(LLT::pointer(0, DL.getPointerSizeInBits(0)), FI)
+ .getReg(0);
+ }
+
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ MachineFunction &MF = MIRBuilder.getMF();
+ auto *MMO = MF.getMachineMemOperand(
+ MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+ inferAlignFromPtrInfo(MF, MPO));
+ MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+ }
+
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override {
+ markPhysRegUsed(PhysReg);
+
+ switch (VA.getLocInfo()) {
+ default: {
+ // If we are copying the value from a physical register with the
+ // size larger than the size of the value itself - build the copy
+ // of the phys reg first and then build the truncation of that copy.
+ // The example of that would be copying from xmm0 to s32, for which
+ // case ValVT == LocVT == MVT::f32. If LocSize and ValSize are not equal
+ // we expect this to be handled in SExt/ZExt/AExt case.
+ unsigned PhysRegSize =
+ MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI);
+ unsigned ValSize = VA.getValVT().getSizeInBits();
+ unsigned LocSize = VA.getLocVT().getSizeInBits();
+ if (PhysRegSize > ValSize && LocSize == ValSize) {
+ auto Copy = MIRBuilder.buildCopy(LLT::scalar(PhysRegSize), PhysReg);
+ MIRBuilder.buildTrunc(ValVReg, Copy);
+ return;
+ }
+
+ MIRBuilder.buildCopy(ValVReg, PhysReg);
+ break;
+ }
+ case CCValAssign::LocInfo::SExt:
+ case CCValAssign::LocInfo::ZExt:
+ case CCValAssign::LocInfo::AExt: {
+ auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
+ MIRBuilder.buildTrunc(ValVReg, Copy);
+ break;
+ }
+ }
+ }
+
+ /// How the physical register gets marked varies between formal
+ /// parameters (it's a basic-block live-in), and a call instruction
+ /// (it's an implicit-def of the BL).
+ virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+
+protected:
+ const DataLayout &DL;
+};
+
+struct FormalArgHandler : public X86IncomingValueHandler {
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
+ MIRBuilder.getMBB().addLiveIn(PhysReg);
+ }
+};
+
+struct CallReturnHandler : public X86IncomingValueHandler {
+ CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn, MachineInstrBuilder &MIB)
+ : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIB.addDef(PhysReg, RegState::Implicit);
+ }
+
+protected:
+ MachineInstrBuilder &MIB;
+};
+
+} // end anonymous namespace
+
+bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const {
+ if (F.arg_empty())
+ return true;
+
+ // TODO: handle variadic function
+ if (F.isVarArg())
+ return false;
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ auto DL = MF.getDataLayout();
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ unsigned Idx = 0;
+ for (const auto &Arg : F.args()) {
+ // TODO: handle not simple cases.
+ if (Arg.hasAttribute(Attribute::ByVal) ||
+ Arg.hasAttribute(Attribute::InReg) ||
+ Arg.hasAttribute(Attribute::StructRet) ||
+ Arg.hasAttribute(Attribute::SwiftSelf) ||
+ Arg.hasAttribute(Attribute::SwiftError) ||
+ Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1)
+ return false;
+
+ ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+ setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
+ if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+ [&](ArrayRef<Register> Regs) {
+ MIRBuilder.buildMerge(VRegs[Idx][0], Regs);
+ }))
+ return false;
+ Idx++;
+ }
+
+ MachineBasicBlock &MBB = MIRBuilder.getMBB();
+ if (!MBB.empty())
+ MIRBuilder.setInstr(*MBB.begin());
+
+ FormalArgHandler Handler(MIRBuilder, MRI, CC_X86);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ return false;
+
+ // Move back to the end of the basic block.
+ MIRBuilder.setMBB(MBB);
+
+ return true;
+}
+
+bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const X86RegisterInfo *TRI = STI.getRegisterInfo();
+
+ // Handle only Linux C, X86_64_SysV calling conventions for now.
+ if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C ||
+ Info.CallConv == CallingConv::X86_64_SysV))
+ return false;
+
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ auto CallSeqStart = MIRBuilder.buildInstr(AdjStackDown);
+
+ // Create a temporarily-floating call instruction so we can add the implicit
+ // uses of arg registers.
+ bool Is64Bit = STI.is64Bit();
+ unsigned CallOpc = Info.Callee.isReg()
+ ? (Is64Bit ? X86::CALL64r : X86::CALL32r)
+ : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
+
+ auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc)
+ .add(Info.Callee)
+ .addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ for (const auto &OrigArg : Info.OrigArgs) {
+
+ // TODO: handle not simple cases.
+ if (OrigArg.Flags[0].isByVal())
+ return false;
+
+ if (OrigArg.Regs.size() > 1)
+ return false;
+
+ if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+ [&](ArrayRef<Register> Regs) {
+ MIRBuilder.buildUnmerge(Regs, OrigArg.Regs[0]);
+ }))
+ return false;
+ }
+ // Do the actual argument marshalling.
+ X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ return false;
+
+ bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed;
+ if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ MIRBuilder.buildInstr(X86::MOV8ri)
+ .addDef(X86::AL)
+ .addImm(Handler.getNumXmmRegs());
+ MIB.addUse(X86::AL, RegState::Implicit);
+ }
+
+ // Now we can add the actual call instruction to the correct basic block.
+ MIRBuilder.insertInstr(MIB);
+
+ // If Callee is a reg, since it is used by a target specific
+ // instruction, it must have a register class matching the
+ // constraint of that instruction.
+ if (Info.Callee.isReg())
+ MIB->getOperand(0).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+ 0));
+
+ // Finally we can copy the returned value back into its virtual-register. In
+ // symmetry with the arguments, the physical register must be an
+ // implicit-define of the call instruction.
+
+ if (!Info.OrigRet.Ty->isVoidTy()) {
+ if (Info.OrigRet.Regs.size() > 1)
+ return false;
+
+ SplitArgs.clear();
+ SmallVector<Register, 8> NewRegs;
+
+ if (!splitToValueTypes(Info.OrigRet, SplitArgs, DL, MRI,
+ [&](ArrayRef<Register> Regs) {
+ NewRegs.assign(Regs.begin(), Regs.end());
+ }))
+ return false;
+
+ CallReturnHandler Handler(MIRBuilder, MRI, RetCC_X86, MIB);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ return false;
+
+ if (!NewRegs.empty())
+ MIRBuilder.buildMerge(Info.OrigRet.Regs[0], NewRegs);
+ }
+
+ CallSeqStart.addImm(Handler.getStackSize())
+ .addImm(0 /* see getFrameTotalSize */)
+ .addImm(0 /* see getFrameAdjustment */);
+
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ MIRBuilder.buildInstr(AdjStackUp)
+ .addImm(Handler.getStackSize())
+ .addImm(0 /* NumBytesForCalleeToPop */);
+
+ return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
new file mode 100644
index 000000000000..9390122d7647
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
@@ -0,0 +1,54 @@
+//===- llvm/lib/Target/X86/X86CallLowering.h - Call lowering ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
+#define LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include <functional>
+
+namespace llvm {
+
+template <typename T> class ArrayRef;
+class DataLayout;
+class MachineRegisterInfo;
+class X86TargetLowering;
+
+class X86CallLowering : public CallLowering {
+public:
+ X86CallLowering(const X86TargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+ ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const override;
+
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
+
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
+
+private:
+ /// A function of this type is used to perform value split action.
+ using SplitArgTy = std::function<void(ArrayRef<Register>)>;
+
+ bool splitToValueTypes(const ArgInfo &OrigArgInfo,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL, MachineRegisterInfo &MRI,
+ SplitArgTy SplitArg) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
new file mode 100644
index 000000000000..c80a5d5bb332
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -0,0 +1,344 @@
+//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of custom routines for the X86
+// Calling Convention that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86CallingConv.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+using namespace llvm;
+
+/// When regcall calling convention compiled to 32 bit arch, special treatment
+/// is required for 64 bit masks.
+/// The value should be assigned to two GPRs.
+/// \return true if registers were allocated and false otherwise.
+static bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ // List of GPR registers that are available to store values in regcall
+ // calling convention.
+ static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI,
+ X86::ESI};
+
+ // The vector will save all the available registers for allocation.
+ SmallVector<unsigned, 5> AvailableRegs;
+
+ // searching for the available registers.
+ for (auto Reg : RegList) {
+ if (!State.isAllocated(Reg))
+ AvailableRegs.push_back(Reg);
+ }
+
+ const size_t RequiredGprsUponSplit = 2;
+ if (AvailableRegs.size() < RequiredGprsUponSplit)
+ return false; // Not enough free registers - continue the search.
+
+ // Allocating the available registers.
+ for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
+
+ // Marking the register as located.
+ unsigned Reg = State.AllocateReg(AvailableRegs[I]);
+
+ // Since we previously made sure that 2 registers are available
+ // we expect that a real register number will be returned.
+ assert(Reg && "Expecting a register will be available");
+
+ // Assign the value to the allocated register
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ }
+
+ // Successful in allocating registers - stop scanning next rules.
+ return true;
+}
+
+static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
+ if (ValVT.is512BitVector()) {
+ static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
+ X86::ZMM3, X86::ZMM4, X86::ZMM5};
+ return makeArrayRef(std::begin(RegListZMM), std::end(RegListZMM));
+ }
+
+ if (ValVT.is256BitVector()) {
+ static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
+ X86::YMM3, X86::YMM4, X86::YMM5};
+ return makeArrayRef(std::begin(RegListYMM), std::end(RegListYMM));
+ }
+
+ static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+ X86::XMM3, X86::XMM4, X86::XMM5};
+ return makeArrayRef(std::begin(RegListXMM), std::end(RegListXMM));
+}
+
+static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
+ static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
+ return makeArrayRef(std::begin(RegListGPR), std::end(RegListGPR));
+}
+
+static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+
+ ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
+ bool Is64bit = static_cast<const X86Subtarget &>(
+ State.getMachineFunction().getSubtarget())
+ .is64Bit();
+
+ for (auto Reg : RegList) {
+ // If the register is not marked as allocated - assign to it.
+ if (!State.isAllocated(Reg)) {
+ unsigned AssigedReg = State.AllocateReg(Reg);
+ assert(AssigedReg == Reg && "Expecting a valid register allocation");
+ State.addLoc(
+ CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
+ return true;
+ }
+ // If the register is marked as shadow allocated - assign to it.
+ if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ }
+
+ llvm_unreachable("Clang should ensure that hva marked vectors will have "
+ "an available register.");
+ return false;
+}
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 64 bit arch.
+/// For HVAs shadow registers might be allocated on the first pass
+/// and actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+static bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // On the second pass, go through the HVAs only.
+ if (ArgFlags.isSecArgPass()) {
+ if (ArgFlags.isHva())
+ return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags, State);
+ return true;
+ }
+
+ // Process only vector types as defined by vectorcall spec:
+ // "A vector type is either a floating-point type, for example,
+ // a float or double, or an SIMD vector type, for example, __m128 or __m256".
+ if (!(ValVT.isFloatingPoint() ||
+ (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+ // If R9 was already assigned it means that we are after the fourth element
+ // and because this is not an HVA / Vector type, we need to allocate
+ // shadow XMM register.
+ if (State.isAllocated(X86::R9)) {
+ // Assign shadow XMM register.
+ (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
+ }
+
+ return false;
+ }
+
+ if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
+ // Assign shadow GPR register.
+ (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
+
+ // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
+ if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+ // In Vectorcall Calling convention, additional shadow stack can be
+ // created on top of the basic 32 bytes of win64.
+ // It can happen if the fifth or sixth argument is vector type or HVA.
+ // At that case for each argument a shadow stack of 8 bytes is allocated.
+ const TargetRegisterInfo *TRI =
+ State.getMachineFunction().getSubtarget().getRegisterInfo();
+ if (TRI->regsOverlap(Reg, X86::XMM4) ||
+ TRI->regsOverlap(Reg, X86::XMM5))
+ State.AllocateStack(8, Align(8));
+
+ if (!ArgFlags.isHva()) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true; // Allocated a register - Stop the search.
+ }
+ }
+ }
+
+ // If this is an HVA - Stop the search,
+ // otherwise continue the search.
+ return ArgFlags.isHva();
+}
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 32 bit arch.
+/// For HVAs actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+static bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // On the second pass, go through the HVAs only.
+ if (ArgFlags.isSecArgPass()) {
+ if (ArgFlags.isHva())
+ return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags, State);
+ return true;
+ }
+
+ // Process only vector types as defined by vectorcall spec:
+ // "A vector type is either a floating point type, for example,
+ // a float or double, or an SIMD vector type, for example, __m128 or __m256".
+ if (!(ValVT.isFloatingPoint() ||
+ (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+ return false;
+ }
+
+ if (ArgFlags.isHva())
+ return true; // If this is an HVA - Stop the search.
+
+ // Assign XMM register.
+ if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+
+ // In case we did not find an available XMM register for a vector -
+ // pass it indirectly.
+ // It is similar to CCPassIndirect, with the addition of inreg.
+ if (!ValVT.isFloatingPoint()) {
+ LocVT = MVT::i32;
+ LocInfo = CCValAssign::Indirect;
+ ArgFlags.setInReg();
+ }
+
+ return false; // No register was assigned - Continue the search.
+}
+
+static bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+ CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+ CCState &) {
+ llvm_unreachable("The AnyReg calling convention is only supported by the "
+ "stackmap and patchpoint intrinsics.");
+ // gracefully fallback to X86 C calling convention on Release builds.
+ return false;
+}
+
+static bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
+ // not to split i64 and double between a register and stack
+ static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+ static const unsigned NumRegs = sizeof(RegList) / sizeof(RegList[0]);
+
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // If this is the first part of an double/i64/i128, or if we're already
+ // in the middle of a split, add to the pending list. If this is not
+ // the end of the split, return, otherwise go on to process the pending
+ // list
+ if (ArgFlags.isSplit() || !PendingMembers.empty()) {
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+ if (!ArgFlags.isSplitEnd())
+ return true;
+ }
+
+ // If there are no pending members, we are not in the middle of a split,
+ // so do the usual inreg stuff.
+ if (PendingMembers.empty()) {
+ if (unsigned Reg = State.AllocateReg(RegList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ return false;
+ }
+
+ assert(ArgFlags.isSplitEnd());
+
+ // We now have the entire original argument in PendingMembers, so decide
+ // whether to use registers or the stack.
+ // Per the MCU ABI:
+ // a) To use registers, we need to have enough of them free to contain
+ // the entire argument.
+ // b) We never want to use more than 2 registers for a single argument.
+
+ unsigned FirstFree = State.getFirstUnallocated(RegList);
+ bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
+
+ for (auto &It : PendingMembers) {
+ if (UseRegs)
+ It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+ else
+ It.convertToMem(State.AllocateStack(4, Align(4)));
+ State.addLoc(It);
+ }
+
+ PendingMembers.clear();
+
+ return true;
+}
+
+/// X86 interrupt handlers can only take one or two stack arguments, but if
+/// there are two arguments, they are in the opposite order from the standard
+/// convention. Therefore, we have to look at the argument count up front before
+/// allocating stack for each argument.
+static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ const MachineFunction &MF = State.getMachineFunction();
+ size_t ArgCount = State.getMachineFunction().getFunction().arg_size();
+ bool Is64Bit = static_cast<const X86Subtarget &>(MF.getSubtarget()).is64Bit();
+ unsigned SlotSize = Is64Bit ? 8 : 4;
+ unsigned Offset;
+ if (ArgCount == 1 && ValNo == 0) {
+ // If we have one argument, the argument is five stack slots big, at fixed
+ // offset zero.
+ Offset = State.AllocateStack(5 * SlotSize, Align(4));
+ } else if (ArgCount == 2 && ValNo == 0) {
+ // If we have two arguments, the stack slot is *after* the error code
+ // argument. Pretend it doesn't consume stack space, and account for it when
+ // we assign the second argument.
+ Offset = SlotSize;
+ } else if (ArgCount == 2 && ValNo == 1) {
+ // If this is the second of two arguments, it must be the error code. It
+ // appears first on the stack, and is then followed by the five slot
+ // interrupt struct.
+ Offset = 0;
+ (void)State.AllocateStack(6 * SlotSize, Align(4));
+ } else {
+ report_fatal_error("unsupported x86 interrupt prototype");
+ }
+
+ // FIXME: This should be accounted for in
+ // X86FrameLowering::getFrameIndexReference, not here.
+ if (Is64Bit && ArgCount == 2)
+ Offset += SlotSize;
+
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return true;
+}
+
+static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ if (LocVT != MVT::i64) {
+ LocVT = MVT::i64;
+ LocInfo = CCValAssign::ZExt;
+ }
+ return false;
+}
+
+// Provides entry points of CC_X86 and RetCC_X86.
+#include "X86GenCallingConv.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.h
new file mode 100644
index 000000000000..191e0fa619b2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.h
@@ -0,0 +1,33 @@
+//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the X86 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+
+bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+} // End llvm namespace
+
+#endif
+
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 000000000000..3735fab818ce
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,1175 @@
+//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("static_cast<const X86Subtarget&>"
+ "(State.getMachineFunction().getSubtarget()).", F),
+ A>;
+
+/// CCIfNotSubtarget - Match if the current subtarget doesn't has a feature F.
+class CCIfNotSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("!static_cast<const X86Subtarget&>"
+ "(State.getMachineFunction().getSubtarget()).", F),
+ A>;
+
+// Register classes for RegCall
+class RC_X86_RegCall {
+ list<Register> GPR_8 = [];
+ list<Register> GPR_16 = [];
+ list<Register> GPR_32 = [];
+ list<Register> GPR_64 = [];
+ list<Register> FP_CALL = [FP0];
+ list<Register> FP_RET = [FP0, FP1];
+ list<Register> XMM = [];
+ list<Register> YMM = [];
+ list<Register> ZMM = [];
+}
+
+// RegCall register classes for 32 bits
+def RC_X86_32_RegCall : RC_X86_RegCall {
+ let GPR_8 = [AL, CL, DL, DIL, SIL];
+ let GPR_16 = [AX, CX, DX, DI, SI];
+ let GPR_32 = [EAX, ECX, EDX, EDI, ESI];
+ let GPR_64 = [RAX]; ///< Not actually used, but AssignToReg can't handle []
+ ///< \todo Fix AssignToReg to enable empty lists
+ let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7];
+ let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7];
+ let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7];
+}
+
+class RC_X86_64_RegCall : RC_X86_RegCall {
+ let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15];
+ let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15];
+ let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7,
+ ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM14, ZMM15];
+}
+
+def RC_X86_64_RegCall_Win : RC_X86_64_RegCall {
+ let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R10B, R11B, R12B, R14B, R15B];
+ let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R10W, R11W, R12W, R14W, R15W];
+ let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R10D, R11D, R12D, R14D, R15D];
+ let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15];
+}
+
+def RC_X86_64_RegCall_SysV : RC_X86_64_RegCall {
+ let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R12B, R13B, R14B, R15B];
+ let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R12W, R13W, R14W, R15W];
+ let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R12D, R13D, R14D, R15D];
+ let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R12, R13, R14, R15];
+}
+
+// X86-64 Intel regcall calling convention.
+multiclass X86_RegCall_base<RC_X86_RegCall RC> {
+def CC_#NAME : CallingConv<[
+ // Handles byval parameters.
+ CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>,
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+
+ // Promote v8i1/v16i1/v32i1 arguments to i32.
+ CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>,
+
+ // bool, char, int, enum, long, pointer --> GPR
+ CCIfType<[i32], CCAssignToReg<RC.GPR_32>>,
+
+ // long long, __int64 --> GPR
+ CCIfType<[i64], CCAssignToReg<RC.GPR_64>>,
+
+ // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32)
+ CCIfType<[v64i1], CCPromoteToType<i64>>,
+ CCIfSubtarget<"is64Bit()", CCIfType<[i64],
+ CCAssignToReg<RC.GPR_64>>>,
+ CCIfSubtarget<"is32Bit()", CCIfType<[i64],
+ CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
+
+ // float, double, float128 --> XMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[f32, f64, f128],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // long double --> FP
+ CCIfType<[f80], CCAssignToReg<RC.FP_CALL>>,
+
+ // __m128, __m128i, __m128d --> XMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // __m256, __m256i, __m256d --> YMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>,
+
+ // __m512, __m512i, __m512d --> ZMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()",CCAssignToReg<RC.ZMM>>>,
+
+ // If no register was found -> assign to stack
+
+ // In 64 bit, assign 64/32 bit values to 8 byte stack
+ CCIfSubtarget<"is64Bit()", CCIfType<[i32, i64, f32, f64],
+ CCAssignToStack<8, 8>>>,
+
+ // In 32 bit, assign 64/32 bit values to 8/4 byte stack
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 4>>,
+
+ // MMX type gets 8 byte slot in stack , while alignment depends on target
+ CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>,
+ CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+ // float 128 get stack slots whose size and alignment depends
+ // on the subtarget.
+ CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToStack<16, 16>>,
+
+ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
+]>;
+
+def RetCC_#NAME : CallingConv<[
+ // Promote i1, v1i1, v8i1 arguments to i8.
+ CCIfType<[i1, v1i1, v8i1], CCPromoteToType<i8>>,
+
+ // Promote v16i1 arguments to i16.
+ CCIfType<[v16i1], CCPromoteToType<i16>>,
+
+ // Promote v32i1 arguments to i32.
+ CCIfType<[v32i1], CCPromoteToType<i32>>,
+
+ // bool, char, int, enum, long, pointer --> GPR
+ CCIfType<[i8], CCAssignToReg<RC.GPR_8>>,
+ CCIfType<[i16], CCAssignToReg<RC.GPR_16>>,
+ CCIfType<[i32], CCAssignToReg<RC.GPR_32>>,
+
+ // long long, __int64 --> GPR
+ CCIfType<[i64], CCAssignToReg<RC.GPR_64>>,
+
+ // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32)
+ CCIfType<[v64i1], CCPromoteToType<i64>>,
+ CCIfSubtarget<"is64Bit()", CCIfType<[i64],
+ CCAssignToReg<RC.GPR_64>>>,
+ CCIfSubtarget<"is32Bit()", CCIfType<[i64],
+ CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
+
+ // long double --> FP
+ CCIfType<[f80], CCAssignToReg<RC.FP_RET>>,
+
+ // float, double, float128 --> XMM
+ CCIfType<[f32, f64, f128],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // __m128, __m128i, __m128d --> XMM
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // __m256, __m256i, __m256d --> YMM
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>,
+
+ // __m512, __m512i, __m512d --> ZMM
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()", CCAssignToReg<RC.ZMM>>>
+]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+ // Scalar values are returned in AX first, then DX. For i8, the ABI
+ // requires the values to be in AL and AH, however this code uses AL and DL
+ // instead. This is because using AH for the second register conflicts with
+ // the way LLVM does multiple return values -- a return of {i16,i8} would end
+ // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+ // for functions that return two i8 values are currently expected to pack the
+ // values into an i16 (which uses AX, and thus AL:AH).
+ //
+ // For code that doesn't care about the ABI, we allow returning more than two
+ // integer values in registers.
+ CCIfType<[v1i1], CCPromoteToType<i8>>,
+ CCIfType<[i1], CCPromoteToType<i8>>,
+ CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
+
+ // Boolean vectors of AVX-512 are returned in SIMD registers.
+ // The call from AVX to AVX-512 function should work,
+ // since the boolean types in AVX/AVX2 are promoted by default.
+ CCIfType<[v2i1], CCPromoteToType<v2i64>>,
+ CCIfType<[v4i1], CCPromoteToType<v4i32>>,
+ CCIfType<[v8i1], CCPromoteToType<v8i16>>,
+ CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+ CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+ CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+ // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3
+ // can only be used by ABI non-compliant code. If the target doesn't have XMM
+ // registers, it won't have vector types.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
+ // can only be used by ABI non-compliant code. This vector type is only
+ // supported while using the AVX target feature.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
+ // can only be used by ABI non-compliant code. This vector type is only
+ // supported while using the AVX-512 target feature.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+ // MMX vector types are always returned in MM0. If the target doesn't have
+ // MM0, it doesn't support these vector types.
+ CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
+
+ // Long double types are always returned in FP0 (even with SSE),
+ // except on Win64.
+ CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+ // The X86-32 calling convention returns FP values in FP0, unless marked
+ // with "inreg" (used here to distinguish one kind of reg from another,
+ // weirdly; this is really the sse-regparm calling convention) in which
+ // case they use XMM0, otherwise it is the same as the common X86 calling
+ // conv.
+ CCIfInReg<CCIfSubtarget<"hasSSE2()",
+ CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+ CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+ // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+ // SSE2.
+ // This can happen when a float, 2 x float, or 3 x float vector is split by
+ // target lowering, and is returned in 1-3 sse regs.
+ CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+ CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+ // For integers, ECX can be used as an extra return register
+ CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+ // Otherwise, it is the same as the common X86 calling convention.
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+ // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit FP vectors
+ // No more than 4 registers
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // 512-bit FP vectors
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+ // i32, i64 in the standard way
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_HiPE : CallingConv<[
+ // Promote all types to i32
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Return: HP, P, VAL1, VAL2
+ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
+]>;
+
+// X86-32 Vectorcall return-value convention.
+def RetCC_X86_32_VectorCall : CallingConv<[
+ // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3.
+ CCIfType<[f32, f64, f128],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // Return integers in the standard way.
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+ // The X86-64 calling convention always returns FP values in XMM0.
+ CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
+
+ // MMX vector types are always returned in XMM0.
+ CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+
+ // Pointers are always returned in full 64-bit registers.
+ CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
+
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+ // The X86-Win64 calling convention always returns __m64 values in RAX.
+ CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+ // GCC returns FP values in RAX on Win64.
+ CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
+ CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
+
+ // Otherwise, everything is the same as 'normal' X86-64 C CC.
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// X86-64 vectorcall return-value convention.
+def RetCC_X86_64_Vectorcall : CallingConv<[
+ // Vectorcall calling convention always returns FP values in XMMs.
+ CCIfType<[f32, f64, f128],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Otherwise, everything is the same as Windows X86-64 C CC.
+ CCDelegateTo<RetCC_X86_Win64_C>
+]>;
+
+// X86-64 HiPE return-value convention.
+def RetCC_X86_64_HiPE : CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: HP, P, VAL1, VAL2
+ CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
+]>;
+
+// X86-64 WebKit_JS return-value convention.
+def RetCC_X86_64_WebKit_JS : CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: RAX
+ CCIfType<[i64], CCAssignToReg<[RAX]>>
+]>;
+
+def RetCC_X86_64_Swift : CallingConv<[
+
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+ // For integers, ECX, R8D can be used as extra return registers.
+ CCIfType<[v1i1], CCPromoteToType<i8>>,
+ CCIfType<[i1], CCPromoteToType<i8>>,
+ CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX, R8D]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX, R8]>>,
+
+ // XMM0, XMM1, XMM2 and XMM3 can be used to return FP values.
+ CCIfType<[f32], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+ CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3.
+ CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def RetCC_X86_64_AnyReg : CallingConv<[
+ CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+// X86-64 HHVM return-value convention.
+def RetCC_X86_64_HHVM: CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: could return in any GP register save RSP and R12.
+ CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9,
+ RAX, R10, R11, R13, R14, R15]>>
+]>;
+
+
+defm X86_32_RegCall :
+ X86_RegCall_base<RC_X86_32_RegCall>;
+defm X86_Win64_RegCall :
+ X86_RegCall_base<RC_X86_64_RegCall_Win>;
+defm X86_SysV64_RegCall :
+ X86_RegCall_base<RC_X86_64_RegCall_SysV>;
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+ // If FastCC, use RetCC_X86_32_Fast.
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>,
+ // CFGuard_Check never returns a value so does not need a RetCC.
+ // If HiPE, use RetCC_X86_32_HiPE.
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_32_RegCall>>,
+
+ // Otherwise, use RetCC_X86_32_C.
+ CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+ // HiPE uses RetCC_X86_64_HiPE
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+
+ // Handle JavaScript calls.
+ CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
+ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
+
+ // Handle Swift calls.
+ CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>,
+
+ // Handle explicit CC selection
+ CCIfCC<"CallingConv::Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+
+ // Handle Vectorcall CC
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>,
+
+ // Handle HHVM calls.
+ CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
+
+ CCIfCC<"CallingConv::X86_RegCall",
+ CCIfSubtarget<"isTargetWin64()",
+ CCDelegateTo<RetCC_X86_Win64_RegCall>>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_SysV64_RegCall>>,
+
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+let Entry = 1 in
+def RetCC_X86 : CallingConv<[
+
+ // Check if this is the Intel OpenCL built-ins calling convention
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+ CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // Pass SwiftSelf in a callee saved register.
+ CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R13]>>>,
+
+ // A SwiftError is passed in R12.
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+ // For Swift Calling Convention, pass sret in %rax.
+ CCIfCC<"CallingConv::Swift",
+ CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
+
+ // Pointers are always passed in full 64-bit registers.
+ CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+ // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
+ CCIfType<[x86mmx],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCIfSubtarget<"hasSSE2()",
+ CCPromoteToType<v2i64>>>>,
+
+ // Boolean vectors of AVX-512 are passed in SIMD registers.
+ // The call from AVX to AVX-512 function should work,
+ // since the boolean types in AVX/AVX2 are promoted by default.
+ CCIfType<[v2i1], CCPromoteToType<v2i64>>,
+ CCIfType<[v4i1], CCPromoteToType<v4i32>>,
+ CCIfType<[v8i1], CCPromoteToType<v8i16>>,
+ CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+ CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+ CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+ // The first 8 256-bit vector arguments are passed in YMM registers, unless
+ // this is a vararg function.
+ // FIXME: This isn't precisely correct; the x86-64 ABI document says that
+ // fixed arguments to vararg functions are supposed to be passed in
+ // registers. Actually modeling that would be a lot of work, though.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()",
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
+ YMM4, YMM5, YMM6, YMM7]>>>>,
+
+ // The first 8 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()",
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
+]>;
+
+// Calling convention for X86-64 HHVM.
+def CC_X86_64_HHVM : CallingConv<[
+ // Use all/any GP registers for args, except RSP.
+ CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15,
+ RDI, RSI, RDX, RCX, R8, R9,
+ RAX, R10, R11, R13, R14]>>
+]>;
+
+// Calling convention for helper functions in HHVM.
+def CC_X86_64_HHVM_C : CallingConv<[
+ // Pass the first argument in RBP.
+ CCIfType<[i64], CCAssignToReg<[RBP]>>,
+
+ // Otherwise it's the same as the regular C calling convention.
+ CCDelegateTo<CC_X86_64_C>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+ // FIXME: Handle varargs.
+
+ // Byval aggregates are passed by pointer
+ CCIfByVal<CCPassIndirect<i64>>,
+
+ // Promote i1/v1i1 arguments to i8.
+ CCIfType<[i1, v1i1], CCPromoteToType<i8>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // A SwiftError is passed in R12.
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+ // The 'CFGuardTarget' parameter, if any, is passed in RAX.
+ CCIfCFGuardTarget<CCAssignToReg<[RAX]>>,
+
+ // 128 bit vectors are passed by pointer
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+ // 256 bit vectors are passed by pointer
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+
+ // 512 bit vectors are passed by pointer
+ CCIfType<[v64i8, v32i16, v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
+ // Long doubles are passed by pointer
+ CCIfType<[f80], CCPassIndirect<i64>>,
+
+ // The first 4 MMX vector arguments are passed in GPRs.
+ CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+ // If SSE was disabled, pass FP values smaller than 64-bits as integers in
+ // GPRs or on the stack.
+ CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
+ CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
+
+ // The first 4 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64],
+ CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+ [RCX , RDX , R8 , R9 ]>>,
+
+ // The first 4 integer arguments are passed in integer registers.
+ CCIfType<[i8 ], CCAssignToRegWithShadow<[CL , DL , R8B , R9B ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+ CCIfType<[i16], CCAssignToRegWithShadow<[CX , DX , R8W , R9W ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+ CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Do not pass the sret argument in RCX, the Win64 thiscall calling
+ // convention requires "this" to be passed in RCX.
+ CCIfCC<"CallingConv::X86_ThisCall",
+ CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ],
+ [XMM1, XMM2, XMM3]>>>>,
+
+ CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_Win64_VectorCall : CallingConv<[
+ CCCustom<"CC_X86_64_VectorCall">,
+
+ // Delegate to fastcall to handle integer types.
+ CCDelegateTo<CC_X86_Win64_C>
+]>;
+
+
+def CC_X86_64_GHC : CallingConv<[
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+ CCIfType<[i64],
+ CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>,
+
+ // Pass in STG registers: F1, F2, F3, F4, D1, D2
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>,
+ // AVX
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()",
+ CCAssignToReg<[YMM1, YMM2, YMM3, YMM4, YMM5, YMM6]>>>,
+ // AVX-512
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()",
+ CCAssignToReg<[ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6]>>>
+]>;
+
+def CC_X86_64_HiPE : CallingConv<[
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3
+ CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_64_WebKit_JS : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Only the first integer argument is passed in register.
+ CCIfType<[i32], CCAssignToReg<[EAX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX]>>,
+
+ // The remaining integer arguments are passed on the stack. 32bit integer and
+ // floating-point arguments are aligned to 4 byte and stored in 4 byte slots.
+ // 64bit integer and floating-point arguments are aligned to 8 byte and stored
+ // in 8 byte stack slots.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def CC_X86_64_AnyReg : CallingConv<[
+ CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector
+/// values are spilled on the stack.
+def CC_X86_32_Vector_Common : CallingConv<[
+ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
+]>;
+
+// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
+// vector registers
+def CC_X86_32_Vector_Standard : CallingConv<[
+ // SSE vector arguments are passed in XMM registers.
+ CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2]>>>,
+
+ // AVX 256-bit vector arguments are passed in YMM registers.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()",
+ CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
+
+ // AVX 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
+
+ CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in
+// vector registers.
+def CC_X86_32_Vector_Darwin : CallingConv<[
+ // SSE vector arguments are passed in XMM registers.
+ CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+ // AVX 256-bit vector arguments are passed in YMM registers.
+ CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()",
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
+ // AVX 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
+
+ CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack.
+def CC_X86_32_Common : CallingConv<[
+ // Handles byval/preallocated parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+ CCIfPreallocated<CCPassByVal<4, 4>>,
+
+ // The first 3 float or double arguments, if marked 'inreg' and if the call
+ // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+ // The first 3 __m64 vector arguments are passed in mmx registers if the
+ // call is not a vararg call.
+ CCIfNotVarArg<CCIfType<[x86mmx],
+ CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Doubles get 8-byte slots that are 4-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+ // Long doubles get slots whose size depends on the subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+ // Boolean vectors of AVX-512 are passed in SIMD registers.
+ // The call from AVX to AVX-512 function should work,
+ // since the boolean types in AVX/AVX2 are promoted by default.
+ CCIfType<[v2i1], CCPromoteToType<v2i64>>,
+ CCIfType<[v4i1], CCPromoteToType<v4i32>>,
+ CCIfType<[v8i1], CCPromoteToType<v8i16>>,
+ CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+ CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+ CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+ // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+ // passed in the parameter area.
+ CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+ // Darwin passes vectors in a form that differs from the i386 psABI
+ CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
+
+ // Otherwise, drop to 'normal' X86-32 CC
+ CCDelegateTo<CC_X86_32_Vector_Standard>
+]>;
+
+def CC_X86_32_C : CallingConv<[
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in ECX.
+ CCIfNest<CCAssignToReg<[ECX]>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_MCU : CallingConv<[
+ // Handles byval parameters. Note that, like FastCC, we can't rely on
+ // the delegation to CC_X86_32_Common because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+
+ // If the call is not a vararg call, some arguments may be passed
+ // in integer registers.
+ CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+ // Promote i1 to i8.
+ CCIfType<[i1], CCPromoteToType<i8>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfInReg<CCIfType<[ i8], CCAssignToReg<[ CL, DL]>>>,
+ CCIfInReg<CCIfType<[i16], CCAssignToReg<[ CX, DX]>>>,
+ CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_Win32_VectorCall : CallingConv<[
+ // Pass floating point in XMMs
+ CCCustom<"CC_X86_32_VectorCall">,
+
+ // Delegate to fastcall to handle integer types.
+ CCDelegateTo<CC_X86_32_FastCall>
+]>;
+
+def CC_X86_32_ThisCall_Common : CallingConv<[
+ // The first integer argument is passed in ECX
+ CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall_Mingw : CallingConv<[
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+
+ CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall_Win : CallingConv<[
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+
+ // Pass sret arguments indirectly through stack.
+ CCIfSRet<CCAssignToStack<4, 4>>,
+
+ CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall : CallingConv<[
+ CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
+ CCDelegateTo<CC_X86_32_ThisCall_Win>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+ // Handles byval parameters. Note that we can't rely on the delegation
+ // to CC_X86_32_Common for this because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+ // The first 3 float or double arguments, if the call is not a vararg
+ // call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+ // Doubles get 8-byte slots that are 8-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_Win32_CFGuard_Check : CallingConv<[
+ // The CFGuard check call takes exactly one integer argument
+ // (i.e. the target function address), which is passed in ECX.
+ CCIfType<[i32], CCAssignToReg<[ECX]>>
+]>;
+
+def CC_X86_32_GHC : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1
+ CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>>
+]>;
+
+def CC_X86_32_HiPE : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2
+ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>
+]>;
+
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+
+ CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+ CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>,
+
+ CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>,
+ CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>,
+
+ CCIfType<[i32], CCAssignToStack<4, 4>>,
+
+ // The SSE vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The 256-bit vector arguments are passed in YMM registers.
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+
+ // The 512-bit vector arguments are passed in ZMM registers.
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+
+ // Pass masks in mask registers
+ CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
+
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>,
+ CCDelegateTo<CC_X86_32_C>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Root Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// This is the root argument convention for the X86-32 backend.
+def CC_X86_32 : CallingConv<[
+ // X86_INTR calling convention is valid in MCU target and should override the
+ // MCU calling convention. Thus, this should be checked before isTargetMCU().
+ CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>,
+ CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
+ CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
+ CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+ CCIfCC<"CallingConv::CFGuard_Check", CCDelegateTo<CC_X86_Win32_CFGuard_Check>>,
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+ CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>,
+ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
+
+ // Otherwise, drop to normal X86-32 CC
+ CCDelegateTo<CC_X86_32_C>
+]>;
+
+// This is the root argument convention for the X86-64 backend.
+def CC_X86_64 : CallingConv<[
+ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
+ CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+ CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
+ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
+ CCIfCC<"CallingConv::Win64", CCDelegateTo<CC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
+ CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
+ CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>,
+ CCIfCC<"CallingConv::X86_RegCall",
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>,
+ CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>,
+
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<CC_X86_64_C>
+]>;
+
+// This is the argument convention used for the entire X86 backend.
+let Entry = 1 in
+def CC_X86 : CallingConv<[
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
+ CCDelegateTo<CC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
+
+def CSR_64_SwiftError : CalleeSavedRegs<(sub CSR_64, R12)>;
+
+def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
+def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
+
+def CSR_Win64_NoSSE : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add CSR_Win64_NoSSE,
+ (sequence "XMM%u", 6, 15))>;
+
+def CSR_Win64_SwiftError : CalleeSavedRegs<(sub CSR_Win64, R12)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// uses rdi to pass a single parameter and rax for the return value. All other
+// GPRs are preserved.
+def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
+ R8, R9, R10, R11)>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add RBP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)>;
+
+// All GPRs - except r11
+def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
+ R8, R9, R10)>;
+
+// All registers - except r11
+def CSR_64_RT_AllRegs : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+ (sequence "XMM%u", 0, 15))>;
+def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+ (sequence "YMM%u", 0, 15))>;
+
+def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+ R11, R12, R13, R14, R15, RBP,
+ (sequence "XMM%u", 0, 15))>;
+
+def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI,
+ EDI)>;
+def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs,
+ (sequence "XMM%u", 0, 7))>;
+def CSR_32_AllRegs_AVX : CalleeSavedRegs<(add CSR_32_AllRegs,
+ (sequence "YMM%u", 0, 7))>;
+def CSR_32_AllRegs_AVX512 : CalleeSavedRegs<(add CSR_32_AllRegs,
+ (sequence "ZMM%u", 0, 7),
+ (sequence "K%u", 0, 7))>;
+
+def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX)>;
+def CSR_64_AllRegs_NoSSE : CalleeSavedRegs<(add RAX, RBX, RCX, RDX, RSI, RDI, R8, R9,
+ R10, R11, R12, R13, R14, R15, RBP)>;
+def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
+ (sequence "YMM%u", 0, 15)),
+ (sequence "XMM%u", 0, 15))>;
+def CSR_64_AllRegs_AVX512 : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
+ (sequence "ZMM%u", 0, 31),
+ (sequence "K%u", 0, 7)),
+ (sequence "XMM%u", 0, 15))>;
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+ R13, R14, R15,
+ (sequence "YMM%u", 6, 15))>;
+
+def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
+ R12, R13, R14, R15,
+ (sequence "ZMM%u", 6, 21),
+ K4, K5, K6, K7)>;
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
+ (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
+ (sequence "YMM%u", 8, 15))>;
+
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RSI, R14, R15,
+ (sequence "ZMM%u", 16, 31),
+ K4, K5, K6, K7)>;
+
+// Only R12 is preserved for PHP calls in HHVM.
+def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
+
+// Register calling convention preserves few GPR and XMM8-15
+def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE,
+ (sequence "XMM%u", 4, 7))>;
+def CSR_Win32_CFGuard_Check_NoSSE : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, ECX)>;
+def CSR_Win32_CFGuard_Check : CalleeSavedRegs<(add CSR_32_RegCall, ECX)>;
+def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
+ (sequence "R%u", 10, 15))>;
+def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE,
+ (sequence "XMM%u", 8, 15))>;
+def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
+ (sequence "R%u", 12, 15))>;
+def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,
+ (sequence "XMM%u", 8, 15))>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
new file mode 100644
index 000000000000..a2de0dc08292
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -0,0 +1,861 @@
+//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements a pass that converts X86 cmov instructions into
+/// branches when profitable. This pass is conservative. It transforms if and
+/// only if it can guarantee a gain with high confidence.
+///
+/// Thus, the optimization applies under the following conditions:
+/// 1. Consider as candidates only CMOVs in innermost loops (assume that
+/// most hotspots are represented by these loops).
+/// 2. Given a group of CMOV instructions that are using the same EFLAGS def
+/// instruction:
+/// a. Consider them as candidates only if all have the same code condition
+/// or the opposite one to prevent generating more than one conditional
+/// jump per EFLAGS def instruction.
+/// b. Consider them as candidates only if all are profitable to be
+/// converted (assume that one bad conversion may cause a degradation).
+/// 3. Apply conversion only for loops that are found profitable and only for
+/// CMOV candidates that were found profitable.
+/// a. A loop is considered profitable only if conversion will reduce its
+/// depth cost by some threshold.
+/// b. CMOV is considered profitable if the cost of its condition is higher
+/// than the average cost of its true-value and false-value by 25% of
+/// branch-misprediction-penalty. This assures no degradation even with
+/// 25% branch misprediction.
+///
+/// Note: This pass is assumed to run on SSA machine code.
+//
+//===----------------------------------------------------------------------===//
+//
+// External interfaces:
+// FunctionPass *llvm::createX86CmovConverterPass();
+// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cmov-conversion"
+
+STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
+STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
+STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
+STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
+
+// This internal switch can be used to turn off the cmov/branch optimization.
+static cl::opt<bool>
+ EnableCmovConverter("x86-cmov-converter",
+ cl::desc("Enable the X86 cmov-to-branch optimization."),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<unsigned>
+ GainCycleThreshold("x86-cmov-converter-threshold",
+ cl::desc("Minimum gain per loop (in cycles) threshold."),
+ cl::init(4), cl::Hidden);
+
+static cl::opt<bool> ForceMemOperand(
+ "x86-cmov-converter-force-mem-operand",
+ cl::desc("Convert cmovs to branches whenever they have memory operands."),
+ cl::init(true), cl::Hidden);
+
+namespace {
+
+/// Converts X86 cmov instructions into branches when profitable.
+class X86CmovConverterPass : public MachineFunctionPass {
+public:
+ X86CmovConverterPass() : MachineFunctionPass(ID) { }
+
+ StringRef getPassName() const override { return "X86 cmov Conversion"; }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Pass identification, replacement for typeid.
+ static char ID;
+
+private:
+ MachineRegisterInfo *MRI = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+ TargetSchedModel TSchedModel;
+
+ /// List of consecutive CMOV instructions.
+ using CmovGroup = SmallVector<MachineInstr *, 2>;
+ using CmovGroups = SmallVector<CmovGroup, 2>;
+
+ /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
+ /// CmovInstGroups accordingly.
+ ///
+ /// \param Blocks List of blocks to process.
+ /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+ /// \returns true iff it found any CMOV-group-candidate.
+ bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
+ CmovGroups &CmovInstGroups,
+ bool IncludeLoads = false);
+
+ /// Check if it is profitable to transform each CMOV-group-candidates into
+ /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
+ ///
+ /// \param Blocks List of blocks to process.
+ /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+ /// \returns true iff any CMOV-group-candidate remain.
+ bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
+ CmovGroups &CmovInstGroups);
+
+ /// Convert the given list of consecutive CMOV instructions into a branch.
+ ///
+ /// \param Group Consecutive CMOV instructions to be converted into branch.
+ void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
+};
+
+} // end anonymous namespace
+
+char X86CmovConverterPass::ID = 0;
+
+void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineLoopInfo>();
+}
+
+bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ if (!EnableCmovConverter)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+ << "**********\n");
+
+ bool Changed = false;
+ MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ MRI = &MF.getRegInfo();
+ TII = STI.getInstrInfo();
+ TRI = STI.getRegisterInfo();
+ TSchedModel.init(&STI);
+
+ // Before we handle the more subtle cases of register-register CMOVs inside
+ // of potentially hot loops, we want to quickly remove all CMOVs with
+ // a memory operand. The CMOV will risk a stall waiting for the load to
+ // complete that speculative execution behind a branch is better suited to
+ // handle on modern x86 chips.
+ if (ForceMemOperand) {
+ CmovGroups AllCmovGroups;
+ SmallVector<MachineBasicBlock *, 4> Blocks;
+ for (auto &MBB : MF)
+ Blocks.push_back(&MBB);
+ if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
+ for (auto &Group : AllCmovGroups) {
+ // Skip any group that doesn't do at least one memory operand cmov.
+ if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
+ continue;
+
+ // For CMOV groups which we can rewrite and which contain a memory load,
+ // always rewrite them. On x86, a CMOV will dramatically amplify any
+ // memory latency by blocking speculative execution.
+ Changed = true;
+ convertCmovInstsToBranches(Group);
+ }
+ }
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Register-operand Conversion Algorithm
+ // ---------
+ // For each inner most loop
+ // collectCmovCandidates() {
+ // Find all CMOV-group-candidates.
+ // }
+ //
+ // checkForProfitableCmovCandidates() {
+ // * Calculate both loop-depth and optimized-loop-depth.
+ // * Use these depth to check for loop transformation profitability.
+ // * Check for CMOV-group-candidate transformation profitability.
+ // }
+ //
+ // For each profitable CMOV-group-candidate
+ // convertCmovInstsToBranches() {
+ // * Create FalseBB, SinkBB, Conditional branch to SinkBB.
+ // * Replace each CMOV instruction with a PHI instruction in SinkBB.
+ // }
+ //
+ // Note: For more details, see each function description.
+ //===--------------------------------------------------------------------===//
+
+ // Build up the loops in pre-order.
+ SmallVector<MachineLoop *, 4> Loops(MLI.begin(), MLI.end());
+ // Note that we need to check size on each iteration as we accumulate child
+ // loops.
+ for (int i = 0; i < (int)Loops.size(); ++i)
+ for (MachineLoop *Child : Loops[i]->getSubLoops())
+ Loops.push_back(Child);
+
+ for (MachineLoop *CurrLoop : Loops) {
+ // Optimize only inner most loops.
+ if (!CurrLoop->getSubLoops().empty())
+ continue;
+
+ // List of consecutive CMOV instructions to be processed.
+ CmovGroups CmovInstGroups;
+
+ if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups))
+ continue;
+
+ if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),
+ CmovInstGroups))
+ continue;
+
+ Changed = true;
+ for (auto &Group : CmovInstGroups)
+ convertCmovInstsToBranches(Group);
+ }
+
+ return Changed;
+}
+
+bool X86CmovConverterPass::collectCmovCandidates(
+ ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
+ bool IncludeLoads) {
+ //===--------------------------------------------------------------------===//
+ // Collect all CMOV-group-candidates and add them into CmovInstGroups.
+ //
+ // CMOV-group:
+ // CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
+ //
+ // CMOV-group-candidate:
+ // CMOV-group where all the CMOV instructions are
+ // 1. consecutive.
+ // 2. have same condition code or opposite one.
+ // 3. have only operand registers (X86::CMOVrr).
+ //===--------------------------------------------------------------------===//
+ // List of possible improvement (TODO's):
+ // --------------------------------------
+ // TODO: Add support for X86::CMOVrm instructions.
+ // TODO: Add support for X86::SETcc instructions.
+ // TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
+ //===--------------------------------------------------------------------===//
+
+ // Current processed CMOV-Group.
+ CmovGroup Group;
+ for (auto *MBB : Blocks) {
+ Group.clear();
+ // Condition code of first CMOV instruction current processed range and its
+ // opposite condition code.
+ X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
+ MemOpCC = X86::COND_INVALID;
+ // Indicator of a non CMOVrr instruction in the current processed range.
+ bool FoundNonCMOVInst = false;
+ // Indicator for current processed CMOV-group if it should be skipped.
+ bool SkipGroup = false;
+
+ for (auto &I : *MBB) {
+ // Skip debug instructions.
+ if (I.isDebugInstr())
+ continue;
+ X86::CondCode CC = X86::getCondFromCMov(I);
+ // Check if we found a X86::CMOVrr instruction.
+ if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) {
+ if (Group.empty()) {
+ // We found first CMOV in the range, reset flags.
+ FirstCC = CC;
+ FirstOppCC = X86::GetOppositeBranchCondition(CC);
+ // Clear out the prior group's memory operand CC.
+ MemOpCC = X86::COND_INVALID;
+ FoundNonCMOVInst = false;
+ SkipGroup = false;
+ }
+ Group.push_back(&I);
+ // Check if it is a non-consecutive CMOV instruction or it has different
+ // condition code than FirstCC or FirstOppCC.
+ if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
+ // Mark the SKipGroup indicator to skip current processed CMOV-Group.
+ SkipGroup = true;
+ if (I.mayLoad()) {
+ if (MemOpCC == X86::COND_INVALID)
+ // The first memory operand CMOV.
+ MemOpCC = CC;
+ else if (CC != MemOpCC)
+ // Can't handle mixed conditions with memory operands.
+ SkipGroup = true;
+ }
+ // Check if we were relying on zero-extending behavior of the CMOV.
+ if (!SkipGroup &&
+ llvm::any_of(
+ MRI->use_nodbg_instructions(I.defs().begin()->getReg()),
+ [&](MachineInstr &UseI) {
+ return UseI.getOpcode() == X86::SUBREG_TO_REG;
+ }))
+ // FIXME: We should model the cost of using an explicit MOV to handle
+ // the zero-extension rather than just refusing to handle this.
+ SkipGroup = true;
+ continue;
+ }
+ // If Group is empty, keep looking for first CMOV in the range.
+ if (Group.empty())
+ continue;
+
+ // We found a non X86::CMOVrr instruction.
+ FoundNonCMOVInst = true;
+ // Check if this instruction define EFLAGS, to determine end of processed
+ // range, as there would be no more instructions using current EFLAGS def.
+ if (I.definesRegister(X86::EFLAGS)) {
+ // Check if current processed CMOV-group should not be skipped and add
+ // it as a CMOV-group-candidate.
+ if (!SkipGroup)
+ CmovInstGroups.push_back(Group);
+ else
+ ++NumOfSkippedCmovGroups;
+ Group.clear();
+ }
+ }
+ // End of basic block is considered end of range, check if current processed
+ // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
+ if (Group.empty())
+ continue;
+ if (!SkipGroup)
+ CmovInstGroups.push_back(Group);
+ else
+ ++NumOfSkippedCmovGroups;
+ }
+
+ NumOfCmovGroupCandidate += CmovInstGroups.size();
+ return !CmovInstGroups.empty();
+}
+
+/// \returns Depth of CMOV instruction as if it was converted into branch.
+/// \param TrueOpDepth depth cost of CMOV true value operand.
+/// \param FalseOpDepth depth cost of CMOV false value operand.
+static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
+ // The depth of the result after branch conversion is
+ // TrueOpDepth * TrueOpProbability + FalseOpDepth * FalseOpProbability.
+ // As we have no info about branch weight, we assume 75% for one and 25% for
+ // the other, and pick the result with the largest resulting depth.
+ return std::max(
+ divideCeil(TrueOpDepth * 3 + FalseOpDepth, 4),
+ divideCeil(FalseOpDepth * 3 + TrueOpDepth, 4));
+}
+
+bool X86CmovConverterPass::checkForProfitableCmovCandidates(
+ ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
+ struct DepthInfo {
+ /// Depth of original loop.
+ unsigned Depth;
+ /// Depth of optimized loop.
+ unsigned OptDepth;
+ };
+ /// Number of loop iterations to calculate depth for ?!
+ static const unsigned LoopIterations = 2;
+ DenseMap<MachineInstr *, DepthInfo> DepthMap;
+ DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
+ enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
+ /// For each register type maps the register to its last def instruction.
+ DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
+ /// Maps register operand to its def instruction, which can be nullptr if it
+ /// is unknown (e.g., operand is defined outside the loop).
+ DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap;
+
+ // Set depth of unknown instruction (i.e., nullptr) to zero.
+ DepthMap[nullptr] = {0, 0};
+
+ SmallPtrSet<MachineInstr *, 4> CmovInstructions;
+ for (auto &Group : CmovInstGroups)
+ CmovInstructions.insert(Group.begin(), Group.end());
+
+ //===--------------------------------------------------------------------===//
+ // Step 1: Calculate instruction depth and loop depth.
+ // Optimized-Loop:
+ // loop with CMOV-group-candidates converted into branches.
+ //
+ // Instruction-Depth:
+ // instruction latency + max operand depth.
+ // * For CMOV instruction in optimized loop the depth is calculated as:
+ // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
+ // TODO: Find a better way to estimate the latency of the branch instruction
+ // rather than using the CMOV latency.
+ //
+ // Loop-Depth:
+ // max instruction depth of all instructions in the loop.
+ // Note: instruction with max depth represents the critical-path in the loop.
+ //
+ // Loop-Depth[i]:
+ // Loop-Depth calculated for first `i` iterations.
+ // Note: it is enough to calculate depth for up to two iterations.
+ //
+ // Depth-Diff[i]:
+ // Number of cycles saved in first 'i` iterations by optimizing the loop.
+ //===--------------------------------------------------------------------===//
+ for (unsigned I = 0; I < LoopIterations; ++I) {
+ DepthInfo &MaxDepth = LoopDepth[I];
+ for (auto *MBB : Blocks) {
+ // Clear physical registers Def map.
+ RegDefMaps[PhyRegType].clear();
+ for (MachineInstr &MI : *MBB) {
+ // Skip debug instructions.
+ if (MI.isDebugInstr())
+ continue;
+ unsigned MIDepth = 0;
+ unsigned MIDepthOpt = 0;
+ bool IsCMOV = CmovInstructions.count(&MI);
+ for (auto &MO : MI.uses()) {
+ // Checks for "isUse()" as "uses()" returns also implicit definitions.
+ if (!MO.isReg() || !MO.isUse())
+ continue;
+ Register Reg = MO.getReg();
+ auto &RDM = RegDefMaps[Reg.isVirtual()];
+ if (MachineInstr *DefMI = RDM.lookup(Reg)) {
+ OperandToDefMap[&MO] = DefMI;
+ DepthInfo Info = DepthMap.lookup(DefMI);
+ MIDepth = std::max(MIDepth, Info.Depth);
+ if (!IsCMOV)
+ MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
+ }
+ }
+
+ if (IsCMOV)
+ MIDepthOpt = getDepthOfOptCmov(
+ DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
+ DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
+
+ // Iterates over all operands to handle implicit definitions as well.
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ RegDefMaps[Reg.isVirtual()][Reg] = &MI;
+ }
+
+ unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+ DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
+ MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
+ MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
+ }
+ }
+ }
+
+ unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
+ LoopDepth[1].Depth - LoopDepth[1].OptDepth};
+
+ //===--------------------------------------------------------------------===//
+ // Step 2: Check if Loop worth to be optimized.
+ // Worth-Optimize-Loop:
+ // case 1: Diff[1] == Diff[0]
+ // Critical-path is iteration independent - there is no dependency
+ // of critical-path instructions on critical-path instructions of
+ // previous iteration.
+ // Thus, it is enough to check gain percent of 1st iteration -
+ // To be conservative, the optimized loop need to have a depth of
+ // 12.5% cycles less than original loop, per iteration.
+ //
+ // case 2: Diff[1] > Diff[0]
+ // Critical-path is iteration dependent - there is dependency of
+ // critical-path instructions on critical-path instructions of
+ // previous iteration.
+ // Thus, check the gain percent of the 2nd iteration (similar to the
+ // previous case), but it is also required to check the gradient of
+ // the gain - the change in Depth-Diff compared to the change in
+ // Loop-Depth between 1st and 2nd iterations.
+ // To be conservative, the gradient need to be at least 50%.
+ //
+ // In addition, In order not to optimize loops with very small gain, the
+ // gain (in cycles) after 2nd iteration should not be less than a given
+ // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
+ //
+ // If loop is not worth optimizing, remove all CMOV-group-candidates.
+ //===--------------------------------------------------------------------===//
+ if (Diff[1] < GainCycleThreshold)
+ return false;
+
+ bool WorthOptLoop = false;
+ if (Diff[1] == Diff[0])
+ WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
+ else if (Diff[1] > Diff[0])
+ WorthOptLoop =
+ (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
+ (Diff[1] * 8 >= LoopDepth[1].Depth);
+
+ if (!WorthOptLoop)
+ return false;
+
+ ++NumOfLoopCandidate;
+
+ //===--------------------------------------------------------------------===//
+ // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
+ // Worth-Optimize-Group:
+ // Iff it worths to optimize all CMOV instructions in the group.
+ //
+ // Worth-Optimize-CMOV:
+ // Predicted branch is faster than CMOV by the difference between depth of
+ // condition operand and depth of taken (predicted) value operand.
+ // To be conservative, the gain of such CMOV transformation should cover at
+ // at least 25% of branch-misprediction-penalty.
+ //===--------------------------------------------------------------------===//
+ unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
+ CmovGroups TempGroups;
+ std::swap(TempGroups, CmovInstGroups);
+ for (auto &Group : TempGroups) {
+ bool WorthOpGroup = true;
+ for (auto *MI : Group) {
+ // Avoid CMOV instruction which value is used as a pointer to load from.
+ // This is another conservative check to avoid converting CMOV instruction
+ // used with tree-search like algorithm, where the branch is unpredicted.
+ auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
+ if (!UIs.empty() && ++UIs.begin() == UIs.end()) {
+ unsigned Op = UIs.begin()->getOpcode();
+ if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
+ WorthOpGroup = false;
+ break;
+ }
+ }
+
+ unsigned CondCost =
+ DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth;
+ unsigned ValCost = getDepthOfOptCmov(
+ DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
+ DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
+ if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
+ WorthOpGroup = false;
+ break;
+ }
+ }
+
+ if (WorthOpGroup)
+ CmovInstGroups.push_back(Group);
+ }
+
+ return !CmovInstGroups.empty();
+}
+
+static bool checkEFLAGSLive(MachineInstr *MI) {
+ if (MI->killsRegister(X86::EFLAGS))
+ return false;
+
+ // The EFLAGS operand of MI might be missing a kill marker.
+ // Figure out whether EFLAGS operand should LIVE after MI instruction.
+ MachineBasicBlock *BB = MI->getParent();
+ MachineBasicBlock::iterator ItrMI = MI;
+
+ // Scan forward through BB for a use/def of EFLAGS.
+ for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
+ if (I->readsRegister(X86::EFLAGS))
+ return true;
+ if (I->definesRegister(X86::EFLAGS))
+ return false;
+ }
+
+ // We hit the end of the block, check whether EFLAGS is live into a successor.
+ for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) {
+ if ((*I)->isLiveIn(X86::EFLAGS))
+ return true;
+ }
+
+ return false;
+}
+
+/// Given /p First CMOV instruction and /p Last CMOV instruction representing a
+/// group of CMOV instructions, which may contain debug instructions in between,
+/// move all debug instructions to after the last CMOV instruction, making the
+/// CMOV group consecutive.
+static void packCmovGroup(MachineInstr *First, MachineInstr *Last) {
+ assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID &&
+ "Last instruction in a CMOV group must be a CMOV instruction");
+
+ SmallVector<MachineInstr *, 2> DBGInstructions;
+ for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {
+ if (I->isDebugInstr())
+ DBGInstructions.push_back(&*I);
+ }
+
+ // Splice the debug instruction after the cmov group.
+ MachineBasicBlock *MBB = First->getParent();
+ for (auto *MI : DBGInstructions)
+ MBB->insertAfter(Last, MI->removeFromParent());
+}
+
+void X86CmovConverterPass::convertCmovInstsToBranches(
+ SmallVectorImpl<MachineInstr *> &Group) const {
+ assert(!Group.empty() && "No CMOV instructions to convert");
+ ++NumOfOptimizedCmovGroups;
+
+ // If the CMOV group is not packed, e.g., there are debug instructions between
+ // first CMOV and last CMOV, then pack the group and make the CMOV instruction
+ // consecutive by moving the debug instructions to after the last CMOV.
+ packCmovGroup(Group.front(), Group.back());
+
+ // To convert a CMOVcc instruction, we actually have to insert the diamond
+ // control-flow pattern. The incoming instruction knows the destination vreg
+ // to set, the condition code register to branch on, the true/false values to
+ // select between, and a branch opcode to use.
+
+ // Before
+ // -----
+ // MBB:
+ // cond = cmp ...
+ // v1 = CMOVge t1, f1, cond
+ // v2 = CMOVlt t2, f2, cond
+ // v3 = CMOVge v1, f3, cond
+ //
+ // After
+ // -----
+ // MBB:
+ // cond = cmp ...
+ // jge %SinkMBB
+ //
+ // FalseMBB:
+ // jmp %SinkMBB
+ //
+ // SinkMBB:
+ // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
+ // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
+ // ; true-value with false-value
+ // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
+ // ; previous Phi instruction result
+
+ MachineInstr &MI = *Group.front();
+ MachineInstr *LastCMOV = Group.back();
+ DebugLoc DL = MI.getDebugLoc();
+
+ X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI));
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+ // Potentially swap the condition codes so that any memory operand to a CMOV
+ // is in the *false* position instead of the *true* position. We can invert
+ // any non-memory operand CMOV instructions to cope with this and we ensure
+ // memory operand CMOVs are only included with a single condition code.
+ if (llvm::any_of(Group, [&](MachineInstr *I) {
+ return I->mayLoad() && X86::getCondFromCMov(*I) == CC;
+ }))
+ std::swap(CC, OppCC);
+
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction::iterator It = ++MBB->getIterator();
+ MachineFunction *F = MBB->getParent();
+ const BasicBlock *BB = MBB->getBasicBlock();
+
+ MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
+ F->insert(It, FalseMBB);
+ F->insert(It, SinkMBB);
+
+ // If the EFLAGS register isn't dead in the terminator, then claim that it's
+ // live into the sink and copy blocks.
+ if (checkEFLAGSLive(LastCMOV)) {
+ FalseMBB->addLiveIn(X86::EFLAGS);
+ SinkMBB->addLiveIn(X86::EFLAGS);
+ }
+
+ // Transfer the remainder of BB and its successor edges to SinkMBB.
+ SinkMBB->splice(SinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
+ SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // Add the false and sink blocks as its successors.
+ MBB->addSuccessor(FalseMBB);
+ MBB->addSuccessor(SinkMBB);
+
+ // Create the conditional branch instruction.
+ BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
+
+ // Add the sink block to the false block successors.
+ FalseMBB->addSuccessor(SinkMBB);
+
+ MachineInstrBuilder MIB;
+ MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock::iterator MIItEnd =
+ std::next(MachineBasicBlock::iterator(LastCMOV));
+ MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
+ MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+
+ // First we need to insert an explicit load on the false path for any memory
+ // operand. We also need to potentially do register rewriting here, but it is
+ // simpler as the memory operands are always on the false path so we can
+ // simply take that input, whatever it is.
+ DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
+ auto &MI = *MIIt++;
+ // Skip any CMOVs in this group which don't load from memory.
+ if (!MI.mayLoad()) {
+ // Remember the false-side register input.
+ Register FalseReg =
+ MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
+ // Walk back through any intermediate cmovs referenced.
+ while (true) {
+ auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
+ if (FRIt == FalseBBRegRewriteTable.end())
+ break;
+ FalseReg = FRIt->second;
+ }
+ FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg;
+ continue;
+ }
+
+ // The condition must be the *opposite* of the one we've decided to branch
+ // on as the branch will go *around* the load and the load should happen
+ // when the CMOV condition is false.
+ assert(X86::getCondFromCMov(MI) == OppCC &&
+ "Can only handle memory-operand cmov instructions with a condition "
+ "opposite to the selected branch direction.");
+
+ // The goal is to rewrite the cmov from:
+ //
+ // MBB:
+ // %A = CMOVcc %B (tied), (mem)
+ //
+ // to
+ //
+ // MBB:
+ // %A = CMOVcc %B (tied), %C
+ // FalseMBB:
+ // %C = MOV (mem)
+ //
+ // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
+ //
+ // MBB:
+ // JMP!cc SinkMBB
+ // FalseMBB:
+ // %C = MOV (mem)
+ // SinkMBB:
+ // %A = PHI [ %C, FalseMBB ], [ %B, MBB]
+
+ // Get a fresh register to use as the destination of the MOV.
+ const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
+ Register TmpReg = MRI->createVirtualRegister(RC);
+
+ SmallVector<MachineInstr *, 4> NewMIs;
+ bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
+ /*UnfoldLoad*/ true,
+ /*UnfoldStore*/ false, NewMIs);
+ (void)Unfolded;
+ assert(Unfolded && "Should never fail to unfold a loading cmov!");
+
+ // Move the new CMOV to just before the old one and reset any impacted
+ // iterator.
+ auto *NewCMOV = NewMIs.pop_back_val();
+ assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
+ "Last new instruction isn't the expected CMOV!");
+ LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
+ MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
+ if (&*MIItBegin == &MI)
+ MIItBegin = MachineBasicBlock::iterator(NewCMOV);
+
+ // Sink whatever instructions were needed to produce the unfolded operand
+ // into the false block.
+ for (auto *NewMI : NewMIs) {
+ LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
+ FalseMBB->insert(FalseInsertionPoint, NewMI);
+ // Re-map any operands that are from other cmovs to the inputs for this block.
+ for (auto &MOp : NewMI->uses()) {
+ if (!MOp.isReg())
+ continue;
+ auto It = FalseBBRegRewriteTable.find(MOp.getReg());
+ if (It == FalseBBRegRewriteTable.end())
+ continue;
+
+ MOp.setReg(It->second);
+ // This might have been a kill when it referenced the cmov result, but
+ // it won't necessarily be once rewritten.
+ // FIXME: We could potentially improve this by tracking whether the
+ // operand to the cmov was also a kill, and then skipping the PHI node
+ // construction below.
+ MOp.setIsKill(false);
+ }
+ }
+ MBB->erase(MachineBasicBlock::iterator(MI),
+ std::next(MachineBasicBlock::iterator(MI)));
+
+ // Add this PHI to the rewrite table.
+ FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
+ }
+
+ // As we are creating the PHIs, we have to be careful if there is more than
+ // one. Later CMOVs may reference the results of earlier CMOVs, but later
+ // PHIs have to reference the individual true/false inputs from earlier PHIs.
+ // That also means that PHI construction must work forward from earlier to
+ // later, and that the code must maintain a mapping from earlier PHI's
+ // destination registers, and the registers that went into the PHI.
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ Register DestReg = MIIt->getOperand(0).getReg();
+ Register Op1Reg = MIIt->getOperand(1).getReg();
+ Register Op2Reg = MIIt->getOperand(2).getReg();
+
+ // If this CMOV we are processing is the opposite condition from the jump we
+ // generated, then we have to swap the operands for the PHI that is going to
+ // be generated.
+ if (X86::getCondFromCMov(*MIIt) == OppCC)
+ std::swap(Op1Reg, Op2Reg);
+
+ auto Op1Itr = RegRewriteTable.find(Op1Reg);
+ if (Op1Itr != RegRewriteTable.end())
+ Op1Reg = Op1Itr->second.first;
+
+ auto Op2Itr = RegRewriteTable.find(Op2Reg);
+ if (Op2Itr != RegRewriteTable.end())
+ Op2Reg = Op2Itr->second.second;
+
+ // SinkMBB:
+ // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
+ // ...
+ MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
+ .addReg(Op1Reg)
+ .addMBB(FalseMBB)
+ .addReg(Op2Reg)
+ .addMBB(MBB);
+ (void)MIB;
+ LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
+ LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
+
+ // Add this PHI to the rewrite table.
+ RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+ }
+
+ // Now remove the CMOV(s).
+ MBB->erase(MIItBegin, MIItEnd);
+}
+
+INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
+ false, false)
+
+FunctionPass *llvm::createX86CmovConverterPass() {
+ return new X86CmovConverterPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
new file mode 100644
index 000000000000..2ff8ee19561b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
@@ -0,0 +1,184 @@
+//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass aids profile-driven cache prefetch insertion by ensuring all
+/// instructions that have a memory operand are distinguishible from each other.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/IPO/SampleProfile.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-discriminate-memops"
+
+static cl::opt<bool> EnableDiscriminateMemops(
+ DEBUG_TYPE, cl::init(false),
+ cl::desc("Generate unique debug info for each instruction with a memory "
+ "operand. Should be enabled for profile-driven cache prefetching, "
+ "both in the build of the binary being profiled, as well as in "
+ "the build of the binary consuming the profile."),
+ cl::Hidden);
+
+static cl::opt<bool> BypassPrefetchInstructions(
+ "x86-bypass-prefetch-instructions", cl::init(true),
+ cl::desc("When discriminating instructions with memory operands, ignore "
+ "prefetch instructions. This ensures the other memory operand "
+ "instructions have the same identifiers after inserting "
+ "prefetches, allowing for successive insertions."),
+ cl::Hidden);
+
+namespace {
+
+using Location = std::pair<StringRef, unsigned>;
+
+Location diToLocation(const DILocation *Loc) {
+ return std::make_pair(Loc->getFilename(), Loc->getLine());
+}
+
+/// Ensure each instruction having a memory operand has a distinct <LineNumber,
+/// Discriminator> pair.
+void updateDebugInfo(MachineInstr *MI, const DILocation *Loc) {
+ DebugLoc DL(Loc);
+ MI->setDebugLoc(DL);
+}
+
+class X86DiscriminateMemOps : public MachineFunctionPass {
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ StringRef getPassName() const override {
+ return "X86 Discriminate Memory Operands";
+ }
+
+public:
+ static char ID;
+
+ /// Default construct and initialize the pass.
+ X86DiscriminateMemOps();
+};
+
+bool IsPrefetchOpcode(unsigned Opcode) {
+ return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 ||
+ Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2;
+}
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation
+//===----------------------------------------------------------------------===//
+
+char X86DiscriminateMemOps::ID = 0;
+
+/// Default construct and initialize the pass.
+X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {}
+
+bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
+ if (!EnableDiscriminateMemops)
+ return false;
+
+ DISubprogram *FDI = MF.getFunction().getSubprogram();
+ if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling())
+ return false;
+
+ // Have a default DILocation, if we find instructions with memops that don't
+ // have any debug info.
+ const DILocation *ReferenceDI =
+ DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI);
+ assert(ReferenceDI && "ReferenceDI should not be nullptr");
+ DenseMap<Location, unsigned> MemOpDiscriminators;
+ MemOpDiscriminators[diToLocation(ReferenceDI)] = 0;
+
+ // Figure out the largest discriminator issued for each Location. When we
+ // issue new discriminators, we can thus avoid issuing discriminators
+ // belonging to instructions that don't have memops. This isn't a requirement
+ // for the goals of this pass, however, it avoids unnecessary ambiguity.
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ const auto &DI = MI.getDebugLoc();
+ if (!DI)
+ continue;
+ if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
+ continue;
+ Location Loc = diToLocation(DI);
+ MemOpDiscriminators[Loc] =
+ std::max(MemOpDiscriminators[Loc], DI->getBaseDiscriminator());
+ }
+ }
+
+ // Keep track of the discriminators seen at each Location. If an instruction's
+ // DebugInfo has a Location and discriminator we've already seen, replace its
+ // discriminator with a new one, to guarantee uniqueness.
+ DenseMap<Location, DenseSet<unsigned>> Seen;
+
+ bool Changed = false;
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0)
+ continue;
+ if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
+ continue;
+ const DILocation *DI = MI.getDebugLoc();
+ bool HasDebug = DI;
+ if (!HasDebug) {
+ DI = ReferenceDI;
+ }
+ Location L = diToLocation(DI);
+ DenseSet<unsigned> &Set = Seen[L];
+ const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert =
+ Set.insert(DI->getBaseDiscriminator());
+ if (!TryInsert.second || !HasDebug) {
+ unsigned BF, DF, CI = 0;
+ DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI);
+ Optional<unsigned> EncodedDiscriminator = DILocation::encodeDiscriminator(
+ MemOpDiscriminators[L] + 1, DF, CI);
+
+ if (!EncodedDiscriminator) {
+ // FIXME(mtrofin): The assumption is that this scenario is infrequent/OK
+ // not to support. If evidence points otherwise, we can explore synthesizeing
+ // unique DIs by adding fake line numbers, or by constructing 64 bit
+ // discriminators.
+ LLVM_DEBUG(dbgs() << "Unable to create a unique discriminator "
+ "for instruction with memory operand in: "
+ << DI->getFilename() << " Line: " << DI->getLine()
+ << " Column: " << DI->getColumn()
+ << ". This is likely due to a large macro expansion. \n");
+ continue;
+ }
+ // Since we were able to encode, bump the MemOpDiscriminators.
+ ++MemOpDiscriminators[L];
+ DI = DI->cloneWithDiscriminator(EncodedDiscriminator.getValue());
+ assert(DI && "DI should not be nullptr");
+ updateDebugInfo(&MI, DI);
+ Changed = true;
+ std::pair<DenseSet<unsigned>::iterator, bool> MustInsert =
+ Set.insert(DI->getBaseDiscriminator());
+ (void)MustInsert; // Silence warning in release build.
+ assert(MustInsert.second && "New discriminator shouldn't be present in set");
+ }
+
+ // Bump the reference DI to avoid cramming discriminators on line 0.
+ // FIXME(mtrofin): pin ReferenceDI on blocks or first instruction with DI
+ // in a block. It's more consistent than just relying on the last memop
+ // instruction we happened to see.
+ ReferenceDI = DI;
+ }
+ }
+ return Changed;
+}
+
+FunctionPass *llvm::createX86DiscriminateMemOpsPass() {
+ return new X86DiscriminateMemOps();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
new file mode 100644
index 000000000000..a2ae6345c006
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -0,0 +1,793 @@
+//===--- X86DomainReassignment.cpp - Selectively switch register classes---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass attempts to find instruction chains (closures) in one domain,
+// and convert them to equivalent instructions in a different domain,
+// if profitable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Printable.h"
+#include <bitset>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-domain-reassignment"
+
+STATISTIC(NumClosuresConverted, "Number of closures converted by the pass");
+
+static cl::opt<bool> DisableX86DomainReassignment(
+ "disable-x86-domain-reassignment", cl::Hidden,
+ cl::desc("X86: Disable Virtual Register Reassignment."), cl::init(false));
+
+namespace {
+enum RegDomain { NoDomain = -1, GPRDomain, MaskDomain, OtherDomain, NumDomains };
+
+static bool isGPR(const TargetRegisterClass *RC) {
+ return X86::GR64RegClass.hasSubClassEq(RC) ||
+ X86::GR32RegClass.hasSubClassEq(RC) ||
+ X86::GR16RegClass.hasSubClassEq(RC) ||
+ X86::GR8RegClass.hasSubClassEq(RC);
+}
+
+static bool isMask(const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) {
+ return X86::VK16RegClass.hasSubClassEq(RC);
+}
+
+static RegDomain getDomain(const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) {
+ if (isGPR(RC))
+ return GPRDomain;
+ if (isMask(RC, TRI))
+ return MaskDomain;
+ return OtherDomain;
+}
+
+/// Return a register class equivalent to \p SrcRC, in \p Domain.
+static const TargetRegisterClass *getDstRC(const TargetRegisterClass *SrcRC,
+ RegDomain Domain) {
+ assert(Domain == MaskDomain && "add domain");
+ if (X86::GR8RegClass.hasSubClassEq(SrcRC))
+ return &X86::VK8RegClass;
+ if (X86::GR16RegClass.hasSubClassEq(SrcRC))
+ return &X86::VK16RegClass;
+ if (X86::GR32RegClass.hasSubClassEq(SrcRC))
+ return &X86::VK32RegClass;
+ if (X86::GR64RegClass.hasSubClassEq(SrcRC))
+ return &X86::VK64RegClass;
+ llvm_unreachable("add register class");
+ return nullptr;
+}
+
+/// Abstract Instruction Converter class.
+class InstrConverterBase {
+protected:
+ unsigned SrcOpcode;
+
+public:
+ InstrConverterBase(unsigned SrcOpcode) : SrcOpcode(SrcOpcode) {}
+
+ virtual ~InstrConverterBase() {}
+
+ /// \returns true if \p MI is legal to convert.
+ virtual bool isLegal(const MachineInstr *MI,
+ const TargetInstrInfo *TII) const {
+ assert(MI->getOpcode() == SrcOpcode &&
+ "Wrong instruction passed to converter");
+ return true;
+ }
+
+ /// Applies conversion to \p MI.
+ ///
+ /// \returns true if \p MI is no longer need, and can be deleted.
+ virtual bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const = 0;
+
+ /// \returns the cost increment incurred by converting \p MI.
+ virtual double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const = 0;
+};
+
+/// An Instruction Converter which ignores the given instruction.
+/// For example, PHI instructions can be safely ignored since only the registers
+/// need to change.
+class InstrIgnore : public InstrConverterBase {
+public:
+ InstrIgnore(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {}
+
+ bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const override {
+ assert(isLegal(MI, TII) && "Cannot convert instruction");
+ return false;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ return 0;
+ }
+};
+
+/// An Instruction Converter which replaces an instruction with another.
+class InstrReplacer : public InstrConverterBase {
+public:
+ /// Opcode of the destination instruction.
+ unsigned DstOpcode;
+
+ InstrReplacer(unsigned SrcOpcode, unsigned DstOpcode)
+ : InstrConverterBase(SrcOpcode), DstOpcode(DstOpcode) {}
+
+ bool isLegal(const MachineInstr *MI,
+ const TargetInstrInfo *TII) const override {
+ if (!InstrConverterBase::isLegal(MI, TII))
+ return false;
+ // It's illegal to replace an instruction that implicitly defines a register
+ // with an instruction that doesn't, unless that register dead.
+ for (const auto &MO : MI->implicit_operands())
+ if (MO.isReg() && MO.isDef() && !MO.isDead() &&
+ !TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg()))
+ return false;
+ return true;
+ }
+
+ bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const override {
+ assert(isLegal(MI, TII) && "Cannot convert instruction");
+ MachineInstrBuilder Bld =
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(DstOpcode));
+ // Transfer explicit operands from original instruction. Implicit operands
+ // are handled by BuildMI.
+ for (auto &Op : MI->explicit_operands())
+ Bld.add(Op);
+ return true;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ // Assuming instructions have the same cost.
+ return 0;
+ }
+};
+
+/// An Instruction Converter which replaces an instruction with another, and
+/// adds a COPY from the new instruction's destination to the old one's.
+class InstrReplacerDstCOPY : public InstrConverterBase {
+public:
+ unsigned DstOpcode;
+
+ InstrReplacerDstCOPY(unsigned SrcOpcode, unsigned DstOpcode)
+ : InstrConverterBase(SrcOpcode), DstOpcode(DstOpcode) {}
+
+ bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const override {
+ assert(isLegal(MI, TII) && "Cannot convert instruction");
+ MachineBasicBlock *MBB = MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
+
+ Register Reg = MRI->createVirtualRegister(
+ TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
+ *MBB->getParent()));
+ MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
+ for (unsigned Idx = 1, End = MI->getNumOperands(); Idx < End; ++Idx)
+ Bld.add(MI->getOperand(Idx));
+
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY))
+ .add(MI->getOperand(0))
+ .addReg(Reg);
+
+ return true;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ // Assuming instructions have the same cost, and that COPY is in the same
+ // domain so it will be eliminated.
+ return 0;
+ }
+};
+
+/// An Instruction Converter for replacing COPY instructions.
+class InstrCOPYReplacer : public InstrReplacer {
+public:
+ RegDomain DstDomain;
+
+ InstrCOPYReplacer(unsigned SrcOpcode, RegDomain DstDomain, unsigned DstOpcode)
+ : InstrReplacer(SrcOpcode, DstOpcode), DstDomain(DstDomain) {}
+
+ bool isLegal(const MachineInstr *MI,
+ const TargetInstrInfo *TII) const override {
+ if (!InstrConverterBase::isLegal(MI, TII))
+ return false;
+
+ // Don't allow copies to/flow GR8/GR16 physical registers.
+ // FIXME: Is there some better way to support this?
+ Register DstReg = MI->getOperand(0).getReg();
+ if (DstReg.isPhysical() && (X86::GR8RegClass.contains(DstReg) ||
+ X86::GR16RegClass.contains(DstReg)))
+ return false;
+ Register SrcReg = MI->getOperand(1).getReg();
+ if (SrcReg.isPhysical() && (X86::GR8RegClass.contains(SrcReg) ||
+ X86::GR16RegClass.contains(SrcReg)))
+ return false;
+
+ return true;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY");
+
+ for (const auto &MO : MI->operands()) {
+ // Physical registers will not be converted. Assume that converting the
+ // COPY to the destination domain will eventually result in a actual
+ // instruction.
+ if (Register::isPhysicalRegister(MO.getReg()))
+ return 1;
+
+ RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()),
+ MRI->getTargetRegisterInfo());
+ // Converting a cross domain COPY to a same domain COPY should eliminate
+ // an insturction
+ if (OpDomain == DstDomain)
+ return -1;
+ }
+ return 0;
+ }
+};
+
+/// An Instruction Converter which replaces an instruction with a COPY.
+class InstrReplaceWithCopy : public InstrConverterBase {
+public:
+ // Source instruction operand Index, to be used as the COPY source.
+ unsigned SrcOpIdx;
+
+ InstrReplaceWithCopy(unsigned SrcOpcode, unsigned SrcOpIdx)
+ : InstrConverterBase(SrcOpcode), SrcOpIdx(SrcOpIdx) {}
+
+ bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const override {
+ assert(isLegal(MI, TII) && "Cannot convert instruction");
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(TargetOpcode::COPY))
+ .add({MI->getOperand(0), MI->getOperand(SrcOpIdx)});
+ return true;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ return 0;
+ }
+};
+
+// Key type to be used by the Instruction Converters map.
+// A converter is identified by <destination domain, source opcode>
+typedef std::pair<int, unsigned> InstrConverterBaseKeyTy;
+
+typedef DenseMap<InstrConverterBaseKeyTy, std::unique_ptr<InstrConverterBase>>
+ InstrConverterBaseMap;
+
+/// A closure is a set of virtual register representing all of the edges in
+/// the closure, as well as all of the instructions connected by those edges.
+///
+/// A closure may encompass virtual registers in the same register bank that
+/// have different widths. For example, it may contain 32-bit GPRs as well as
+/// 64-bit GPRs.
+///
+/// A closure that computes an address (i.e. defines a virtual register that is
+/// used in a memory operand) excludes the instructions that contain memory
+/// operands using the address. Such an instruction will be included in a
+/// different closure that manipulates the loaded or stored value.
+class Closure {
+private:
+ /// Virtual registers in the closure.
+ DenseSet<Register> Edges;
+
+ /// Instructions in the closure.
+ SmallVector<MachineInstr *, 8> Instrs;
+
+ /// Domains which this closure can legally be reassigned to.
+ std::bitset<NumDomains> LegalDstDomains;
+
+ /// An ID to uniquely identify this closure, even when it gets
+ /// moved around
+ unsigned ID;
+
+public:
+ Closure(unsigned ID, std::initializer_list<RegDomain> LegalDstDomainList) : ID(ID) {
+ for (RegDomain D : LegalDstDomainList)
+ LegalDstDomains.set(D);
+ }
+
+ /// Mark this closure as illegal for reassignment to all domains.
+ void setAllIllegal() { LegalDstDomains.reset(); }
+
+ /// \returns true if this closure has domains which are legal to reassign to.
+ bool hasLegalDstDomain() const { return LegalDstDomains.any(); }
+
+ /// \returns true if is legal to reassign this closure to domain \p RD.
+ bool isLegal(RegDomain RD) const { return LegalDstDomains[RD]; }
+
+ /// Mark this closure as illegal for reassignment to domain \p RD.
+ void setIllegal(RegDomain RD) { LegalDstDomains[RD] = false; }
+
+ bool empty() const { return Edges.empty(); }
+
+ bool insertEdge(Register Reg) { return Edges.insert(Reg).second; }
+
+ using const_edge_iterator = DenseSet<Register>::const_iterator;
+ iterator_range<const_edge_iterator> edges() const {
+ return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end());
+ }
+
+ void addInstruction(MachineInstr *I) {
+ Instrs.push_back(I);
+ }
+
+ ArrayRef<MachineInstr *> instructions() const {
+ return Instrs;
+ }
+
+ LLVM_DUMP_METHOD void dump(const MachineRegisterInfo *MRI) const {
+ dbgs() << "Registers: ";
+ bool First = true;
+ for (Register Reg : Edges) {
+ if (!First)
+ dbgs() << ", ";
+ First = false;
+ dbgs() << printReg(Reg, MRI->getTargetRegisterInfo(), 0, MRI);
+ }
+ dbgs() << "\n" << "Instructions:";
+ for (MachineInstr *MI : Instrs) {
+ dbgs() << "\n ";
+ MI->print(dbgs());
+ }
+ dbgs() << "\n";
+ }
+
+ unsigned getID() const {
+ return ID;
+ }
+
+};
+
+class X86DomainReassignment : public MachineFunctionPass {
+ const X86Subtarget *STI = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+
+ /// All edges that are included in some closure
+ DenseSet<unsigned> EnclosedEdges;
+
+ /// All instructions that are included in some closure.
+ DenseMap<MachineInstr *, unsigned> EnclosedInstrs;
+
+public:
+ static char ID;
+
+ X86DomainReassignment() : MachineFunctionPass(ID) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return "X86 Domain Reassignment Pass";
+ }
+
+private:
+ /// A map of available Instruction Converters.
+ InstrConverterBaseMap Converters;
+
+ /// Initialize Converters map.
+ void initConverters();
+
+ /// Starting from \Reg, expand the closure as much as possible.
+ void buildClosure(Closure &, Register Reg);
+
+ /// Enqueue \p Reg to be considered for addition to the closure.
+ void visitRegister(Closure &, Register Reg, RegDomain &Domain,
+ SmallVectorImpl<unsigned> &Worklist);
+
+ /// Reassign the closure to \p Domain.
+ void reassign(const Closure &C, RegDomain Domain) const;
+
+ /// Add \p MI to the closure.
+ void encloseInstr(Closure &C, MachineInstr *MI);
+
+ /// /returns true if it is profitable to reassign the closure to \p Domain.
+ bool isReassignmentProfitable(const Closure &C, RegDomain Domain) const;
+
+ /// Calculate the total cost of reassigning the closure to \p Domain.
+ double calculateCost(const Closure &C, RegDomain Domain) const;
+};
+
+char X86DomainReassignment::ID = 0;
+
+} // End anonymous namespace.
+
+void X86DomainReassignment::visitRegister(Closure &C, Register Reg,
+ RegDomain &Domain,
+ SmallVectorImpl<unsigned> &Worklist) {
+ if (EnclosedEdges.count(Reg))
+ return;
+
+ if (!Reg.isVirtual())
+ return;
+
+ if (!MRI->hasOneDef(Reg))
+ return;
+
+ RegDomain RD = getDomain(MRI->getRegClass(Reg), MRI->getTargetRegisterInfo());
+ // First edge in closure sets the domain.
+ if (Domain == NoDomain)
+ Domain = RD;
+
+ if (Domain != RD)
+ return;
+
+ Worklist.push_back(Reg);
+}
+
+void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) {
+ auto I = EnclosedInstrs.find(MI);
+ if (I != EnclosedInstrs.end()) {
+ if (I->second != C.getID())
+ // Instruction already belongs to another closure, avoid conflicts between
+ // closure and mark this closure as illegal.
+ C.setAllIllegal();
+ return;
+ }
+
+ EnclosedInstrs[MI] = C.getID();
+ C.addInstruction(MI);
+
+ // Mark closure as illegal for reassignment to domains, if there is no
+ // converter for the instruction or if the converter cannot convert the
+ // instruction.
+ for (int i = 0; i != NumDomains; ++i) {
+ if (C.isLegal((RegDomain)i)) {
+ auto I = Converters.find({i, MI->getOpcode()});
+ if (I == Converters.end() || !I->second->isLegal(MI, TII))
+ C.setIllegal((RegDomain)i);
+ }
+ }
+}
+
+double X86DomainReassignment::calculateCost(const Closure &C,
+ RegDomain DstDomain) const {
+ assert(C.isLegal(DstDomain) && "Cannot calculate cost for illegal closure");
+
+ double Cost = 0.0;
+ for (auto *MI : C.instructions())
+ Cost += Converters.find({DstDomain, MI->getOpcode()})
+ ->second->getExtraCost(MI, MRI);
+ return Cost;
+}
+
+bool X86DomainReassignment::isReassignmentProfitable(const Closure &C,
+ RegDomain Domain) const {
+ return calculateCost(C, Domain) < 0.0;
+}
+
+void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
+ assert(C.isLegal(Domain) && "Cannot convert illegal closure");
+
+ // Iterate all instructions in the closure, convert each one using the
+ // appropriate converter.
+ SmallVector<MachineInstr *, 8> ToErase;
+ for (auto *MI : C.instructions())
+ if (Converters.find({Domain, MI->getOpcode()})
+ ->second->convertInstr(MI, TII, MRI))
+ ToErase.push_back(MI);
+
+ // Iterate all registers in the closure, replace them with registers in the
+ // destination domain.
+ for (Register Reg : C.edges()) {
+ MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain));
+ for (auto &MO : MRI->use_operands(Reg)) {
+ if (MO.isReg())
+ // Remove all subregister references as they are not valid in the
+ // destination domain.
+ MO.setSubReg(0);
+ }
+ }
+
+ for (auto *MI : ToErase)
+ MI->eraseFromParent();
+}
+
+/// \returns true when \p Reg is used as part of an address calculation in \p
+/// MI.
+static bool usedAsAddr(const MachineInstr &MI, Register Reg,
+ const TargetInstrInfo *TII) {
+ if (!MI.mayLoadOrStore())
+ return false;
+
+ const MCInstrDesc &Desc = TII->get(MI.getOpcode());
+ int MemOpStart = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemOpStart == -1)
+ return false;
+
+ MemOpStart += X86II::getOperandBias(Desc);
+ for (unsigned MemOpIdx = MemOpStart,
+ MemOpEnd = MemOpStart + X86::AddrNumOperands;
+ MemOpIdx < MemOpEnd; ++MemOpIdx) {
+ const MachineOperand &Op = MI.getOperand(MemOpIdx);
+ if (Op.isReg() && Op.getReg() == Reg)
+ return true;
+ }
+ return false;
+}
+
+void X86DomainReassignment::buildClosure(Closure &C, Register Reg) {
+ SmallVector<unsigned, 4> Worklist;
+ RegDomain Domain = NoDomain;
+ visitRegister(C, Reg, Domain, Worklist);
+ while (!Worklist.empty()) {
+ unsigned CurReg = Worklist.pop_back_val();
+
+ // Register already in this closure.
+ if (!C.insertEdge(CurReg))
+ continue;
+ EnclosedEdges.insert(Reg);
+
+ MachineInstr *DefMI = MRI->getVRegDef(CurReg);
+ encloseInstr(C, DefMI);
+
+ // Add register used by the defining MI to the worklist.
+ // Do not add registers which are used in address calculation, they will be
+ // added to a different closure.
+ int OpEnd = DefMI->getNumOperands();
+ const MCInstrDesc &Desc = DefMI->getDesc();
+ int MemOp = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemOp != -1)
+ MemOp += X86II::getOperandBias(Desc);
+ for (int OpIdx = 0; OpIdx < OpEnd; ++OpIdx) {
+ if (OpIdx == MemOp) {
+ // skip address calculation.
+ OpIdx += (X86::AddrNumOperands - 1);
+ continue;
+ }
+ auto &Op = DefMI->getOperand(OpIdx);
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ visitRegister(C, Op.getReg(), Domain, Worklist);
+ }
+
+ // Expand closure through register uses.
+ for (auto &UseMI : MRI->use_nodbg_instructions(CurReg)) {
+ // We would like to avoid converting closures which calculare addresses,
+ // as this should remain in GPRs.
+ if (usedAsAddr(UseMI, CurReg, TII)) {
+ C.setAllIllegal();
+ continue;
+ }
+ encloseInstr(C, &UseMI);
+
+ for (auto &DefOp : UseMI.defs()) {
+ if (!DefOp.isReg())
+ continue;
+
+ Register DefReg = DefOp.getReg();
+ if (!DefReg.isVirtual()) {
+ C.setAllIllegal();
+ continue;
+ }
+ visitRegister(C, DefReg, Domain, Worklist);
+ }
+ }
+ }
+}
+
+void X86DomainReassignment::initConverters() {
+ Converters[{MaskDomain, TargetOpcode::PHI}] =
+ std::make_unique<InstrIgnore>(TargetOpcode::PHI);
+
+ Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] =
+ std::make_unique<InstrIgnore>(TargetOpcode::IMPLICIT_DEF);
+
+ Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] =
+ std::make_unique<InstrReplaceWithCopy>(TargetOpcode::INSERT_SUBREG, 2);
+
+ Converters[{MaskDomain, TargetOpcode::COPY}] =
+ std::make_unique<InstrCOPYReplacer>(TargetOpcode::COPY, MaskDomain,
+ TargetOpcode::COPY);
+
+ auto createReplacerDstCOPY = [&](unsigned From, unsigned To) {
+ Converters[{MaskDomain, From}] =
+ std::make_unique<InstrReplacerDstCOPY>(From, To);
+ };
+
+ createReplacerDstCOPY(X86::MOVZX32rm16, X86::KMOVWkm);
+ createReplacerDstCOPY(X86::MOVZX64rm16, X86::KMOVWkm);
+
+ createReplacerDstCOPY(X86::MOVZX32rr16, X86::KMOVWkk);
+ createReplacerDstCOPY(X86::MOVZX64rr16, X86::KMOVWkk);
+
+ if (STI->hasDQI()) {
+ createReplacerDstCOPY(X86::MOVZX16rm8, X86::KMOVBkm);
+ createReplacerDstCOPY(X86::MOVZX32rm8, X86::KMOVBkm);
+ createReplacerDstCOPY(X86::MOVZX64rm8, X86::KMOVBkm);
+
+ createReplacerDstCOPY(X86::MOVZX16rr8, X86::KMOVBkk);
+ createReplacerDstCOPY(X86::MOVZX32rr8, X86::KMOVBkk);
+ createReplacerDstCOPY(X86::MOVZX64rr8, X86::KMOVBkk);
+ }
+
+ auto createReplacer = [&](unsigned From, unsigned To) {
+ Converters[{MaskDomain, From}] = std::make_unique<InstrReplacer>(From, To);
+ };
+
+ createReplacer(X86::MOV16rm, X86::KMOVWkm);
+ createReplacer(X86::MOV16mr, X86::KMOVWmk);
+ createReplacer(X86::MOV16rr, X86::KMOVWkk);
+ createReplacer(X86::SHR16ri, X86::KSHIFTRWri);
+ createReplacer(X86::SHL16ri, X86::KSHIFTLWri);
+ createReplacer(X86::NOT16r, X86::KNOTWrr);
+ createReplacer(X86::OR16rr, X86::KORWrr);
+ createReplacer(X86::AND16rr, X86::KANDWrr);
+ createReplacer(X86::XOR16rr, X86::KXORWrr);
+
+ if (STI->hasBWI()) {
+ createReplacer(X86::MOV32rm, X86::KMOVDkm);
+ createReplacer(X86::MOV64rm, X86::KMOVQkm);
+
+ createReplacer(X86::MOV32mr, X86::KMOVDmk);
+ createReplacer(X86::MOV64mr, X86::KMOVQmk);
+
+ createReplacer(X86::MOV32rr, X86::KMOVDkk);
+ createReplacer(X86::MOV64rr, X86::KMOVQkk);
+
+ createReplacer(X86::SHR32ri, X86::KSHIFTRDri);
+ createReplacer(X86::SHR64ri, X86::KSHIFTRQri);
+
+ createReplacer(X86::SHL32ri, X86::KSHIFTLDri);
+ createReplacer(X86::SHL64ri, X86::KSHIFTLQri);
+
+ createReplacer(X86::ADD32rr, X86::KADDDrr);
+ createReplacer(X86::ADD64rr, X86::KADDQrr);
+
+ createReplacer(X86::NOT32r, X86::KNOTDrr);
+ createReplacer(X86::NOT64r, X86::KNOTQrr);
+
+ createReplacer(X86::OR32rr, X86::KORDrr);
+ createReplacer(X86::OR64rr, X86::KORQrr);
+
+ createReplacer(X86::AND32rr, X86::KANDDrr);
+ createReplacer(X86::AND64rr, X86::KANDQrr);
+
+ createReplacer(X86::ANDN32rr, X86::KANDNDrr);
+ createReplacer(X86::ANDN64rr, X86::KANDNQrr);
+
+ createReplacer(X86::XOR32rr, X86::KXORDrr);
+ createReplacer(X86::XOR64rr, X86::KXORQrr);
+
+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+ // to prove only Z flag is used.
+ //createReplacer(X86::TEST32rr, X86::KTESTDrr);
+ //createReplacer(X86::TEST64rr, X86::KTESTQrr);
+ }
+
+ if (STI->hasDQI()) {
+ createReplacer(X86::ADD8rr, X86::KADDBrr);
+ createReplacer(X86::ADD16rr, X86::KADDWrr);
+
+ createReplacer(X86::AND8rr, X86::KANDBrr);
+
+ createReplacer(X86::MOV8rm, X86::KMOVBkm);
+ createReplacer(X86::MOV8mr, X86::KMOVBmk);
+ createReplacer(X86::MOV8rr, X86::KMOVBkk);
+
+ createReplacer(X86::NOT8r, X86::KNOTBrr);
+
+ createReplacer(X86::OR8rr, X86::KORBrr);
+
+ createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
+ createReplacer(X86::SHL8ri, X86::KSHIFTLBri);
+
+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+ // to prove only Z flag is used.
+ //createReplacer(X86::TEST8rr, X86::KTESTBrr);
+ //createReplacer(X86::TEST16rr, X86::KTESTWrr);
+
+ createReplacer(X86::XOR8rr, X86::KXORBrr);
+ }
+}
+
+bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ if (DisableX86DomainReassignment)
+ return false;
+
+ LLVM_DEBUG(
+ dbgs() << "***** Machine Function before Domain Reassignment *****\n");
+ LLVM_DEBUG(MF.print(dbgs()));
+
+ STI = &MF.getSubtarget<X86Subtarget>();
+ // GPR->K is the only transformation currently supported, bail out early if no
+ // AVX512.
+ // TODO: We're also bailing of AVX512BW isn't supported since we use VK32 and
+ // VK64 for GR32/GR64, but those aren't legal classes on KNL. If the register
+ // coalescer doesn't clean it up and we generate a spill we will crash.
+ if (!STI->hasAVX512() || !STI->hasBWI())
+ return false;
+
+ MRI = &MF.getRegInfo();
+ assert(MRI->isSSA() && "Expected MIR to be in SSA form");
+
+ TII = STI->getInstrInfo();
+ initConverters();
+ bool Changed = false;
+
+ EnclosedEdges.clear();
+ EnclosedInstrs.clear();
+
+ std::vector<Closure> Closures;
+
+ // Go over all virtual registers and calculate a closure.
+ unsigned ClosureID = 0;
+ for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
+ Register Reg = Register::index2VirtReg(Idx);
+
+ // GPR only current source domain supported.
+ if (!isGPR(MRI->getRegClass(Reg)))
+ continue;
+
+ // Register already in closure.
+ if (EnclosedEdges.count(Reg))
+ continue;
+
+ // Calculate closure starting with Reg.
+ Closure C(ClosureID++, {MaskDomain});
+ buildClosure(C, Reg);
+
+ // Collect all closures that can potentially be converted.
+ if (!C.empty() && C.isLegal(MaskDomain))
+ Closures.push_back(std::move(C));
+ }
+
+ for (Closure &C : Closures) {
+ LLVM_DEBUG(C.dump(MRI));
+ if (isReassignmentProfitable(C, MaskDomain)) {
+ reassign(C, MaskDomain);
+ ++NumClosuresConverted;
+ Changed = true;
+ }
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "***** Machine Function after Domain Reassignment *****\n");
+ LLVM_DEBUG(MF.print(dbgs()));
+
+ return Changed;
+}
+
+INITIALIZE_PASS(X86DomainReassignment, "x86-domain-reassignment",
+ "X86 Domain Reassignment Pass", false, false)
+
+/// Returns an instance of the Domain Reassignment pass.
+FunctionPass *llvm::createX86DomainReassignmentPass() {
+ return new X86DomainReassignment();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
new file mode 100644
index 000000000000..97f843fa24eb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -0,0 +1,295 @@
+//===- X86EvexToVex.cpp ---------------------------------------------------===//
+// Compress EVEX instructions to VEX encoding when possible to reduce code size
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file defines the pass that goes over all AVX-512 instructions which
+/// are encoded using the EVEX prefix and if possible replaces them by their
+/// corresponding VEX encoding which is usually shorter by 2 bytes.
+/// EVEX instructions may be encoded via the VEX prefix when the AVX-512
+/// instruction has a corresponding AVX/AVX2 opcode, when vector length
+/// accessed by instruction is less than 512 bits and when it does not use
+// the xmm or the mask registers or xmm/ymm registers with indexes higher than 15.
+/// The pass applies code reduction on the generated code for AVX-512 instrs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86InstComments.h"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+// Including the generated EVEX2VEX tables.
+struct X86EvexToVexCompressTableEntry {
+ uint16_t EvexOpcode;
+ uint16_t VexOpcode;
+
+ bool operator<(const X86EvexToVexCompressTableEntry &RHS) const {
+ return EvexOpcode < RHS.EvexOpcode;
+ }
+
+ friend bool operator<(const X86EvexToVexCompressTableEntry &TE,
+ unsigned Opc) {
+ return TE.EvexOpcode < Opc;
+ }
+};
+#include "X86GenEVEX2VEXTables.inc"
+
+#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible"
+#define EVEX2VEX_NAME "x86-evex-to-vex-compress"
+
+#define DEBUG_TYPE EVEX2VEX_NAME
+
+namespace {
+
+class EvexToVexInstPass : public MachineFunctionPass {
+
+ /// For EVEX instructions that can be encoded using VEX encoding, replace
+ /// them by the VEX encoding in order to reduce size.
+ bool CompressEvexToVexImpl(MachineInstr &MI) const;
+
+public:
+ static char ID;
+
+ EvexToVexInstPass() : MachineFunctionPass(ID) { }
+
+ StringRef getPassName() const override { return EVEX2VEX_DESC; }
+
+ /// Loop over all of the basic blocks, replacing EVEX instructions
+ /// by equivalent VEX instructions when possible for reducing code size.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ // This pass runs after regalloc and doesn't support VReg operands.
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ /// Machine instruction info used throughout the class.
+ const X86InstrInfo *TII = nullptr;
+
+ const X86Subtarget *ST = nullptr;
+};
+
+} // end anonymous namespace
+
+char EvexToVexInstPass::ID = 0;
+
+bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+
+ ST = &MF.getSubtarget<X86Subtarget>();
+ if (!ST->hasAVX512())
+ return false;
+
+ bool Changed = false;
+
+ /// Go over all basic blocks in function and replace
+ /// EVEX encoded instrs by VEX encoding when possible.
+ for (MachineBasicBlock &MBB : MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB)
+ Changed |= CompressEvexToVexImpl(MI);
+ }
+
+ return Changed;
+}
+
+static bool usesExtendedRegister(const MachineInstr &MI) {
+ auto isHiRegIdx = [](unsigned Reg) {
+ // Check for XMM register with indexes between 16 - 31.
+ if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
+ return true;
+
+ // Check for YMM register with indexes between 16 - 31.
+ if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
+ return true;
+
+ return false;
+ };
+
+ // Check that operands are not ZMM regs or
+ // XMM/YMM regs with hi indexes between 16 - 31.
+ for (const MachineOperand &MO : MI.explicit_operands()) {
+ if (!MO.isReg())
+ continue;
+
+ Register Reg = MO.getReg();
+
+ assert(!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31) &&
+ "ZMM instructions should not be in the EVEX->VEX tables");
+
+ if (isHiRegIdx(Reg))
+ return true;
+ }
+
+ return false;
+}
+
+// Do any custom cleanup needed to finalize the conversion.
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc,
+ const X86Subtarget *ST) {
+ (void)NewOpc;
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case X86::VPDPBUSDSZ256m:
+ case X86::VPDPBUSDSZ256r:
+ case X86::VPDPBUSDSZ128m:
+ case X86::VPDPBUSDSZ128r:
+ case X86::VPDPBUSDZ256m:
+ case X86::VPDPBUSDZ256r:
+ case X86::VPDPBUSDZ128m:
+ case X86::VPDPBUSDZ128r:
+ case X86::VPDPWSSDSZ256m:
+ case X86::VPDPWSSDSZ256r:
+ case X86::VPDPWSSDSZ128m:
+ case X86::VPDPWSSDSZ128r:
+ case X86::VPDPWSSDZ256m:
+ case X86::VPDPWSSDZ256r:
+ case X86::VPDPWSSDZ128m:
+ case X86::VPDPWSSDZ128r:
+ // These can only VEX convert if AVXVNNI is enabled.
+ return ST->hasAVXVNNI();
+ case X86::VALIGNDZ128rri:
+ case X86::VALIGNDZ128rmi:
+ case X86::VALIGNQZ128rri:
+ case X86::VALIGNQZ128rmi: {
+ assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
+ "Unexpected new opcode!");
+ unsigned Scale = (Opc == X86::VALIGNQZ128rri ||
+ Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
+ MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+ Imm.setImm(Imm.getImm() * Scale);
+ break;
+ }
+ case X86::VSHUFF32X4Z256rmi:
+ case X86::VSHUFF32X4Z256rri:
+ case X86::VSHUFF64X2Z256rmi:
+ case X86::VSHUFF64X2Z256rri:
+ case X86::VSHUFI32X4Z256rmi:
+ case X86::VSHUFI32X4Z256rri:
+ case X86::VSHUFI64X2Z256rmi:
+ case X86::VSHUFI64X2Z256rri: {
+ assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
+ NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
+ "Unexpected new opcode!");
+ MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+ int64_t ImmVal = Imm.getImm();
+ // Set bit 5, move bit 1 to bit 4, copy bit 0.
+ Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
+ break;
+ }
+ case X86::VRNDSCALEPDZ128rri:
+ case X86::VRNDSCALEPDZ128rmi:
+ case X86::VRNDSCALEPSZ128rri:
+ case X86::VRNDSCALEPSZ128rmi:
+ case X86::VRNDSCALEPDZ256rri:
+ case X86::VRNDSCALEPDZ256rmi:
+ case X86::VRNDSCALEPSZ256rri:
+ case X86::VRNDSCALEPSZ256rmi:
+ case X86::VRNDSCALESDZr:
+ case X86::VRNDSCALESDZm:
+ case X86::VRNDSCALESSZr:
+ case X86::VRNDSCALESSZm:
+ case X86::VRNDSCALESDZr_Int:
+ case X86::VRNDSCALESDZm_Int:
+ case X86::VRNDSCALESSZr_Int:
+ case X86::VRNDSCALESSZm_Int:
+ const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+ int64_t ImmVal = Imm.getImm();
+ // Ensure that only bits 3:0 of the immediate are used.
+ if ((ImmVal & 0xf) != ImmVal)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+
+// For EVEX instructions that can be encoded using VEX encoding
+// replace them by the VEX encoding in order to reduce size.
+bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
+ // VEX format.
+ // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1
+ // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM]
+ //
+ // EVEX format.
+ // # of bytes: 4 1 1 1 4 / 1 1
+ // [Prefixes] EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate]
+
+ const MCInstrDesc &Desc = MI.getDesc();
+
+ // Check for EVEX instructions only.
+ if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX)
+ return false;
+
+ // Check for EVEX instructions with mask or broadcast as in these cases
+ // the EVEX prefix is needed in order to carry this information
+ // thus preventing the transformation to VEX encoding.
+ if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B))
+ return false;
+
+ // Check for EVEX instructions with L2 set. These instructions are 512-bits
+ // and can't be converted to VEX.
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ return false;
+
+#ifndef NDEBUG
+ // Make sure the tables are sorted.
+ static std::atomic<bool> TableChecked(false);
+ if (!TableChecked.load(std::memory_order_relaxed)) {
+ assert(llvm::is_sorted(X86EvexToVex128CompressTable) &&
+ "X86EvexToVex128CompressTable is not sorted!");
+ assert(llvm::is_sorted(X86EvexToVex256CompressTable) &&
+ "X86EvexToVex256CompressTable is not sorted!");
+ TableChecked.store(true, std::memory_order_relaxed);
+ }
+#endif
+
+ // Use the VEX.L bit to select the 128 or 256-bit table.
+ ArrayRef<X86EvexToVexCompressTableEntry> Table =
+ (Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable)
+ : makeArrayRef(X86EvexToVex128CompressTable);
+
+ const auto *I = llvm::lower_bound(Table, MI.getOpcode());
+ if (I == Table.end() || I->EvexOpcode != MI.getOpcode())
+ return false;
+
+ unsigned NewOpc = I->VexOpcode;
+
+ if (usesExtendedRegister(MI))
+ return false;
+
+ if (!performCustomAdjustments(MI, NewOpc, ST))
+ return false;
+
+ MI.setDesc(TII->get(NewOpc));
+ MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+ return true;
+}
+
+INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
+
+FunctionPass *llvm::createX86EvexToVexInsts() {
+ return new EvexToVexInstPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
new file mode 100644
index 000000000000..15af0fb2e888
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -0,0 +1,539 @@
+//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, other late
+// optimizations, or simply the encoding of the instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
+#include "llvm/IR/GlobalValue.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pseudo"
+#define X86_EXPAND_PSEUDO_NAME "X86 pseudo instruction expansion pass"
+
+namespace {
+class X86ExpandPseudo : public MachineFunctionPass {
+public:
+ static char ID;
+ X86ExpandPseudo() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ const X86Subtarget *STI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
+ const X86MachineFunctionInfo *X86FI = nullptr;
+ const X86FrameLowering *X86FL = nullptr;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "X86 pseudo instruction expansion pass";
+ }
+
+private:
+ void ExpandICallBranchFunnel(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator MBBI);
+
+ bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool ExpandMBB(MachineBasicBlock &MBB);
+};
+char X86ExpandPseudo::ID = 0;
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(X86ExpandPseudo, DEBUG_TYPE, X86_EXPAND_PSEUDO_NAME, false,
+ false)
+
+void X86ExpandPseudo::ExpandICallBranchFunnel(
+ MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) {
+ MachineBasicBlock *JTMBB = MBB;
+ MachineInstr *JTInst = &*MBBI;
+ MachineFunction *MF = MBB->getParent();
+ const BasicBlock *BB = MBB->getBasicBlock();
+ auto InsPt = MachineFunction::iterator(MBB);
+ ++InsPt;
+
+ std::vector<std::pair<MachineBasicBlock *, unsigned>> TargetMBBs;
+ DebugLoc DL = JTInst->getDebugLoc();
+ MachineOperand Selector = JTInst->getOperand(0);
+ const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal();
+
+ auto CmpTarget = [&](unsigned Target) {
+ if (Selector.isReg())
+ MBB->addLiveIn(Selector.getReg());
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::LEA64r), X86::R11)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addGlobalAddress(CombinedGlobal,
+ JTInst->getOperand(2 + 2 * Target).getImm())
+ .addReg(0);
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::CMP64rr))
+ .add(Selector)
+ .addReg(X86::R11);
+ };
+
+ auto CreateMBB = [&]() {
+ auto *NewMBB = MF->CreateMachineBasicBlock(BB);
+ MBB->addSuccessor(NewMBB);
+ if (!MBB->isLiveIn(X86::EFLAGS))
+ MBB->addLiveIn(X86::EFLAGS);
+ return NewMBB;
+ };
+
+ auto EmitCondJump = [&](unsigned CC, MachineBasicBlock *ThenMBB) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::JCC_1)).addMBB(ThenMBB).addImm(CC);
+
+ auto *ElseMBB = CreateMBB();
+ MF->insert(InsPt, ElseMBB);
+ MBB = ElseMBB;
+ MBBI = MBB->end();
+ };
+
+ auto EmitCondJumpTarget = [&](unsigned CC, unsigned Target) {
+ auto *ThenMBB = CreateMBB();
+ TargetMBBs.push_back({ThenMBB, Target});
+ EmitCondJump(CC, ThenMBB);
+ };
+
+ auto EmitTailCall = [&](unsigned Target) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::TAILJMPd64))
+ .add(JTInst->getOperand(3 + 2 * Target));
+ };
+
+ std::function<void(unsigned, unsigned)> EmitBranchFunnel =
+ [&](unsigned FirstTarget, unsigned NumTargets) {
+ if (NumTargets == 1) {
+ EmitTailCall(FirstTarget);
+ return;
+ }
+
+ if (NumTargets == 2) {
+ CmpTarget(FirstTarget + 1);
+ EmitCondJumpTarget(X86::COND_B, FirstTarget);
+ EmitTailCall(FirstTarget + 1);
+ return;
+ }
+
+ if (NumTargets < 6) {
+ CmpTarget(FirstTarget + 1);
+ EmitCondJumpTarget(X86::COND_B, FirstTarget);
+ EmitCondJumpTarget(X86::COND_E, FirstTarget + 1);
+ EmitBranchFunnel(FirstTarget + 2, NumTargets - 2);
+ return;
+ }
+
+ auto *ThenMBB = CreateMBB();
+ CmpTarget(FirstTarget + (NumTargets / 2));
+ EmitCondJump(X86::COND_B, ThenMBB);
+ EmitCondJumpTarget(X86::COND_E, FirstTarget + (NumTargets / 2));
+ EmitBranchFunnel(FirstTarget + (NumTargets / 2) + 1,
+ NumTargets - (NumTargets / 2) - 1);
+
+ MF->insert(InsPt, ThenMBB);
+ MBB = ThenMBB;
+ MBBI = MBB->end();
+ EmitBranchFunnel(FirstTarget, NumTargets / 2);
+ };
+
+ EmitBranchFunnel(0, (JTInst->getNumOperands() - 2) / 2);
+ for (auto P : TargetMBBs) {
+ MF->insert(InsPt, P.first);
+ BuildMI(P.first, DL, TII->get(X86::TAILJMPd64))
+ .add(JTInst->getOperand(3 + 2 * P.second));
+ }
+ JTMBB->erase(JTInst);
+}
+
+/// If \p MBBI is a pseudo instruction, this method expands
+/// it to the corresponding (sequence of) actual instruction(s).
+/// \returns true if \p MBBI has been expanded.
+bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+ DebugLoc DL = MBBI->getDebugLoc();
+ switch (Opcode) {
+ default:
+ return false;
+ case X86::TCRETURNdi:
+ case X86::TCRETURNdicc:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNdi64cc:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64: {
+ bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ MachineOperand &StackAdjust = MBBI->getOperand(isMem ? X86::AddrNumOperands
+ : 1);
+ assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+ int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+ int Offset = 0;
+ assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+ // Incoporate the retaddr area.
+ Offset = StackAdj - MaxTCDelta;
+ assert(Offset >= 0 && "Offset should never be negative");
+
+ if (Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64cc) {
+ assert(Offset == 0 && "Conditional tail call cannot adjust the stack.");
+ }
+
+ if (Offset) {
+ // Check for possible merge with preceding ADD instruction.
+ Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
+ X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true);
+ }
+
+ // Jump to label or value in register.
+ bool IsWin64 = STI->isTargetWin64();
+ if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc ||
+ Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) {
+ unsigned Op;
+ switch (Opcode) {
+ case X86::TCRETURNdi:
+ Op = X86::TAILJMPd;
+ break;
+ case X86::TCRETURNdicc:
+ Op = X86::TAILJMPd_CC;
+ break;
+ case X86::TCRETURNdi64cc:
+ assert(!MBB.getParent()->hasWinCFI() &&
+ "Conditional tail calls confuse "
+ "the Win64 unwinder.");
+ Op = X86::TAILJMPd64_CC;
+ break;
+ default:
+ // Note: Win64 uses REX prefixes indirect jumps out of functions, but
+ // not direct ones.
+ Op = X86::TAILJMPd64;
+ break;
+ }
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+ if (JumpTarget.isGlobal()) {
+ MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+ JumpTarget.getTargetFlags());
+ } else {
+ assert(JumpTarget.isSymbol());
+ MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+ JumpTarget.getTargetFlags());
+ }
+ if (Op == X86::TAILJMPd_CC || Op == X86::TAILJMPd64_CC) {
+ MIB.addImm(MBBI->getOperand(2).getImm());
+ }
+
+ } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
+ unsigned Op = (Opcode == X86::TCRETURNmi)
+ ? X86::TAILJMPm
+ : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+ for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
+ MIB.add(MBBI->getOperand(i));
+ } else if (Opcode == X86::TCRETURNri64) {
+ JumpTarget.setIsKill();
+ BuildMI(MBB, MBBI, DL,
+ TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
+ .add(JumpTarget);
+ } else {
+ JumpTarget.setIsKill();
+ BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
+ .add(JumpTarget);
+ }
+
+ MachineInstr &NewMI = *std::prev(MBBI);
+ NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
+
+ // Update the call site info.
+ if (MBBI->isCandidateForCallSiteEntry())
+ MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
+
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+
+ return true;
+ }
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ MachineOperand &DestAddr = MBBI->getOperand(0);
+ assert(DestAddr.isReg() && "Offset should be in register!");
+ const bool Uses64BitFramePtr =
+ STI->isTarget64BitLP64() || STI->isTargetNaCl64();
+ Register StackPtr = TRI->getStackRegister();
+ BuildMI(MBB, MBBI, DL,
+ TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+ .addReg(DestAddr.getReg());
+ // The EH_RETURN pseudo is really removed during the MC Lowering.
+ return true;
+ }
+ case X86::IRET: {
+ // Adjust stack to erase error code
+ int64_t StackAdj = MBBI->getOperand(0).getImm();
+ X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, true);
+ // Replace pseudo with machine iret
+ BuildMI(MBB, MBBI, DL,
+ TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::RET: {
+ // Adjust stack to erase error code
+ int64_t StackAdj = MBBI->getOperand(0).getImm();
+ MachineInstrBuilder MIB;
+ if (StackAdj == 0) {
+ MIB = BuildMI(MBB, MBBI, DL,
+ TII->get(STI->is64Bit() ? X86::RETQ : X86::RETL));
+ } else if (isUInt<16>(StackAdj)) {
+ MIB = BuildMI(MBB, MBBI, DL,
+ TII->get(STI->is64Bit() ? X86::RETIQ : X86::RETIL))
+ .addImm(StackAdj);
+ } else {
+ assert(!STI->is64Bit() &&
+ "shouldn't need to do this for x86_64 targets!");
+ // A ret can only handle immediates as big as 2**16-1. If we need to pop
+ // off bytes before the return address, we must do it manually.
+ BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define);
+ X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, /*InEpilogue=*/true);
+ BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX);
+ MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
+ }
+ for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I)
+ MIB.add(MBBI->getOperand(I));
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::LCMPXCHG16B_SAVE_RBX: {
+ // Perform the following transformation.
+ // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
+ // =>
+ // RBX = InArg
+ // actualcmpxchg Addr
+ // RBX = SaveRbx
+ const MachineOperand &InArg = MBBI->getOperand(6);
+ Register SaveRbx = MBBI->getOperand(7).getReg();
+
+ // Copy the input argument of the pseudo into the argument of the
+ // actual instruction.
+ // NOTE: We don't copy the kill flag since the input might be the same reg
+ // as one of the other operands of LCMPXCHG16B.
+ TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false);
+ // Create the actual instruction.
+ MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B));
+ // Copy the operands related to the address.
+ for (unsigned Idx = 1; Idx < 6; ++Idx)
+ NewInstr->addOperand(MBBI->getOperand(Idx));
+ // Finally, restore the value of RBX.
+ TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx,
+ /*SrcIsKill*/ true);
+
+ // Delete the pseudo.
+ MBBI->eraseFromParent();
+ return true;
+ }
+ // Loading/storing mask pairs requires two kmov operations. The second one of
+ // these needs a 2 byte displacement relative to the specified address (with
+ // 32 bit spill size). The pairs of 1bit masks up to 16 bit masks all use the
+ // same spill size, they all are stored using MASKPAIR16STORE, loaded using
+ // MASKPAIR16LOAD.
+ //
+ // The displacement value might wrap around in theory, thus the asserts in
+ // both cases.
+ case X86::MASKPAIR16LOAD: {
+ int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm();
+ assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+ Register Reg = MBBI->getOperand(0).getReg();
+ bool DstIsDead = MBBI->getOperand(0).isDead();
+ Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0);
+ Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1);
+
+ auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm))
+ .addReg(Reg0, RegState::Define | getDeadRegState(DstIsDead));
+ auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm))
+ .addReg(Reg1, RegState::Define | getDeadRegState(DstIsDead));
+
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ MIBLo.add(MBBI->getOperand(1 + i));
+ if (i == X86::AddrDisp)
+ MIBHi.addImm(Disp + 2);
+ else
+ MIBHi.add(MBBI->getOperand(1 + i));
+ }
+
+ // Split the memory operand, adjusting the offset and size for the halves.
+ MachineMemOperand *OldMMO = MBBI->memoperands().front();
+ MachineFunction *MF = MBB.getParent();
+ MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2);
+ MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2);
+
+ MIBLo.setMemRefs(MMOLo);
+ MIBHi.setMemRefs(MMOHi);
+
+ // Delete the pseudo.
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::MASKPAIR16STORE: {
+ int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm();
+ assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+ Register Reg = MBBI->getOperand(X86::AddrNumOperands).getReg();
+ bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill();
+ Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0);
+ Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1);
+
+ auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk));
+ auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk));
+
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ MIBLo.add(MBBI->getOperand(i));
+ if (i == X86::AddrDisp)
+ MIBHi.addImm(Disp + 2);
+ else
+ MIBHi.add(MBBI->getOperand(i));
+ }
+ MIBLo.addReg(Reg0, getKillRegState(SrcIsKill));
+ MIBHi.addReg(Reg1, getKillRegState(SrcIsKill));
+
+ // Split the memory operand, adjusting the offset and size for the halves.
+ MachineMemOperand *OldMMO = MBBI->memoperands().front();
+ MachineFunction *MF = MBB.getParent();
+ MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2);
+ MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2);
+
+ MIBLo.setMemRefs(MMOLo);
+ MIBHi.setMemRefs(MMOHi);
+
+ // Delete the pseudo.
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::MWAITX_SAVE_RBX: {
+ // Perform the following transformation.
+ // SaveRbx = pseudomwaitx InArg, SaveRbx
+ // =>
+ // [E|R]BX = InArg
+ // actualmwaitx
+ // [E|R]BX = SaveRbx
+ const MachineOperand &InArg = MBBI->getOperand(1);
+ // Copy the input argument of the pseudo into the argument of the
+ // actual instruction.
+ TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill());
+ // Create the actual instruction.
+ BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr));
+ // Finally, restore the value of RBX.
+ Register SaveRbx = MBBI->getOperand(2).getReg();
+ TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true);
+ // Delete the pseudo.
+ MBBI->eraseFromParent();
+ return true;
+ }
+ case TargetOpcode::ICALL_BRANCH_FUNNEL:
+ ExpandICallBranchFunnel(&MBB, MBBI);
+ return true;
+ case X86::PLDTILECFG: {
+ MI.RemoveOperand(0);
+ MI.setDesc(TII->get(X86::LDTILECFG));
+ return true;
+ }
+ case X86::PSTTILECFG: {
+ MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
+ MI.setDesc(TII->get(X86::STTILECFG));
+ return true;
+ }
+ case X86::PTILELOADDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
+ for (unsigned i = 2; i > 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILELOADD));
+ return true;
+ }
+ case X86::PTDPBSSDV: {
+ MI.RemoveOperand(7); // Remove $tmmcfg
+ MI.untieRegOperand(4);
+ for (unsigned i = 3; i > 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TDPBSSD));
+ MI.tieOperands(0, 1);
+ return true;
+ }
+ case X86::PTILESTOREDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
+ for (int i = 1; i >= 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILESTORED));
+ return true;
+ }
+ case X86::PTILEZEROV: {
+ for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILEZERO));
+ return true;
+ }
+ }
+ llvm_unreachable("Previous switch has a fallthrough?");
+}
+
+/// Expand all pseudo instructions contained in \p MBB.
+/// \returns true if any expansion occurred for \p MBB.
+bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ // MBBI may be invalidated by the expansion.
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= ExpandMI(MBB, MBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+ STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ X86FL = STI->getFrameLowering();
+
+ bool Modified = false;
+ for (MachineBasicBlock &MBB : MF)
+ Modified |= ExpandMBB(MBB);
+ return Modified;
+}
+
+/// Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createX86ExpandPseudoPass() {
+ return new X86ExpandPseudo();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 000000000000..caf158102230
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,4028 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86CallingConv.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel final : public FastISel {
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+ /// floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf64;
+ bool X86ScalarSSEf32;
+
+public:
+ explicit X86FastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo)
+ : FastISel(funcInfo, libInfo) {
+ Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ }
+
+ bool fastSelectInstruction(const Instruction *I) override;
+
+ /// The specified machine instr operand is a vreg, and that
+ /// vreg is being provided by the specified load instruction. If possible,
+ /// try to fold the load as an operand to the instruction, returning true if
+ /// possible.
+ bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) override;
+
+ bool fastLowerArguments() override;
+ bool fastLowerCall(CallLoweringInfo &CLI) override;
+ bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+#include "X86GenFastISel.inc"
+
+private:
+ bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,
+ const DebugLoc &DL);
+
+ bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
+ unsigned &ResultReg, unsigned Alignment = 1);
+
+ bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
+ MachineMemOperand *MMO = nullptr, bool Aligned = false);
+ bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+ X86AddressMode &AM,
+ MachineMemOperand *MMO = nullptr, bool Aligned = false);
+
+ bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+ unsigned &ResultReg);
+
+ bool X86SelectAddress(const Value *V, X86AddressMode &AM);
+ bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
+
+ bool X86SelectLoad(const Instruction *I);
+
+ bool X86SelectStore(const Instruction *I);
+
+ bool X86SelectRet(const Instruction *I);
+
+ bool X86SelectCmp(const Instruction *I);
+
+ bool X86SelectZExt(const Instruction *I);
+
+ bool X86SelectSExt(const Instruction *I);
+
+ bool X86SelectBranch(const Instruction *I);
+
+ bool X86SelectShift(const Instruction *I);
+
+ bool X86SelectDivRem(const Instruction *I);
+
+ bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
+
+ bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
+
+ bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
+
+ bool X86SelectSelect(const Instruction *I);
+
+ bool X86SelectTrunc(const Instruction *I);
+
+ bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
+ const TargetRegisterClass *RC);
+
+ bool X86SelectFPExt(const Instruction *I);
+ bool X86SelectFPTrunc(const Instruction *I);
+ bool X86SelectSIToFP(const Instruction *I);
+ bool X86SelectUIToFP(const Instruction *I);
+ bool X86SelectIntToFP(const Instruction *I, bool IsSigned);
+
+ const X86InstrInfo *getInstrInfo() const {
+ return Subtarget->getInstrInfo();
+ }
+ const X86TargetMachine *getTargetMachine() const {
+ return static_cast<const X86TargetMachine *>(&TM);
+ }
+
+ bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
+
+ unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
+ unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
+ unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
+ unsigned fastMaterializeConstant(const Constant *C) override;
+
+ unsigned fastMaterializeAlloca(const AllocaInst *C) override;
+
+ unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
+
+ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+ /// computed in an SSE register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
+
+ bool IsMemcpySmall(uint64_t Len);
+
+ bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+ X86AddressMode SrcAM, uint64_t Len);
+
+ bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+ const Value *Cond);
+
+ const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+ X86AddressMode &AM);
+
+ unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC, unsigned Op0,
+ bool Op0IsKill, unsigned Op1, bool Op1IsKill,
+ unsigned Op2, bool Op2IsKill, unsigned Op3,
+ bool Op3IsKill);
+};
+
+} // end anonymous namespace.
+
+static std::pair<unsigned, bool>
+getX86SSEConditionCode(CmpInst::Predicate Predicate) {
+ unsigned CC;
+ bool NeedSwap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (Predicate) {
+ default: llvm_unreachable("Unexpected predicate");
+ case CmpInst::FCMP_OEQ: CC = 0; break;
+ case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OLT: CC = 1; break;
+ case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OLE: CC = 2; break;
+ case CmpInst::FCMP_UNO: CC = 3; break;
+ case CmpInst::FCMP_UNE: CC = 4; break;
+ case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UGE: CC = 5; break;
+ case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UGT: CC = 6; break;
+ case CmpInst::FCMP_ORD: CC = 7; break;
+ case CmpInst::FCMP_UEQ: CC = 8; break;
+ case CmpInst::FCMP_ONE: CC = 12; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
+/// Adds a complex addressing mode to the given machine instr builder.
+/// Note, this will constrain the index register. If its not possible to
+/// constrain the given index register, then a new one will be created. The
+/// IndexReg field of the addressing mode will be updated to match in this case.
+const MachineInstrBuilder &
+X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
+ X86AddressMode &AM) {
+ // First constrain the index register. It needs to be a GR64_NOSP.
+ AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
+ MIB->getNumOperands() +
+ X86::AddrIndexReg);
+ return ::addFullAddress(MIB, AM);
+}
+
+/// Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+ const Value *Cond) {
+ if (!isa<ExtractValueInst>(Cond))
+ return false;
+
+ const auto *EV = cast<ExtractValueInst>(Cond);
+ if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+ return false;
+
+ const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+ MVT RetVT;
+ const Function *Callee = II->getCalledFunction();
+ Type *RetTy =
+ cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+ if (!isTypeLegal(RetTy, RetVT))
+ return false;
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return false;
+
+ X86::CondCode TmpCC;
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
+ }
+
+ // Check if both instructions are in the same basic block.
+ if (II->getParent() != I->getParent())
+ return false;
+
+ // Make sure nothing is in the way
+ BasicBlock::const_iterator Start(I);
+ BasicBlock::const_iterator End(II);
+ for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+ // We only expect extractvalue instructions between the intrinsic and the
+ // instruction to be selected.
+ if (!isa<ExtractValueInst>(Itr))
+ return false;
+
+ // Check that the extractvalue operand comes from the intrinsic.
+ const auto *EVI = cast<ExtractValueInst>(Itr);
+ if (EVI->getAggregateOperand() != II)
+ return false;
+ }
+
+ CC = TmpCC;
+ return true;
+}
+
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
+ EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true);
+ if (evt == MVT::Other || !evt.isSimple())
+ // Unhandled type. Halt "fast" selection and bail.
+ return false;
+
+ VT = evt.getSimpleVT();
+ // For now, require SSE/SSE2 for performing floating-point operations,
+ // since x87 requires additional work.
+ if (VT == MVT::f64 && !X86ScalarSSEf64)
+ return false;
+ if (VT == MVT::f32 && !X86ScalarSSEf32)
+ return false;
+ // Similarly, no f80 support yet.
+ if (VT == MVT::f80)
+ return false;
+ // We only handle legal types. For example, on x86-32 the instruction
+ // selector contains all of the 64-bit instructions from x86-64,
+ // under the assumption that i64 won't be used if the target doesn't
+ // support it.
+ return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
+ MachineMemOperand *MMO, unsigned &ResultReg,
+ unsigned Alignment) {
+ bool HasSSE41 = Subtarget->hasSSE41();
+ bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX2 = Subtarget->hasAVX2();
+ bool HasAVX512 = Subtarget->hasAVX512();
+ bool HasVLX = Subtarget->hasVLX();
+ bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
+ // Treat i1 loads the same as i8 loads. Masking will be done when storing.
+ if (VT == MVT::i1)
+ VT = MVT::i8;
+
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::i8:
+ Opc = X86::MOV8rm;
+ break;
+ case MVT::i16:
+ Opc = X86::MOV16rm;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32rm;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = X86::MOV64rm;
+ break;
+ case MVT::f32:
+ if (X86ScalarSSEf32)
+ Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt;
+ else
+ Opc = X86::LD_Fp32m;
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64)
+ Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt;
+ else
+ Opc = X86::LD_Fp64m;
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return false;
+ case MVT::v4f32:
+ if (IsNonTemporal && Alignment >= 16 && HasSSE41)
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ else if (Alignment >= 16)
+ Opc = HasVLX ? X86::VMOVAPSZ128rm :
+ HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
+ else
+ Opc = HasVLX ? X86::VMOVUPSZ128rm :
+ HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
+ break;
+ case MVT::v2f64:
+ if (IsNonTemporal && Alignment >= 16 && HasSSE41)
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ else if (Alignment >= 16)
+ Opc = HasVLX ? X86::VMOVAPDZ128rm :
+ HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
+ else
+ Opc = HasVLX ? X86::VMOVUPDZ128rm :
+ HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
+ break;
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ if (IsNonTemporal && Alignment >= 16 && HasSSE41)
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ else if (Alignment >= 16)
+ Opc = HasVLX ? X86::VMOVDQA64Z128rm :
+ HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
+ else
+ Opc = HasVLX ? X86::VMOVDQU64Z128rm :
+ HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
+ break;
+ case MVT::v8f32:
+ assert(HasAVX);
+ if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+ Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
+ else if (IsNonTemporal && Alignment >= 16)
+ return false; // Force split for X86::VMOVNTDQArm
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
+ else
+ Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
+ break;
+ case MVT::v4f64:
+ assert(HasAVX);
+ if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+ Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
+ else if (IsNonTemporal && Alignment >= 16)
+ return false; // Force split for X86::VMOVNTDQArm
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
+ else
+ Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
+ break;
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v16i16:
+ case MVT::v32i8:
+ assert(HasAVX);
+ if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+ Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
+ else if (IsNonTemporal && Alignment >= 16)
+ return false; // Force split for X86::VMOVNTDQArm
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
+ else
+ Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
+ break;
+ case MVT::v16f32:
+ assert(HasAVX512);
+ if (IsNonTemporal && Alignment >= 64)
+ Opc = X86::VMOVNTDQAZrm;
+ else
+ Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
+ break;
+ case MVT::v8f64:
+ assert(HasAVX512);
+ if (IsNonTemporal && Alignment >= 64)
+ Opc = X86::VMOVNTDQAZrm;
+ else
+ Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
+ break;
+ case MVT::v8i64:
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8:
+ assert(HasAVX512);
+ // Note: There are a lot more choices based on type with AVX-512, but
+ // there's really no advantage when the load isn't masked.
+ if (IsNonTemporal && Alignment >= 64)
+ Opc = X86::VMOVNTDQAZrm;
+ else
+ Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
+ break;
+ }
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+
+ ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ addFullAddress(MIB, AM);
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+ return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+ X86AddressMode &AM,
+ MachineMemOperand *MMO, bool Aligned) {
+ bool HasSSE1 = Subtarget->hasSSE1();
+ bool HasSSE2 = Subtarget->hasSSE2();
+ bool HasSSE4A = Subtarget->hasSSE4A();
+ bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX512 = Subtarget->hasAVX512();
+ bool HasVLX = Subtarget->hasVLX();
+ bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
+ // Get opcode and regclass of the output for the given store instruction.
+ unsigned Opc = 0;
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f80: // No f80 support yet.
+ default: return false;
+ case MVT::i1: {
+ // Mask out all but lowest bit.
+ Register AndResult = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(X86::AND8ri), AndResult)
+ .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
+ ValReg = AndResult;
+ LLVM_FALLTHROUGH; // handle i1 as i8.
+ }
+ case MVT::i8: Opc = X86::MOV8mr; break;
+ case MVT::i16: Opc = X86::MOV16mr; break;
+ case MVT::i32:
+ Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
+ break;
+ case MVT::f32:
+ if (X86ScalarSSEf32) {
+ if (IsNonTemporal && HasSSE4A)
+ Opc = X86::MOVNTSS;
+ else
+ Opc = HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+ } else
+ Opc = X86::ST_Fp32m;
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf32) {
+ if (IsNonTemporal && HasSSE4A)
+ Opc = X86::MOVNTSD;
+ else
+ Opc = HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
+ } else
+ Opc = X86::ST_Fp64m;
+ break;
+ case MVT::x86mmx:
+ Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;
+ break;
+ case MVT::v4f32:
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPSZ128mr :
+ HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPSZ128mr :
+ HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPSZ128mr :
+ HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
+ break;
+ case MVT::v2f64:
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPDZ128mr :
+ HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPDZ128mr :
+ HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPDZ128mr :
+ HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
+ break;
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTDQZ128mr :
+ HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
+ else
+ Opc = HasVLX ? X86::VMOVDQA64Z128mr :
+ HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
+ } else
+ Opc = HasVLX ? X86::VMOVDQU64Z128mr :
+ HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
+ break;
+ case MVT::v8f32:
+ assert(HasAVX);
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
+ break;
+ case MVT::v4f64:
+ assert(HasAVX);
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
+ break;
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v16i16:
+ case MVT::v32i8:
+ assert(HasAVX);
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
+ else
+ Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
+ break;
+ case MVT::v16f32:
+ assert(HasAVX512);
+ if (Aligned)
+ Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
+ else
+ Opc = X86::VMOVUPSZmr;
+ break;
+ case MVT::v8f64:
+ assert(HasAVX512);
+ if (Aligned) {
+ Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
+ } else
+ Opc = X86::VMOVUPDZmr;
+ break;
+ case MVT::v8i64:
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8:
+ assert(HasAVX512);
+ // Note: There are a lot more choices based on type with AVX-512, but
+ // there's really no advantage when the store isn't masked.
+ if (Aligned)
+ Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;
+ else
+ Opc = X86::VMOVDQU64Zmr;
+ break;
+ }
+
+ const MCInstrDesc &Desc = TII.get(Opc);
+ // Some of the instructions in the previous switch use FR128 instead
+ // of FR32 for ValReg. Make sure the register we feed the instruction
+ // matches its register class constraints.
+ // Note: This is fine to do a copy from FR32 to FR128, this is the
+ // same registers behind the scene and actually why it did not trigger
+ // any bugs before.
+ ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, Desc);
+ addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+
+ return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
+ X86AddressMode &AM,
+ MachineMemOperand *MMO, bool Aligned) {
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Val))
+ Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
+
+ // If this is a store of a simple constant, fold the constant into the store.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+ unsigned Opc = 0;
+ bool Signed = true;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i1:
+ Signed = false;
+ LLVM_FALLTHROUGH; // Handle as i8.
+ case MVT::i8: Opc = X86::MOV8mi; break;
+ case MVT::i16: Opc = X86::MOV16mi; break;
+ case MVT::i32: Opc = X86::MOV32mi; break;
+ case MVT::i64:
+ // Must be a 32-bit sign extended value.
+ if (isInt<32>(CI->getSExtValue()))
+ Opc = X86::MOV64mi32;
+ break;
+ }
+
+ if (Opc) {
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+ addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
+ : CI->getZExtValue());
+ if (MMO)
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+ return true;
+ }
+ }
+
+ Register ValReg = getRegForValue(Val);
+ if (ValReg == 0)
+ return false;
+
+ bool ValKill = hasTrivialKill(Val);
+ return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+ unsigned Src, EVT SrcVT,
+ unsigned &ResultReg) {
+ unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+ Src, /*TODO: Kill=*/false);
+ if (RR == 0)
+ return false;
+
+ ResultReg = RR;
+ return true;
+}
+
+bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // Can't handle TLS yet.
+ if (GV->isThreadLocal())
+ return false;
+
+ // Can't handle !absolute_symbol references yet.
+ if (GV->isAbsoluteSymbolRef())
+ return false;
+
+ // RIP-relative addresses can't have additional register operands, so if
+ // we've already folded stuff into the addressing mode, just force the
+ // global value into its own register, which we can use as the basereg.
+ if (!Subtarget->isPICStyleRIPRel() ||
+ (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+ // Okay, we've committed to selecting this global. Set up the address.
+ AM.GV = GV;
+
+ // Allow the subtarget to classify the global.
+ unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+
+ // If this reference is relative to the pic base, set it now.
+ if (isGlobalRelativeToPICBase(GVFlags)) {
+ // FIXME: How do we know Base.Reg is free??
+ AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ }
+
+ // Unless the ABI requires an extra load, return a direct reference to
+ // the global.
+ if (!isGlobalStubReference(GVFlags)) {
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ }
+ AM.GVOpFlags = GVFlags;
+ return true;
+ }
+
+ // Ok, we need to do a load from a stub. If we've already loaded from
+ // this stub, reuse the loaded pointer, otherwise emit the load now.
+ DenseMap<const Value *, Register>::iterator I = LocalValueMap.find(V);
+ Register LoadReg;
+ if (I != LocalValueMap.end() && I->second) {
+ LoadReg = I->second;
+ } else {
+ // Issue load from stub.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ X86AddressMode StubAM;
+ StubAM.Base.Reg = AM.Base.Reg;
+ StubAM.GV = GV;
+ StubAM.GVOpFlags = GVFlags;
+
+ // Prepare for inserting code in the local-value area.
+ SavePoint SaveInsertPt = enterLocalValueArea();
+
+ if (TLI.getPointerTy(DL) == MVT::i64) {
+ Opc = X86::MOV64rm;
+ RC = &X86::GR64RegClass;
+ } else {
+ Opc = X86::MOV32rm;
+ RC = &X86::GR32RegClass;
+ }
+
+ if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL)
+ StubAM.Base.Reg = X86::RIP;
+
+ LoadReg = createResultReg(RC);
+ MachineInstrBuilder LoadMI =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
+ addFullAddress(LoadMI, StubAM);
+
+ // Ok, back to normal mode.
+ leaveLocalValueArea(SaveInsertPt);
+
+ // Prevent loading GV stub multiple times in same MBB.
+ LocalValueMap[V] = LoadReg;
+ }
+
+ // Now construct the final address. Note that the Disp, Scale,
+ // and Index values may already be set here.
+ AM.Base.Reg = LoadReg;
+ AM.GV = nullptr;
+ return true;
+ }
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+ SmallVector<const Value *, 32> GEPs;
+redo_gep:
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ // Don't walk into other basic blocks; it's possible we haven't
+ // visited them yet, so the instructions may not yet be assigned
+ // virtual registers.
+ if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
+ FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+ Opcode = I->getOpcode();
+ U = I;
+ }
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+ if (Ty->getAddressSpace() > 255)
+ // Fast instruction selection doesn't support the special
+ // address spaces.
+ return false;
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts.
+ return X86SelectAddress(U->getOperand(0), AM);
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::Alloca: {
+ // Do static allocas.
+ const AllocaInst *A = cast<AllocaInst>(V);
+ DenseMap<const AllocaInst *, int>::iterator SI =
+ FuncInfo.StaticAllocaMap.find(A);
+ if (SI != FuncInfo.StaticAllocaMap.end()) {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = SI->second;
+ return true;
+ }
+ break;
+ }
+
+ case Instruction::Add: {
+ // Adds of constants are common and easy enough.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+ uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+ // They have to fit in the 32-bit signed displacement field though.
+ if (isInt<32>(Disp)) {
+ AM.Disp = (uint32_t)Disp;
+ return X86SelectAddress(U->getOperand(0), AM);
+ }
+ }
+ break;
+ }
+
+ case Instruction::GetElementPtr: {
+ X86AddressMode SavedAM = AM;
+
+ // Pattern-match simple GEPs.
+ uint64_t Disp = (int32_t)AM.Disp;
+ unsigned IndexReg = AM.IndexReg;
+ unsigned Scale = AM.Scale;
+ gep_type_iterator GTI = gep_type_begin(U);
+ // Iterate through the indices, folding what we can. Constants can be
+ // folded, and one dynamic index can be handled, if the scale is supported.
+ for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+ i != e; ++i, ++GTI) {
+ const Value *Op = *i;
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+ continue;
+ }
+
+ // A array/variable index is always of the form i*S where S is the
+ // constant scale size. See if we can push the scale into immediates.
+ uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+ for (;;) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ Disp += CI->getSExtValue() * S;
+ break;
+ }
+ if (canFoldAddIntoGEP(U, Op)) {
+ // A compatible add with a constant operand. Fold the constant.
+ ConstantInt *CI =
+ cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ Disp += CI->getSExtValue() * S;
+ // Iterate on the other operand.
+ Op = cast<AddOperator>(Op)->getOperand(0);
+ continue;
+ }
+ if (IndexReg == 0 &&
+ (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+ (S == 1 || S == 2 || S == 4 || S == 8)) {
+ // Scaled-index addressing.
+ Scale = S;
+ IndexReg = getRegForGEPIndex(Op).first;
+ if (IndexReg == 0)
+ return false;
+ break;
+ }
+ // Unsupported.
+ goto unsupported_gep;
+ }
+ }
+
+ // Check for displacement overflow.
+ if (!isInt<32>(Disp))
+ break;
+
+ AM.IndexReg = IndexReg;
+ AM.Scale = Scale;
+ AM.Disp = (uint32_t)Disp;
+ GEPs.push_back(V);
+
+ if (const GetElementPtrInst *GEP =
+ dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
+ // Ok, the GEP indices were covered by constant-offset and scaled-index
+ // addressing. Update the address state and move on to examining the base.
+ V = GEP;
+ goto redo_gep;
+ } else if (X86SelectAddress(U->getOperand(0), AM)) {
+ return true;
+ }
+
+ // If we couldn't merge the gep value into this addr mode, revert back to
+ // our address and just match the value instead of completely failing.
+ AM = SavedAM;
+
+ for (const Value *I : reverse(GEPs))
+ if (handleConstantAddresses(I, AM))
+ return true;
+
+ return false;
+ unsupported_gep:
+ // Ok, the GEP indices weren't all covered.
+ break;
+ }
+ }
+
+ return handleConstantAddresses(V, AM);
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ const Instruction *I = dyn_cast<Instruction>(V);
+ // Record if the value is defined in the same basic block.
+ //
+ // This information is crucial to know whether or not folding an
+ // operand is valid.
+ // Indeed, FastISel generates or reuses a virtual register for all
+ // operands of all instructions it selects. Obviously, the definition and
+ // its uses must use the same virtual register otherwise the produced
+ // code is incorrect.
+ // Before instruction selection, FunctionLoweringInfo::set sets the virtual
+ // registers for values that are alive across basic blocks. This ensures
+ // that the values are consistently set between across basic block, even
+ // if different instruction selection mechanisms are used (e.g., a mix of
+ // SDISel and FastISel).
+ // For values local to a basic block, the instruction selection process
+ // generates these virtual registers with whatever method is appropriate
+ // for its needs. In particular, FastISel and SDISel do not share the way
+ // local virtual registers are set.
+ // Therefore, this is impossible (or at least unsafe) to share values
+ // between basic blocks unless they use the same instruction selection
+ // method, which is not guarantee for X86.
+ // Moreover, things like hasOneUse could not be used accurately, if we
+ // allow to reference values across basic blocks whereas they are not
+ // alive across basic blocks initially.
+ bool InMBB = true;
+ if (I) {
+ Opcode = I->getOpcode();
+ U = I;
+ InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts if its operand is in the same BB.
+ if (InMBB)
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs if its operand is in the same BB.
+ if (InMBB &&
+ TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+ TLI.getPointerTy(DL))
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints if its operand is in the same BB.
+ if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+ }
+
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // RIP-relative addresses can't have additional register operands.
+ if (Subtarget->isPICStyleRIPRel() &&
+ (AM.Base.Reg != 0 || AM.IndexReg != 0))
+ return false;
+
+ // Can't handle TLS.
+ if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+ if (GVar->isThreadLocal())
+ return false;
+
+ // Okay, we've committed to selecting this global. Set up the basic address.
+ AM.GV = GV;
+
+ // Return a direct reference to the global. Fastisel can handle calls to
+ // functions that require loads, such as dllimport and nonlazybind
+ // functions.
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ } else {
+ AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);
+ }
+
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ auto GetCallRegForValue = [this](const Value *V) {
+ Register Reg = getRegForValue(V);
+
+ // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits.
+ if (Reg && Subtarget->isTarget64BitILP32()) {
+ Register CopyReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32rr),
+ CopyReg)
+ .addReg(Reg);
+
+ Register ExtReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg)
+ .addImm(0)
+ .addReg(CopyReg)
+ .addImm(X86::sub_32bit);
+ Reg = ExtReg;
+ }
+
+ return Reg;
+ };
+
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = GetCallRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = GetCallRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(const Instruction *I) {
+ // Atomic stores need special handling.
+ const StoreInst *S = cast<StoreInst>(I);
+
+ if (S->isAtomic())
+ return false;
+
+ const Value *PtrV = I->getOperand(1);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
+ const Value *Val = S->getValueOperand();
+ const Value *Ptr = S->getPointerOperand();
+
+ MVT VT;
+ if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ Align Alignment = S->getAlign();
+ Align ABIAlignment = DL.getABITypeAlign(Val->getType());
+ bool Aligned = Alignment >= ABIAlignment;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(Ptr, AM))
+ return false;
+
+ return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
+}
+
+/// X86SelectRet - Select and emit code to implement ret instructions.
+bool X86FastISel::X86SelectRet(const Instruction *I) {
+ const ReturnInst *Ret = cast<ReturnInst>(I);
+ const Function &F = *I->getParent()->getParent();
+ const X86MachineFunctionInfo *X86MFInfo =
+ FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
+
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ if (TLI.supportSwiftError() &&
+ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return false;
+
+ if (TLI.supportSplitCSR(FuncInfo.MF))
+ return false;
+
+ CallingConv::ID CC = F.getCallingConv();
+ if (CC != CallingConv::C &&
+ CC != CallingConv::Fast &&
+ CC != CallingConv::Tail &&
+ CC != CallingConv::X86_FastCall &&
+ CC != CallingConv::X86_StdCall &&
+ CC != CallingConv::X86_ThisCall &&
+ CC != CallingConv::X86_64_SysV &&
+ CC != CallingConv::Win64)
+ return false;
+
+ // Don't handle popping bytes if they don't fit the ret's immediate.
+ if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))
+ return false;
+
+ // fastcc with -tailcallopt is intended to provide a guaranteed
+ // tail call optimization. Fastisel doesn't know how to do that.
+ if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+ CC == CallingConv::Tail)
+ return false;
+
+ // Let SDISel handle vararg functions.
+ if (F.isVarArg())
+ return false;
+
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+
+ if (Ret->getNumOperands() > 0) {
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ValLocs;
+ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+ const Value *RV = Ret->getOperand(0);
+ Register Reg = getRegForValue(RV);
+ if (Reg == 0)
+ return false;
+
+ // Only handle a single return value for now.
+ if (ValLocs.size() != 1)
+ return false;
+
+ CCValAssign &VA = ValLocs[0];
+
+ // Don't bother handling odd stuff for now.
+ if (VA.getLocInfo() != CCValAssign::Full)
+ return false;
+ // Only handle register returns for now.
+ if (!VA.isRegLoc())
+ return false;
+
+ // The calling-convention tables for x87 returns don't tell
+ // the whole story.
+ if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+ return false;
+
+ unsigned SrcReg = Reg + VA.getValNo();
+ EVT SrcVT = TLI.getValueType(DL, RV->getType());
+ EVT DstVT = VA.getValVT();
+ // Special handling for extended integers.
+ if (SrcVT != DstVT) {
+ if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+ return false;
+
+ if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+ return false;
+
+ assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+ if (SrcVT == MVT::i1) {
+ if (Outs[0].Flags.isSExt())
+ return false;
+ // TODO
+ SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*Op0IsKill=*/false);
+ SrcVT = MVT::i8;
+ }
+ unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+ ISD::SIGN_EXTEND;
+ // TODO
+ SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg,
+ /*Op0IsKill=*/false);
+ }
+
+ // Make the copy.
+ Register DstReg = VA.getLocReg();
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ // Avoid a cross-class copy. This is very unlikely.
+ if (!SrcRC->contains(DstReg))
+ return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
+
+ // Add register to return instruction.
+ RetRegs.push_back(VA.getLocReg());
+ }
+
+ // Swift calling convention does not require we copy the sret argument
+ // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
+
+ // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax.
+ if (F.hasStructRetAttr() && CC != CallingConv::Swift) {
+ Register Reg = X86MFInfo->getSRetReturnReg();
+ assert(Reg &&
+ "SRetReturnReg should have been set in LowerFormalArguments()!");
+ unsigned RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
+ RetRegs.push_back(RetReg);
+ }
+
+ // Now emit the RET.
+ MachineInstrBuilder MIB;
+ if (X86MFInfo->getBytesToPopOnReturn()) {
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Subtarget->is64Bit() ? X86::RETIQ : X86::RETIL))
+ .addImm(X86MFInfo->getBytesToPopOnReturn());
+ } else {
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+ }
+ for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+ MIB.addReg(RetRegs[i], RegState::Implicit);
+ return true;
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+ const LoadInst *LI = cast<LoadInst>(I);
+
+ // Atomic loads need special handling.
+ if (LI->isAtomic())
+ return false;
+
+ const Value *SV = I->getOperand(0);
+ if (TLI.supportSwiftError()) {
+ // Swifterror values can come from either a function parameter with
+ // swifterror attribute or an alloca with swifterror attribute.
+ if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+ if (Arg->hasSwiftErrorAttr())
+ return false;
+ }
+
+ if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+ if (Alloca->isSwiftError())
+ return false;
+ }
+ }
+
+ MVT VT;
+ if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ const Value *Ptr = LI->getPointerOperand();
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(Ptr, AM))
+ return false;
+
+ unsigned ResultReg = 0;
+ if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
+ LI->getAlign().value()))
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+ bool HasAVX512 = Subtarget->hasAVX512();
+ bool HasAVX = Subtarget->hasAVX();
+ bool X86ScalarSSEf32 = Subtarget->hasSSE1();
+ bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::i8: return X86::CMP8rr;
+ case MVT::i16: return X86::CMP16rr;
+ case MVT::i32: return X86::CMP32rr;
+ case MVT::i64: return X86::CMP64rr;
+ case MVT::f32:
+ return X86ScalarSSEf32
+ ? (HasAVX512 ? X86::VUCOMISSZrr
+ : HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr)
+ : 0;
+ case MVT::f64:
+ return X86ScalarSSEf64
+ ? (HasAVX512 ? X86::VUCOMISDZrr
+ : HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr)
+ : 0;
+ }
+}
+
+/// If we have a comparison with RHS as the RHS of the comparison, return an
+/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
+ int64_t Val = RHSC->getSExtValue();
+ switch (VT.getSimpleVT().SimpleTy) {
+ // Otherwise, we can't fold the immediate into this comparison.
+ default:
+ return 0;
+ case MVT::i8:
+ return X86::CMP8ri;
+ case MVT::i16:
+ if (isInt<8>(Val))
+ return X86::CMP16ri8;
+ return X86::CMP16ri;
+ case MVT::i32:
+ if (isInt<8>(Val))
+ return X86::CMP32ri8;
+ return X86::CMP32ri;
+ case MVT::i64:
+ if (isInt<8>(Val))
+ return X86::CMP64ri8;
+ // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+ // field.
+ if (isInt<32>(Val))
+ return X86::CMP64ri32;
+ return 0;
+ }
+}
+
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
+ const DebugLoc &CurDbgLoc) {
+ Register Op0Reg = getRegForValue(Op0);
+ if (Op0Reg == 0) return false;
+
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Op1))
+ Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
+
+ // We have two options: compare with register or immediate. If the RHS of
+ // the compare is an immediate that we can fold into this compare, use
+ // CMPri, otherwise use CMPrr.
+ if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+ if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
+ .addReg(Op0Reg)
+ .addImm(Op1C->getSExtValue());
+ return true;
+ }
+ }
+
+ unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
+ if (CompareOpc == 0) return false;
+
+ Register Op1Reg = getRegForValue(Op1);
+ if (Op1Reg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
+ .addReg(Op0Reg)
+ .addReg(Op1Reg);
+
+ return true;
+}
+
+bool X86FastISel::X86SelectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
+
+ MVT VT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+ return false;
+
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ unsigned ResultReg = 0;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: {
+ ResultReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
+ ResultReg);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg,
+ /*Op0IsKill=*/true, X86::sub_8bit);
+ if (!ResultReg)
+ return false;
+ break;
+ }
+ case CmpInst::FCMP_TRUE: {
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+ ResultReg).addImm(1);
+ break;
+ }
+ }
+
+ if (ResultReg) {
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ const Value *LHS = CI->getOperand(0);
+ const Value *RHS = CI->getOperand(1);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+ // We don't have to materialize a zero constant for this case and can just use
+ // %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *RHSC = dyn_cast<ConstantFP>(RHS);
+ if (RHSC && RHSC->isNullValue())
+ RHS = LHS;
+ }
+
+ // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+ static const uint16_t SETFOpcTable[2][3] = {
+ { X86::COND_E, X86::COND_NP, X86::AND8rr },
+ { X86::COND_NE, X86::COND_P, X86::OR8rr }
+ };
+ const uint16_t *SETFOpc = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
+ case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
+ }
+
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ if (SETFOpc) {
+ if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+ return false;
+
+ Register FlagReg1 = createResultReg(&X86::GR8RegClass);
+ Register FlagReg2 = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg1).addImm(SETFOpc[0]);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg2).addImm(SETFOpc[1]);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
+ ResultReg).addReg(FlagReg1).addReg(FlagReg2);
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ X86::CondCode CC;
+ bool SwapArgs;
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ if (SwapArgs)
+ std::swap(LHS, RHS);
+
+ // Emit a compare of LHS/RHS.
+ if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ ResultReg).addImm(CC);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectZExt(const Instruction *I) {
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+ if (!TLI.isTypeLegal(DstVT))
+ return false;
+
+ Register ResultReg = getRegForValue(I->getOperand(0));
+ if (ResultReg == 0)
+ return false;
+
+ // Handle zero-extension from i1 to i8, which is common.
+ MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+ if (SrcVT == MVT::i1) {
+ // Set the high bits to zero.
+ ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+ SrcVT = MVT::i8;
+
+ if (ResultReg == 0)
+ return false;
+ }
+
+ if (DstVT == MVT::i64) {
+ // Handle extension to 64-bits via sub-register shenanigans.
+ unsigned MovInst;
+
+ switch (SrcVT.SimpleTy) {
+ case MVT::i8: MovInst = X86::MOVZX32rr8; break;
+ case MVT::i16: MovInst = X86::MOVZX32rr16; break;
+ case MVT::i32: MovInst = X86::MOV32rr; break;
+ default: llvm_unreachable("Unexpected zext to i64 source type");
+ }
+
+ Register Result32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
+ .addReg(ResultReg);
+
+ ResultReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
+ ResultReg)
+ .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+ } else if (DstVT == MVT::i16) {
+ // i8->i16 doesn't exist in the autogenerated isel table. Need to zero
+ // extend to 32-bits and then extract down to 16-bits.
+ Register Result32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
+ Result32).addReg(ResultReg);
+
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
+ /*Op0IsKill=*/true, X86::sub_16bit);
+ } else if (DstVT != MVT::i8) {
+ ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+ ResultReg, /*Op0IsKill=*/true);
+ if (ResultReg == 0)
+ return false;
+ }
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectSExt(const Instruction *I) {
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+ if (!TLI.isTypeLegal(DstVT))
+ return false;
+
+ Register ResultReg = getRegForValue(I->getOperand(0));
+ if (ResultReg == 0)
+ return false;
+
+ // Handle sign-extension from i1 to i8.
+ MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+ if (SrcVT == MVT::i1) {
+ // Set the high bits to zero.
+ Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg,
+ /*TODO: Kill=*/false);
+ if (ZExtReg == 0)
+ return false;
+
+ // Negate the result to make an 8-bit sign extended value.
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::NEG8r),
+ ResultReg).addReg(ZExtReg);
+
+ SrcVT = MVT::i8;
+ }
+
+ if (DstVT == MVT::i16) {
+ // i8->i16 doesn't exist in the autogenerated isel table. Need to sign
+ // extend to 32-bits and then extract down to 16-bits.
+ Register Result32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
+ Result32).addReg(ResultReg);
+
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
+ /*Op0IsKill=*/true, X86::sub_16bit);
+ } else if (DstVT != MVT::i8) {
+ ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
+ ResultReg, /*Op0IsKill=*/true);
+ if (ResultReg == 0)
+ return false;
+ }
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectBranch(const Instruction *I) {
+ // Unconditional branches are selected by tablegen-generated code.
+ // Handle a conditional branch.
+ const BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ // Fold the common case of a conditional branch with a comparison
+ // in the same block (values defined on other blocks may not have
+ // initialized registers).
+ X86::CondCode CC;
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+ EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
+
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
+ case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true;
+ }
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
+ // 0.0.
+ // We don't have to materialize a zero constant for this case and can just
+ // use %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+ if (CmpRHSC && CmpRHSC->isNullValue())
+ CmpRHS = CmpLHS;
+ }
+
+ // Try to take advantage of fallthrough opportunities.
+ if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
+ // code check. Instead two branch instructions are required to check all
+ // the flags. First we change the predicate to a supported condition code,
+ // which will be the first branch. Later one we will emit the second
+ // branch.
+ bool NeedExtraBranch = false;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ:
+ std::swap(TrueMBB, FalseMBB);
+ LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UNE:
+ NeedExtraBranch = true;
+ Predicate = CmpInst::FCMP_ONE;
+ break;
+ }
+
+ bool SwapArgs;
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ if (SwapArgs)
+ std::swap(CmpLHS, CmpRHS);
+
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(CC);
+
+ // X86 requires a second branch to handle UNE (and OEQ, which is mapped
+ // to UNE above).
+ if (NeedExtraBranch) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(X86::COND_P);
+ }
+
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+ }
+ } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+ // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+ // typically happen for _Bool and C++ bools.
+ MVT SourceVT;
+ if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+ isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+ unsigned TestOpc = 0;
+ switch (SourceVT.SimpleTy) {
+ default: break;
+ case MVT::i8: TestOpc = X86::TEST8ri; break;
+ case MVT::i16: TestOpc = X86::TEST16ri; break;
+ case MVT::i32: TestOpc = X86::TEST32ri; break;
+ case MVT::i64: TestOpc = X86::TEST64ri32; break;
+ }
+ if (TestOpc) {
+ Register OpReg = getRegForValue(TI->getOperand(0));
+ if (OpReg == 0) return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
+ .addReg(OpReg).addImm(1);
+
+ unsigned JmpCond = X86::COND_NE;
+ if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ JmpCond = X86::COND_E;
+ }
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(JmpCond);
+
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+ }
+ }
+ } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ Register TmpReg = getRegForValue(BI->getCondition());
+ if (TmpReg == 0)
+ return false;
+
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(CC);
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+ }
+
+ // Otherwise do a clumsy setcc and re-test it.
+ // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+ // in an explicit cast, so make sure to handle that correctly.
+ Register OpReg = getRegForValue(BI->getCondition());
+ if (OpReg == 0) return false;
+
+ // In case OpReg is a K register, COPY to a GPR
+ if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
+ unsigned KOpReg = OpReg;
+ OpReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), OpReg)
+ .addReg(KOpReg);
+ OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Op0IsKill=*/true,
+ X86::sub_8bit);
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(OpReg)
+ .addImm(1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(X86::COND_NE);
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+ return true;
+}
+
+bool X86FastISel::X86SelectShift(const Instruction *I) {
+ unsigned CReg = 0, OpReg = 0;
+ const TargetRegisterClass *RC = nullptr;
+ if (I->getType()->isIntegerTy(8)) {
+ CReg = X86::CL;
+ RC = &X86::GR8RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL8rCL; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(16)) {
+ CReg = X86::CX;
+ RC = &X86::GR16RegClass;
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected shift opcode");
+ case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL16rCL; break;
+ }
+ } else if (I->getType()->isIntegerTy(32)) {
+ CReg = X86::ECX;
+ RC = &X86::GR32RegClass;
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected shift opcode");
+ case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL32rCL; break;
+ }
+ } else if (I->getType()->isIntegerTy(64)) {
+ CReg = X86::RCX;
+ RC = &X86::GR64RegClass;
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected shift opcode");
+ case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+ case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+ case Instruction::Shl: OpReg = X86::SHL64rCL; break;
+ }
+ } else {
+ return false;
+ }
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ Register Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0) return false;
+
+ Register Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+ CReg).addReg(Op1Reg);
+
+ // The shift instruction uses X86::CL. If we defined a super-register
+ // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+ if (CReg != X86::CL)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::KILL), X86::CL)
+ .addReg(CReg, RegState::Kill);
+
+ Register ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
+ .addReg(Op0Reg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectDivRem(const Instruction *I) {
+ const static unsigned NumTypes = 4; // i8, i16, i32, i64
+ const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
+ const static bool S = true; // IsSigned
+ const static bool U = false; // !IsSigned
+ const static unsigned Copy = TargetOpcode::COPY;
+ // For the X86 DIV/IDIV instruction, in most cases the dividend
+ // (numerator) must be in a specific register pair highreg:lowreg,
+ // producing the quotient in lowreg and the remainder in highreg.
+ // For most data types, to set up the instruction, the dividend is
+ // copied into lowreg, and lowreg is sign-extended or zero-extended
+ // into highreg. The exception is i8, where the dividend is defined
+ // as a single register rather than a register pair, and we
+ // therefore directly sign-extend or zero-extend the dividend into
+ // lowreg, instead of copying, and ignore the highreg.
+ const static struct DivRemEntry {
+ // The following portion depends only on the data type.
+ const TargetRegisterClass *RC;
+ unsigned LowInReg; // low part of the register pair
+ unsigned HighInReg; // high part of the register pair
+ // The following portion depends on both the data type and the operation.
+ struct DivRemResult {
+ unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
+ unsigned OpSignExtend; // Opcode for sign-extending lowreg into
+ // highreg, or copying a zero into highreg.
+ unsigned OpCopy; // Opcode for copying dividend into lowreg, or
+ // zero/sign-extending into lowreg for i8.
+ unsigned DivRemResultReg; // Register containing the desired result.
+ bool IsOpSigned; // Whether to use signed or unsigned form.
+ } ResultTable[NumOps];
+ } OpTable[NumTypes] = {
+ { &X86::GR8RegClass, X86::AX, 0, {
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
+ }
+ }, // i8
+ { &X86::GR16RegClass, X86::AX, X86::DX, {
+ { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
+ { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
+ { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv
+ { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem
+ }
+ }, // i16
+ { &X86::GR32RegClass, X86::EAX, X86::EDX, {
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
+ }
+ }, // i32
+ { &X86::GR64RegClass, X86::RAX, X86::RDX, {
+ { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
+ { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
+ { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
+ { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
+ }
+ }, // i64
+ };
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned TypeIndex, OpIndex;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::i8: TypeIndex = 0; break;
+ case MVT::i16: TypeIndex = 1; break;
+ case MVT::i32: TypeIndex = 2; break;
+ case MVT::i64: TypeIndex = 3;
+ if (!Subtarget->is64Bit())
+ return false;
+ break;
+ }
+
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected div/rem opcode");
+ case Instruction::SDiv: OpIndex = 0; break;
+ case Instruction::SRem: OpIndex = 1; break;
+ case Instruction::UDiv: OpIndex = 2; break;
+ case Instruction::URem: OpIndex = 3; break;
+ }
+
+ const DivRemEntry &TypeEntry = OpTable[TypeIndex];
+ const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+ Register Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0)
+ return false;
+ Register Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0)
+ return false;
+
+ // Move op0 into low-order input register.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
+ // Zero-extend or sign-extend into high-order input register.
+ if (OpEntry.OpSignExtend) {
+ if (OpEntry.IsOpSigned)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(OpEntry.OpSignExtend));
+ else {
+ Register Zero32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(X86::MOV32r0), Zero32);
+
+ // Copy the zero into the appropriate sub/super/identical physical
+ // register. Unfortunately the operations needed are not uniform enough
+ // to fit neatly into the table above.
+ if (VT == MVT::i16) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Copy), TypeEntry.HighInReg)
+ .addReg(Zero32, 0, X86::sub_16bit);
+ } else if (VT == MVT::i32) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Copy), TypeEntry.HighInReg)
+ .addReg(Zero32);
+ } else if (VT == MVT::i64) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+ .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+ }
+ }
+ }
+ // Generate the DIV/IDIV instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
+ // For i8 remainder, we can't reference ah directly, as we'll end
+ // up with bogus copies like %r9b = COPY %ah. Reference ax
+ // instead to prevent ah references in a rex instruction.
+ //
+ // The current assumption of the fast register allocator is that isel
+ // won't generate explicit references to the GR8_NOREX registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
+ unsigned ResultReg = 0;
+ if ((I->getOpcode() == Instruction::SRem ||
+ I->getOpcode() == Instruction::URem) &&
+ OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
+ Register SourceSuperReg = createResultReg(&X86::GR16RegClass);
+ Register ResultSuperReg = createResultReg(&X86::GR16RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Copy), SourceSuperReg).addReg(X86::AX);
+
+ // Shift AX right by 8 bits instead of using AH.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
+ ResultSuperReg).addReg(SourceSuperReg).addImm(8);
+
+ // Now reference the 8-bit subreg of the result.
+ ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+ /*Op0IsKill=*/true, X86::sub_8bit);
+ }
+ // Copy the result out of the physreg if we haven't already.
+ if (!ResultReg) {
+ ResultReg = createResultReg(TypeEntry.RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
+ .addReg(OpEntry.DivRemResultReg);
+ }
+ updateValueMap(I, ResultReg);
+
+ return true;
+}
+
+/// Emit a conditional move instruction (if the are supported) to lower
+/// the select.
+bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
+ // Check if the subtarget supports these instructions.
+ if (!Subtarget->hasCMov())
+ return false;
+
+ // FIXME: Add support for i8.
+ if (RetVT < MVT::i16 || RetVT > MVT::i64)
+ return false;
+
+ const Value *Cond = I->getOperand(0);
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ bool NeedTest = true;
+ X86::CondCode CC = X86::COND_NE;
+
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<CmpInst>(Cond);
+ if (CI && (CI->getParent() == I->getParent())) {
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+ static const uint16_t SETFOpcTable[2][3] = {
+ { X86::COND_NP, X86::COND_E, X86::TEST8rr },
+ { X86::COND_P, X86::COND_NE, X86::OR8rr }
+ };
+ const uint16_t *SETFOpc = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_OEQ:
+ SETFOpc = &SETFOpcTable[0][0];
+ Predicate = CmpInst::ICMP_NE;
+ break;
+ case CmpInst::FCMP_UNE:
+ SETFOpc = &SETFOpcTable[1][0];
+ Predicate = CmpInst::ICMP_NE;
+ break;
+ }
+
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+ return false;
+
+ if (SETFOpc) {
+ Register FlagReg1 = createResultReg(&X86::GR8RegClass);
+ Register FlagReg2 = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg1).addImm(SETFOpc[0]);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg2).addImm(SETFOpc[1]);
+ auto const &II = TII.get(SETFOpc[2]);
+ if (II.getNumDefs()) {
+ Register TmpReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
+ .addReg(FlagReg2).addReg(FlagReg1);
+ } else {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(FlagReg2).addReg(FlagReg1);
+ }
+ }
+ NeedTest = false;
+ } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ Register TmpReg = getRegForValue(Cond);
+ if (TmpReg == 0)
+ return false;
+
+ NeedTest = false;
+ }
+
+ if (NeedTest) {
+ // Selects operate on i1, however, CondReg is 8 bits width and may contain
+ // garbage. Indeed, only the less significant bit is supposed to be
+ // accurate. If we read more than the lsb, we may see non-zero values
+ // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
+ // the select. This is achieved by performing TEST against 1.
+ Register CondReg = getRegForValue(Cond);
+ if (CondReg == 0)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+
+ // In case OpReg is a K register, COPY to a GPR
+ if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
+ unsigned KCondReg = CondReg;
+ CondReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CondReg)
+ .addReg(KCondReg, getKillRegState(CondIsKill));
+ CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
+ X86::sub_8bit);
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(CondReg, getKillRegState(CondIsKill))
+ .addImm(1);
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ Register RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ Register LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ if (!LHSReg || !RHSReg)
+ return false;
+
+ const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
+ unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8);
+ Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill, CC);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+/// Emit SSE or AVX instructions to lower the select.
+///
+/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
+/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
+/// SSE instructions are available. If AVX is available, try to use a VBLENDV.
+bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
+ if (!CI || (CI->getParent() != I->getParent()))
+ return false;
+
+ if (I->getType() != CI->getOperand(0)->getType() ||
+ !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
+ (Subtarget->hasSSE2() && RetVT == MVT::f64)))
+ return false;
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+ // We don't have to materialize a zero constant for this case and can just use
+ // %x again on the RHS.
+ if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+ const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+ if (CmpRHSC && CmpRHSC->isNullValue())
+ CmpRHS = CmpLHS;
+ }
+
+ unsigned CC;
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
+ if (CC > 7 && !Subtarget->hasAVX())
+ return false;
+
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ Register LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ Register RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ Register CmpLHSReg = getRegForValue(CmpLHS);
+ bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
+
+ Register CmpRHSReg = getRegForValue(CmpRHS);
+ bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
+
+ if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg)
+ return false;
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ unsigned ResultReg;
+
+ if (Subtarget->hasAVX512()) {
+ // If we have AVX512 we can use a mask compare and masked movss/sd.
+ const TargetRegisterClass *VR128X = &X86::VR128XRegClass;
+ const TargetRegisterClass *VK1 = &X86::VK1RegClass;
+
+ unsigned CmpOpcode =
+ (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr;
+ Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+
+ // Need an IMPLICIT_DEF for the input that is used to generate the upper
+ // bits of the result register since its not based on any of the inputs.
+ Register ImplicitDefReg = createResultReg(VR128X);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+
+ // Place RHSReg is the passthru of the masked movss/sd operation and put
+ // LHS in the input. The mask input comes from the compare.
+ unsigned MovOpcode =
+ (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
+ unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill,
+ CmpReg, true, ImplicitDefReg, true,
+ LHSReg, LHSIsKill);
+
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);
+
+ } else if (Subtarget->hasAVX()) {
+ const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+
+ // If we have AVX, create 1 blendv instead of 3 logic instructions.
+ // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
+ // uses XMM0 as the selection register. That may need just as many
+ // instructions as the AND/ANDN/OR sequence due to register moves, so
+ // don't bother.
+ unsigned CmpOpcode =
+ (RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
+ unsigned BlendOpcode =
+ (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
+
+ Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+ Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill, CmpReg, true);
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
+ } else {
+ // Choose the SSE instruction sequence based on data type (float or double).
+ static const uint16_t OpcTable[2][4] = {
+ { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },
+ { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }
+ };
+
+ const uint16_t *Opc = nullptr;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = &OpcTable[0][0]; break;
+ case MVT::f64: Opc = &OpcTable[1][0]; break;
+ }
+
+ const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+ Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+ Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg,
+ /*Op0IsKill=*/false, LHSReg, LHSIsKill);
+ Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg,
+ /*Op0IsKill=*/true, RHSReg, RHSIsKill);
+ Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*Op0IsKill=*/true,
+ AndReg, /*Op1IsKill=*/true);
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
+ }
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
+ // These are pseudo CMOV instructions and will be later expanded into control-
+ // flow.
+ unsigned Opc;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::i8: Opc = X86::CMOV_GR8; break;
+ case MVT::i16: Opc = X86::CMOV_GR16; break;
+ case MVT::i32: Opc = X86::CMOV_GR32; break;
+ case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
+ : X86::CMOV_FR32; break;
+ case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
+ : X86::CMOV_FR64; break;
+ }
+
+ const Value *Cond = I->getOperand(0);
+ X86::CondCode CC = X86::COND_NE;
+
+ // Optimize conditions coming from a compare if both instructions are in the
+ // same basic block (values defined in other basic blocks may not have
+ // initialized registers).
+ const auto *CI = dyn_cast<CmpInst>(Cond);
+ if (CI && (CI->getParent() == I->getParent())) {
+ bool NeedSwap;
+ std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
+ if (CC > X86::LAST_VALID_COND)
+ return false;
+
+ const Value *CmpLHS = CI->getOperand(0);
+ const Value *CmpRHS = CI->getOperand(1);
+
+ if (NeedSwap)
+ std::swap(CmpLHS, CmpRHS);
+
+ EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+ if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+ return false;
+ } else {
+ Register CondReg = getRegForValue(Cond);
+ if (CondReg == 0)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+
+ // In case OpReg is a K register, COPY to a GPR
+ if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
+ unsigned KCondReg = CondReg;
+ CondReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CondReg)
+ .addReg(KCondReg, getKillRegState(CondIsKill));
+ CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
+ X86::sub_8bit);
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+ .addReg(CondReg, getKillRegState(CondIsKill))
+ .addImm(1);
+ }
+
+ const Value *LHS = I->getOperand(1);
+ const Value *RHS = I->getOperand(2);
+
+ Register LHSReg = getRegForValue(LHS);
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ Register RHSReg = getRegForValue(RHS);
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (!LHSReg || !RHSReg)
+ return false;
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+
+ Register ResultReg =
+ fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+ MVT RetVT;
+ if (!isTypeLegal(I->getType(), RetVT))
+ return false;
+
+ // Check if we can fold the select.
+ if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ const Value *Opnd = nullptr;
+ switch (Predicate) {
+ default: break;
+ case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
+ case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;
+ }
+ // No need for a select anymore - this is an unconditional move.
+ if (Opnd) {
+ Register OpReg = getRegForValue(Opnd);
+ if (OpReg == 0)
+ return false;
+ bool OpIsKill = hasTrivialKill(Opnd);
+ const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+ Register ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(OpReg, getKillRegState(OpIsKill));
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ // First try to use real conditional move instructions.
+ if (X86FastEmitCMoveSelect(RetVT, I))
+ return true;
+
+ // Try to use a sequence of SSE instructions to simulate a conditional move.
+ if (X86FastEmitSSESelect(RetVT, I))
+ return true;
+
+ // Fall-back to pseudo conditional move instructions, which will be later
+ // converted to control-flow.
+ if (X86FastEmitPseudoSelect(RetVT, I))
+ return true;
+
+ return false;
+}
+
+// Common code for X86SelectSIToFP and X86SelectUIToFP.
+bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
+ // The target-independent selection algorithm in FastISel already knows how
+ // to select a SINT_TO_FP if the target is SSE but not AVX.
+ // Early exit if the subtarget doesn't have AVX.
+ // Unsigned conversion requires avx512.
+ bool HasAVX512 = Subtarget->hasAVX512();
+ if (!Subtarget->hasAVX() || (!IsSigned && !HasAVX512))
+ return false;
+
+ // TODO: We could sign extend narrower types.
+ MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+ if (SrcVT != MVT::i32 && SrcVT != MVT::i64)
+ return false;
+
+ // Select integer to float/double conversion.
+ Register OpReg = getRegForValue(I->getOperand(0));
+ if (OpReg == 0)
+ return false;
+
+ unsigned Opcode;
+
+ static const uint16_t SCvtOpc[2][2][2] = {
+ { { X86::VCVTSI2SSrr, X86::VCVTSI642SSrr },
+ { X86::VCVTSI2SDrr, X86::VCVTSI642SDrr } },
+ { { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr },
+ { X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } },
+ };
+ static const uint16_t UCvtOpc[2][2] = {
+ { X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr },
+ { X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr },
+ };
+ bool Is64Bit = SrcVT == MVT::i64;
+
+ if (I->getType()->isDoubleTy()) {
+ // s/uitofp int -> double
+ Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit];
+ } else if (I->getType()->isFloatTy()) {
+ // s/uitofp int -> float
+ Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit];
+ } else
+ return false;
+
+ MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();
+ const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);
+ Register ImplicitDefReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+ Register ResultReg =
+ fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+ return X86SelectIntToFP(I, /*IsSigned*/true);
+}
+
+bool X86FastISel::X86SelectUIToFP(const Instruction *I) {
+ return X86SelectIntToFP(I, /*IsSigned*/false);
+}
+
+// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
+bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
+ unsigned TargetOpc,
+ const TargetRegisterClass *RC) {
+ assert((I->getOpcode() == Instruction::FPExt ||
+ I->getOpcode() == Instruction::FPTrunc) &&
+ "Instruction must be an FPExt or FPTrunc!");
+ bool HasAVX = Subtarget->hasAVX();
+
+ Register OpReg = getRegForValue(I->getOperand(0));
+ if (OpReg == 0)
+ return false;
+
+ unsigned ImplicitDefReg;
+ if (HasAVX) {
+ ImplicitDefReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+
+ }
+
+ Register ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
+ ResultReg);
+
+ if (HasAVX)
+ MIB.addReg(ImplicitDefReg);
+
+ MIB.addReg(OpReg);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectFPExt(const Instruction *I) {
+ if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
+ I->getOperand(0)->getType()->isFloatTy()) {
+ bool HasAVX512 = Subtarget->hasAVX512();
+ // fpext from float to double.
+ unsigned Opc =
+ HasAVX512 ? X86::VCVTSS2SDZrr
+ : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
+ return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64));
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
+ if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
+ I->getOperand(0)->getType()->isDoubleTy()) {
+ bool HasAVX512 = Subtarget->hasAVX512();
+ // fptrunc from double to float.
+ unsigned Opc =
+ HasAVX512 ? X86::VCVTSD2SSZrr
+ : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
+ return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32));
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectTrunc(const Instruction *I) {
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+
+ // This code only handles truncation to byte.
+ if (DstVT != MVT::i8 && DstVT != MVT::i1)
+ return false;
+ if (!TLI.isTypeLegal(SrcVT))
+ return false;
+
+ Register InputReg = getRegForValue(I->getOperand(0));
+ if (!InputReg)
+ // Unhandled operand. Halt "fast" selection and bail.
+ return false;
+
+ if (SrcVT == MVT::i8) {
+ // Truncate from i8 to i1; no code needed.
+ updateValueMap(I, InputReg);
+ return true;
+ }
+
+ // Issue an extract_subreg.
+ Register ResultReg = fastEmitInst_extractsubreg(MVT::i8,
+ InputReg, false,
+ X86::sub_8bit);
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+ return Len <= (Subtarget->is64Bit() ? 32 : 16);
+}
+
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+ X86AddressMode SrcAM, uint64_t Len) {
+
+ // Make sure we don't bloat code by inlining very large memcpy's.
+ if (!IsMemcpySmall(Len))
+ return false;
+
+ bool i64Legal = Subtarget->is64Bit();
+
+ // We don't care about alignment here since we just emit integer accesses.
+ while (Len) {
+ MVT VT;
+ if (Len >= 8 && i64Legal)
+ VT = MVT::i64;
+ else if (Len >= 4)
+ VT = MVT::i32;
+ else if (Len >= 2)
+ VT = MVT::i16;
+ else
+ VT = MVT::i8;
+
+ unsigned Reg;
+ bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
+ RV &= X86FastEmitStore(VT, Reg, /*ValIsKill=*/true, DestAM);
+ assert(RV && "Failed to emit load or store??");
+
+ unsigned Size = VT.getSizeInBits()/8;
+ Len -= Size;
+ DestAM.Disp += Size;
+ SrcAM.Disp += Size;
+ }
+
+ return true;
+}
+
+bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+ // FIXME: Handle more intrinsics.
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::convert_from_fp16:
+ case Intrinsic::convert_to_fp16: {
+ if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())
+ return false;
+
+ const Value *Op = II->getArgOperand(0);
+ Register InputReg = getRegForValue(Op);
+ if (InputReg == 0)
+ return false;
+
+ // F16C only allows converting from float to half and from half to float.
+ bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
+ if (IsFloatToHalf) {
+ if (!Op->getType()->isFloatTy())
+ return false;
+ } else {
+ if (!II->getType()->isFloatTy())
+ return false;
+ }
+
+ unsigned ResultReg = 0;
+ const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
+ if (IsFloatToHalf) {
+ // 'InputReg' is implicitly promoted from register class FR32 to
+ // register class VR128 by method 'constrainOperandRegClass' which is
+ // directly called by 'fastEmitInst_ri'.
+ // Instruction VCVTPS2PHrr takes an extra immediate operand which is
+ // used to provide rounding control: use MXCSR.RC, encoded as 0b100.
+ // It's consistent with the other FP instructions, which are usually
+ // controlled by MXCSR.
+ unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr
+ : X86::VCVTPS2PHrr;
+ InputReg = fastEmitInst_ri(Opc, RC, InputReg, false, 4);
+
+ // Move the lower 32-bits of ResultReg to another register of class GR32.
+ Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr
+ : X86::VMOVPDI2DIrr;
+ ResultReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(InputReg, RegState::Kill);
+
+ // The result value is in the lower 16-bits of ResultReg.
+ unsigned RegIdx = X86::sub_16bit;
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
+ } else {
+ assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
+ // Explicitly zero-extend the input to 32-bit.
+ InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg,
+ /*Op0IsKill=*/false);
+
+ // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
+ InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
+ InputReg, /*Op0IsKill=*/true);
+
+ unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
+ : X86::VCVTPH2PSrr;
+ InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Op0IsKill=*/true);
+
+ // The result value is in the lower 32-bits of ResultReg.
+ // Emit an explicit copy from register class VR128 to register class FR32.
+ ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(InputReg, RegState::Kill);
+ }
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ case Intrinsic::frameaddress: {
+ MachineFunction *MF = FuncInfo.MF;
+ if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
+ return false;
+
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ unsigned Opc;
+ const TargetRegisterClass *RC = nullptr;
+
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Invalid result type for frameaddress.");
+ case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
+ case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
+ }
+
+ // This needs to be set before we call getPtrSizedFrameRegister, otherwise
+ // we get the wrong frame register.
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
+ assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+ (FrameReg == X86::EBP && VT == MVT::i32)) &&
+ "Invalid Frame Register!");
+
+ // Always make a copy of the frame register to a vreg first, so that we
+ // never directly reference the frame register (the TwoAddressInstruction-
+ // Pass doesn't like that).
+ Register SrcReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
+
+ // Now recursively load from the frame address.
+ // movq (%rbp), %rax
+ // movq (%rax), %rax
+ // movq (%rax), %rax
+ // ...
+ unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+ while (Depth--) {
+ Register DestReg = createResultReg(RC);
+ addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), DestReg), SrcReg);
+ SrcReg = DestReg;
+ }
+
+ updateValueMap(II, SrcReg);
+ return true;
+ }
+ case Intrinsic::memcpy: {
+ const MemCpyInst *MCI = cast<MemCpyInst>(II);
+ // Don't handle volatile or variable length memcpys.
+ if (MCI->isVolatile())
+ return false;
+
+ if (isa<ConstantInt>(MCI->getLength())) {
+ // Small memcpy's are common enough that we want to do them
+ // without a call if possible.
+ uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
+ if (IsMemcpySmall(Len)) {
+ X86AddressMode DestAM, SrcAM;
+ if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
+ !X86SelectAddress(MCI->getRawSource(), SrcAM))
+ return false;
+ TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+ return true;
+ }
+ }
+
+ unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+ if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
+ return false;
+
+ if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
+ return false;
+
+ return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 1);
+ }
+ case Intrinsic::memset: {
+ const MemSetInst *MSI = cast<MemSetInst>(II);
+
+ if (MSI->isVolatile())
+ return false;
+
+ unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+ if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
+ return false;
+
+ if (MSI->getDestAddressSpace() > 255)
+ return false;
+
+ return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
+ }
+ case Intrinsic::stackprotector: {
+ // Emit code to store the stack guard onto the stack.
+ EVT PtrTy = TLI.getPointerTy(DL);
+
+ const Value *Op1 = II->getArgOperand(0); // The guard's value.
+ const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
+
+ MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
+
+ // Grab the frame index.
+ X86AddressMode AM;
+ if (!X86SelectAddress(Slot, AM)) return false;
+ if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
+ return true;
+ }
+ case Intrinsic::dbg_declare: {
+ const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
+ X86AddressMode AM;
+ assert(DI->getAddress() && "Null address should be checked earlier!");
+ if (!X86SelectAddress(DI->getAddress(), AM))
+ return false;
+ const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+ // FIXME may need to add RegState::Debug to any registers produced,
+ // although ESP/EBP should be the only ones at the moment.
+ assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
+ "Expected inlined-at fields to agree");
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
+ .addImm(0)
+ .addMetadata(DI->getVariable())
+ .addMetadata(DI->getExpression());
+ return true;
+ }
+ case Intrinsic::trap: {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
+ return true;
+ }
+ case Intrinsic::sqrt: {
+ if (!Subtarget->hasSSE1())
+ return false;
+
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
+ // is not generated by FastISel yet.
+ // FIXME: Update this code once tablegen can handle it.
+ static const uint16_t SqrtOpc[3][2] = {
+ { X86::SQRTSSr, X86::SQRTSDr },
+ { X86::VSQRTSSr, X86::VSQRTSDr },
+ { X86::VSQRTSSZr, X86::VSQRTSDZr },
+ };
+ unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
+ Subtarget->hasAVX() ? 1 :
+ 0;
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break;
+ case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break;
+ }
+
+ const Value *SrcVal = II->getArgOperand(0);
+ Register SrcReg = getRegForValue(SrcVal);
+
+ if (SrcReg == 0)
+ return false;
+
+ const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+ unsigned ImplicitDefReg = 0;
+ if (AVXLevel > 0) {
+ ImplicitDefReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+ }
+
+ Register ResultReg = createResultReg(RC);
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+ ResultReg);
+
+ if (ImplicitDefReg)
+ MIB.addReg(ImplicitDefReg);
+
+ MIB.addReg(SrcReg);
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: {
+ // This implements the basic lowering of the xalu with overflow intrinsics
+ // into add/sub/mul followed by either seto or setb.
+ const Function *Callee = II->getCalledFunction();
+ auto *Ty = cast<StructType>(Callee->getReturnType());
+ Type *RetTy = Ty->getTypeAtIndex(0U);
+ assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&
+ Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&
+ "Overflow value expected to be an i1");
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ if (VT < MVT::i8 || VT > MVT::i64)
+ return false;
+
+ const Value *LHS = II->getArgOperand(0);
+ const Value *RHS = II->getArgOperand(1);
+
+ // Canonicalize immediate to the RHS.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
+ std::swap(LHS, RHS);
+
+ unsigned BaseOpc, CondCode;
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::sadd_with_overflow:
+ BaseOpc = ISD::ADD; CondCode = X86::COND_O; break;
+ case Intrinsic::uadd_with_overflow:
+ BaseOpc = ISD::ADD; CondCode = X86::COND_B; break;
+ case Intrinsic::ssub_with_overflow:
+ BaseOpc = ISD::SUB; CondCode = X86::COND_O; break;
+ case Intrinsic::usub_with_overflow:
+ BaseOpc = ISD::SUB; CondCode = X86::COND_B; break;
+ case Intrinsic::smul_with_overflow:
+ BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break;
+ case Intrinsic::umul_with_overflow:
+ BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;
+ }
+
+ Register LHSReg = getRegForValue(LHS);
+ if (LHSReg == 0)
+ return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned ResultReg = 0;
+ // Check if we have an immediate version.
+ if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
+ static const uint16_t Opc[2][4] = {
+ { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
+ { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
+ };
+
+ if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) &&
+ CondCode == X86::COND_O) {
+ // We can use INC/DEC.
+ ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ bool IsDec = BaseOpc == ISD::SUB;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ } else
+ ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+ CI->getZExtValue());
+ }
+
+ unsigned RHSReg;
+ bool RHSIsKill;
+ if (!ResultReg) {
+ RHSReg = getRegForValue(RHS);
+ if (RHSReg == 0)
+ return false;
+ RHSIsKill = hasTrivialKill(RHS);
+ ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill);
+ }
+
+ // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
+ // it manually.
+ if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+ static const uint16_t MULOpc[] =
+ { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+ static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+ // First copy the first operand into RAX, which is an implicit input to
+ // the X86::MUL*r instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+ TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+ } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
+ static const uint16_t MULOpc[] =
+ { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
+ if (VT == MVT::i8) {
+ // Copy the first operand into AL, which is an implicit input to the
+ // X86::IMUL8r instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), X86::AL)
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
+ RHSIsKill);
+ } else
+ ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
+ TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
+ RHSReg, RHSIsKill);
+ }
+
+ if (!ResultReg)
+ return false;
+
+ // Assign to a GPR since the overflow return value is lowered to a SETcc.
+ Register ResultReg2 = createResultReg(&X86::GR8RegClass);
+ assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ ResultReg2).addImm(CondCode);
+
+ updateValueMap(II, ResultReg, 2);
+ return true;
+ }
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64: {
+ bool IsInputDouble;
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic.");
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ if (!Subtarget->hasSSE1())
+ return false;
+ IsInputDouble = false;
+ break;
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64:
+ if (!Subtarget->hasSSE2())
+ return false;
+ IsInputDouble = true;
+ break;
+ }
+
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ static const uint16_t CvtOpc[3][2][2] = {
+ { { X86::CVTTSS2SIrr, X86::CVTTSS2SI64rr },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SI64rr } },
+ { { X86::VCVTTSS2SIrr, X86::VCVTTSS2SI64rr },
+ { X86::VCVTTSD2SIrr, X86::VCVTTSD2SI64rr } },
+ { { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr },
+ { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } },
+ };
+ unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
+ Subtarget->hasAVX() ? 1 :
+ 0;
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected result type.");
+ case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break;
+ case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break;
+ }
+
+ // Check if we can fold insertelement instructions into the convert.
+ const Value *Op = II->getArgOperand(0);
+ while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
+ const Value *Index = IE->getOperand(2);
+ if (!isa<ConstantInt>(Index))
+ break;
+ unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+
+ if (Idx == 0) {
+ Op = IE->getOperand(1);
+ break;
+ }
+ Op = IE->getOperand(0);
+ }
+
+ Register Reg = getRegForValue(Op);
+ if (Reg == 0)
+ return false;
+
+ Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(Reg);
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ }
+}
+
+bool X86FastISel::fastLowerArguments() {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg())
+ return false;
+
+ CallingConv::ID CC = F->getCallingConv();
+ if (CC != CallingConv::C)
+ return false;
+
+ if (Subtarget->isCallingConvWin64(CC))
+ return false;
+
+ if (!Subtarget->is64Bit())
+ return false;
+
+ if (Subtarget->useSoftFloat())
+ return false;
+
+ // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+ unsigned GPRCnt = 0;
+ unsigned FPRCnt = 0;
+ for (auto const &Arg : F->args()) {
+ if (Arg.hasAttribute(Attribute::ByVal) ||
+ Arg.hasAttribute(Attribute::InReg) ||
+ Arg.hasAttribute(Attribute::StructRet) ||
+ Arg.hasAttribute(Attribute::SwiftSelf) ||
+ Arg.hasAttribute(Attribute::SwiftError) ||
+ Arg.hasAttribute(Attribute::Nest))
+ return false;
+
+ Type *ArgTy = Arg.getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+ return false;
+
+ EVT ArgVT = TLI.getValueType(DL, ArgTy);
+ if (!ArgVT.isSimple()) return false;
+ switch (ArgVT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i32:
+ case MVT::i64:
+ ++GPRCnt;
+ break;
+ case MVT::f32:
+ case MVT::f64:
+ if (!Subtarget->hasSSE1())
+ return false;
+ ++FPRCnt;
+ break;
+ }
+
+ if (GPRCnt > 6)
+ return false;
+
+ if (FPRCnt > 8)
+ return false;
+ }
+
+ static const MCPhysReg GPR32ArgRegs[] = {
+ X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+ };
+ static const MCPhysReg GPR64ArgRegs[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+ };
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+
+ unsigned GPRIdx = 0;
+ unsigned FPRIdx = 0;
+ for (auto const &Arg : F->args()) {
+ MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
+ const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+ unsigned SrcReg;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type.");
+ case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
+ case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
+ case MVT::f32: LLVM_FALLTHROUGH;
+ case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
+ }
+ Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+ // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+ // Without this, EmitLiveInCopies may eliminate the livein if its only
+ // use is a bitcast (which isn't turned into an instruction).
+ Register ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(DstReg, getKillRegState(true));
+ updateValueMap(&Arg, ResultReg);
+ }
+ return true;
+}
+
+static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
+ CallingConv::ID CC,
+ const CallBase *CB) {
+ if (Subtarget->is64Bit())
+ return 0;
+ if (Subtarget->getTargetTriple().isOSMSVCRT())
+ return 0;
+ if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+ CC == CallingConv::HiPE || CC == CallingConv::Tail)
+ return 0;
+
+ if (CB)
+ if (CB->arg_empty() || !CB->paramHasAttr(0, Attribute::StructRet) ||
+ CB->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
+ return 0;
+
+ return 4;
+}
+
+bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+ auto &OutVals = CLI.OutVals;
+ auto &OutFlags = CLI.OutFlags;
+ auto &OutRegs = CLI.OutRegs;
+ auto &Ins = CLI.Ins;
+ auto &InRegs = CLI.InRegs;
+ CallingConv::ID CC = CLI.CallConv;
+ bool &IsTailCall = CLI.IsTailCall;
+ bool IsVarArg = CLI.IsVarArg;
+ const Value *Callee = CLI.Callee;
+ MCSymbol *Symbol = CLI.Symbol;
+
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsWin64 = Subtarget->isCallingConvWin64(CC);
+
+ const CallInst *CI = dyn_cast_or_null<CallInst>(CLI.CB);
+ const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr;
+
+ // Call / invoke instructions with NoCfCheck attribute require special
+ // handling.
+ const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
+ if ((CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()))
+ return false;
+
+ // Functions with no_caller_saved_registers that need special handling.
+ if ((CI && CI->hasFnAttr("no_caller_saved_registers")) ||
+ (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
+ return false;
+
+ // Functions using thunks for indirect calls need to use SDISel.
+ if (Subtarget->useIndirectThunkCalls())
+ return false;
+
+ // Handle only C, fastcc, and webkit_js calling conventions for now.
+ switch (CC) {
+ default: return false;
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::Tail:
+ case CallingConv::WebKit_JS:
+ case CallingConv::Swift:
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::Win64:
+ case CallingConv::X86_64_SysV:
+ case CallingConv::CFGuard_Check:
+ break;
+ }
+
+ // Allow SelectionDAG isel to handle tail calls.
+ if (IsTailCall)
+ return false;
+
+ // fastcc with -tailcallopt is intended to provide a guaranteed
+ // tail call optimization. Fastisel doesn't know how to do that.
+ if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+ CC == CallingConv::Tail)
+ return false;
+
+ // Don't know how to handle Win64 varargs yet. Nothing special needed for
+ // x86-32. Special handling for x86-64 is implemented.
+ if (IsVarArg && IsWin64)
+ return false;
+
+ // Don't know about inalloca yet.
+ if (CLI.CB && CLI.CB->hasInAllocaArgument())
+ return false;
+
+ for (auto Flag : CLI.OutFlags)
+ if (Flag.isSwiftError() || Flag.isPreallocated())
+ return false;
+
+ SmallVector<MVT, 16> OutVTs;
+ SmallVector<unsigned, 16> ArgRegs;
+
+ // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
+ // instruction. This is safe because it is common to all FastISel supported
+ // calling conventions on x86.
+ for (int i = 0, e = OutVals.size(); i != e; ++i) {
+ Value *&Val = OutVals[i];
+ ISD::ArgFlagsTy Flags = OutFlags[i];
+ if (auto *CI = dyn_cast<ConstantInt>(Val)) {
+ if (CI->getBitWidth() < 32) {
+ if (Flags.isSExt())
+ Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
+ else
+ Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
+ }
+ }
+
+ // Passing bools around ends up doing a trunc to i1 and passing it.
+ // Codegen this as an argument + "and 1".
+ MVT VT;
+ auto *TI = dyn_cast<TruncInst>(Val);
+ unsigned ResultReg;
+ if (TI && TI->getType()->isIntegerTy(1) && CLI.CB &&
+ (TI->getParent() == CLI.CB->getParent()) && TI->hasOneUse()) {
+ Value *PrevVal = TI->getOperand(0);
+ ResultReg = getRegForValue(PrevVal);
+
+ if (!ResultReg)
+ return false;
+
+ if (!isTypeLegal(PrevVal->getType(), VT))
+ return false;
+
+ ResultReg =
+ fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
+ } else {
+ if (!isTypeLegal(Val->getType(), VT) ||
+ (VT.isVector() && VT.getVectorElementType() == MVT::i1))
+ return false;
+ ResultReg = getRegForValue(Val);
+ }
+
+ if (!ResultReg)
+ return false;
+
+ ArgRegs.push_back(ResultReg);
+ OutVTs.push_back(VT);
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
+
+ // Allocate shadow area for Win64
+ if (IsWin64)
+ CCInfo.AllocateStack(32, Align(8));
+
+ CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
+ // Issue CALLSEQ_START
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+ .addImm(NumBytes).addImm(0).addImm(0);
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign const &VA = ArgLocs[i];
+ const Value *ArgVal = OutVals[VA.getValNo()];
+ MVT ArgVT = OutVTs[VA.getValNo()];
+
+ if (ArgVT == MVT::x86mmx)
+ return false;
+
+ unsigned ArgReg = ArgRegs[VA.getValNo()];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+
+ if (ArgVT == MVT::i1)
+ return false;
+
+ bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::ZExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+
+ // Handle zero-extension from i1 to i8, which is common.
+ if (ArgVT == MVT::i1) {
+ // Set the high bits to zero.
+ ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
+ ArgVT = MVT::i8;
+
+ if (ArgReg == 0)
+ return false;
+ }
+
+ bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::AExt: {
+ assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+ "Unexpected extend");
+ bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+ ArgVT, ArgReg);
+
+ assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::BCvt: {
+ ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
+ /*TODO: Kill=*/false);
+ assert(ArgReg && "Failed to emit a bitcast!");
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::VExt:
+ // VExt has not been implemented, so this should be impossible to reach
+ // for now. However, fallback to Selection DAG isel once implemented.
+ return false;
+ case CCValAssign::AExtUpper:
+ case CCValAssign::SExtUpper:
+ case CCValAssign::ZExtUpper:
+ case CCValAssign::FPExt:
+ case CCValAssign::Trunc:
+ llvm_unreachable("Unexpected loc info!");
+ case CCValAssign::Indirect:
+ // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+ // support this.
+ return false;
+ }
+
+ if (VA.isRegLoc()) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+ OutRegs.push_back(VA.getLocReg());
+ } else {
+ assert(VA.isMemLoc() && "Unknown value location!");
+
+ // Don't emit stores for undef values.
+ if (isa<UndefValue>(ArgVal))
+ continue;
+
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ X86AddressMode AM;
+ AM.Base.Reg = RegInfo->getStackRegister();
+ AM.Disp = LocMemOffset;
+ ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
+ Align Alignment = DL.getABITypeAlign(ArgVal->getType());
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
+ MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+ if (Flags.isByVal()) {
+ X86AddressMode SrcAM;
+ SrcAM.Base.Reg = ArgReg;
+ if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
+ return false;
+ } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+ // If this is a really simple value, emit this with the Value* version
+ // of X86FastEmitStore. If it isn't simple, we don't want to do this,
+ // as it can cause us to reevaluate the argument.
+ if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
+ return false;
+ } else {
+ bool ValIsKill = hasTrivialKill(ArgVal);
+ if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
+ return false;
+ }
+ }
+ }
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (Subtarget->isPICStyleGOT()) {
+ unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
+ }
+
+ if (Is64Bit && IsVarArg && !IsWin64) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // Count the number of XMM registers allocated.
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+ assert((Subtarget->hasSSE1() || !NumXMMRegs)
+ && "SSE registers cannot be used when SSE is disabled");
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+ X86::AL).addImm(NumXMMRegs);
+ }
+
+ // Materialize callee address in a register. FIXME: GV address can be
+ // handled with a CALLpcrel32 instead.
+ X86AddressMode CalleeAM;
+ if (!X86SelectCallAddress(Callee, CalleeAM))
+ return false;
+
+ unsigned CalleeOp = 0;
+ const GlobalValue *GV = nullptr;
+ if (CalleeAM.GV != nullptr) {
+ GV = CalleeAM.GV;
+ } else if (CalleeAM.Base.Reg != 0) {
+ CalleeOp = CalleeAM.Base.Reg;
+ } else
+ return false;
+
+ // Issue the call.
+ MachineInstrBuilder MIB;
+ if (CalleeOp) {
+ // Register-indirect call.
+ unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
+ .addReg(CalleeOp);
+ } else {
+ // Direct call.
+ assert(GV && "Not a direct call");
+ // See if we need any target-specific flags on the GV operand.
+ unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);
+
+ // This will be a direct call, or an indirect call through memory for
+ // NonLazyBind calls or dllimport calls.
+ bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT ||
+ OpFlags == X86II::MO_GOTPCREL ||
+ OpFlags == X86II::MO_COFFSTUB;
+ unsigned CallOpc = NeedLoad
+ ? (Is64Bit ? X86::CALL64m : X86::CALL32m)
+ : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
+
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
+ if (NeedLoad)
+ MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0);
+ if (Symbol)
+ MIB.addSym(Symbol, OpFlags);
+ else
+ MIB.addGlobalAddress(GV, 0, OpFlags);
+ if (NeedLoad)
+ MIB.addReg(0);
+ }
+
+ // Add a register mask operand representing the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+ // Add an implicit use GOT pointer in EBX.
+ if (Subtarget->isPICStyleGOT())
+ MIB.addReg(X86::EBX, RegState::Implicit);
+
+ if (Is64Bit && IsVarArg && !IsWin64)
+ MIB.addReg(X86::AL, RegState::Implicit);
+
+ // Add implicit physical register uses to the call.
+ for (auto Reg : OutRegs)
+ MIB.addReg(Reg, RegState::Implicit);
+
+ // Issue CALLSEQ_END
+ unsigned NumBytesForCalleeToPop =
+ X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
+ TM.Options.GuaranteedTailCallOpt)
+ ? NumBytes // Callee pops everything.
+ : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CB);
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+ .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
+
+ // Now handle call return values.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
+ CLI.RetTy->getContext());
+ CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ Register ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ EVT CopyVT = VA.getValVT();
+ unsigned CopyReg = ResultReg + i;
+ Register SrcReg = VA.getLocReg();
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+ ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+
+ // If we prefer to use the value in xmm registers, copy it out as f80 and
+ // use a truncate to move it from fp stack reg to xmm reg.
+ if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
+ CopyVT = MVT::f80;
+ CopyReg = createResultReg(&X86::RFP80RegClass);
+ }
+
+ // Copy out the result.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);
+ InRegs.push_back(VA.getLocReg());
+
+ // Round the f80 to the right size, which also moves it to the appropriate
+ // xmm register. This is accomplished by storing the f80 value in memory
+ // and then loading it back.
+ if (CopyVT != VA.getValVT()) {
+ EVT ResVT = VA.getValVT();
+ unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+ unsigned MemSize = ResVT.getSizeInBits()/8;
+ int FI = MFI.CreateStackObject(MemSize, Align(MemSize), false);
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc)), FI)
+ .addReg(CopyReg);
+ Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt;
+ addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg + i), FI);
+ }
+ }
+
+ CLI.ResultReg = ResultReg;
+ CLI.NumResultRegs = RVLocs.size();
+ CLI.Call = MIB;
+
+ return true;
+}
+
+bool
+X86FastISel::fastSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ default: break;
+ case Instruction::Load:
+ return X86SelectLoad(I);
+ case Instruction::Store:
+ return X86SelectStore(I);
+ case Instruction::Ret:
+ return X86SelectRet(I);
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return X86SelectCmp(I);
+ case Instruction::ZExt:
+ return X86SelectZExt(I);
+ case Instruction::SExt:
+ return X86SelectSExt(I);
+ case Instruction::Br:
+ return X86SelectBranch(I);
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Shl:
+ return X86SelectShift(I);
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ return X86SelectDivRem(I);
+ case Instruction::Select:
+ return X86SelectSelect(I);
+ case Instruction::Trunc:
+ return X86SelectTrunc(I);
+ case Instruction::FPExt:
+ return X86SelectFPExt(I);
+ case Instruction::FPTrunc:
+ return X86SelectFPTrunc(I);
+ case Instruction::SIToFP:
+ return X86SelectSIToFP(I);
+ case Instruction::UIToFP:
+ return X86SelectUIToFP(I);
+ case Instruction::IntToPtr: // Deliberate fall-through.
+ case Instruction::PtrToInt: {
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+ if (DstVT.bitsGT(SrcVT))
+ return X86SelectZExt(I);
+ if (DstVT.bitsLT(SrcVT))
+ return X86SelectTrunc(I);
+ Register Reg = getRegForValue(I->getOperand(0));
+ if (Reg == 0) return false;
+ updateValueMap(I, Reg);
+ return true;
+ }
+ case Instruction::BitCast: {
+ // Select SSE2/AVX bitcasts between 128/256/512 bit vector types.
+ if (!Subtarget->hasSSE2())
+ return false;
+
+ MVT SrcVT, DstVT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) ||
+ !isTypeLegal(I->getType(), DstVT))
+ return false;
+
+ // Only allow vectors that use xmm/ymm/zmm.
+ if (!SrcVT.isVector() || !DstVT.isVector() ||
+ SrcVT.getVectorElementType() == MVT::i1 ||
+ DstVT.getVectorElementType() == MVT::i1)
+ return false;
+
+ Register Reg = getRegForValue(I->getOperand(0));
+ if (!Reg)
+ return false;
+
+ // Emit a reg-reg copy so we don't propagate cached known bits information
+ // with the wrong VT if we fall out of fast isel after selecting this.
+ const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
+ Register ResultReg = createResultReg(DstClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg);
+
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
+ if (VT > MVT::i64)
+ return 0;
+
+ uint64_t Imm = CI->getZExtValue();
+ if (Imm == 0) {
+ Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type");
+ case MVT::i1:
+ case MVT::i8:
+ return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Op0IsKill=*/true,
+ X86::sub_8bit);
+ case MVT::i16:
+ return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Op0IsKill=*/true,
+ X86::sub_16bit);
+ case MVT::i32:
+ return SrcReg;
+ case MVT::i64: {
+ Register ResultReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+ .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+ return ResultReg;
+ }
+ }
+ }
+
+ unsigned Opc = 0;
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type");
+ case MVT::i1:
+ VT = MVT::i8;
+ LLVM_FALLTHROUGH;
+ case MVT::i8: Opc = X86::MOV8ri; break;
+ case MVT::i16: Opc = X86::MOV16ri; break;
+ case MVT::i32: Opc = X86::MOV32ri; break;
+ case MVT::i64: {
+ if (isUInt<32>(Imm))
+ Opc = X86::MOV32ri64;
+ else if (isInt<32>(Imm))
+ Opc = X86::MOV64ri32;
+ else
+ Opc = X86::MOV64ri;
+ break;
+ }
+ }
+ return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+}
+
+unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
+ if (CFP->isNullValue())
+ return fastMaterializeFloatZero(CFP);
+
+ // Can't handle alternate code models yet.
+ CodeModel::Model CM = TM.getCodeModel();
+ if (CM != CodeModel::Small && CM != CodeModel::Large)
+ return 0;
+
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX512 = Subtarget->hasAVX512();
+ switch (VT.SimpleTy) {
+ default: return 0;
+ case MVT::f32:
+ if (X86ScalarSSEf32)
+ Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt;
+ else
+ Opc = X86::LD_Fp32m;
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64)
+ Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt;
+ else
+ Opc = X86::LD_Fp64m;
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return 0;
+ }
+
+ // MachineConstantPool wants an explicit alignment.
+ Align Alignment = DL.getPrefTypeAlign(CFP->getType());
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);
+ if (OpFlag == X86II::MO_PIC_BASE_OFFSET)
+ PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ else if (OpFlag == X86II::MO_GOTOFF)
+ PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+ else if (Subtarget->is64Bit() && TM.getCodeModel() == CodeModel::Small)
+ PICBase = X86::RIP;
+
+ // Create the load from the constant pool.
+ unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment);
+ Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
+
+ // Large code model only applies to 64-bit mode.
+ if (Subtarget->is64Bit() && CM == CodeModel::Large) {
+ Register AddrReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+ AddrReg)
+ .addConstantPoolIndex(CPI, 0, OpFlag);
+ MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg);
+ addRegReg(MIB, AddrReg, false, PICBase, false);
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+ MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
+ MIB->addMemOperand(*FuncInfo.MF, MMO);
+ return ResultReg;
+ }
+
+ addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg),
+ CPI, PICBase, OpFlag);
+ return ResultReg;
+}
+
+unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return 0;
+
+ // Materialize addresses with LEA/MOV instructions.
+ X86AddressMode AM;
+ if (X86SelectAddress(GV, AM)) {
+ // If the expression is just a basereg, then we're done, otherwise we need
+ // to emit an LEA.
+ if (AM.BaseType == X86AddressMode::RegBase &&
+ AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
+ return AM.Base.Reg;
+
+ Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ if (TM.getRelocationModel() == Reloc::Static &&
+ TLI.getPointerTy(DL) == MVT::i64) {
+ // The displacement code could be more than 32 bits away so we need to use
+ // an instruction with a 64 bit immediate
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+ ResultReg)
+ .addGlobalAddress(GV);
+ } else {
+ unsigned Opc =
+ TLI.getPointerTy(DL) == MVT::i32
+ ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+ : X86::LEA64r;
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg), AM);
+ }
+ return ResultReg;
+ }
+ return 0;
+}
+
+unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
+ EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+ // Only handle simple types.
+ if (!CEVT.isSimple())
+ return 0;
+ MVT VT = CEVT.getSimpleVT();
+
+ if (const auto *CI = dyn_cast<ConstantInt>(C))
+ return X86MaterializeInt(CI, VT);
+ else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+ return X86MaterializeFP(CFP, VT);
+ else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+ return X86MaterializeGV(GV, VT);
+
+ return 0;
+}
+
+unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
+ // Fail on dynamic allocas. At this point, getRegForValue has already
+ // checked its CSE maps, so if we're here trying to handle a dynamic
+ // alloca, we're not going to succeed. X86SelectAddress has a
+ // check for dynamic allocas, because it's called directly from
+ // various places, but targetMaterializeAlloca also needs a check
+ // in order to avoid recursion between getRegForValue,
+ // X86SelectAddrss, and targetMaterializeAlloca.
+ if (!FuncInfo.StaticAllocaMap.count(C))
+ return 0;
+ assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(C, AM))
+ return 0;
+ unsigned Opc =
+ TLI.getPointerTy(DL) == MVT::i32
+ ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+ : X86::LEA64r;
+ const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
+ Register ResultReg = createResultReg(RC);
+ addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(Opc), ResultReg), AM);
+ return ResultReg;
+}
+
+unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
+ MVT VT;
+ if (!isTypeLegal(CF->getType(), VT))
+ return 0;
+
+ // Get opcode and regclass for the given zero.
+ bool HasAVX512 = Subtarget->hasAVX512();
+ unsigned Opc = 0;
+ switch (VT.SimpleTy) {
+ default: return 0;
+ case MVT::f32:
+ if (X86ScalarSSEf32)
+ Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
+ else
+ Opc = X86::LD_Fp032;
+ break;
+ case MVT::f64:
+ if (X86ScalarSSEf64)
+ Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
+ else
+ Opc = X86::LD_Fp064;
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return 0;
+ }
+
+ Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ return ResultReg;
+}
+
+
+bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) {
+ const Value *Ptr = LI->getPointerOperand();
+ X86AddressMode AM;
+ if (!X86SelectAddress(Ptr, AM))
+ return false;
+
+ const X86InstrInfo &XII = (const X86InstrInfo &)TII;
+
+ unsigned Size = DL.getTypeAllocSize(LI->getType());
+
+ SmallVector<MachineOperand, 8> AddrOps;
+ AM.getFullAddress(AddrOps);
+
+ MachineInstr *Result = XII.foldMemoryOperandImpl(
+ *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, LI->getAlign(),
+ /*AllowCommute=*/true);
+ if (!Result)
+ return false;
+
+ // The index register could be in the wrong register class. Unfortunately,
+ // foldMemoryOperandImpl could have commuted the instruction so its not enough
+ // to just look at OpNo + the offset to the index reg. We actually need to
+ // scan the instruction to find the index reg and see if its the correct reg
+ // class.
+ unsigned OperandNo = 0;
+ for (MachineInstr::mop_iterator I = Result->operands_begin(),
+ E = Result->operands_end(); I != E; ++I, ++OperandNo) {
+ MachineOperand &MO = *I;
+ if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
+ continue;
+ // Found the index reg, now try to rewrite it.
+ Register IndexReg = constrainOperandRegClass(Result->getDesc(),
+ MO.getReg(), OperandNo);
+ if (IndexReg == MO.getReg())
+ continue;
+ MO.setReg(IndexReg);
+ }
+
+ Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
+ Result->cloneInstrSymbols(*FuncInfo.MF, *MI);
+ MachineBasicBlock::iterator I(MI);
+ removeDeadCode(I, std::next(I));
+ return true;
+}
+
+unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill,
+ unsigned Op2, bool Op2IsKill,
+ unsigned Op3, bool Op3IsKill) {
+ const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+ Register ResultReg = createResultReg(RC);
+ Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+ Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+ Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
+ Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3);
+
+ if (II.getNumDefs() >= 1)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op1, getKillRegState(Op1IsKill))
+ .addReg(Op2, getKillRegState(Op2IsKill))
+ .addReg(Op3, getKillRegState(Op3IsKill));
+ else {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op1, getKillRegState(Op1IsKill))
+ .addReg(Op2, getKillRegState(Op2IsKill))
+ .addReg(Op3, getKillRegState(Op3IsKill));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ }
+ return ResultReg;
+}
+
+
+namespace llvm {
+ FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) {
+ return new X86FastISel(funcInfo, libInfo);
+ }
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
new file mode 100644
index 000000000000..f8d822aebc5b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -0,0 +1,459 @@
+//===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines the pass that looks through the machine instructions
+/// late in the compilation, and finds byte or word instructions that
+/// can be profitably replaced with 32 bit instructions that give equivalent
+/// results for the bits of the results that are used. There are two possible
+/// reasons to do this.
+///
+/// One reason is to avoid false-dependences on the upper portions
+/// of the registers. Only instructions that have a destination register
+/// which is not in any of the source registers can be affected by this.
+/// Any instruction where one of the source registers is also the destination
+/// register is unaffected, because it has a true dependence on the source
+/// register already. So, this consideration primarily affects load
+/// instructions and register-to-register moves. It would
+/// seem like cmov(s) would also be affected, but because of the way cmov is
+/// really implemented by most machines as reading both the destination and
+/// and source registers, and then "merging" the two based on a condition,
+/// it really already should be considered as having a true dependence on the
+/// destination register as well.
+///
+/// The other reason to do this is for potential code size savings. Word
+/// operations need an extra override byte compared to their 32 bit
+/// versions. So this can convert many word operations to their larger
+/// size, saving a byte in encoding. This could introduce partial register
+/// dependences where none existed however. As an example take:
+/// orw ax, $0x1000
+/// addw ax, $3
+/// now if this were to get transformed into
+/// orw ax, $1000
+/// addl eax, $3
+/// because the addl encodes shorter than the addw, this would introduce
+/// a use of a register that was only partially written earlier. On older
+/// Intel processors this can be quite a performance penalty, so this should
+/// probably only be done when it can be proven that a new partial dependence
+/// wouldn't be created, or when your know a newer processor is being
+/// targeted, or when optimizing for minimum code size.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup"
+#define FIXUPBW_NAME "x86-fixup-bw-insts"
+
+#define DEBUG_TYPE FIXUPBW_NAME
+
+// Option to allow this optimization pass to have fine-grained control.
+static cl::opt<bool>
+ FixupBWInsts("fixup-byte-word-insts",
+ cl::desc("Change byte and word instructions to larger sizes"),
+ cl::init(true), cl::Hidden);
+
+namespace {
+class FixupBWInstPass : public MachineFunctionPass {
+ /// Loop over all of the instructions in the basic block replacing applicable
+ /// byte or word instructions with better alternatives.
+ void processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ /// This sets the \p SuperDestReg to the 32 bit super reg of the original
+ /// destination register of the MachineInstr passed in. It returns true if
+ /// that super register is dead just prior to \p OrigMI, and false if not.
+ bool getSuperRegDestIfDead(MachineInstr *OrigMI,
+ Register &SuperDestReg) const;
+
+ /// Change the MachineInstr \p MI into the equivalent extending load to 32 bit
+ /// register if it is safe to do so. Return the replacement instruction if
+ /// OK, otherwise return nullptr.
+ MachineInstr *tryReplaceLoad(unsigned New32BitOpcode, MachineInstr *MI) const;
+
+ /// Change the MachineInstr \p MI into the equivalent 32-bit copy if it is
+ /// safe to do so. Return the replacement instruction if OK, otherwise return
+ /// nullptr.
+ MachineInstr *tryReplaceCopy(MachineInstr *MI) const;
+
+ /// Change the MachineInstr \p MI into the equivalent extend to 32 bit
+ /// register if it is safe to do so. Return the replacement instruction if
+ /// OK, otherwise return nullptr.
+ MachineInstr *tryReplaceExtend(unsigned New32BitOpcode,
+ MachineInstr *MI) const;
+
+ // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
+ // possible. Return the replacement instruction if OK, return nullptr
+ // otherwise.
+ MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB) const;
+
+public:
+ static char ID;
+
+ StringRef getPassName() const override { return FIXUPBW_DESC; }
+
+ FixupBWInstPass() : MachineFunctionPass(ID) { }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
+ // guide some heuristics.
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ /// Loop over all of the basic blocks, replacing byte and word instructions by
+ /// equivalent 32 bit instructions where performance or code size can be
+ /// improved.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ MachineFunction *MF = nullptr;
+
+ /// Machine instruction info used throughout the class.
+ const X86InstrInfo *TII = nullptr;
+
+ /// Local member for function's OptForSize attribute.
+ bool OptForSize = false;
+
+ /// Machine loop info used for guiding some heruistics.
+ MachineLoopInfo *MLI = nullptr;
+
+ /// Register Liveness information after the current instruction.
+ LivePhysRegs LiveRegs;
+
+ ProfileSummaryInfo *PSI;
+ MachineBlockFrequencyInfo *MBFI;
+};
+char FixupBWInstPass::ID = 0;
+}
+
+INITIALIZE_PASS(FixupBWInstPass, FIXUPBW_NAME, FIXUPBW_DESC, false, false)
+
+FunctionPass *llvm::createX86FixupBWInsts() { return new FixupBWInstPass(); }
+
+bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
+ if (!FixupBWInsts || skipFunction(MF.getFunction()))
+ return false;
+
+ this->MF = &MF;
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ MLI = &getAnalysis<MachineLoopInfo>();
+ PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ MBFI = (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
+ LiveRegs.init(TII->getRegisterInfo());
+
+ LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
+
+ // Process all basic blocks.
+ for (auto &MBB : MF)
+ processBasicBlock(MF, MBB);
+
+ LLVM_DEBUG(dbgs() << "End X86FixupBWInsts\n";);
+
+ return true;
+}
+
+/// Check if after \p OrigMI the only portion of super register
+/// of the destination register of \p OrigMI that is alive is that
+/// destination register.
+///
+/// If so, return that super register in \p SuperDestReg.
+bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
+ Register &SuperDestReg) const {
+ const X86RegisterInfo *TRI = &TII->getRegisterInfo();
+ Register OrigDestReg = OrigMI->getOperand(0).getReg();
+ SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
+
+ const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg);
+
+ // Make sure that the sub-register that this instruction has as its
+ // destination is the lowest order sub-register of the super-register.
+ // If it isn't, then the register isn't really dead even if the
+ // super-register is considered dead.
+ if (SubRegIdx == X86::sub_8bit_hi)
+ return false;
+
+ // If neither the destination-super register nor any applicable subregisters
+ // are live after this instruction, then the super register is safe to use.
+ if (!LiveRegs.contains(SuperDestReg)) {
+ // If the original destination register was not the low 8-bit subregister
+ // then the super register check is sufficient.
+ if (SubRegIdx != X86::sub_8bit)
+ return true;
+ // If the original destination register was the low 8-bit subregister and
+ // we also need to check the 16-bit subregister and the high 8-bit
+ // subregister.
+ if (!LiveRegs.contains(getX86SubSuperRegister(OrigDestReg, 16)) &&
+ !LiveRegs.contains(getX86SubSuperRegister(SuperDestReg, 8,
+ /*High=*/true)))
+ return true;
+ // Otherwise, we have a little more checking to do.
+ }
+
+ // If we get here, the super-register destination (or some part of it) is
+ // marked as live after the original instruction.
+ //
+ // The X86 backend does not have subregister liveness tracking enabled,
+ // so liveness information might be overly conservative. Specifically, the
+ // super register might be marked as live because it is implicitly defined
+ // by the instruction we are examining.
+ //
+ // However, for some specific instructions (this pass only cares about MOVs)
+ // we can produce more precise results by analysing that MOV's operands.
+ //
+ // Indeed, if super-register is not live before the mov it means that it
+ // was originally <read-undef> and so we are free to modify these
+ // undef upper bits. That may happen in case where the use is in another MBB
+ // and the vreg/physreg corresponding to the move has higher width than
+ // necessary (e.g. due to register coalescing with a "truncate" copy).
+ // So, we would like to handle patterns like this:
+ //
+ // %bb.2: derived from LLVM BB %if.then
+ // Live Ins: %rdi
+ // Predecessors according to CFG: %bb.0
+ // %ax<def> = MOV16rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax
+ // ; No implicit %eax
+ // Successors according to CFG: %bb.3(?%)
+ //
+ // %bb.3: derived from LLVM BB %if.end
+ // Live Ins: %eax Only %ax is actually live
+ // Predecessors according to CFG: %bb.2 %bb.1
+ // %ax = KILL %ax, implicit killed %eax
+ // RET 0, %ax
+ unsigned Opc = OrigMI->getOpcode(); (void)Opc;
+ // These are the opcodes currently known to work with the code below, if
+ // something // else will be added we need to ensure that new opcode has the
+ // same properties.
+ if (Opc != X86::MOV8rm && Opc != X86::MOV16rm && Opc != X86::MOV8rr &&
+ Opc != X86::MOV16rr)
+ return false;
+
+ bool IsDefined = false;
+ for (auto &MO: OrigMI->implicit_operands()) {
+ if (!MO.isReg())
+ continue;
+
+ assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
+
+ if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg()))
+ IsDefined = true;
+
+ // If MO is a use of any part of the destination register but is not equal
+ // to OrigDestReg or one of its subregisters, we cannot use SuperDestReg.
+ // For example, if OrigDestReg is %al then an implicit use of %ah, %ax,
+ // %eax, or %rax will prevent us from using the %eax register.
+ if (MO.isUse() && !TRI->isSubRegisterEq(OrigDestReg, MO.getReg()) &&
+ TRI->regsOverlap(SuperDestReg, MO.getReg()))
+ return false;
+ }
+ // Reg is not Imp-def'ed -> it's live both before/after the instruction.
+ if (!IsDefined)
+ return false;
+
+ // Otherwise, the Reg is not live before the MI and the MOV can't
+ // make it really live, so it's in fact dead even after the MI.
+ return true;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
+ MachineInstr *MI) const {
+ Register NewDestReg;
+
+ // We are going to try to rewrite this load to a larger zero-extending
+ // load. This is safe if all portions of the 32 bit super-register
+ // of the original destination register, except for the original destination
+ // register are dead. getSuperRegDestIfDead checks that.
+ if (!getSuperRegDestIfDead(MI, NewDestReg))
+ return nullptr;
+
+ // Safe to change the instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
+
+ unsigned NumArgs = MI->getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i)
+ MIB.add(MI->getOperand(i));
+
+ MIB.setMemRefs(MI->memoperands());
+
+ return MIB;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
+ assert(MI->getNumExplicitOperands() == 2);
+ auto &OldDest = MI->getOperand(0);
+ auto &OldSrc = MI->getOperand(1);
+
+ Register NewDestReg;
+ if (!getSuperRegDestIfDead(MI, NewDestReg))
+ return nullptr;
+
+ Register NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
+
+ // This is only correct if we access the same subregister index: otherwise,
+ // we could try to replace "movb %ah, %al" with "movl %eax, %eax".
+ const X86RegisterInfo *TRI = &TII->getRegisterInfo();
+ if (TRI->getSubRegIndex(NewSrcReg, OldSrc.getReg()) !=
+ TRI->getSubRegIndex(NewDestReg, OldDest.getReg()))
+ return nullptr;
+
+ // Safe to change the instruction.
+ // Don't set src flags, as we don't know if we're also killing the superreg.
+ // However, the superregister might not be defined; make it explicit that
+ // we don't care about the higher bits by reading it as Undef, and adding
+ // an imp-use on the original subregister.
+ MachineInstrBuilder MIB =
+ BuildMI(*MF, MI->getDebugLoc(), TII->get(X86::MOV32rr), NewDestReg)
+ .addReg(NewSrcReg, RegState::Undef)
+ .addReg(OldSrc.getReg(), RegState::Implicit);
+
+ // Drop imp-defs/uses that would be redundant with the new def/use.
+ for (auto &Op : MI->implicit_operands())
+ if (Op.getReg() != (Op.isDef() ? NewDestReg : NewSrcReg))
+ MIB.add(Op);
+
+ return MIB;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode,
+ MachineInstr *MI) const {
+ Register NewDestReg;
+ if (!getSuperRegDestIfDead(MI, NewDestReg))
+ return nullptr;
+
+ // Don't interfere with formation of CBW instructions which should be a
+ // shorter encoding than even the MOVSX32rr8. It's also immune to partial
+ // merge issues on Intel CPUs.
+ if (MI->getOpcode() == X86::MOVSX16rr8 &&
+ MI->getOperand(0).getReg() == X86::AX &&
+ MI->getOperand(1).getReg() == X86::AL)
+ return nullptr;
+
+ // Safe to change the instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
+
+ unsigned NumArgs = MI->getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i)
+ MIB.add(MI->getOperand(i));
+
+ MIB.setMemRefs(MI->memoperands());
+
+ return MIB;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
+ MachineBasicBlock &MBB) const {
+ // See if this is an instruction of the type we are currently looking for.
+ switch (MI->getOpcode()) {
+
+ case X86::MOV8rm:
+ // Only replace 8 bit loads with the zero extending versions if
+ // in an inner most loop and not optimizing for size. This takes
+ // an extra byte to encode, and provides limited performance upside.
+ if (MachineLoop *ML = MLI->getLoopFor(&MBB))
+ if (ML->begin() == ML->end() && !OptForSize)
+ return tryReplaceLoad(X86::MOVZX32rm8, MI);
+ break;
+
+ case X86::MOV16rm:
+ // Always try to replace 16 bit load with 32 bit zero extending.
+ // Code size is the same, and there is sometimes a perf advantage
+ // from eliminating a false dependence on the upper portion of
+ // the register.
+ return tryReplaceLoad(X86::MOVZX32rm16, MI);
+
+ case X86::MOV8rr:
+ case X86::MOV16rr:
+ // Always try to replace 8/16 bit copies with a 32 bit copy.
+ // Code size is either less (16) or equal (8), and there is sometimes a
+ // perf advantage from eliminating a false dependence on the upper portion
+ // of the register.
+ return tryReplaceCopy(MI);
+
+ case X86::MOVSX16rr8:
+ return tryReplaceExtend(X86::MOVSX32rr8, MI);
+ case X86::MOVSX16rm8:
+ return tryReplaceExtend(X86::MOVSX32rm8, MI);
+ case X86::MOVZX16rr8:
+ return tryReplaceExtend(X86::MOVZX32rr8, MI);
+ case X86::MOVZX16rm8:
+ return tryReplaceExtend(X86::MOVZX32rm8, MI);
+
+ default:
+ // nothing to do here.
+ break;
+ }
+
+ return nullptr;
+}
+
+void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
+ MachineBasicBlock &MBB) {
+
+ // This algorithm doesn't delete the instructions it is replacing
+ // right away. By leaving the existing instructions in place, the
+ // register liveness information doesn't change, and this makes the
+ // analysis that goes on be better than if the replaced instructions
+ // were immediately removed.
+ //
+ // This algorithm always creates a replacement instruction
+ // and notes that and the original in a data structure, until the
+ // whole BB has been analyzed. This keeps the replacement instructions
+ // from making it seem as if the larger register might be live.
+ SmallVector<std::pair<MachineInstr *, MachineInstr *>, 8> MIReplacements;
+
+ // Start computing liveness for this block. We iterate from the end to be able
+ // to update this for each instruction.
+ LiveRegs.clear();
+ // We run after PEI, so we need to AddPristinesAndCSRs.
+ LiveRegs.addLiveOuts(MBB);
+
+ OptForSize = MF.getFunction().hasOptSize() ||
+ llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
+
+ for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
+ MachineInstr *MI = &*I;
+
+ if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB))
+ MIReplacements.push_back(std::make_pair(MI, NewMI));
+
+ // We're done with this instruction, update liveness for the next one.
+ LiveRegs.stepBackward(*MI);
+ }
+
+ while (!MIReplacements.empty()) {
+ MachineInstr *MI = MIReplacements.back().first;
+ MachineInstr *NewMI = MIReplacements.back().second;
+ MIReplacements.pop_back();
+ MBB.insert(MI, NewMI);
+ MBB.erase(MI);
+ }
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
new file mode 100644
index 000000000000..0054d5818a96
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -0,0 +1,702 @@
+//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that finds instructions that can be
+// re-written as LEA instructions in order to reduce pipeline delays.
+// It replaces LEAs with ADD/INC/DEC when that is better for size/speed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define FIXUPLEA_DESC "X86 LEA Fixup"
+#define FIXUPLEA_NAME "x86-fixup-LEAs"
+
+#define DEBUG_TYPE FIXUPLEA_NAME
+
+STATISTIC(NumLEAs, "Number of LEA instructions created");
+
+namespace {
+class FixupLEAPass : public MachineFunctionPass {
+ enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+
+ /// Given a machine register, look for the instruction
+ /// which writes it in the current basic block. If found,
+ /// try to replace it with an equivalent LEA instruction.
+ /// If replacement succeeds, then also process the newly created
+ /// instruction.
+ void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB);
+
+ /// Given a memory access or LEA instruction
+ /// whose address mode uses a base and/or index register, look for
+ /// an opportunity to replace the instruction which sets the base or index
+ /// register with an equivalent LEA instruction.
+ void processInstruction(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB);
+
+ /// Given a LEA instruction which is unprofitable
+ /// on SlowLEA targets try to replace it with an equivalent ADD instruction.
+ void processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB);
+
+ /// Given a LEA instruction which is unprofitable
+ /// on SNB+ try to replace it with other instructions.
+ /// According to Intel's Optimization Reference Manual:
+ /// " For LEA instructions with three source operands and some specific
+ /// situations, instruction latency has increased to 3 cycles, and must
+ /// dispatch via port 1:
+ /// - LEA that has all three source operands: base, index, and offset
+ /// - LEA that uses base and index registers where the base is EBP, RBP,
+ /// or R13
+ /// - LEA that uses RIP relative addressing mode
+ /// - LEA that uses 16-bit addressing mode "
+ /// This function currently handles the first 2 cases only.
+ void processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB, bool OptIncDec);
+
+ /// Look for LEAs that are really two address LEAs that we might be able to
+ /// turn into regular ADD instructions.
+ bool optTwoAddrLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB, bool OptIncDec,
+ bool UseLEAForSP) const;
+
+ /// Determine if an instruction references a machine register
+ /// and, if so, whether it reads or writes the register.
+ RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
+
+ /// Step backwards through a basic block, looking
+ /// for an instruction which writes a register within
+ /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+ MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
+ MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB);
+
+ /// if an instruction can be converted to an
+ /// equivalent LEA, insert the new instruction into the basic block
+ /// and return a pointer to it. Otherwise, return zero.
+ MachineInstr *postRAConvertToLEA(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI) const;
+
+public:
+ static char ID;
+
+ StringRef getPassName() const override { return FIXUPLEA_DESC; }
+
+ FixupLEAPass() : MachineFunctionPass(ID) { }
+
+ /// Loop over all of the basic blocks,
+ /// replacing instructions by equivalent LEA instructions
+ /// if needed and when possible.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ // This pass runs after regalloc and doesn't support VReg operands.
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ TargetSchedModel TSM;
+ const X86InstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
+};
+}
+
+char FixupLEAPass::ID = 0;
+
+INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
+
+MachineInstr *
+FixupLEAPass::postRAConvertToLEA(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI) const {
+ MachineInstr &MI = *MBBI;
+ switch (MI.getOpcode()) {
+ case X86::MOV32rr:
+ case X86::MOV64rr: {
+ const MachineOperand &Src = MI.getOperand(1);
+ const MachineOperand &Dest = MI.getOperand(0);
+ MachineInstr *NewMI =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r
+ : X86::LEA64r))
+ .add(Dest)
+ .add(Src)
+ .addImm(1)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0);
+ return NewMI;
+ }
+ }
+
+ if (!MI.isConvertibleTo3Addr())
+ return nullptr;
+
+ switch (MI.getOpcode()) {
+ default:
+ // Only convert instructions that we've verified are safe.
+ return nullptr;
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8_DB:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri_DB:
+ case X86::ADD32ri8_DB:
+ if (!MI.getOperand(2).isImm()) {
+ // convertToThreeAddress will call getImm()
+ // which requires isImm() to be true
+ return nullptr;
+ }
+ break;
+ case X86::SHL64ri:
+ case X86::SHL32ri:
+ case X86::INC64r:
+ case X86::INC32r:
+ case X86::DEC64r:
+ case X86::DEC32r:
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB:
+ // These instructions are all fine to convert.
+ break;
+ }
+ MachineFunction::iterator MFI = MBB.getIterator();
+ return TII->convertToThreeAddress(MFI, MI, nullptr);
+}
+
+FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
+
+static bool isLEA(unsigned Opcode) {
+ return Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r;
+}
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ bool IsSlowLEA = ST.slowLEA();
+ bool IsSlow3OpsLEA = ST.slow3OpsLEA();
+ bool LEAUsesAG = ST.LEAusesAG();
+
+ bool OptIncDec = !ST.slowIncDec() || MF.getFunction().hasOptSize();
+ bool UseLEAForSP = ST.useLeaForSP();
+
+ TSM.init(&ST);
+ TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto *MBFI = (PSI && PSI->hasProfileSummary())
+ ? &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI()
+ : nullptr;
+
+ LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+ for (MachineBasicBlock &MBB : MF) {
+ // First pass. Try to remove or optimize existing LEAs.
+ bool OptIncDecPerBB =
+ OptIncDec || llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ if (!isLEA(I->getOpcode()))
+ continue;
+
+ if (optTwoAddrLEA(I, MBB, OptIncDecPerBB, UseLEAForSP))
+ continue;
+
+ if (IsSlowLEA)
+ processInstructionForSlowLEA(I, MBB);
+ else if (IsSlow3OpsLEA)
+ processInstrForSlow3OpLEA(I, MBB, OptIncDecPerBB);
+ }
+
+ // Second pass for creating LEAs. This may reverse some of the
+ // transformations above.
+ if (LEAUsesAG) {
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+ processInstruction(I, MBB);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "End X86FixupLEAs\n";);
+
+ return true;
+}
+
+FixupLEAPass::RegUsageState
+FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
+ RegUsageState RegUsage = RU_NotUsed;
+ MachineInstr &MI = *I;
+
+ for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+ MachineOperand &opnd = MI.getOperand(i);
+ if (opnd.isReg() && opnd.getReg() == p.getReg()) {
+ if (opnd.isDef())
+ return RU_Write;
+ RegUsage = RU_Read;
+ }
+ }
+ return RegUsage;
+}
+
+/// getPreviousInstr - Given a reference to an instruction in a basic
+/// block, return a reference to the previous instruction in the block,
+/// wrapping around to the last instruction of the block if the block
+/// branches to itself.
+static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB) {
+ if (I == MBB.begin()) {
+ if (MBB.isPredecessor(&MBB)) {
+ I = --MBB.end();
+ return true;
+ } else
+ return false;
+ }
+ --I;
+ return true;
+}
+
+MachineBasicBlock::iterator
+FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB) {
+ int InstrDistance = 1;
+ MachineBasicBlock::iterator CurInst;
+ static const int INSTR_DISTANCE_THRESHOLD = 5;
+
+ CurInst = I;
+ bool Found;
+ Found = getPreviousInstr(CurInst, MBB);
+ while (Found && I != CurInst) {
+ if (CurInst->isCall() || CurInst->isInlineAsm())
+ break;
+ if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
+ break; // too far back to make a difference
+ if (usesRegister(p, CurInst) == RU_Write) {
+ return CurInst;
+ }
+ InstrDistance += TSM.computeInstrLatency(&*CurInst);
+ Found = getPreviousInstr(CurInst, MBB);
+ }
+ return MachineBasicBlock::iterator();
+}
+
+static inline bool isInefficientLEAReg(unsigned Reg) {
+ return Reg == X86::EBP || Reg == X86::RBP ||
+ Reg == X86::R13D || Reg == X86::R13;
+}
+
+/// Returns true if this LEA uses base an index registers, and the base register
+/// is known to be inefficient for the subtarget.
+// TODO: use a variant scheduling class to model the latency profile
+// of LEA instructions, and implement this logic as a scheduling predicate.
+static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
+ const MachineOperand &Index) {
+ return Base.isReg() && isInefficientLEAReg(Base.getReg()) && Index.isReg() &&
+ Index.getReg() != X86::NoRegister;
+}
+
+static inline bool hasLEAOffset(const MachineOperand &Offset) {
+ return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
+}
+
+static inline unsigned getADDrrFromLEA(unsigned LEAOpcode) {
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ return X86::ADD32rr;
+ case X86::LEA64r:
+ return X86::ADD64rr;
+ }
+}
+
+static inline unsigned getADDriFromLEA(unsigned LEAOpcode,
+ const MachineOperand &Offset) {
+ bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
+ case X86::LEA64r:
+ return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32;
+ }
+}
+
+static inline unsigned getINCDECFromLEA(unsigned LEAOpcode, bool IsINC) {
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ return IsINC ? X86::INC32r : X86::DEC32r;
+ case X86::LEA64r:
+ return IsINC ? X86::INC64r : X86::DEC64r;
+ }
+}
+
+bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB, bool OptIncDec,
+ bool UseLEAForSP) const {
+ MachineInstr &MI = *I;
+
+ const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
+ const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt);
+ const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg);
+ const MachineOperand &Disp = MI.getOperand(1 + X86::AddrDisp);
+ const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
+
+ if (Segment.getReg() != 0 || !Disp.isImm() || Scale.getImm() > 1 ||
+ MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I) !=
+ MachineBasicBlock::LQR_Dead)
+ return false;
+
+ Register DestReg = MI.getOperand(0).getReg();
+ Register BaseReg = Base.getReg();
+ Register IndexReg = Index.getReg();
+
+ // Don't change stack adjustment LEAs.
+ if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP))
+ return false;
+
+ // LEA64_32 has 64-bit operands but 32-bit result.
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ if (BaseReg != 0)
+ BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit);
+ if (IndexReg != 0)
+ IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit);
+ }
+
+ MachineInstr *NewMI = nullptr;
+
+ // Look for lea(%reg1, %reg2), %reg1 or lea(%reg2, %reg1), %reg1
+ // which can be turned into add %reg2, %reg1
+ if (BaseReg != 0 && IndexReg != 0 && Disp.getImm() == 0 &&
+ (DestReg == BaseReg || DestReg == IndexReg)) {
+ unsigned NewOpcode = getADDrrFromLEA(MI.getOpcode());
+ if (DestReg != BaseReg)
+ std::swap(BaseReg, IndexReg);
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addReg(IndexReg)
+ .addReg(Base.getReg(), RegState::Implicit)
+ .addReg(Index.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addReg(IndexReg);
+ }
+ } else if (DestReg == BaseReg && IndexReg == 0) {
+ // This is an LEA with only a base register and a displacement,
+ // We can use ADDri or INC/DEC.
+
+ // Does this LEA have one these forms:
+ // lea %reg, 1(%reg)
+ // lea %reg, -1(%reg)
+ if (OptIncDec && (Disp.getImm() == 1 || Disp.getImm() == -1)) {
+ bool IsINC = Disp.getImm() == 1;
+ unsigned NewOpcode = getINCDECFromLEA(MI.getOpcode(), IsINC);
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addReg(Base.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg);
+ }
+ } else {
+ unsigned NewOpcode = getADDriFromLEA(MI.getOpcode(), Disp);
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addImm(Disp.getImm())
+ .addReg(Base.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addImm(Disp.getImm());
+ }
+ }
+ } else
+ return false;
+
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.erase(I);
+ I = NewMI;
+ return true;
+}
+
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB) {
+ // Process a load, store, or LEA instruction.
+ MachineInstr &MI = *I;
+ const MCInstrDesc &Desc = MI.getDesc();
+ int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (AddrOffset >= 0) {
+ AddrOffset += X86II::getOperandBias(Desc);
+ MachineOperand &p = MI.getOperand(AddrOffset + X86::AddrBaseReg);
+ if (p.isReg() && p.getReg() != X86::ESP) {
+ seekLEAFixup(p, I, MBB);
+ }
+ MachineOperand &q = MI.getOperand(AddrOffset + X86::AddrIndexReg);
+ if (q.isReg() && q.getReg() != X86::ESP) {
+ seekLEAFixup(q, I, MBB);
+ }
+ }
+}
+
+void FixupLEAPass::seekLEAFixup(MachineOperand &p,
+ MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB) {
+ MachineBasicBlock::iterator MBI = searchBackwards(p, I, MBB);
+ if (MBI != MachineBasicBlock::iterator()) {
+ MachineInstr *NewMI = postRAConvertToLEA(MBB, MBI);
+ if (NewMI) {
+ ++NumLEAs;
+ LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
+ // now to replace with an equivalent LEA...
+ LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+ MBB.getParent()->substituteDebugValuesForInst(*MBI, *NewMI, 1);
+ MBB.erase(MBI);
+ MachineBasicBlock::iterator J =
+ static_cast<MachineBasicBlock::iterator>(NewMI);
+ processInstruction(J, MBB);
+ }
+ }
+}
+
+void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB) {
+ MachineInstr &MI = *I;
+ const unsigned Opcode = MI.getOpcode();
+
+ const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
+ const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt);
+ const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg);
+ const MachineOperand &Offset = MI.getOperand(1 + X86::AddrDisp);
+ const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
+
+ if (Segment.getReg() != 0 || !Offset.isImm() ||
+ MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
+ MachineBasicBlock::LQR_Dead)
+ return;
+ const Register DstR = Dst.getReg();
+ const Register SrcR1 = Base.getReg();
+ const Register SrcR2 = Index.getReg();
+ if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
+ return;
+ if (Scale.getImm() > 1)
+ return;
+ LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+ LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+ MachineInstr *NewMI = nullptr;
+ // Make ADD instruction for two registers writing to LEA's destination
+ if (SrcR1 != 0 && SrcR2 != 0) {
+ const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
+ const MachineOperand &Src = SrcR1 == DstR ? Index : Base;
+ NewMI =
+ BuildMI(MBB, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
+ LLVM_DEBUG(NewMI->dump(););
+ }
+ // Make ADD instruction for immediate
+ if (Offset.getImm() != 0) {
+ const MCInstrDesc &ADDri =
+ TII->get(getADDriFromLEA(Opcode, Offset));
+ const MachineOperand &SrcR = SrcR1 == DstR ? Base : Index;
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), ADDri, DstR)
+ .add(SrcR)
+ .addImm(Offset.getImm());
+ LLVM_DEBUG(NewMI->dump(););
+ }
+ if (NewMI) {
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.erase(I);
+ I = NewMI;
+ }
+}
+
+void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB,
+ bool OptIncDec) {
+ MachineInstr &MI = *I;
+ const unsigned LEAOpcode = MI.getOpcode();
+
+ const MachineOperand &Dest = MI.getOperand(0);
+ const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
+ const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt);
+ const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg);
+ const MachineOperand &Offset = MI.getOperand(1 + X86::AddrDisp);
+ const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
+
+ if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) ||
+ MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
+ MachineBasicBlock::LQR_Dead ||
+ Segment.getReg() != X86::NoRegister)
+ return;
+
+ Register DestReg = Dest.getReg();
+ Register BaseReg = Base.getReg();
+ Register IndexReg = Index.getReg();
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ if (BaseReg != 0)
+ BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit);
+ if (IndexReg != 0)
+ IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit);
+ }
+
+ bool IsScale1 = Scale.getImm() == 1;
+ bool IsInefficientBase = isInefficientLEAReg(BaseReg);
+ bool IsInefficientIndex = isInefficientLEAReg(IndexReg);
+
+ // Skip these cases since it takes more than 2 instructions
+ // to replace the LEA instruction.
+ if (IsInefficientBase && DestReg == BaseReg && !IsScale1)
+ return;
+
+ LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+ LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+
+ MachineInstr *NewMI = nullptr;
+
+ // First try to replace LEA with one or two (for the 3-op LEA case)
+ // add instructions:
+ // 1.lea (%base,%index,1), %base => add %index,%base
+ // 2.lea (%base,%index,1), %index => add %base,%index
+ if (IsScale1 && (DestReg == BaseReg || DestReg == IndexReg)) {
+ unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+ if (DestReg != BaseReg)
+ std::swap(BaseReg, IndexReg);
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(BaseReg)
+ .addReg(IndexReg)
+ .addReg(Base.getReg(), RegState::Implicit)
+ .addReg(Index.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(BaseReg)
+ .addReg(IndexReg);
+ }
+ } else if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+ // If the base is inefficient try switching the index and base operands,
+ // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+ // lea offset(%base,%index,scale),%dst =>
+ // lea (%base,%index,scale); add offset,%dst
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+ .add(Dest)
+ .add(IsInefficientBase ? Index : Base)
+ .add(Scale)
+ .add(IsInefficientBase ? Base : Index)
+ .addImm(0)
+ .add(Segment);
+ LLVM_DEBUG(NewMI->dump(););
+ }
+
+ // If either replacement succeeded above, add the offset if needed, then
+ // replace the instruction.
+ if (NewMI) {
+ // Create ADD instruction for the Offset in case of 3-Ops LEA.
+ if (hasLEAOffset(Offset)) {
+ if (OptIncDec && Offset.isImm() &&
+ (Offset.getImm() == 1 || Offset.getImm() == -1)) {
+ unsigned NewOpc =
+ getINCDECFromLEA(MI.getOpcode(), Offset.getImm() == 1);
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg);
+ LLVM_DEBUG(NewMI->dump(););
+ } else {
+ unsigned NewOpc = getADDriFromLEA(MI.getOpcode(), Offset);
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg)
+ .add(Offset);
+ LLVM_DEBUG(NewMI->dump(););
+ }
+ }
+
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.erase(I);
+ I = NewMI;
+ return;
+ }
+
+ // Handle the rest of the cases with inefficient base register:
+ assert(DestReg != BaseReg && "DestReg == BaseReg should be handled already!");
+ assert(IsInefficientBase && "efficient base should be handled already!");
+
+ // FIXME: Handle LEA64_32r.
+ if (LEAOpcode == X86::LEA64_32r)
+ return;
+
+ // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
+ if (IsScale1 && !hasLEAOffset(Offset)) {
+ bool BIK = Base.isKill() && BaseReg != IndexReg;
+ TII->copyPhysReg(MBB, MI, MI.getDebugLoc(), DestReg, BaseReg, BIK);
+ LLVM_DEBUG(MI.getPrevNode()->dump(););
+
+ unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg)
+ .add(Index);
+ LLVM_DEBUG(NewMI->dump(););
+
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.erase(I);
+ I = NewMI;
+ return;
+ }
+
+ // lea offset(%base,%index,scale), %dst =>
+ // lea offset( ,%index,scale), %dst; add %base,%dst
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+ .add(Dest)
+ .addReg(0)
+ .add(Scale)
+ .add(Index)
+ .add(Offset)
+ .add(Segment);
+ LLVM_DEBUG(NewMI->dump(););
+
+ unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg)
+ .add(Base);
+ LLVM_DEBUG(NewMI->dump(););
+
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.erase(I);
+ I = NewMI;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
new file mode 100644
index 000000000000..269f8ce6bd7a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -0,0 +1,133 @@
+//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that fixes zero-extension of setcc patterns.
+// X86 setcc instructions are modeled to have no input arguments, and a single
+// GR8 output argument. This is consistent with other similar instructions
+// (e.g. movb), but means it is impossible to directly generate a setcc into
+// the lower GR8 of a specified GR32.
+// This means that ISel must select (zext (setcc)) into something like
+// seta %al; movzbl %al, %eax.
+// Unfortunately, this can cause a stall due to the partial register write
+// performed by the setcc. Instead, we can use:
+// xor %eax, %eax; seta %al
+// This both avoids the stall, and encodes shorter.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-setcc"
+
+STATISTIC(NumSubstZexts, "Number of setcc + zext pairs substituted");
+
+namespace {
+class X86FixupSetCCPass : public MachineFunctionPass {
+public:
+ static char ID;
+
+ X86FixupSetCCPass() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "X86 Fixup SetCC"; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+
+ enum { SearchBound = 16 };
+};
+} // end anonymous namespace
+
+char X86FixupSetCCPass::ID = 0;
+
+INITIALIZE_PASS(X86FixupSetCCPass, DEBUG_TYPE, DEBUG_TYPE, false, false)
+
+FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
+
+bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ MRI = &MF.getRegInfo();
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+
+ SmallVector<MachineInstr*, 4> ToErase;
+
+ for (auto &MBB : MF) {
+ MachineInstr *FlagsDefMI = nullptr;
+ for (auto &MI : MBB) {
+ // Remember the most recent preceding eflags defining instruction.
+ if (MI.definesRegister(X86::EFLAGS))
+ FlagsDefMI = &MI;
+
+ // Find a setcc that is used by a zext.
+ // This doesn't have to be the only use, the transformation is safe
+ // regardless.
+ if (MI.getOpcode() != X86::SETCCr)
+ continue;
+
+ MachineInstr *ZExt = nullptr;
+ for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))
+ if (Use.getOpcode() == X86::MOVZX32rr8)
+ ZExt = &Use;
+
+ if (!ZExt)
+ continue;
+
+ if (!FlagsDefMI)
+ continue;
+
+ // We'd like to put something that clobbers eflags directly before
+ // FlagsDefMI. This can't hurt anything after FlagsDefMI, because
+ // it, itself, by definition, clobbers eflags. But it may happen that
+ // FlagsDefMI also *uses* eflags, in which case the transformation is
+ // invalid.
+ if (FlagsDefMI->readsRegister(X86::EFLAGS))
+ continue;
+
+ // On 32-bit, we need to be careful to force an ABCD register.
+ const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
+ ? &X86::GR32RegClass
+ : &X86::GR32_ABCDRegClass;
+ if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) {
+ // If we cannot constrain the register, we would need an additional copy
+ // and are better off keeping the MOVZX32rr8 we have now.
+ continue;
+ }
+
+ ++NumSubstZexts;
+ Changed = true;
+
+ // Initialize a register with 0. This must go before the eflags def
+ Register ZeroReg = MRI->createVirtualRegister(RC);
+ BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
+ ZeroReg);
+
+ // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
+ // the setcc result into the low byte of the zeroed register.
+ BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
+ TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg())
+ .addReg(ZeroReg)
+ .addReg(MI.getOperand(0).getReg())
+ .addImm(X86::sub_8bit);
+ ToErase.push_back(ZExt);
+ }
+ }
+
+ for (auto &I : ToErase)
+ I->eraseFromParent();
+
+ return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
new file mode 100644
index 000000000000..d43fd807a5a7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -0,0 +1,984 @@
+//====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual
+/// flag bits.
+///
+/// We have to do this by carefully analyzing and rewriting the usage of the
+/// copied EFLAGS register because there is no general way to rematerialize the
+/// entire EFLAGS register safely and efficiently. Using `popf` both forces
+/// dynamic stack adjustment and can create correctness issues due to IF, TF,
+/// and other non-status flags being overwritten. Using sequences involving
+/// SAHF don't work on all x86 processors and are often quite slow compared to
+/// directly testing a single status preserved in its own GPR.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-flags-copy-lowering"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
+STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
+STATISTIC(NumTestsInserted, "Number of test instructions inserted");
+STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
+
+namespace {
+
+// Convenient array type for storing registers associated with each condition.
+using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
+
+class X86FlagsCopyLoweringPass : public MachineFunctionPass {
+public:
+ X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { }
+
+ StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Pass identification, replacement for typeid.
+ static char ID;
+
+private:
+ MachineRegisterInfo *MRI = nullptr;
+ const X86Subtarget *Subtarget = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+ const TargetRegisterClass *PromoteRC = nullptr;
+ MachineDominatorTree *MDT = nullptr;
+
+ CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator CopyDefI);
+
+ Register promoteCondToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, X86::CondCode Cond);
+ std::pair<unsigned, bool>
+ getCondOrInverseInReg(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ X86::CondCode Cond, CondRegArray &CondRegs);
+ void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+ DebugLoc Loc, unsigned Reg);
+
+ void rewriteArithmetic(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &MI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs);
+ void rewriteCMov(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &CMovI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs);
+ void rewriteFCMov(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &CMovI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs);
+ void rewriteCondJmp(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &JmpI, CondRegArray &CondRegs);
+ void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
+ MachineInstr &CopyDefI);
+ void rewriteSetCC(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &SetCCI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs);
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE,
+ "X86 EFLAGS copy lowering", false, false)
+INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE,
+ "X86 EFLAGS copy lowering", false, false)
+
+FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
+ return new X86FlagsCopyLoweringPass();
+}
+
+char X86FlagsCopyLoweringPass::ID = 0;
+
+void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+namespace {
+/// An enumeration of the arithmetic instruction mnemonics which have
+/// interesting flag semantics.
+///
+/// We can map instruction opcodes into these mnemonics to make it easy to
+/// dispatch with specific functionality.
+enum class FlagArithMnemonic {
+ ADC,
+ ADCX,
+ ADOX,
+ RCL,
+ RCR,
+ SBB,
+ SETB,
+};
+} // namespace
+
+static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ report_fatal_error("No support for lowering a copy into EFLAGS when used "
+ "by this instruction!");
+
+#define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX) \
+ case X86::MNEMONIC##8##SUFFIX: \
+ case X86::MNEMONIC##16##SUFFIX: \
+ case X86::MNEMONIC##32##SUFFIX: \
+ case X86::MNEMONIC##64##SUFFIX:
+
+#define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \
+ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \
+ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \
+ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \
+ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \
+ case X86::MNEMONIC##8ri: \
+ case X86::MNEMONIC##16ri8: \
+ case X86::MNEMONIC##32ri8: \
+ case X86::MNEMONIC##64ri8: \
+ case X86::MNEMONIC##16ri: \
+ case X86::MNEMONIC##32ri: \
+ case X86::MNEMONIC##64ri32: \
+ case X86::MNEMONIC##8mi: \
+ case X86::MNEMONIC##16mi8: \
+ case X86::MNEMONIC##32mi8: \
+ case X86::MNEMONIC##64mi8: \
+ case X86::MNEMONIC##16mi: \
+ case X86::MNEMONIC##32mi: \
+ case X86::MNEMONIC##64mi32: \
+ case X86::MNEMONIC##8i8: \
+ case X86::MNEMONIC##16i16: \
+ case X86::MNEMONIC##32i32: \
+ case X86::MNEMONIC##64i32:
+
+ LLVM_EXPAND_ADC_SBB_INSTR(ADC)
+ return FlagArithMnemonic::ADC;
+
+ LLVM_EXPAND_ADC_SBB_INSTR(SBB)
+ return FlagArithMnemonic::SBB;
+
+#undef LLVM_EXPAND_ADC_SBB_INSTR
+
+ LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
+ LLVM_EXPAND_INSTR_SIZES(RCL, r1)
+ LLVM_EXPAND_INSTR_SIZES(RCL, ri)
+ return FlagArithMnemonic::RCL;
+
+ LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
+ LLVM_EXPAND_INSTR_SIZES(RCR, r1)
+ LLVM_EXPAND_INSTR_SIZES(RCR, ri)
+ return FlagArithMnemonic::RCR;
+
+#undef LLVM_EXPAND_INSTR_SIZES
+
+ case X86::ADCX32rr:
+ case X86::ADCX64rr:
+ case X86::ADCX32rm:
+ case X86::ADCX64rm:
+ return FlagArithMnemonic::ADCX;
+
+ case X86::ADOX32rr:
+ case X86::ADOX64rr:
+ case X86::ADOX32rm:
+ case X86::ADOX64rm:
+ return FlagArithMnemonic::ADOX;
+
+ case X86::SETB_C32r:
+ case X86::SETB_C64r:
+ return FlagArithMnemonic::SETB;
+ }
+}
+
+static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
+ MachineInstr &SplitI,
+ const X86InstrInfo &TII) {
+ MachineFunction &MF = *MBB.getParent();
+
+ assert(SplitI.getParent() == &MBB &&
+ "Split instruction must be in the split block!");
+ assert(SplitI.isBranch() &&
+ "Only designed to split a tail of branch instructions!");
+ assert(X86::getCondFromBranch(SplitI) != X86::COND_INVALID &&
+ "Must split on an actual jCC instruction!");
+
+ // Dig out the previous instruction to the split point.
+ MachineInstr &PrevI = *std::prev(SplitI.getIterator());
+ assert(PrevI.isBranch() && "Must split after a branch!");
+ assert(X86::getCondFromBranch(PrevI) != X86::COND_INVALID &&
+ "Must split after an actual jCC instruction!");
+ assert(!std::prev(PrevI.getIterator())->isTerminator() &&
+ "Must only have this one terminator prior to the split!");
+
+ // Grab the one successor edge that will stay in `MBB`.
+ MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB();
+
+ // Analyze the original block to see if we are actually splitting an edge
+ // into two edges. This can happen when we have multiple conditional jumps to
+ // the same successor.
+ bool IsEdgeSplit =
+ std::any_of(SplitI.getIterator(), MBB.instr_end(),
+ [&](MachineInstr &MI) {
+ assert(MI.isTerminator() &&
+ "Should only have spliced terminators!");
+ return llvm::any_of(
+ MI.operands(), [&](MachineOperand &MOp) {
+ return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc;
+ });
+ }) ||
+ MBB.getFallThrough() == &UnsplitSucc;
+
+ MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
+
+ // Insert the new block immediately after the current one. Any existing
+ // fallthrough will be sunk into this new block anyways.
+ MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
+
+ // Splice the tail of instructions into the new block.
+ NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end());
+
+ // Copy the necessary succesors (and their probability info) into the new
+ // block.
+ for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
+ if (IsEdgeSplit || *SI != &UnsplitSucc)
+ NewMBB.copySuccessor(&MBB, SI);
+ // Normalize the probabilities if we didn't end up splitting the edge.
+ if (!IsEdgeSplit)
+ NewMBB.normalizeSuccProbs();
+
+ // Now replace all of the moved successors in the original block with the new
+ // block. This will merge their probabilities.
+ for (MachineBasicBlock *Succ : NewMBB.successors())
+ if (Succ != &UnsplitSucc)
+ MBB.replaceSuccessor(Succ, &NewMBB);
+
+ // We should always end up replacing at least one successor.
+ assert(MBB.isSuccessor(&NewMBB) &&
+ "Failed to make the new block a successor!");
+
+ // Now update all the PHIs.
+ for (MachineBasicBlock *Succ : NewMBB.successors()) {
+ for (MachineInstr &MI : *Succ) {
+ if (!MI.isPHI())
+ break;
+
+ for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+ OpIdx += 2) {
+ MachineOperand &OpV = MI.getOperand(OpIdx);
+ MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
+ assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
+ if (OpMBB.getMBB() != &MBB)
+ continue;
+
+ // Replace the operand for unsplit successors
+ if (!IsEdgeSplit || Succ != &UnsplitSucc) {
+ OpMBB.setMBB(&NewMBB);
+
+ // We have to continue scanning as there may be multiple entries in
+ // the PHI.
+ continue;
+ }
+
+ // When we have split the edge append a new successor.
+ MI.addOperand(MF, OpV);
+ MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
+ break;
+ }
+ }
+ }
+
+ return NewMBB;
+}
+
+static X86::CondCode getCondFromFCMOV(unsigned Opcode) {
+ switch (Opcode) {
+ default: return X86::COND_INVALID;
+ case X86::CMOVBE_Fp32: case X86::CMOVBE_Fp64: case X86::CMOVBE_Fp80:
+ return X86::COND_BE;
+ case X86::CMOVB_Fp32: case X86::CMOVB_Fp64: case X86::CMOVB_Fp80:
+ return X86::COND_B;
+ case X86::CMOVE_Fp32: case X86::CMOVE_Fp64: case X86::CMOVE_Fp80:
+ return X86::COND_E;
+ case X86::CMOVNBE_Fp32: case X86::CMOVNBE_Fp64: case X86::CMOVNBE_Fp80:
+ return X86::COND_A;
+ case X86::CMOVNB_Fp32: case X86::CMOVNB_Fp64: case X86::CMOVNB_Fp80:
+ return X86::COND_AE;
+ case X86::CMOVNE_Fp32: case X86::CMOVNE_Fp64: case X86::CMOVNE_Fp80:
+ return X86::COND_NE;
+ case X86::CMOVNP_Fp32: case X86::CMOVNP_Fp64: case X86::CMOVNP_Fp80:
+ return X86::COND_NP;
+ case X86::CMOVP_Fp32: case X86::CMOVP_Fp64: case X86::CMOVP_Fp80:
+ return X86::COND_P;
+ }
+}
+
+bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+ << " **********\n");
+
+ Subtarget = &MF.getSubtarget<X86Subtarget>();
+ MRI = &MF.getRegInfo();
+ TII = Subtarget->getInstrInfo();
+ TRI = Subtarget->getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ PromoteRC = &X86::GR8RegClass;
+
+ if (MF.begin() == MF.end())
+ // Nothing to do for a degenerate empty function...
+ return false;
+
+ // Collect the copies in RPO so that when there are chains where a copy is in
+ // turn copied again we visit the first one first. This ensures we can find
+ // viable locations for testing the original EFLAGS that dominate all the
+ // uses across complex CFGs.
+ SmallVector<MachineInstr *, 4> Copies;
+ ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+ for (MachineBasicBlock *MBB : RPOT)
+ for (MachineInstr &MI : *MBB)
+ if (MI.getOpcode() == TargetOpcode::COPY &&
+ MI.getOperand(0).getReg() == X86::EFLAGS)
+ Copies.push_back(&MI);
+
+ for (MachineInstr *CopyI : Copies) {
+ MachineBasicBlock &MBB = *CopyI->getParent();
+
+ MachineOperand &VOp = CopyI->getOperand(1);
+ assert(VOp.isReg() &&
+ "The input to the copy for EFLAGS should always be a register!");
+ MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg());
+ if (CopyDefI.getOpcode() != TargetOpcode::COPY) {
+ // FIXME: The big likely candidate here are PHI nodes. We could in theory
+ // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard
+ // enough that it is probably better to change every other part of LLVM
+ // to avoid creating them. The issue is that once we have PHIs we won't
+ // know which original EFLAGS value we need to capture with our setCCs
+ // below. The end result will be computing a complete set of setCCs that
+ // we *might* want, computing them in every place where we copy *out* of
+ // EFLAGS and then doing SSA formation on all of them to insert necessary
+ // PHI nodes and consume those here. Then hoping that somehow we DCE the
+ // unnecessary ones. This DCE seems very unlikely to be successful and so
+ // we will almost certainly end up with a glut of dead setCC
+ // instructions. Until we have a motivating test case and fail to avoid
+ // it by changing other parts of LLVM's lowering, we refuse to handle
+ // this complex case here.
+ LLVM_DEBUG(
+ dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
+ CopyDefI.dump());
+ report_fatal_error(
+ "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
+ }
+
+ auto Cleanup = make_scope_exit([&] {
+ // All uses of the EFLAGS copy are now rewritten, kill the copy into
+ // eflags and if dead the copy from.
+ CopyI->eraseFromParent();
+ if (MRI->use_empty(CopyDefI.getOperand(0).getReg()))
+ CopyDefI.eraseFromParent();
+ ++NumCopiesEliminated;
+ });
+
+ MachineOperand &DOp = CopyI->getOperand(0);
+ assert(DOp.isDef() && "Expected register def!");
+ assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!");
+ if (DOp.isDead())
+ continue;
+
+ MachineBasicBlock *TestMBB = CopyDefI.getParent();
+ auto TestPos = CopyDefI.getIterator();
+ DebugLoc TestLoc = CopyDefI.getDebugLoc();
+
+ LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
+
+ // Walk up across live-in EFLAGS to find where they were actually def'ed.
+ //
+ // This copy's def may just be part of a region of blocks covered by
+ // a single def of EFLAGS and we want to find the top of that region where
+ // possible.
+ //
+ // This is essentially a search for a *candidate* reaching definition
+ // location. We don't need to ever find the actual reaching definition here,
+ // but we want to walk up the dominator tree to find the highest point which
+ // would be viable for such a definition.
+ auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End) {
+ // Scan backwards as we expect these to be relatively short and often find
+ // a clobber near the end.
+ return llvm::any_of(
+ llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
+ // Flag any instruction (other than the copy we are
+ // currently rewriting) that defs EFLAGS.
+ return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
+ });
+ };
+ auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
+ MachineBasicBlock *EndMBB) {
+ assert(MDT->dominates(BeginMBB, EndMBB) &&
+ "Only support paths down the dominator tree!");
+ SmallPtrSet<MachineBasicBlock *, 4> Visited;
+ SmallVector<MachineBasicBlock *, 4> Worklist;
+ // We terminate at the beginning. No need to scan it.
+ Visited.insert(BeginMBB);
+ Worklist.push_back(EndMBB);
+ do {
+ auto *MBB = Worklist.pop_back_val();
+ for (auto *PredMBB : MBB->predecessors()) {
+ if (!Visited.insert(PredMBB).second)
+ continue;
+ if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
+ return true;
+ // Enqueue this block to walk its predecessors.
+ Worklist.push_back(PredMBB);
+ }
+ } while (!Worklist.empty());
+ // No clobber found along a path from the begin to end.
+ return false;
+ };
+ while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
+ !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
+ // Find the nearest common dominator of the predecessors, as
+ // that will be the best candidate to hoist into.
+ MachineBasicBlock *HoistMBB =
+ std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
+ *TestMBB->pred_begin(),
+ [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
+ return MDT->findNearestCommonDominator(LHS, RHS);
+ });
+
+ // Now we need to scan all predecessors that may be reached along paths to
+ // the hoist block. A clobber anywhere in any of these blocks the hoist.
+ // Note that this even handles loops because we require *no* clobbers.
+ if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
+ break;
+
+ // We also need the terminators to not sneakily clobber flags.
+ if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
+ HoistMBB->instr_end()))
+ break;
+
+ // We found a viable location, hoist our test position to it.
+ TestMBB = HoistMBB;
+ TestPos = TestMBB->getFirstTerminator()->getIterator();
+ // Clear the debug location as it would just be confusing after hoisting.
+ TestLoc = DebugLoc();
+ }
+ LLVM_DEBUG({
+ auto DefIt = llvm::find_if(
+ llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
+ [&](MachineInstr &MI) {
+ return MI.findRegisterDefOperand(X86::EFLAGS);
+ });
+ if (DefIt.base() != TestMBB->instr_begin()) {
+ dbgs() << " Using EFLAGS defined by: ";
+ DefIt->dump();
+ } else {
+ dbgs() << " Using live-in flags for BB:\n";
+ TestMBB->dump();
+ }
+ });
+
+ // While rewriting uses, we buffer jumps and rewrite them in a second pass
+ // because doing so will perturb the CFG that we are walking to find the
+ // uses in the first place.
+ SmallVector<MachineInstr *, 4> JmpIs;
+
+ // Gather the condition flags that have already been preserved in
+ // registers. We do this from scratch each time as we expect there to be
+ // very few of them and we expect to not revisit the same copy definition
+ // many times. If either of those change sufficiently we could build a map
+ // of these up front instead.
+ CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
+
+ // Collect the basic blocks we need to scan. Typically this will just be
+ // a single basic block but we may have to scan multiple blocks if the
+ // EFLAGS copy lives into successors.
+ SmallVector<MachineBasicBlock *, 2> Blocks;
+ SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
+ Blocks.push_back(&MBB);
+
+ do {
+ MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
+
+ // Track when if/when we find a kill of the flags in this block.
+ bool FlagsKilled = false;
+
+ // In most cases, we walk from the beginning to the end of the block. But
+ // when the block is the same block as the copy is from, we will visit it
+ // twice. The first time we start from the copy and go to the end. The
+ // second time we start from the beginning and go to the copy. This lets
+ // us handle copies inside of cycles.
+ // FIXME: This loop is *super* confusing. This is at least in part
+ // a symptom of all of this routine needing to be refactored into
+ // documentable components. Once done, there may be a better way to write
+ // this loop.
+ for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
+ ? std::next(CopyI->getIterator())
+ : UseMBB.instr_begin(),
+ MIE = UseMBB.instr_end();
+ MII != MIE;) {
+ MachineInstr &MI = *MII++;
+ // If we are in the original copy block and encounter either the copy
+ // def or the copy itself, break so that we don't re-process any part of
+ // the block or process the instructions in the range that was copied
+ // over.
+ if (&MI == CopyI || &MI == &CopyDefI) {
+ assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
+ "Should only encounter these on the second pass over the "
+ "original block.");
+ break;
+ }
+
+ MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
+ if (!FlagUse) {
+ if (MI.findRegisterDefOperand(X86::EFLAGS)) {
+ // If EFLAGS are defined, it's as-if they were killed. We can stop
+ // scanning here.
+ //
+ // NB!!! Many instructions only modify some flags. LLVM currently
+ // models this as clobbering all flags, but if that ever changes
+ // this will need to be carefully updated to handle that more
+ // complex logic.
+ FlagsKilled = true;
+ break;
+ }
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << " Rewriting use: "; MI.dump());
+
+ // Check the kill flag before we rewrite as that may change it.
+ if (FlagUse->isKill())
+ FlagsKilled = true;
+
+ // Once we encounter a branch, the rest of the instructions must also be
+ // branches. We can't rewrite in place here, so we handle them below.
+ //
+ // Note that we don't have to handle tail calls here, even conditional
+ // tail calls, as those are not introduced into the X86 MI until post-RA
+ // branch folding or black placement. As a consequence, we get to deal
+ // with the simpler formulation of conditional branches followed by tail
+ // calls.
+ if (X86::getCondFromBranch(MI) != X86::COND_INVALID) {
+ auto JmpIt = MI.getIterator();
+ do {
+ JmpIs.push_back(&*JmpIt);
+ ++JmpIt;
+ } while (JmpIt != UseMBB.instr_end() &&
+ X86::getCondFromBranch(*JmpIt) !=
+ X86::COND_INVALID);
+ break;
+ }
+
+ // Otherwise we can just rewrite in-place.
+ if (X86::getCondFromCMov(MI) != X86::COND_INVALID) {
+ rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ } else if (getCondFromFCMOV(MI.getOpcode()) != X86::COND_INVALID) {
+ rewriteFCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) {
+ rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ } else if (MI.getOpcode() == TargetOpcode::COPY) {
+ rewriteCopy(MI, *FlagUse, CopyDefI);
+ } else {
+ // We assume all other instructions that use flags also def them.
+ assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
+ "Expected a def of EFLAGS for this instruction!");
+
+ // NB!!! Several arithmetic instructions only *partially* update
+ // flags. Theoretically, we could generate MI code sequences that
+ // would rely on this fact and observe different flags independently.
+ // But currently LLVM models all of these instructions as clobbering
+ // all the flags in an undef way. We rely on that to simplify the
+ // logic.
+ FlagsKilled = true;
+
+ // Generically handle remaining uses as arithmetic instructions.
+ rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
+ CondRegs);
+ }
+
+ // If this was the last use of the flags, we're done.
+ if (FlagsKilled)
+ break;
+ }
+
+ // If the flags were killed, we're done with this block.
+ if (FlagsKilled)
+ continue;
+
+ // Otherwise we need to scan successors for ones where the flags live-in
+ // and queue those up for processing.
+ for (MachineBasicBlock *SuccMBB : UseMBB.successors())
+ if (SuccMBB->isLiveIn(X86::EFLAGS) &&
+ VisitedBlocks.insert(SuccMBB).second) {
+ // We currently don't do any PHI insertion and so we require that the
+ // test basic block dominates all of the use basic blocks. Further, we
+ // can't have a cycle from the test block back to itself as that would
+ // create a cycle requiring a PHI to break it.
+ //
+ // We could in theory do PHI insertion here if it becomes useful by
+ // just taking undef values in along every edge that we don't trace
+ // this EFLAGS copy along. This isn't as bad as fully general PHI
+ // insertion, but still seems like a great deal of complexity.
+ //
+ // Because it is theoretically possible that some earlier MI pass or
+ // other lowering transformation could induce this to happen, we do
+ // a hard check even in non-debug builds here.
+ if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
+ LLVM_DEBUG({
+ dbgs()
+ << "ERROR: Encountered use that is not dominated by our test "
+ "basic block! Rewriting this would require inserting PHI "
+ "nodes to track the flag state across the CFG.\n\nTest "
+ "block:\n";
+ TestMBB->dump();
+ dbgs() << "Use block:\n";
+ SuccMBB->dump();
+ });
+ report_fatal_error(
+ "Cannot lower EFLAGS copy when original copy def "
+ "does not dominate all uses.");
+ }
+
+ Blocks.push_back(SuccMBB);
+
+ // After this, EFLAGS will be recreated before each use.
+ SuccMBB->removeLiveIn(X86::EFLAGS);
+ }
+ } while (!Blocks.empty());
+
+ // Now rewrite the jumps that use the flags. These we handle specially
+ // because if there are multiple jumps in a single basic block we'll have
+ // to do surgery on the CFG.
+ MachineBasicBlock *LastJmpMBB = nullptr;
+ for (MachineInstr *JmpI : JmpIs) {
+ // Past the first jump within a basic block we need to split the blocks
+ // apart.
+ if (JmpI->getParent() == LastJmpMBB)
+ splitBlock(*JmpI->getParent(), *JmpI, *TII);
+ else
+ LastJmpMBB = JmpI->getParent();
+
+ rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+ }
+
+ // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
+ // the copy's def operand is itself a kill.
+ }
+
+#ifndef NDEBUG
+ for (MachineBasicBlock &MBB : MF)
+ for (MachineInstr &MI : MBB)
+ if (MI.getOpcode() == TargetOpcode::COPY &&
+ (MI.getOperand(0).getReg() == X86::EFLAGS ||
+ MI.getOperand(1).getReg() == X86::EFLAGS)) {
+ LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: ";
+ MI.dump());
+ llvm_unreachable("Unlowered EFLAGS copy!");
+ }
+#endif
+
+ return true;
+}
+
+/// Collect any conditions that have already been set in registers so that we
+/// can re-use them rather than adding duplicates.
+CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
+ CondRegArray CondRegs = {};
+
+ // Scan backwards across the range of instructions with live EFLAGS.
+ for (MachineInstr &MI :
+ llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
+ X86::CondCode Cond = X86::getCondFromSETCC(MI);
+ if (Cond != X86::COND_INVALID && !MI.mayStore() &&
+ MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual()) {
+ assert(MI.getOperand(0).isDef() &&
+ "A non-storing SETcc should always define a register!");
+ CondRegs[Cond] = MI.getOperand(0).getReg();
+ }
+
+ // Stop scanning when we see the first definition of the EFLAGS as prior to
+ // this we would potentially capture the wrong flag state.
+ if (MI.findRegisterDefOperand(X86::EFLAGS))
+ break;
+ }
+ return CondRegs;
+}
+
+Register X86FlagsCopyLoweringPass::promoteCondToReg(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, X86::CondCode Cond) {
+ Register Reg = MRI->createVirtualRegister(PromoteRC);
+ auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
+ TII->get(X86::SETCCr), Reg).addImm(Cond);
+ (void)SetI;
+ LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump());
+ ++NumSetCCsInserted;
+ return Reg;
+}
+
+std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
+ unsigned &CondReg = CondRegs[Cond];
+ unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
+ if (!CondReg && !InvCondReg)
+ CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+ if (CondReg)
+ return {CondReg, false};
+ else
+ return {InvCondReg, true};
+}
+
+void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Pos,
+ DebugLoc Loc, unsigned Reg) {
+ auto TestI =
+ BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
+ (void)TestI;
+ LLVM_DEBUG(dbgs() << " test cond: "; TestI->dump());
+ ++NumTestsInserted;
+}
+
+void X86FlagsCopyLoweringPass::rewriteArithmetic(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs) {
+ // Arithmetic is either reading CF or OF. Figure out which condition we need
+ // to preserve in a register.
+ X86::CondCode Cond = X86::COND_INVALID;
+
+ // The addend to use to reset CF or OF when added to the flag value.
+ int Addend = 0;
+
+ switch (getMnemonicFromOpcode(MI.getOpcode())) {
+ case FlagArithMnemonic::ADC:
+ case FlagArithMnemonic::ADCX:
+ case FlagArithMnemonic::RCL:
+ case FlagArithMnemonic::RCR:
+ case FlagArithMnemonic::SBB:
+ case FlagArithMnemonic::SETB:
+ Cond = X86::COND_B; // CF == 1
+ // Set up an addend that when one is added will need a carry due to not
+ // having a higher bit available.
+ Addend = 255;
+ break;
+
+ case FlagArithMnemonic::ADOX:
+ Cond = X86::COND_O; // OF == 1
+ // Set up an addend that when one is added will turn from positive to
+ // negative and thus overflow in the signed domain.
+ Addend = 127;
+ break;
+ }
+
+ // Now get a register that contains the value of the flag input to the
+ // arithmetic. We require exactly this flag to simplify the arithmetic
+ // required to materialize it back into the flag.
+ unsigned &CondReg = CondRegs[Cond];
+ if (!CondReg)
+ CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ // Insert an instruction that will set the flag back to the desired value.
+ Register TmpReg = MRI->createVirtualRegister(PromoteRC);
+ auto AddI =
+ BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
+ .addDef(TmpReg, RegState::Dead)
+ .addReg(CondReg)
+ .addImm(Addend);
+ (void)AddI;
+ LLVM_DEBUG(dbgs() << " add cond: "; AddI->dump());
+ ++NumAddsInserted;
+ FlagUse.setIsKill(true);
+}
+
+void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc,
+ MachineInstr &CMovI,
+ MachineOperand &FlagUse,
+ CondRegArray &CondRegs) {
+ // First get the register containing this specific condition.
+ X86::CondCode Cond = X86::getCondFromCMov(CMovI);
+ unsigned CondReg;
+ bool Inverted;
+ std::tie(CondReg, Inverted) =
+ getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+ MachineBasicBlock &MBB = *CMovI.getParent();
+
+ // Insert a direct test of the saved register.
+ insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
+
+ // Rewrite the CMov to use the !ZF flag from the test, and then kill its use
+ // of the flags afterward.
+ CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1)
+ .setImm(Inverted ? X86::COND_E : X86::COND_NE);
+ FlagUse.setIsKill(true);
+ LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump());
+}
+
+void X86FlagsCopyLoweringPass::rewriteFCMov(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc,
+ MachineInstr &CMovI,
+ MachineOperand &FlagUse,
+ CondRegArray &CondRegs) {
+ // First get the register containing this specific condition.
+ X86::CondCode Cond = getCondFromFCMOV(CMovI.getOpcode());
+ unsigned CondReg;
+ bool Inverted;
+ std::tie(CondReg, Inverted) =
+ getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+ MachineBasicBlock &MBB = *CMovI.getParent();
+
+ // Insert a direct test of the saved register.
+ insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
+
+ auto getFCMOVOpcode = [](unsigned Opcode, bool Inverted) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::CMOVBE_Fp32: case X86::CMOVNBE_Fp32:
+ case X86::CMOVB_Fp32: case X86::CMOVNB_Fp32:
+ case X86::CMOVE_Fp32: case X86::CMOVNE_Fp32:
+ case X86::CMOVP_Fp32: case X86::CMOVNP_Fp32:
+ return Inverted ? X86::CMOVE_Fp32 : X86::CMOVNE_Fp32;
+ case X86::CMOVBE_Fp64: case X86::CMOVNBE_Fp64:
+ case X86::CMOVB_Fp64: case X86::CMOVNB_Fp64:
+ case X86::CMOVE_Fp64: case X86::CMOVNE_Fp64:
+ case X86::CMOVP_Fp64: case X86::CMOVNP_Fp64:
+ return Inverted ? X86::CMOVE_Fp64 : X86::CMOVNE_Fp64;
+ case X86::CMOVBE_Fp80: case X86::CMOVNBE_Fp80:
+ case X86::CMOVB_Fp80: case X86::CMOVNB_Fp80:
+ case X86::CMOVE_Fp80: case X86::CMOVNE_Fp80:
+ case X86::CMOVP_Fp80: case X86::CMOVNP_Fp80:
+ return Inverted ? X86::CMOVE_Fp80 : X86::CMOVNE_Fp80;
+ }
+ };
+
+ // Rewrite the CMov to use the !ZF flag from the test.
+ CMovI.setDesc(TII->get(getFCMOVOpcode(CMovI.getOpcode(), Inverted)));
+ FlagUse.setIsKill(true);
+ LLVM_DEBUG(dbgs() << " fixed fcmov: "; CMovI.dump());
+}
+
+void X86FlagsCopyLoweringPass::rewriteCondJmp(
+ MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
+ // First get the register containing this specific condition.
+ X86::CondCode Cond = X86::getCondFromBranch(JmpI);
+ unsigned CondReg;
+ bool Inverted;
+ std::tie(CondReg, Inverted) =
+ getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+ MachineBasicBlock &JmpMBB = *JmpI.getParent();
+
+ // Insert a direct test of the saved register.
+ insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
+
+ // Rewrite the jump to use the !ZF flag from the test, and kill its use of
+ // flags afterward.
+ JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE);
+ JmpI.findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
+ LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump());
+}
+
+void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
+ MachineOperand &FlagUse,
+ MachineInstr &CopyDefI) {
+ // Just replace this copy with the original copy def.
+ MRI->replaceRegWith(MI.getOperand(0).getReg(),
+ CopyDefI.getOperand(0).getReg());
+ MI.eraseFromParent();
+}
+
+void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc,
+ MachineInstr &SetCCI,
+ MachineOperand &FlagUse,
+ CondRegArray &CondRegs) {
+ X86::CondCode Cond = X86::getCondFromSETCC(SetCCI);
+ // Note that we can't usefully rewrite this to the inverse without complex
+ // analysis of the users of the setCC. Largely we rely on duplicates which
+ // could have been avoided already being avoided here.
+ unsigned &CondReg = CondRegs[Cond];
+ if (!CondReg)
+ CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+ // Rewriting a register def is trivial: we just replace the register and
+ // remove the setcc.
+ if (!SetCCI.mayStore()) {
+ assert(SetCCI.getOperand(0).isReg() &&
+ "Cannot have a non-register defined operand to SETcc!");
+ MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
+ SetCCI.eraseFromParent();
+ return;
+ }
+
+ // Otherwise, we need to emit a store.
+ auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
+ SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
+ // Copy the address operands.
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.add(SetCCI.getOperand(i));
+
+ MIB.addReg(CondReg);
+
+ MIB.setMemRefs(SetCCI.memoperands());
+
+ SetCCI.eraseFromParent();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 000000000000..e6ee46957500
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1730 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// pseudo registers into register stack instructions. This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// The x87 hardware tracks liveness of the stack registers, so it is necessary
+// to implement exact liveness tracking between basic blocks. The CFG edges are
+// partitioned into bundles where the same FP registers must be live in
+// identical stack positions. Instructions are inserted at the end of each basic
+// block to rearrange the live registers to match the outgoing bundle.
+//
+// This approach avoids splitting critical edges at the potential cost of more
+// live register shuffling instructions when critical edges are present.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <bitset>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-codegen"
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP , "Number of floating point instructions");
+
+namespace {
+ const unsigned ScratchFPReg = 7;
+
+ struct FPS : public MachineFunctionPass {
+ static char ID;
+ FPS() : MachineFunctionPass(ID) {
+ // This is really only to keep valgrind quiet.
+ // The logic in isLive() is too much for it.
+ memset(Stack, 0, sizeof(Stack));
+ memset(RegMap, 0, sizeof(RegMap));
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<EdgeBundles>();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override { return "X86 FP Stackifier"; }
+
+ private:
+ const TargetInstrInfo *TII = nullptr; // Machine instruction info.
+
+ // Two CFG edges are related if they leave the same block, or enter the same
+ // block. The transitive closure of an edge under this relation is a
+ // LiveBundle. It represents a set of CFG edges where the live FP stack
+ // registers must be allocated identically in the x87 stack.
+ //
+ // A LiveBundle is usually all the edges leaving a block, or all the edges
+ // entering a block, but it can contain more edges if critical edges are
+ // present.
+ //
+ // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+ // but the exact mapping of FP registers to stack slots is fixed later.
+ struct LiveBundle {
+ // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+ unsigned Mask;
+
+ // Number of pre-assigned live registers in FixStack. This is 0 when the
+ // stack order has not yet been fixed.
+ unsigned FixCount;
+
+ // Assigned stack order for live-in registers.
+ // FixStack[i] == getStackEntry(i) for all i < FixCount.
+ unsigned char FixStack[8];
+
+ LiveBundle() : Mask(0), FixCount(0) {}
+
+ // Have the live registers been assigned a stack order yet?
+ bool isFixed() const { return !Mask || FixCount; }
+ };
+
+ // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+ // with no live FP registers.
+ SmallVector<LiveBundle, 8> LiveBundles;
+
+ // The edge bundle analysis provides indices into the LiveBundles vector.
+ EdgeBundles *Bundles = nullptr;
+
+ // Return a bitmask of FP registers in block's live-in list.
+ static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) {
+ unsigned Mask = 0;
+ for (MachineBasicBlock::livein_iterator I = MBB->livein_begin();
+ I != MBB->livein_end(); ) {
+ MCPhysReg Reg = I->PhysReg;
+ static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums");
+ if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+ Mask |= 1 << (Reg - X86::FP0);
+ if (RemoveFPs) {
+ I = MBB->removeLiveIn(I);
+ continue;
+ }
+ }
+ ++I;
+ }
+ return Mask;
+ }
+
+ // Partition all the CFG edges into LiveBundles.
+ void bundleCFGRecomputeKillFlags(MachineFunction &MF);
+
+ MachineBasicBlock *MBB = nullptr; // Current basic block
+
+ // The hardware keeps track of how many FP registers are live, so we have
+ // to model that exactly. Usually, each live register corresponds to an
+ // FP<n> register, but when dealing with calls, returns, and inline
+ // assembly, it is sometimes necessary to have live scratch registers.
+ unsigned Stack[8]; // FP<n> Registers in each stack slot...
+ unsigned StackTop = 0; // The current top of the FP stack.
+
+ enum {
+ NumFPRegs = 8 // Including scratch pseudo-registers.
+ };
+
+ // For each live FP<n> register, point to its Stack[] entry.
+ // The first entries correspond to FP0-FP6, the rest are scratch registers
+ // used when we need slightly different live registers than what the
+ // register allocator thinks.
+ unsigned RegMap[NumFPRegs];
+
+ // Set up our stack model to match the incoming registers to MBB.
+ void setupBlockStack();
+
+ // Shuffle live registers to match the expectations of successor blocks.
+ void finishBlockStack();
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dumpStack() const {
+ dbgs() << "Stack contents:";
+ for (unsigned i = 0; i != StackTop; ++i) {
+ dbgs() << " FP" << Stack[i];
+ assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+ }
+ }
+#endif
+
+ /// getSlot - Return the stack slot number a particular register number is
+ /// in.
+ unsigned getSlot(unsigned RegNo) const {
+ assert(RegNo < NumFPRegs && "Regno out of range!");
+ return RegMap[RegNo];
+ }
+
+ /// isLive - Is RegNo currently live in the stack?
+ bool isLive(unsigned RegNo) const {
+ unsigned Slot = getSlot(RegNo);
+ return Slot < StackTop && Stack[Slot] == RegNo;
+ }
+
+ /// getStackEntry - Return the X86::FP<n> register in register ST(i).
+ unsigned getStackEntry(unsigned STi) const {
+ if (STi >= StackTop)
+ report_fatal_error("Access past stack top!");
+ return Stack[StackTop-1-STi];
+ }
+
+ /// getSTReg - Return the X86::ST(i) register which contains the specified
+ /// FP<RegNo> register.
+ unsigned getSTReg(unsigned RegNo) const {
+ return StackTop - 1 - getSlot(RegNo) + X86::ST0;
+ }
+
+ // pushReg - Push the specified FP<n> register onto the stack.
+ void pushReg(unsigned Reg) {
+ assert(Reg < NumFPRegs && "Register number out of range!");
+ if (StackTop >= 8)
+ report_fatal_error("Stack overflow!");
+ Stack[StackTop] = Reg;
+ RegMap[Reg] = StackTop++;
+ }
+
+ // popReg - Pop a register from the stack.
+ void popReg() {
+ if (StackTop == 0)
+ report_fatal_error("Cannot pop empty stack!");
+ RegMap[Stack[--StackTop]] = ~0; // Update state
+ }
+
+ bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+ void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ if (isAtTop(RegNo)) return;
+
+ unsigned STReg = getSTReg(RegNo);
+ unsigned RegOnTop = getStackEntry(0);
+
+ // Swap the slots the regs are in.
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+ // Swap stack slot contents.
+ if (RegMap[RegOnTop] >= StackTop)
+ report_fatal_error("Access past stack top!");
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+ // Emit an fxch to update the runtime processors version of the state.
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+ ++NumFXCH;
+ }
+
+ void duplicateToTop(unsigned RegNo, unsigned AsReg,
+ MachineBasicBlock::iterator I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ unsigned STReg = getSTReg(RegNo);
+ pushReg(AsReg); // New register on top of stack
+
+ BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+ }
+
+ /// popStackAfter - Pop the current value off of the top of the FP stack
+ /// after the specified instruction.
+ void popStackAfter(MachineBasicBlock::iterator &I);
+
+ /// freeStackSlotAfter - Free the specified register from the register
+ /// stack, so that it is no longer in a register. If the register is
+ /// currently at the top of the stack, we just pop the current instruction,
+ /// otherwise we store the current top-of-stack into the specified slot,
+ /// then pop the top of stack.
+ void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+ /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+ /// instruction.
+ MachineBasicBlock::iterator
+ freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
+
+ /// Adjust the live registers to be the set in Mask.
+ void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+ /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
+ /// st(0), FP reg FixStack[1] is st(1) etc.
+ void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+ MachineBasicBlock::iterator I);
+
+ bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ void handleCall(MachineBasicBlock::iterator &I);
+ void handleReturn(MachineBasicBlock::iterator &I);
+ void handleZeroArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+ void handleTwoArgFP(MachineBasicBlock::iterator &I);
+ void handleCompareFP(MachineBasicBlock::iterator &I);
+ void handleCondMovFP(MachineBasicBlock::iterator &I);
+ void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+ // Check if a COPY instruction is using FP registers.
+ static bool isFPCopy(MachineInstr &MI) {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ return X86::RFP80RegClass.contains(DstReg) ||
+ X86::RFP80RegClass.contains(SrcReg);
+ }
+
+ void setKillFlags(MachineBasicBlock &MBB) const;
+ };
+}
+
+char FPS::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FPS, DEBUG_TYPE, "X86 FP Stackifier",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
+INITIALIZE_PASS_END(FPS, DEBUG_TYPE, "X86 FP Stackifier",
+ false, false)
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+ assert(MO.isReg() && "Expected an FP register!");
+ Register Reg = MO.getReg();
+ assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+ return Reg - X86::FP0;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+ // We only need to run this pass if there are any FP registers used in this
+ // function. If it is all integer, there is nothing for us to do!
+ bool FPIsUsed = false;
+
+ static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (unsigned i = 0; i <= 6; ++i)
+ if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
+ FPIsUsed = true;
+ break;
+ }
+
+ // Early exit.
+ if (!FPIsUsed) return false;
+
+ Bundles = &getAnalysis<EdgeBundles>();
+ TII = MF.getSubtarget().getInstrInfo();
+
+ // Prepare cross-MBB liveness.
+ bundleCFGRecomputeKillFlags(MF);
+
+ StackTop = 0;
+
+ // Process the function in depth first order so that we process at least one
+ // of the predecessors for every reachable block in the function.
+ df_iterator_default_set<MachineBasicBlock*> Processed;
+ MachineBasicBlock *Entry = &MF.front();
+
+ LiveBundle &Bundle =
+ LiveBundles[Bundles->getBundle(Entry->getNumber(), false)];
+
+ // In regcall convention, some FP registers may not be passed through
+ // the stack, so they will need to be assigned to the stack first
+ if ((Entry->getParent()->getFunction().getCallingConv() ==
+ CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) {
+ // In the register calling convention, up to one FP argument could be
+ // saved in the first FP register.
+ // If bundle.mask is non-zero and Bundle.FixCount is zero, it means
+ // that the FP registers contain arguments.
+ // The actual value is passed in FP0.
+ // Here we fix the stack and mark FP0 as pre-assigned register.
+ assert((Bundle.Mask & 0xFE) == 0 &&
+ "Only FP0 could be passed as an argument");
+ Bundle.FixCount = 1;
+ Bundle.FixStack[0] = 0;
+ }
+
+ bool Changed = false;
+ for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed))
+ Changed |= processBasicBlock(MF, *BB);
+
+ // Process any unreachable blocks in arbitrary order now.
+ if (MF.size() != Processed.size())
+ for (MachineBasicBlock &BB : MF)
+ if (Processed.insert(&BB).second)
+ Changed |= processBasicBlock(MF, BB);
+
+ LiveBundles.clear();
+
+ return Changed;
+}
+
+/// bundleCFG - Scan all the basic blocks to determine consistent live-in and
+/// live-out sets for the FP registers. Consistent means that the set of
+/// registers live-out from a block is identical to the live-in set of all
+/// successors. This is not enforced by the normal live-in lists since
+/// registers may be implicitly defined, or not used by all successors.
+void FPS::bundleCFGRecomputeKillFlags(MachineFunction &MF) {
+ assert(LiveBundles.empty() && "Stale data in LiveBundles");
+ LiveBundles.resize(Bundles->getNumBundles());
+
+ // Gather the actual live-in masks for all MBBs.
+ for (MachineBasicBlock &MBB : MF) {
+ setKillFlags(MBB);
+
+ const unsigned Mask = calcLiveInMask(&MBB, false);
+ if (!Mask)
+ continue;
+ // Update MBB ingoing bundle mask.
+ LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask;
+ }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+ bool Changed = false;
+ MBB = &BB;
+
+ setupBlockStack();
+
+ for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+ MachineInstr &MI = *I;
+ uint64_t Flags = MI.getDesc().TSFlags;
+
+ unsigned FPInstClass = Flags & X86II::FPTypeMask;
+ if (MI.isInlineAsm())
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI.isCopy() && isFPCopy(MI))
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI.isImplicitDef() &&
+ X86::RFP80RegClass.contains(MI.getOperand(0).getReg()))
+ FPInstClass = X86II::SpecialFP;
+
+ if (MI.isCall())
+ FPInstClass = X86II::SpecialFP;
+
+ if (FPInstClass == X86II::NotFP)
+ continue; // Efficiently ignore non-fp insts!
+
+ MachineInstr *PrevMI = nullptr;
+ if (I != BB.begin())
+ PrevMI = &*std::prev(I);
+
+ ++NumFP; // Keep track of # of pseudo instrs
+ LLVM_DEBUG(dbgs() << "\nFPInst:\t" << MI);
+
+ // Get dead variables list now because the MI pointer may be deleted as part
+ // of processing!
+ SmallVector<unsigned, 8> DeadRegs;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && MO.isDead())
+ DeadRegs.push_back(MO.getReg());
+ }
+
+ switch (FPInstClass) {
+ case X86II::ZeroArgFP: handleZeroArgFP(I); break;
+ case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0)
+ case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+ case X86II::TwoArgFP: handleTwoArgFP(I); break;
+ case X86II::CompareFP: handleCompareFP(I); break;
+ case X86II::CondMovFP: handleCondMovFP(I); break;
+ case X86II::SpecialFP: handleSpecialFP(I); break;
+ default: llvm_unreachable("Unknown FP Type!");
+ }
+
+ // Check to see if any of the values defined by this instruction are dead
+ // after definition. If so, pop them.
+ for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+ unsigned Reg = DeadRegs[i];
+ // Check if Reg is live on the stack. An inline-asm register operand that
+ // is in the clobber list and marked dead might not be live on the stack.
+ static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
+ if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
+ LLVM_DEBUG(dbgs() << "Register FP#" << Reg - X86::FP0 << " is dead!\n");
+ freeStackSlotAfter(I, Reg-X86::FP0);
+ }
+ }
+
+ // Print out all of the instructions expanded to if -debug
+ LLVM_DEBUG({
+ MachineBasicBlock::iterator PrevI = PrevMI;
+ if (I == PrevI) {
+ dbgs() << "Just deleted pseudo instruction\n";
+ } else {
+ MachineBasicBlock::iterator Start = I;
+ // Rewind to first instruction newly inserted.
+ while (Start != BB.begin() && std::prev(Start) != PrevI)
+ --Start;
+ dbgs() << "Inserted instructions:\n\t";
+ Start->print(dbgs());
+ while (++Start != std::next(I)) {
+ }
+ }
+ dumpStack();
+ });
+ (void)PrevMI;
+
+ Changed = true;
+ }
+
+ finishBlockStack();
+
+ return Changed;
+}
+
+/// setupBlockStack - Use the live bundles to set up our model of the stack
+/// to match predecessors' live out stack.
+void FPS::setupBlockStack() {
+ LLVM_DEBUG(dbgs() << "\nSetting up live-ins for " << printMBBReference(*MBB)
+ << " derived from " << MBB->getName() << ".\n");
+ StackTop = 0;
+ // Get the live-in bundle for MBB.
+ const LiveBundle &Bundle =
+ LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
+
+ if (!Bundle.Mask) {
+ LLVM_DEBUG(dbgs() << "Block has no FP live-ins.\n");
+ return;
+ }
+
+ // Depth-first iteration should ensure that we always have an assigned stack.
+ assert(Bundle.isFixed() && "Reached block before any predecessors");
+
+ // Push the fixed live-in registers.
+ for (unsigned i = Bundle.FixCount; i > 0; --i) {
+ LLVM_DEBUG(dbgs() << "Live-in st(" << (i - 1) << "): %fp"
+ << unsigned(Bundle.FixStack[i - 1]) << '\n');
+ pushReg(Bundle.FixStack[i-1]);
+ }
+
+ // Kill off unwanted live-ins. This can happen with a critical edge.
+ // FIXME: We could keep these live registers around as zombies. They may need
+ // to be revived at the end of a short block. It might save a few instrs.
+ unsigned Mask = calcLiveInMask(MBB, /*RemoveFPs=*/true);
+ adjustLiveRegs(Mask, MBB->begin());
+ LLVM_DEBUG(MBB->dump());
+}
+
+/// finishBlockStack - Revive live-outs that are implicitly defined out of
+/// MBB. Shuffle live registers to match the expected fixed stack of any
+/// predecessors, and ensure that all predecessors are expecting the same
+/// stack.
+void FPS::finishBlockStack() {
+ // The RET handling below takes care of return blocks for us.
+ if (MBB->succ_empty())
+ return;
+
+ LLVM_DEBUG(dbgs() << "Setting up live-outs for " << printMBBReference(*MBB)
+ << " derived from " << MBB->getName() << ".\n");
+
+ // Get MBB's live-out bundle.
+ unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
+ LiveBundle &Bundle = LiveBundles[BundleIdx];
+
+ // We may need to kill and define some registers to match successors.
+ // FIXME: This can probably be combined with the shuffle below.
+ MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+ adjustLiveRegs(Bundle.Mask, Term);
+
+ if (!Bundle.Mask) {
+ LLVM_DEBUG(dbgs() << "No live-outs.\n");
+ return;
+ }
+
+ // Has the stack order been fixed yet?
+ LLVM_DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+ if (Bundle.isFixed()) {
+ LLVM_DEBUG(dbgs() << "Shuffling stack to match.\n");
+ shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
+ } else {
+ // Not fixed yet, we get to choose.
+ LLVM_DEBUG(dbgs() << "Fixing stack order now.\n");
+ Bundle.FixCount = StackTop;
+ for (unsigned i = 0; i < StackTop; ++i)
+ Bundle.FixStack[i] = getStackEntry(i);
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+ struct TableEntry {
+ uint16_t from;
+ uint16_t to;
+ bool operator<(const TableEntry &TE) const { return from < TE.from; }
+ friend bool operator<(const TableEntry &TE, unsigned V) {
+ return TE.from < V;
+ }
+ friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+ const TableEntry &TE) {
+ return V < TE.from;
+ }
+ };
+}
+
+static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
+ const TableEntry *I = llvm::lower_bound(Table, Opcode);
+ if (I != Table.end() && I->from == Opcode)
+ return I->to;
+ return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE) \
+ { \
+ static std::atomic<bool> TABLE##Checked(false); \
+ if (!TABLE##Checked.load(std::memory_order_relaxed)) { \
+ assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \
+ "All lookup tables must be sorted for efficient access!"); \
+ TABLE##Checked.store(true, std::memory_order_relaxed); \
+ } \
+ }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+ { X86::ABS_Fp32 , X86::ABS_F },
+ { X86::ABS_Fp64 , X86::ABS_F },
+ { X86::ABS_Fp80 , X86::ABS_F },
+ { X86::ADD_Fp32m , X86::ADD_F32m },
+ { X86::ADD_Fp64m , X86::ADD_F64m },
+ { X86::ADD_Fp64m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m64 , X86::ADD_F64m },
+ { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+ { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+ { X86::CHS_Fp32 , X86::CHS_F },
+ { X86::CHS_Fp64 , X86::CHS_F },
+ { X86::CHS_Fp80 , X86::CHS_F },
+ { X86::CMOVBE_Fp32 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp64 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp80 , X86::CMOVBE_F },
+ { X86::CMOVB_Fp32 , X86::CMOVB_F },
+ { X86::CMOVB_Fp64 , X86::CMOVB_F },
+ { X86::CMOVB_Fp80 , X86::CMOVB_F },
+ { X86::CMOVE_Fp32 , X86::CMOVE_F },
+ { X86::CMOVE_Fp64 , X86::CMOVE_F },
+ { X86::CMOVE_Fp80 , X86::CMOVE_F },
+ { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+ { X86::CMOVNB_Fp32 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp64 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp80 , X86::CMOVNB_F },
+ { X86::CMOVNE_Fp32 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp64 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp80 , X86::CMOVNE_F },
+ { X86::CMOVNP_Fp32 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp64 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp80 , X86::CMOVNP_F },
+ { X86::CMOVP_Fp32 , X86::CMOVP_F },
+ { X86::CMOVP_Fp64 , X86::CMOVP_F },
+ { X86::CMOVP_Fp80 , X86::CMOVP_F },
+ { X86::COM_FpIr32 , X86::COM_FIr },
+ { X86::COM_FpIr64 , X86::COM_FIr },
+ { X86::COM_FpIr80 , X86::COM_FIr },
+ { X86::COM_Fpr32 , X86::COM_FST0r },
+ { X86::COM_Fpr64 , X86::COM_FST0r },
+ { X86::COM_Fpr80 , X86::COM_FST0r },
+ { X86::DIVR_Fp32m , X86::DIVR_F32m },
+ { X86::DIVR_Fp64m , X86::DIVR_F64m },
+ { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+ { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+ { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+ { X86::DIV_Fp32m , X86::DIV_F32m },
+ { X86::DIV_Fp64m , X86::DIV_F64m },
+ { X86::DIV_Fp64m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m64 , X86::DIV_F64m },
+ { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+ { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+ { X86::ILD_Fp16m32 , X86::ILD_F16m },
+ { X86::ILD_Fp16m64 , X86::ILD_F16m },
+ { X86::ILD_Fp16m80 , X86::ILD_F16m },
+ { X86::ILD_Fp32m32 , X86::ILD_F32m },
+ { X86::ILD_Fp32m64 , X86::ILD_F32m },
+ { X86::ILD_Fp32m80 , X86::ILD_F32m },
+ { X86::ILD_Fp64m32 , X86::ILD_F64m },
+ { X86::ILD_Fp64m64 , X86::ILD_F64m },
+ { X86::ILD_Fp64m80 , X86::ILD_F64m },
+ { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+ { X86::IST_Fp16m32 , X86::IST_F16m },
+ { X86::IST_Fp16m64 , X86::IST_F16m },
+ { X86::IST_Fp16m80 , X86::IST_F16m },
+ { X86::IST_Fp32m32 , X86::IST_F32m },
+ { X86::IST_Fp32m64 , X86::IST_F32m },
+ { X86::IST_Fp32m80 , X86::IST_F32m },
+ { X86::IST_Fp64m32 , X86::IST_FP64m },
+ { X86::IST_Fp64m64 , X86::IST_FP64m },
+ { X86::IST_Fp64m80 , X86::IST_FP64m },
+ { X86::LD_Fp032 , X86::LD_F0 },
+ { X86::LD_Fp064 , X86::LD_F0 },
+ { X86::LD_Fp080 , X86::LD_F0 },
+ { X86::LD_Fp132 , X86::LD_F1 },
+ { X86::LD_Fp164 , X86::LD_F1 },
+ { X86::LD_Fp180 , X86::LD_F1 },
+ { X86::LD_Fp32m , X86::LD_F32m },
+ { X86::LD_Fp32m64 , X86::LD_F32m },
+ { X86::LD_Fp32m80 , X86::LD_F32m },
+ { X86::LD_Fp64m , X86::LD_F64m },
+ { X86::LD_Fp64m80 , X86::LD_F64m },
+ { X86::LD_Fp80m , X86::LD_F80m },
+ { X86::MUL_Fp32m , X86::MUL_F32m },
+ { X86::MUL_Fp64m , X86::MUL_F64m },
+ { X86::MUL_Fp64m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m64 , X86::MUL_F64m },
+ { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+ { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+ { X86::SQRT_Fp32 , X86::SQRT_F },
+ { X86::SQRT_Fp64 , X86::SQRT_F },
+ { X86::SQRT_Fp80 , X86::SQRT_F },
+ { X86::ST_Fp32m , X86::ST_F32m },
+ { X86::ST_Fp64m , X86::ST_F64m },
+ { X86::ST_Fp64m32 , X86::ST_F32m },
+ { X86::ST_Fp80m32 , X86::ST_F32m },
+ { X86::ST_Fp80m64 , X86::ST_F64m },
+ { X86::ST_FpP80m , X86::ST_FP80m },
+ { X86::SUBR_Fp32m , X86::SUBR_F32m },
+ { X86::SUBR_Fp64m , X86::SUBR_F64m },
+ { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+ { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+ { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+ { X86::SUB_Fp32m , X86::SUB_F32m },
+ { X86::SUB_Fp64m , X86::SUB_F64m },
+ { X86::SUB_Fp64m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m64 , X86::SUB_F64m },
+ { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+ { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+ { X86::TST_Fp32 , X86::TST_F },
+ { X86::TST_Fp64 , X86::TST_F },
+ { X86::TST_Fp80 , X86::TST_F },
+ { X86::UCOM_FpIr32 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr64 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr80 , X86::UCOM_FIr },
+ { X86::UCOM_Fpr32 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr64 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr80 , X86::UCOM_Fr },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+ ASSERT_SORTED(OpcodeTable);
+ int Opc = Lookup(OpcodeTable, Opcode);
+ assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+ return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version. The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+ { X86::ADD_FrST0 , X86::ADD_FPrST0 },
+
+ { X86::COMP_FST0r, X86::FCOMPP },
+ { X86::COM_FIr , X86::COM_FIPr },
+ { X86::COM_FST0r , X86::COMP_FST0r },
+
+ { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+ { X86::DIV_FrST0 , X86::DIV_FPrST0 },
+
+ { X86::IST_F16m , X86::IST_FP16m },
+ { X86::IST_F32m , X86::IST_FP32m },
+
+ { X86::MUL_FrST0 , X86::MUL_FPrST0 },
+
+ { X86::ST_F32m , X86::ST_FP32m },
+ { X86::ST_F64m , X86::ST_FP64m },
+ { X86::ST_Frr , X86::ST_FPrr },
+
+ { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+ { X86::SUB_FrST0 , X86::SUB_FPrST0 },
+
+ { X86::UCOM_FIr , X86::UCOM_FIPr },
+
+ { X86::UCOM_FPr , X86::UCOM_FPPr },
+ { X86::UCOM_Fr , X86::UCOM_FPr },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction. This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible. The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+ const DebugLoc &dl = MI.getDebugLoc();
+ ASSERT_SORTED(PopTable);
+
+ popReg();
+
+ // Check to see if there is a popping version of this instruction...
+ int Opcode = Lookup(PopTable, I->getOpcode());
+ if (Opcode != -1) {
+ I->setDesc(TII->get(Opcode));
+ if (Opcode == X86::FCOMPP || Opcode == X86::UCOM_FPPr)
+ I->RemoveOperand(0);
+ } else { // Insert an explicit pop
+ I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+ }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register. If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+ if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy.
+ popStackAfter(I);
+ return;
+ }
+
+ // Otherwise, store the top of stack into the dead slot, killing the operand
+ // without having to add in an explicit xchg then pop.
+ //
+ I = freeStackSlotBefore(++I, FPRegNo);
+}
+
+/// freeStackSlotBefore - Free the specified register without trying any
+/// folding.
+MachineBasicBlock::iterator
+FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
+ unsigned STReg = getSTReg(FPRegNo);
+ unsigned OldSlot = getSlot(FPRegNo);
+ unsigned TopReg = Stack[StackTop-1];
+ Stack[OldSlot] = TopReg;
+ RegMap[TopReg] = OldSlot;
+ RegMap[FPRegNo] = ~0;
+ Stack[--StackTop] = ~0;
+ return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr))
+ .addReg(STReg)
+ .getInstr();
+}
+
+/// adjustLiveRegs - Kill and revive registers such that exactly the FP
+/// registers with a bit in Mask are live.
+void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
+ unsigned Defs = Mask;
+ unsigned Kills = 0;
+ for (unsigned i = 0; i < StackTop; ++i) {
+ unsigned RegNo = Stack[i];
+ if (!(Defs & (1 << RegNo)))
+ // This register is live, but we don't want it.
+ Kills |= (1 << RegNo);
+ else
+ // We don't need to imp-def this live register.
+ Defs &= ~(1 << RegNo);
+ }
+ assert((Kills & Defs) == 0 && "Register needs killing and def'ing?");
+
+ // Produce implicit-defs for free by using killed registers.
+ while (Kills && Defs) {
+ unsigned KReg = countTrailingZeros(Kills);
+ unsigned DReg = countTrailingZeros(Defs);
+ LLVM_DEBUG(dbgs() << "Renaming %fp" << KReg << " as imp %fp" << DReg
+ << "\n");
+ std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
+ std::swap(RegMap[KReg], RegMap[DReg]);
+ Kills &= ~(1 << KReg);
+ Defs &= ~(1 << DReg);
+ }
+
+ // Kill registers by popping.
+ if (Kills && I != MBB->begin()) {
+ MachineBasicBlock::iterator I2 = std::prev(I);
+ while (StackTop) {
+ unsigned KReg = getStackEntry(0);
+ if (!(Kills & (1 << KReg)))
+ break;
+ LLVM_DEBUG(dbgs() << "Popping %fp" << KReg << "\n");
+ popStackAfter(I2);
+ Kills &= ~(1 << KReg);
+ }
+ }
+
+ // Manually kill the rest.
+ while (Kills) {
+ unsigned KReg = countTrailingZeros(Kills);
+ LLVM_DEBUG(dbgs() << "Killing %fp" << KReg << "\n");
+ freeStackSlotBefore(I, KReg);
+ Kills &= ~(1 << KReg);
+ }
+
+ // Load zeros for all the imp-defs.
+ while(Defs) {
+ unsigned DReg = countTrailingZeros(Defs);
+ LLVM_DEBUG(dbgs() << "Defining %fp" << DReg << " as 0\n");
+ BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
+ pushReg(DReg);
+ Defs &= ~(1 << DReg);
+ }
+
+ // Now we should have the correct registers live.
+ LLVM_DEBUG(dumpStack());
+ assert(StackTop == countPopulation(Mask) && "Live count mismatch");
+}
+
+/// shuffleStackTop - emit fxch instructions before I to shuffle the top
+/// FixCount entries into the order given by FixStack.
+/// FIXME: Is there a better algorithm than insertion sort?
+void FPS::shuffleStackTop(const unsigned char *FixStack,
+ unsigned FixCount,
+ MachineBasicBlock::iterator I) {
+ // Move items into place, starting from the desired stack bottom.
+ while (FixCount--) {
+ // Old register at position FixCount.
+ unsigned OldReg = getStackEntry(FixCount);
+ // Desired register at position FixCount.
+ unsigned Reg = FixStack[FixCount];
+ if (Reg == OldReg)
+ continue;
+ // (Reg st0) (OldReg st0) = (Reg OldReg st0)
+ moveToTop(Reg, I);
+ if (FixCount > 0)
+ moveToTop(OldReg, I);
+ }
+ LLVM_DEBUG(dumpStack());
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+void FPS::handleCall(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+ unsigned STReturns = 0;
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+
+ assert(Op.isImplicit() && "Expected implicit def/use");
+
+ if (Op.isDef())
+ STReturns |= 1 << getFPReg(Op);
+
+ // Remove the operand so that later passes don't see it.
+ MI.RemoveOperand(i);
+ --i;
+ --e;
+ }
+
+ unsigned N = countTrailingOnes(STReturns);
+
+ // FP registers used for function return must be consecutive starting at
+ // FP0
+ assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2));
+
+ // Reset the FP Stack - It is required because of possible leftovers from
+ // passed arguments. The caller should assume that the FP stack is
+ // returned empty (unless the callee returns values on FP stack).
+ while (StackTop > 0)
+ popReg();
+
+ for (unsigned I = 0; I < N; ++I)
+ pushReg(N - I - 1);
+}
+
+/// If RET has an FP register use operand, pass the first one in ST(0) and
+/// the second one in ST(1).
+void FPS::handleReturn(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+
+ // Find the register operands.
+ unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+ unsigned LiveMask = 0;
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ // FP Register uses must be kills unless there are two uses of the same
+ // register, in which case only one will be a kill.
+ assert(Op.isUse() &&
+ (Op.isKill() || // Marked kill.
+ getFPReg(Op) == FirstFPRegOp || // Second instance.
+ MI.killsRegister(Op.getReg())) && // Later use is marked kill.
+ "Ret only defs operands, and values aren't live beyond it");
+
+ if (FirstFPRegOp == ~0U)
+ FirstFPRegOp = getFPReg(Op);
+ else {
+ assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+ SecondFPRegOp = getFPReg(Op);
+ }
+ LiveMask |= (1 << getFPReg(Op));
+
+ // Remove the operand so that later passes don't see it.
+ MI.RemoveOperand(i);
+ --i;
+ --e;
+ }
+
+ // We may have been carrying spurious live-ins, so make sure only the
+ // returned registers are left live.
+ adjustLiveRegs(LiveMask, MI);
+ if (!LiveMask) return; // Quick check to see if any are possible.
+
+ // There are only four possibilities here:
+ // 1) we are returning a single FP value. In this case, it has to be in
+ // ST(0) already, so just declare success by removing the value from the
+ // FP Stack.
+ if (SecondFPRegOp == ~0U) {
+ // Assert that the top of stack contains the right FP register.
+ assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+ "Top of stack not the right register for RET!");
+
+ // Ok, everything is good, mark the value as not being on the stack
+ // anymore so that our assertion about the stack being empty at end of
+ // block doesn't fire.
+ StackTop = 0;
+ return;
+ }
+
+ // Otherwise, we are returning two values:
+ // 2) If returning the same value for both, we only have one thing in the FP
+ // stack. Consider: RET FP1, FP1
+ if (StackTop == 1) {
+ assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+ "Stack misconfiguration for RET!");
+
+ // Duplicate the TOS so that we return it twice. Just pick some other FPx
+ // register to hold it.
+ unsigned NewReg = ScratchFPReg;
+ duplicateToTop(FirstFPRegOp, NewReg, MI);
+ FirstFPRegOp = NewReg;
+ }
+
+ /// Okay we know we have two different FPx operands now:
+ assert(StackTop == 2 && "Must have two values live!");
+
+ /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+ /// in ST(1). In this case, emit an fxch.
+ if (getStackEntry(0) == SecondFPRegOp) {
+ assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+ moveToTop(FirstFPRegOp, MI);
+ }
+
+ /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+ /// ST(1). Just remove both from our understanding of the stack and return.
+ assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+ assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+ StackTop = 0;
+}
+
+/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+ unsigned DestReg = getFPReg(MI.getOperand(0));
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI.RemoveOperand(0); // Remove the explicit ST(0) operand
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+ MI.addOperand(
+ MachineOperand::CreateReg(X86::ST0, /*isDef*/ true, /*isImp*/ true));
+
+ // Result gets pushed on the stack.
+ pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+ unsigned NumOps = MI.getDesc().getNumOperands();
+ assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
+ "Can only handle fst* & ftst instructions!");
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI.getOperand(NumOps - 1));
+ bool KillsSrc = MI.killsRegister(X86::FP0 + Reg);
+
+ // FISTP64m is strange because there isn't a non-popping versions.
+ // If we have one _and_ we don't want to pop the operand, duplicate the value
+ // on the stack instead of moving it. This ensure that popping the value is
+ // always ok.
+ // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+ //
+ if (!KillsSrc && (MI.getOpcode() == X86::IST_Fp64m32 ||
+ MI.getOpcode() == X86::ISTT_Fp16m32 ||
+ MI.getOpcode() == X86::ISTT_Fp32m32 ||
+ MI.getOpcode() == X86::ISTT_Fp64m32 ||
+ MI.getOpcode() == X86::IST_Fp64m64 ||
+ MI.getOpcode() == X86::ISTT_Fp16m64 ||
+ MI.getOpcode() == X86::ISTT_Fp32m64 ||
+ MI.getOpcode() == X86::ISTT_Fp64m64 ||
+ MI.getOpcode() == X86::IST_Fp64m80 ||
+ MI.getOpcode() == X86::ISTT_Fp16m80 ||
+ MI.getOpcode() == X86::ISTT_Fp32m80 ||
+ MI.getOpcode() == X86::ISTT_Fp64m80 ||
+ MI.getOpcode() == X86::ST_FpP80m)) {
+ duplicateToTop(Reg, ScratchFPReg, I);
+ } else {
+ moveToTop(Reg, I); // Move to the top of the stack...
+ }
+
+ // Convert from the pseudo instruction to the concrete instruction.
+ MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+ MI.addOperand(
+ MachineOperand::CreateReg(X86::ST0, /*isDef*/ false, /*isImp*/ true));
+
+ if (MI.getOpcode() == X86::IST_FP64m || MI.getOpcode() == X86::ISTT_FP16m ||
+ MI.getOpcode() == X86::ISTT_FP32m || MI.getOpcode() == X86::ISTT_FP64m ||
+ MI.getOpcode() == X86::ST_FP80m) {
+ if (StackTop == 0)
+ report_fatal_error("Stack empty??");
+ --StackTop;
+ } else if (KillsSrc) { // Last use of operand?
+ popStackAfter(I);
+ }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value. These instructions may have
+/// non-fp operands after their FP operands.
+///
+/// Examples:
+/// R1 = fchs R2
+/// R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+#ifndef NDEBUG
+ unsigned NumOps = MI.getDesc().getNumOperands();
+ assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI.getOperand(1));
+ bool KillsSrc = MI.killsRegister(X86::FP0 + Reg);
+
+ if (KillsSrc) {
+ // If this is the last use of the source register, just make sure it's on
+ // the top of the stack.
+ moveToTop(Reg, I);
+ if (StackTop == 0)
+ report_fatal_error("Stack cannot be empty!");
+ --StackTop;
+ pushReg(getFPReg(MI.getOperand(0)));
+ } else {
+ // If this is not the last use of the source register, _copy_ it to the top
+ // of the stack.
+ duplicateToTop(Reg, getFPReg(MI.getOperand(0)), I);
+ }
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI.RemoveOperand(1); // Drop the source operand.
+ MI.RemoveOperand(0); // Drop the destination operand.
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r },
+ { X86::ADD_Fp64 , X86::ADD_FST0r },
+ { X86::ADD_Fp80 , X86::ADD_FST0r },
+ { X86::DIV_Fp32 , X86::DIV_FST0r },
+ { X86::DIV_Fp64 , X86::DIV_FST0r },
+ { X86::DIV_Fp80 , X86::DIV_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r },
+ { X86::MUL_Fp64 , X86::MUL_FST0r },
+ { X86::MUL_Fp80 , X86::MUL_FST0r },
+ { X86::SUB_Fp32 , X86::SUB_FST0r },
+ { X86::SUB_Fp64 , X86::SUB_FST0r },
+ { X86::SUB_Fp80 , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FST0r },
+ { X86::DIV_Fp64 , X86::DIVR_FST0r },
+ { X86::DIV_Fp80 , X86::DIVR_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FST0r },
+ { X86::SUB_Fp64 , X86::SUBR_FST0r },
+ { X86::SUB_Fp80 , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp64 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp80 , X86::DIVR_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp64 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp80 , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 },
+ { X86::ADD_Fp64 , X86::ADD_FrST0 },
+ { X86::ADD_Fp80 , X86::ADD_FrST0 },
+ { X86::DIV_Fp32 , X86::DIV_FrST0 },
+ { X86::DIV_Fp64 , X86::DIV_FrST0 },
+ { X86::DIV_Fp80 , X86::DIV_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 },
+ { X86::MUL_Fp64 , X86::MUL_FrST0 },
+ { X86::MUL_Fp80 , X86::MUL_FrST0 },
+ { X86::SUB_Fp32 , X86::SUB_FrST0 },
+ { X86::SUB_Fp64 , X86::SUB_FrST0 },
+ { X86::SUB_Fp80 , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub ST(0), ST(i)
+/// ST(i) = fsub ST(0), ST(i)
+/// ST(0) = fsubr ST(0), ST(i)
+/// ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr &MI = *I;
+
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+ assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+ unsigned Dest = getFPReg(MI.getOperand(0));
+ unsigned Op0 = getFPReg(MI.getOperand(NumOperands - 2));
+ unsigned Op1 = getFPReg(MI.getOperand(NumOperands - 1));
+ bool KillsOp0 = MI.killsRegister(X86::FP0 + Op0);
+ bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+ DebugLoc dl = MI.getDebugLoc();
+
+ unsigned TOS = getStackEntry(0);
+
+ // One of our operands must be on the top of the stack. If neither is yet, we
+ // need to move one.
+ if (Op0 != TOS && Op1 != TOS) { // No operand at TOS?
+ // We can choose to move either operand to the top of the stack. If one of
+ // the operands is killed by this instruction, we want that one so that we
+ // can update right on top of the old version.
+ if (KillsOp0) {
+ moveToTop(Op0, I); // Move dead operand to TOS.
+ TOS = Op0;
+ } else if (KillsOp1) {
+ moveToTop(Op1, I);
+ TOS = Op1;
+ } else {
+ // All of the operands are live after this instruction executes, so we
+ // cannot update on top of any operand. Because of this, we must
+ // duplicate one of the stack elements to the top. It doesn't matter
+ // which one we pick.
+ //
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+ } else if (!KillsOp0 && !KillsOp1) {
+ // If we DO have one of our operands at the top of the stack, but we don't
+ // have a dead operand, we must duplicate one of the operands to a new slot
+ // on the stack.
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+
+ // Now we know that one of our operands is on the top of the stack, and at
+ // least one of our operands is killed by this instruction.
+ assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+ "Stack conditions not set up right!");
+
+ // We decide which form to use based on what is on the top of the stack, and
+ // which operand is killed by this instruction.
+ ArrayRef<TableEntry> InstTable;
+ bool isForward = TOS == Op0;
+ bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+ if (updateST0) {
+ if (isForward)
+ InstTable = ForwardST0Table;
+ else
+ InstTable = ReverseST0Table;
+ } else {
+ if (isForward)
+ InstTable = ForwardSTiTable;
+ else
+ InstTable = ReverseSTiTable;
+ }
+
+ int Opcode = Lookup(InstTable, MI.getOpcode());
+ assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+ // NotTOS - The register which is not on the top of stack...
+ unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+ // Replace the old instruction with a new instruction
+ MBB->remove(&*I++);
+ I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+ if (!MI.mayRaiseFPException())
+ I->setFlag(MachineInstr::MIFlag::NoFPExcept);
+
+ // If both operands are killed, pop one off of the stack in addition to
+ // overwriting the other one.
+ if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+ assert(!updateST0 && "Should have updated other operand!");
+ popStackAfter(I); // Pop the top of stack
+ }
+
+ // Update stack information so that we know the destination register is now on
+ // the stack.
+ unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+ assert(UpdatedSlot < StackTop && Dest < 7);
+ Stack[UpdatedSlot] = Dest;
+ RegMap[Dest] = UpdatedSlot;
+ MBB->getParent()->DeleteMachineInstr(&MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+ assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+ unsigned Op0 = getFPReg(MI.getOperand(NumOperands - 2));
+ unsigned Op1 = getFPReg(MI.getOperand(NumOperands - 1));
+ bool KillsOp0 = MI.killsRegister(X86::FP0 + Op0);
+ bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+
+ // Make sure the first operand is on the top of stack, the other one can be
+ // anywhere.
+ moveToTop(Op0, I);
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI.getOperand(0).setReg(getSTReg(Op1));
+ MI.RemoveOperand(1);
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+ // If any of the operands are killed by this instruction, free them.
+ if (KillsOp0) freeStackSlotAfter(I, Op0);
+ if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions. These
+/// instructions move a st(i) register to st(0) iff a condition is true. These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+
+ unsigned Op0 = getFPReg(MI.getOperand(0));
+ unsigned Op1 = getFPReg(MI.getOperand(2));
+ bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+
+ // The first operand *must* be on the top of the stack.
+ moveToTop(Op0, I);
+
+ // Change the second operand to the stack register that the operand is in.
+ // Change from the pseudo instruction to the concrete instruction.
+ MI.RemoveOperand(0);
+ MI.RemoveOperand(1);
+ MI.getOperand(0).setReg(getSTReg(Op1));
+ MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+ // If we kill the second operand, make sure to pop it from the stack.
+ if (Op0 != Op1 && KillsOp1) {
+ // Get this value off of the register stack.
+ freeStackSlotAfter(I, Op1);
+ }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions. This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
+ MachineInstr &MI = *Inst;
+
+ if (MI.isCall()) {
+ handleCall(Inst);
+ return;
+ }
+
+ if (MI.isReturn()) {
+ handleReturn(Inst);
+ return;
+ }
+
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unknown SpecialFP instruction!");
+ case TargetOpcode::COPY: {
+ // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
+ const MachineOperand &MO1 = MI.getOperand(1);
+ const MachineOperand &MO0 = MI.getOperand(0);
+ bool KillsSrc = MI.killsRegister(MO1.getReg());
+
+ // FP <- FP copy.
+ unsigned DstFP = getFPReg(MO0);
+ unsigned SrcFP = getFPReg(MO1);
+ assert(isLive(SrcFP) && "Cannot copy dead register");
+ if (KillsSrc) {
+ // If the input operand is killed, we can just change the owner of the
+ // incoming stack slot into the result.
+ unsigned Slot = getSlot(SrcFP);
+ Stack[Slot] = DstFP;
+ RegMap[DstFP] = Slot;
+ } else {
+ // For COPY we just duplicate the specified value to a new stack slot.
+ // This could be made better, but would require substantial changes.
+ duplicateToTop(SrcFP, DstFP, Inst);
+ }
+ break;
+ }
+
+ case TargetOpcode::IMPLICIT_DEF: {
+ // All FP registers must be explicitly defined, so load a 0 instead.
+ unsigned Reg = MI.getOperand(0).getReg() - X86::FP0;
+ LLVM_DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+ BuildMI(*MBB, Inst, MI.getDebugLoc(), TII->get(X86::LD_F0));
+ pushReg(Reg);
+ break;
+ }
+
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR: {
+ // The inline asm MachineInstr currently only *uses* FP registers for the
+ // 'f' constraint. These should be turned into the current ST(x) register
+ // in the machine instr.
+ //
+ // There are special rules for x87 inline assembly. The compiler must know
+ // exactly how many registers are popped and pushed implicitly by the asm.
+ // Otherwise it is not possible to restore the stack state after the inline
+ // asm.
+ //
+ // There are 3 kinds of input operands:
+ //
+ // 1. Popped inputs. These must appear at the stack top in ST0-STn. A
+ // popped input operand must be in a fixed stack slot, and it is either
+ // tied to an output operand, or in the clobber list. The MI has ST use
+ // and def operands for these inputs.
+ //
+ // 2. Fixed inputs. These inputs appear in fixed stack slots, but are
+ // preserved by the inline asm. The fixed stack slots must be STn-STm
+ // following the popped inputs. A fixed input operand cannot be tied to
+ // an output or appear in the clobber list. The MI has ST use operands
+ // and no defs for these inputs.
+ //
+ // 3. Preserved inputs. These inputs use the "f" constraint which is
+ // represented as an FP register. The inline asm won't change these
+ // stack slots.
+ //
+ // Outputs must be in ST registers, FP outputs are not allowed. Clobbered
+ // registers do not count as output operands. The inline asm changes the
+ // stack as if it popped all the popped inputs and then pushed all the
+ // output operands.
+
+ // Scan the assembly for ST registers used, defined and clobbered. We can
+ // only tell clobbers from defs by looking at the asm descriptor.
+ unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
+ unsigned NumOps = 0;
+ SmallSet<unsigned, 1> FRegIdx;
+ unsigned RCID;
+
+ for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI.getNumOperands();
+ i != e && MI.getOperand(i).isImm(); i += 1 + NumOps) {
+ unsigned Flags = MI.getOperand(i).getImm();
+
+ NumOps = InlineAsm::getNumOperandRegisters(Flags);
+ if (NumOps != 1)
+ continue;
+ const MachineOperand &MO = MI.getOperand(i + 1);
+ if (!MO.isReg())
+ continue;
+ unsigned STReg = MO.getReg() - X86::FP0;
+ if (STReg >= 8)
+ continue;
+
+ // If the flag has a register class constraint, this must be an operand
+ // with constraint "f". Record its index and continue.
+ if (InlineAsm::hasRegClassConstraint(Flags, RCID)) {
+ FRegIdx.insert(i + 1);
+ continue;
+ }
+
+ switch (InlineAsm::getKind(Flags)) {
+ case InlineAsm::Kind_RegUse:
+ STUses |= (1u << STReg);
+ break;
+ case InlineAsm::Kind_RegDef:
+ case InlineAsm::Kind_RegDefEarlyClobber:
+ STDefs |= (1u << STReg);
+ if (MO.isDead())
+ STDeadDefs |= (1u << STReg);
+ break;
+ case InlineAsm::Kind_Clobber:
+ STClobbers |= (1u << STReg);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (STUses && !isMask_32(STUses))
+ MI.emitError("fixed input regs must be last on the x87 stack");
+ unsigned NumSTUses = countTrailingOnes(STUses);
+
+ // Defs must be contiguous from the stack top. ST0-STn.
+ if (STDefs && !isMask_32(STDefs)) {
+ MI.emitError("output regs must be last on the x87 stack");
+ STDefs = NextPowerOf2(STDefs) - 1;
+ }
+ unsigned NumSTDefs = countTrailingOnes(STDefs);
+
+ // So must the clobbered stack slots. ST0-STm, m >= n.
+ if (STClobbers && !isMask_32(STDefs | STClobbers))
+ MI.emitError("clobbers must be last on the x87 stack");
+
+ // Popped inputs are the ones that are also clobbered or defined.
+ unsigned STPopped = STUses & (STDefs | STClobbers);
+ if (STPopped && !isMask_32(STPopped))
+ MI.emitError("implicitly popped regs must be last on the x87 stack");
+ unsigned NumSTPopped = countTrailingOnes(STPopped);
+
+ LLVM_DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+ << NumSTPopped << ", and defines " << NumSTDefs
+ << " regs.\n");
+
+#ifndef NDEBUG
+ // If any input operand uses constraint "f", all output register
+ // constraints must be early-clobber defs.
+ for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I)
+ if (FRegIdx.count(I)) {
+ assert((1 << getFPReg(MI.getOperand(I)) & STDefs) == 0 &&
+ "Operands with constraint \"f\" cannot overlap with defs");
+ }
+#endif
+
+ // Collect all FP registers (register operands with constraints "t", "u",
+ // and "f") to kill afer the instruction.
+ unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ unsigned FPReg = getFPReg(Op);
+
+ // If we kill this operand, make sure to pop it from the stack after the
+ // asm. We just remember it for now, and pop them all off at the end in
+ // a batch.
+ if (Op.isUse() && Op.isKill())
+ FPKills |= 1U << FPReg;
+ }
+
+ // Do not include registers that are implicitly popped by defs/clobbers.
+ FPKills &= ~(STDefs | STClobbers);
+
+ // Now we can rearrange the live registers to match what was requested.
+ unsigned char STUsesArray[8];
+
+ for (unsigned I = 0; I < NumSTUses; ++I)
+ STUsesArray[I] = I;
+
+ shuffleStackTop(STUsesArray, NumSTUses, Inst);
+ LLVM_DEBUG({
+ dbgs() << "Before asm: ";
+ dumpStack();
+ });
+
+ // With the stack layout fixed, rewrite the FP registers.
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+
+ unsigned FPReg = getFPReg(Op);
+
+ if (FRegIdx.count(i))
+ // Operand with constraint "f".
+ Op.setReg(getSTReg(FPReg));
+ else
+ // Operand with a single register class constraint ("t" or "u").
+ Op.setReg(X86::ST0 + FPReg);
+ }
+
+ // Simulate the inline asm popping its inputs and pushing its outputs.
+ StackTop -= NumSTPopped;
+
+ for (unsigned i = 0; i < NumSTDefs; ++i)
+ pushReg(NumSTDefs - i - 1);
+
+ // If this asm kills any FP registers (is the last use of them) we must
+ // explicitly emit pop instructions for them. Do this now after the asm has
+ // executed so that the ST(x) numbers are not off (which would happen if we
+ // did this inline with operand rewriting).
+ //
+ // Note: this might be a non-optimal pop sequence. We might be able to do
+ // better by trying to pop in stack order or something.
+ while (FPKills) {
+ unsigned FPReg = countTrailingZeros(FPKills);
+ if (isLive(FPReg))
+ freeStackSlotAfter(Inst, FPReg);
+ FPKills &= ~(1U << FPReg);
+ }
+
+ // Don't delete the inline asm!
+ return;
+ }
+ }
+
+ Inst = MBB->erase(Inst); // Remove the pseudo instruction
+
+ // We want to leave I pointing to the previous instruction, but what if we
+ // just erased the first instruction?
+ if (Inst == MBB->begin()) {
+ LLVM_DEBUG(dbgs() << "Inserting dummy KILL\n");
+ Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL));
+ } else
+ --Inst;
+}
+
+void FPS::setKillFlags(MachineBasicBlock &MBB) const {
+ const TargetRegisterInfo &TRI =
+ *MBB.getParent()->getSubtarget().getRegisterInfo();
+ LivePhysRegs LPR(TRI);
+
+ LPR.addLiveOuts(MBB);
+
+ for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+ I != E; ++I) {
+ if (I->isDebugInstr())
+ continue;
+
+ std::bitset<8> Defs;
+ SmallVector<MachineOperand *, 2> Uses;
+ MachineInstr &MI = *I;
+
+ for (auto &MO : I->operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg() - X86::FP0;
+
+ if (Reg >= 8)
+ continue;
+
+ if (MO.isDef()) {
+ Defs.set(Reg);
+ if (!LPR.contains(MO.getReg()))
+ MO.setIsDead();
+ } else
+ Uses.push_back(&MO);
+ }
+
+ for (auto *MO : Uses)
+ if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg()))
+ MO->setIsKill();
+
+ LPR.stepBackward(MI);
+ }
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
new file mode 100644
index 000000000000..866f11364004
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -0,0 +1,3597 @@
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cstdlib>
+
+#define DEBUG_TYPE "x86-fl"
+
+STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
+STATISTIC(NumFrameExtraProbe,
+ "Number of extra stack probes generated in prologue");
+
+using namespace llvm;
+
+X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
+ MaybeAlign StackAlignOverride)
+ : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
+ STI.is64Bit() ? -8 : -4),
+ STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
+ // Cache a bunch of frame-related predicates for this subtarget.
+ SlotSize = TRI->getSlotSize();
+ Is64Bit = STI.is64Bit();
+ IsLP64 = STI.isTarget64BitLP64();
+ // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+ Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+ StackPtr = TRI->getStackRegister();
+}
+
+bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ return !MF.getFrameInfo().hasVarSizedObjects() &&
+ !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() &&
+ !MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified. Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+ return hasReservedCallFrame(MF) ||
+ MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
+ (hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
+ TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+ return MF.getFrameInfo().hasStackObjects() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register. This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+ TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+ MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+ MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
+ MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
+ MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ MFI.hasCopyImplyingStackAdjustment());
+}
+
+static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::SUB64ri8;
+ return X86::SUB64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::SUB32ri8;
+ return X86::SUB32ri;
+ }
+}
+
+static unsigned getADDriOpcode(bool IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::ADD64ri8;
+ return X86::ADD64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::ADD32ri8;
+ return X86::ADD32ri;
+ }
+}
+
+static unsigned getSUBrrOpcode(bool IsLP64) {
+ return IsLP64 ? X86::SUB64rr : X86::SUB32rr;
+}
+
+static unsigned getADDrrOpcode(bool IsLP64) {
+ return IsLP64 ? X86::ADD64rr : X86::ADD32rr;
+}
+
+static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::AND64ri8;
+ return X86::AND64ri32;
+ }
+ if (isInt<8>(Imm))
+ return X86::AND32ri8;
+ return X86::AND32ri;
+}
+
+static unsigned getLEArOpcode(bool IsLP64) {
+ return IsLP64 ? X86::LEA64r : X86::LEA32r;
+}
+
+static bool isEAXLiveIn(MachineBasicBlock &MBB) {
+ for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
+ unsigned Reg = RegMask.PhysReg;
+
+ if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
+ Reg == X86::AH || Reg == X86::AL)
+ return true;
+ }
+
+ return false;
+}
+
+/// Check if the flags need to be preserved before the terminators.
+/// This would be the case, if the eflags is live-in of the region
+/// composed by the terminators or live-out of that region, without
+/// being defined by a terminator.
+static bool
+flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
+ for (const MachineInstr &MI : MBB.terminators()) {
+ bool BreakNext = false;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (Reg != X86::EFLAGS)
+ continue;
+
+ // This terminator needs an eflags that is not defined
+ // by a previous another terminator:
+ // EFLAGS is live-in of the region composed by the terminators.
+ if (!MO.isDef())
+ return true;
+ // This terminator defines the eflags, i.e., we don't need to preserve it.
+ // However, we still need to check this specific terminator does not
+ // read a live-in value.
+ BreakNext = true;
+ }
+ // We found a definition of the eflags, no need to preserve them.
+ if (BreakNext)
+ return false;
+ }
+
+ // None of the terminators use or define the eflags.
+ // Check if they are live-out, that would imply we need to preserve them.
+ for (const MachineBasicBlock *Succ : MBB.successors())
+ if (Succ->isLiveIn(X86::EFLAGS))
+ return true;
+
+ return false;
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &DL,
+ int64_t NumBytes, bool InEpilogue) const {
+ bool isSub = NumBytes < 0;
+ uint64_t Offset = isSub ? -NumBytes : NumBytes;
+ MachineInstr::MIFlag Flag =
+ isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
+
+ uint64_t Chunk = (1LL << 31) - 1;
+
+ MachineFunction &MF = *MBB.getParent();
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86TargetLowering &TLI = *STI.getTargetLowering();
+ const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
+
+ // It's ok to not take into account large chunks when probing, as the
+ // allocation is split in smaller chunks anyway.
+ if (EmitInlineStackProbe && !InEpilogue) {
+
+ // This pseudo-instruction is going to be expanded, potentially using a
+ // loop, by inlineStackProbe().
+ BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset);
+ return;
+ } else if (Offset > Chunk) {
+ // Rather than emit a long series of instructions for large offsets,
+ // load the offset into a register and do one sub/add
+ unsigned Reg = 0;
+ unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
+
+ if (isSub && !isEAXLiveIn(MBB))
+ Reg = Rax;
+ else
+ Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
+
+ unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
+ unsigned AddSubRROpc =
+ isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
+ if (Reg) {
+ BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg)
+ .addImm(Offset)
+ .setMIFlag(Flag);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
+ .addReg(StackPtr)
+ .addReg(Reg);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ return;
+ } else if (Offset > 8 * Chunk) {
+ // If we would need more than 8 add or sub instructions (a >16GB stack
+ // frame), it's worth spilling RAX to materialize this immediate.
+ // pushq %rax
+ // movabsq +-$Offset+-SlotSize, %rax
+ // addq %rsp, %rax
+ // xchg %rax, (%rsp)
+ // movq (%rsp), %rsp
+ assert(Is64Bit && "can't have 32-bit 16GB stack frame");
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+ .addReg(Rax, RegState::Kill)
+ .setMIFlag(Flag);
+ // Subtract is not commutative, so negate the offset and always use add.
+ // Subtract 8 less and add 8 more to account for the PUSH we just did.
+ if (isSub)
+ Offset = -(Offset - SlotSize);
+ else
+ Offset = Offset + SlotSize;
+ BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax)
+ .addImm(Offset)
+ .setMIFlag(Flag);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
+ .addReg(Rax)
+ .addReg(StackPtr);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ // Exchange the new SP in RAX with the top of the stack.
+ addRegOffset(
+ BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
+ StackPtr, false, 0);
+ // Load new SP from the top of the stack into RSP.
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
+ StackPtr, false, 0);
+ return;
+ }
+ }
+
+ while (Offset) {
+ uint64_t ThisVal = std::min(Offset, Chunk);
+ if (ThisVal == SlotSize) {
+ // Use push / pop for slot sized adjustments as a size optimization. We
+ // need to find a dead register when using pop.
+ unsigned Reg = isSub
+ ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
+ : TRI->findDeadCallerSavedReg(MBB, MBBI);
+ if (Reg) {
+ unsigned Opc = isSub
+ ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
+ : (Is64Bit ? X86::POP64r : X86::POP32r);
+ BuildMI(MBB, MBBI, DL, TII.get(Opc))
+ .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))
+ .setMIFlag(Flag);
+ Offset -= ThisVal;
+ continue;
+ }
+ }
+
+ BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
+ .setMIFlag(Flag);
+
+ Offset -= ThisVal;
+ }
+}
+
+MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
+ assert(Offset != 0 && "zero offset stack adjustment requested");
+
+ // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
+ // is tricky.
+ bool UseLEA;
+ if (!InEpilogue) {
+ // Check if inserting the prologue at the beginning
+ // of MBB would require to use LEA operations.
+ // We need to use LEA operations if EFLAGS is live in, because
+ // it means an instruction will read it before it gets defined.
+ UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
+ } else {
+ // If we can use LEA for SP but we shouldn't, check that none
+ // of the terminators uses the eflags. Otherwise we will insert
+ // a ADD that will redefine the eflags and break the condition.
+ // Alternatively, we could move the ADD, but this may not be possible
+ // and is an optimization anyway.
+ UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
+ if (UseLEA && !STI.useLeaForSP())
+ UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
+ // If that assert breaks, that means we do not do the right thing
+ // in canUseAsEpilogue.
+ assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
+ "We shouldn't have allowed this insertion point");
+ }
+
+ MachineInstrBuilder MI;
+ if (UseLEA) {
+ MI = addRegOffset(BuildMI(MBB, MBBI, DL,
+ TII.get(getLEArOpcode(Uses64BitFramePtr)),
+ StackPtr),
+ StackPtr, false, Offset);
+ } else {
+ bool IsSub = Offset < 0;
+ uint64_t AbsOffset = IsSub ? -Offset : Offset;
+ const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
+ : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
+ MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(AbsOffset);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ }
+ return MI;
+}
+
+int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ bool doMergeWithPrevious) const {
+ if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+ (!doMergeWithPrevious && MBBI == MBB.end()))
+ return 0;
+
+ MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
+
+ PI = skipDebugInstructionsBackward(PI, MBB.begin());
+ // It is assumed that ADD/SUB/LEA instruction is succeded by one CFI
+ // instruction, and that there are no DBG_VALUE or other instructions between
+ // ADD/SUB/LEA and its corresponding CFI instruction.
+ /* TODO: Add support for the case where there are multiple CFI instructions
+ below the ADD/SUB/LEA, e.g.:
+ ...
+ add
+ cfi_def_cfa_offset
+ cfi_offset
+ ...
+ */
+ if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
+ PI = std::prev(PI);
+
+ unsigned Opc = PI->getOpcode();
+ int Offset = 0;
+
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr){
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset = PI->getOperand(2).getImm();
+ } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+ PI->getOperand(0).getReg() == StackPtr &&
+ PI->getOperand(1).getReg() == StackPtr &&
+ PI->getOperand(2).getImm() == 1 &&
+ PI->getOperand(3).getReg() == X86::NoRegister &&
+ PI->getOperand(5).getReg() == X86::NoRegister) {
+ // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
+ Offset = PI->getOperand(4).getImm();
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ assert(PI->getOperand(1).getReg() == StackPtr);
+ Offset = -PI->getOperand(2).getImm();
+ } else
+ return 0;
+
+ PI = MBB.erase(PI);
+ if (PI != MBB.end() && PI->isCFIInstruction()) PI = MBB.erase(PI);
+ if (!doMergeWithPrevious)
+ MBBI = skipDebugInstructionsForward(PI, MBB.end());
+
+ return Offset;
+}
+
+void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ const MCCFIInstruction &CFIInst) const {
+ MachineFunction &MF = *MBB.getParent();
+ unsigned CFIIndex = MF.addFrameInst(CFIInst);
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+}
+
+/// Emits Dwarf Info specifying offsets of callee saved registers and
+/// frame pointer. This is called only when basic block sections are enabled.
+void X86FrameLowering::emitCalleeSavedFrameMoves(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+ MachineFunction &MF = *MBB.getParent();
+ if (!hasFP(MF)) {
+ emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
+ return;
+ }
+ const MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+ const Register FramePtr = TRI->getFrameRegister(MF);
+ const Register MachineFramePtr =
+ STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
+ : FramePtr;
+ unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
+ // Offset = space for return address + size of the frame pointer itself.
+ unsigned Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
+ BuildCFI(MBB, MBBI, DebugLoc{},
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset));
+ emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
+}
+
+void X86FrameLowering::emitCalleeSavedFrameMoves(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool IsPrologue) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ if (CSI.empty()) return;
+
+ // Calculate offsets.
+ for (std::vector<CalleeSavedInfo>::const_iterator
+ I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+ int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
+ unsigned Reg = I->getReg();
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+
+ if (IsPrologue) {
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+ } else {
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createRestore(nullptr, DwarfReg));
+ }
+ }
+}
+
+void X86FrameLowering::emitStackProbe(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ if (STI.isTargetWindowsCoreCLR()) {
+ if (InProlog) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING))
+ .addImm(0 /* no explicit stack size */);
+ } else {
+ emitStackProbeInline(MF, MBB, MBBI, DL, false);
+ }
+ } else {
+ emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+ }
+}
+
+void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const {
+ auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) {
+ return MI.getOpcode() == X86::STACKALLOC_W_PROBING;
+ });
+ if (Where != PrologMBB.end()) {
+ DebugLoc DL = PrologMBB.findDebugLoc(Where);
+ emitStackProbeInline(MF, PrologMBB, Where, DL, true);
+ Where->eraseFromParent();
+ }
+}
+
+void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ if (STI.isTargetWindowsCoreCLR() && STI.is64Bit())
+ emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog);
+ else
+ emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog);
+}
+
+void X86FrameLowering::emitStackProbeInlineGeneric(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+ MachineInstr &AllocWithProbe = *MBBI;
+ uint64_t Offset = AllocWithProbe.getOperand(0).getImm();
+
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86TargetLowering &TLI = *STI.getTargetLowering();
+ assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&
+ "different expansion expected for CoreCLR 64 bit");
+
+ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+ uint64_t ProbeChunk = StackProbeSize * 8;
+
+ uint64_t MaxAlign =
+ TRI->needsStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;
+
+ // Synthesize a loop or unroll it, depending on the number of iterations.
+ // BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
+ // between the unaligned rsp and current rsp.
+ if (Offset > ProbeChunk) {
+ emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset,
+ MaxAlign % StackProbeSize);
+ } else {
+ emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset,
+ MaxAlign % StackProbeSize);
+ }
+}
+
+void X86FrameLowering::emitStackProbeInlineGenericBlock(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
+ uint64_t AlignOffset) const {
+
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86TargetLowering &TLI = *STI.getTargetLowering();
+ const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+ const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+
+ uint64_t CurrentOffset = 0;
+
+ assert(AlignOffset < StackProbeSize);
+
+ // If the offset is so small it fits within a page, there's nothing to do.
+ if (StackProbeSize < Offset + AlignOffset) {
+
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(StackProbeSize - AlignOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
+ .setMIFlag(MachineInstr::FrameSetup),
+ StackPtr, false, 0)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ NumFrameExtraProbe++;
+ CurrentOffset = StackProbeSize - AlignOffset;
+ }
+
+ // For the next N - 1 pages, just probe. I tried to take advantage of
+ // natural probes but it implies much more logic and there was very few
+ // interesting natural probes to interleave.
+ while (CurrentOffset + StackProbeSize < Offset) {
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(StackProbeSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+
+
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
+ .setMIFlag(MachineInstr::FrameSetup),
+ StackPtr, false, 0)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ NumFrameExtraProbe++;
+ CurrentOffset += StackProbeSize;
+ }
+
+ // No need to probe the tail, it is smaller than a Page.
+ uint64_t ChunkSize = Offset - CurrentOffset;
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(ChunkSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+}
+
+void X86FrameLowering::emitStackProbeInlineGenericLoop(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
+ uint64_t AlignOffset) const {
+ assert(Offset && "null offset");
+
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86TargetLowering &TLI = *STI.getTargetLowering();
+ const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+
+ if (AlignOffset) {
+ if (AlignOffset < StackProbeSize) {
+ // Perform a first smaller allocation followed by a probe.
+ const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(AlignOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
+ .setMIFlag(MachineInstr::FrameSetup),
+ StackPtr, false, 0)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ NumFrameExtraProbe++;
+ Offset -= AlignOffset;
+ }
+ }
+
+ // Synthesize a loop
+ NumFrameLoopProbe++;
+ const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+ MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = ++MBB.getIterator();
+ MF.insert(MBBIter, testMBB);
+ MF.insert(MBBIter, tailMBB);
+
+ Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // save loop bound
+ {
+ const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+ BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)
+ .addReg(FinalStackProbed)
+ .addImm(Offset / StackProbeSize * StackProbeSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // allocate a page
+ {
+ const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
+ BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(StackProbeSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // touch the page
+ addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))
+ .setMIFlag(MachineInstr::FrameSetup),
+ StackPtr, false, 0)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // cmp with stack pointer bound
+ BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+ .addReg(StackPtr)
+ .addReg(FinalStackProbed)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // jump
+ BuildMI(testMBB, DL, TII.get(X86::JCC_1))
+ .addMBB(testMBB)
+ .addImm(X86::COND_NE)
+ .setMIFlag(MachineInstr::FrameSetup);
+ testMBB->addSuccessor(testMBB);
+ testMBB->addSuccessor(tailMBB);
+
+ // BB management
+ tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end());
+ tailMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ MBB.addSuccessor(testMBB);
+
+ // handle tail
+ unsigned TailOffset = Offset % StackProbeSize;
+ if (TailOffset) {
+ const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, TailOffset);
+ BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(TailOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Update Live In information
+ recomputeLiveIns(*testMBB);
+ recomputeLiveIns(*tailMBB);
+}
+
+void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ assert(STI.is64Bit() && "different expansion needed for 32 bit");
+ assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+ // RAX contains the number of bytes of desired stack adjustment.
+ // The handling here assumes this value has already been updated so as to
+ // maintain stack alignment.
+ //
+ // We need to exit with RSP modified by this amount and execute suitable
+ // page touches to notify the OS that we're growing the stack responsibly.
+ // All stack probing must be done without modifying RSP.
+ //
+ // MBB:
+ // SizeReg = RAX;
+ // ZeroReg = 0
+ // CopyReg = RSP
+ // Flags, TestReg = CopyReg - SizeReg
+ // FinalReg = !Flags.Ovf ? TestReg : ZeroReg
+ // LimitReg = gs magic thread env access
+ // if FinalReg >= LimitReg goto ContinueMBB
+ // RoundBB:
+ // RoundReg = page address of FinalReg
+ // LoopMBB:
+ // LoopReg = PHI(LimitReg,ProbeReg)
+ // ProbeReg = LoopReg - PageSize
+ // [ProbeReg] = 0
+ // if (ProbeReg > RoundReg) goto LoopMBB
+ // ContinueMBB:
+ // RSP = RSP - RAX
+ // [rest of original MBB]
+
+ // Set up the new basic blocks
+ MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
+ MF.insert(MBBIter, RoundMBB);
+ MF.insert(MBBIter, LoopMBB);
+ MF.insert(MBBIter, ContinueMBB);
+
+ // Split MBB and move the tail portion down to ContinueMBB.
+ MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+ ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+ ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+ // Some useful constants
+ const int64_t ThreadEnvironmentStackLimit = 0x10;
+ const int64_t PageSize = 0x1000;
+ const int64_t PageMask = ~(PageSize - 1);
+
+ // Registers we need. For the normal case we use virtual
+ // registers. For the prolog expansion we use RAX, RCX and RDX.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterClass *RegClass = &X86::GR64RegClass;
+ const Register SizeReg = InProlog ? X86::RAX
+ : MRI.createVirtualRegister(RegClass),
+ ZeroReg = InProlog ? X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ CopyReg = InProlog ? X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ TestReg = InProlog ? X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ FinalReg = InProlog ? X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ RoundedReg = InProlog ? X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ LimitReg = InProlog ? X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ JoinReg = InProlog ? X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ ProbeReg = InProlog ? X86::RCX
+ : MRI.createVirtualRegister(RegClass);
+
+ // SP-relative offsets where we can save RCX and RDX.
+ int64_t RCXShadowSlot = 0;
+ int64_t RDXShadowSlot = 0;
+
+ // If inlining in the prolog, save RCX and RDX.
+ if (InProlog) {
+ // Compute the offsets. We need to account for things already
+ // pushed onto the stack at this point: return address, frame
+ // pointer (if used), and callee saves.
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
+ const bool HasFP = hasFP(MF);
+
+ // Check if we need to spill RCX and/or RDX.
+ // Here we assume that no earlier prologue instruction changes RCX and/or
+ // RDX, so checking the block live-ins is enough.
+ const bool IsRCXLiveIn = MBB.isLiveIn(X86::RCX);
+ const bool IsRDXLiveIn = MBB.isLiveIn(X86::RDX);
+ int64_t InitSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
+ // Assign the initial slot to both registers, then change RDX's slot if both
+ // need to be spilled.
+ if (IsRCXLiveIn)
+ RCXShadowSlot = InitSlot;
+ if (IsRDXLiveIn)
+ RDXShadowSlot = InitSlot;
+ if (IsRDXLiveIn && IsRCXLiveIn)
+ RDXShadowSlot += 8;
+ // Emit the saves if needed.
+ if (IsRCXLiveIn)
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RCXShadowSlot)
+ .addReg(X86::RCX);
+ if (IsRDXLiveIn)
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RDXShadowSlot)
+ .addReg(X86::RDX);
+ } else {
+ // Not in the prolog. Copy RAX to a virtual reg.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
+ }
+
+ // Add code to MBB to check for overflow and set the new target stack pointer
+ // to zero if so.
+ BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
+ .addReg(ZeroReg, RegState::Undef)
+ .addReg(ZeroReg, RegState::Undef);
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
+ BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
+ .addReg(CopyReg)
+ .addReg(SizeReg);
+ BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg)
+ .addReg(TestReg)
+ .addReg(ZeroReg)
+ .addImm(X86::COND_B);
+
+ // FinalReg now holds final stack pointer value, or zero if
+ // allocation would overflow. Compare against the current stack
+ // limit from the thread environment block. Note this limit is the
+ // lowest touched page on the stack, not the point at which the OS
+ // will cause an overflow exception, so this is just an optimization
+ // to avoid unnecessarily touching pages that are below the current
+ // SP but already committed to the stack by the OS.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
+ .addReg(0)
+ .addImm(1)
+ .addReg(0)
+ .addImm(ThreadEnvironmentStackLimit)
+ .addReg(X86::GS);
+ BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
+ // Jump if the desired stack pointer is at or above the stack limit.
+ BuildMI(&MBB, DL, TII.get(X86::JCC_1)).addMBB(ContinueMBB).addImm(X86::COND_AE);
+
+ // Add code to roundMBB to round the final stack pointer to a page boundary.
+ RoundMBB->addLiveIn(FinalReg);
+ BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
+ .addReg(FinalReg)
+ .addImm(PageMask);
+ BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
+
+ // LimitReg now holds the current stack limit, RoundedReg page-rounded
+ // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
+ // and probe until we reach RoundedReg.
+ if (!InProlog) {
+ BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
+ .addReg(LimitReg)
+ .addMBB(RoundMBB)
+ .addReg(ProbeReg)
+ .addMBB(LoopMBB);
+ }
+
+ LoopMBB->addLiveIn(JoinReg);
+ addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
+ false, -PageSize);
+
+ // Probe by storing a byte onto the stack.
+ BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
+ .addReg(ProbeReg)
+ .addImm(1)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0)
+ .addImm(0);
+
+ LoopMBB->addLiveIn(RoundedReg);
+ BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
+ .addReg(RoundedReg)
+ .addReg(ProbeReg);
+ BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)).addMBB(LoopMBB).addImm(X86::COND_NE);
+
+ MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+ // If in prolog, restore RDX and RCX.
+ if (InProlog) {
+ if (RCXShadowSlot) // It means we spilled RCX in the prologue.
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
+ TII.get(X86::MOV64rm), X86::RCX),
+ X86::RSP, false, RCXShadowSlot);
+ if (RDXShadowSlot) // It means we spilled RDX in the prologue.
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
+ TII.get(X86::MOV64rm), X86::RDX),
+ X86::RSP, false, RDXShadowSlot);
+ }
+
+ // Now that the probing is done, add code to continueMBB to update
+ // the stack pointer for real.
+ ContinueMBB->addLiveIn(SizeReg);
+ BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(SizeReg);
+
+ // Add the control flow edges we need.
+ MBB.addSuccessor(ContinueMBB);
+ MBB.addSuccessor(RoundMBB);
+ RoundMBB->addSuccessor(LoopMBB);
+ LoopMBB->addSuccessor(ContinueMBB);
+ LoopMBB->addSuccessor(LoopMBB);
+
+ // Mark all the instructions added to the prolog as frame setup.
+ if (InProlog) {
+ for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+ BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *RoundMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *LoopMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+ CMBBI != ContinueMBBI; ++CMBBI) {
+ CMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ }
+}
+
+void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool InProlog) const {
+ bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
+
+ // FIXME: Add indirect thunk support and remove this.
+ if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls())
+ report_fatal_error("Emitting stack probe calls on 64-bit with the large "
+ "code model and indirect thunks not yet implemented.");
+
+ unsigned CallOp;
+ if (Is64Bit)
+ CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
+ else
+ CallOp = X86::CALLpcrel32;
+
+ StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);
+
+ MachineInstrBuilder CI;
+ MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
+
+ // All current stack probes take AX and SP as input, clobber flags, and
+ // preserve all registers. x86_64 probes leave RSP unmodified.
+ if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+ // For the large code model, we have to call through a register. Use R11,
+ // as it is scratch in all supported calling conventions.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
+ .addExternalSymbol(MF.createExternalSymbolName(Symbol));
+ CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
+ } else {
+ CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
+ .addExternalSymbol(MF.createExternalSymbolName(Symbol));
+ }
+
+ unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX;
+ unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP;
+ CI.addReg(AX, RegState::Implicit)
+ .addReg(SP, RegState::Implicit)
+ .addReg(AX, RegState::Define | RegState::Implicit)
+ .addReg(SP, RegState::Define | RegState::Implicit)
+ .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+
+ if (STI.isTargetWin64() || !STI.isOSWindows()) {
+ // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
+ // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+ // themselves. They also does not clobber %rax so we can reuse it when
+ // adjusting %rsp.
+ // All other platforms do not specify a particular ABI for the stack probe
+ // function, so we arbitrarily define it to not adjust %esp/%rsp itself.
+ BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP)
+ .addReg(SP)
+ .addReg(AX);
+ }
+
+ if (InProlog) {
+ // Apply the frame setup flag to all inserted instrs.
+ for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
+ ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+}
+
+static unsigned calculateSetFPREG(uint64_t SPAdjust) {
+ // Win64 ABI has a less restrictive limitation of 240; 128 works equally well
+ // and might require smaller successive adjustments.
+ const uint64_t Win64MaxSEHOffset = 128;
+ uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
+ // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
+ return SEHFrameOffset & -16;
+}
+
+// If we're forcing a stack realignment we can't rely on just the frame
+// info, we need to know the ABI stack alignment as well in case we
+// have a call out. Otherwise just make sure we have some alignment - we'll
+// go with the minimum SlotSize.
+uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment.
+ Align StackAlign = getStackAlign();
+ if (MF.getFunction().hasFnAttribute("stackrealign")) {
+ if (MFI.hasCalls())
+ MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+ else if (MaxAlign < SlotSize)
+ MaxAlign = Align(SlotSize);
+ }
+ return MaxAlign.value();
+}
+
+void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned Reg,
+ uint64_t MaxAlign) const {
+ uint64_t Val = -MaxAlign;
+ unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
+
+ MachineFunction &MF = *MBB.getParent();
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86TargetLowering &TLI = *STI.getTargetLowering();
+ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+ const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
+
+ // We want to make sure that (in worst case) less than StackProbeSize bytes
+ // are not probed after the AND. This assumption is used in
+ // emitStackProbeInlineGeneric.
+ if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {
+ {
+ NumFrameLoopProbe++;
+ MachineBasicBlock *entryMBB =
+ MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+ MachineBasicBlock *headMBB =
+ MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+ MachineBasicBlock *bodyMBB =
+ MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+ MachineBasicBlock *footMBB =
+ MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ MachineFunction::iterator MBBIter = MBB.getIterator();
+ MF.insert(MBBIter, entryMBB);
+ MF.insert(MBBIter, headMBB);
+ MF.insert(MBBIter, bodyMBB);
+ MF.insert(MBBIter, footMBB);
+ const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+ Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
+
+ // Setup entry block
+ {
+
+ entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI);
+ BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MachineInstr *MI =
+ BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed)
+ .addReg(FinalStackProbed)
+ .addImm(Val)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+
+ BuildMI(entryMBB, DL,
+ TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+ .addReg(FinalStackProbed)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(entryMBB, DL, TII.get(X86::JCC_1))
+ .addMBB(&MBB)
+ .addImm(X86::COND_E)
+ .setMIFlag(MachineInstr::FrameSetup);
+ entryMBB->addSuccessor(headMBB);
+ entryMBB->addSuccessor(&MBB);
+ }
+
+ // Loop entry block
+
+ {
+ const unsigned SUBOpc =
+ getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
+ BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(StackProbeSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ BuildMI(headMBB, DL,
+ TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+ .addReg(FinalStackProbed)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // jump
+ BuildMI(headMBB, DL, TII.get(X86::JCC_1))
+ .addMBB(footMBB)
+ .addImm(X86::COND_B)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ headMBB->addSuccessor(bodyMBB);
+ headMBB->addSuccessor(footMBB);
+ }
+
+ // setup loop body
+ {
+ addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc))
+ .setMIFlag(MachineInstr::FrameSetup),
+ StackPtr, false, 0)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ const unsigned SUBOpc =
+ getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
+ BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(StackProbeSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // cmp with stack pointer bound
+ BuildMI(bodyMBB, DL,
+ TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+ .addReg(FinalStackProbed)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // jump
+ BuildMI(bodyMBB, DL, TII.get(X86::JCC_1))
+ .addMBB(bodyMBB)
+ .addImm(X86::COND_B)
+ .setMIFlag(MachineInstr::FrameSetup);
+ bodyMBB->addSuccessor(bodyMBB);
+ bodyMBB->addSuccessor(footMBB);
+ }
+
+ // setup loop footer
+ {
+ BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr)
+ .addReg(FinalStackProbed)
+ .setMIFlag(MachineInstr::FrameSetup);
+ addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc))
+ .setMIFlag(MachineInstr::FrameSetup),
+ StackPtr, false, 0)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ footMBB->addSuccessor(&MBB);
+ }
+
+ recomputeLiveIns(*headMBB);
+ recomputeLiveIns(*bodyMBB);
+ recomputeLiveIns(*footMBB);
+ recomputeLiveIns(MBB);
+ }
+ } else {
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
+ .addReg(Reg)
+ .addImm(Val)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ }
+}
+
+bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
+ // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
+ // clobbered by any interrupt handler.
+ assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
+ "MF used frame lowering for wrong subtarget");
+ const Function &Fn = MF.getFunction();
+ const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
+ return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);
+}
+
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+
+/*
+ Here's a gist of what gets emitted:
+
+ ; Establish frame pointer, if needed
+ [if needs FP]
+ push %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ .seh_pushreg %rpb
+ mov %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+
+ ; Spill general-purpose registers
+ [for all callee-saved GPRs]
+ pushq %<reg>
+ [if not needs FP]
+ .cfi_def_cfa_offset (offset from RETADDR)
+ .seh_pushreg %<reg>
+
+ ; If the required stack alignment > default stack alignment
+ ; rsp needs to be re-aligned. This creates a "re-alignment gap"
+ ; of unknown size in the stack frame.
+ [if stack needs re-alignment]
+ and $MASK, %rsp
+
+ ; Allocate space for locals
+ [if target is Windows and allocated space > 4096 bytes]
+ ; Windows needs special care for allocations larger
+ ; than one page.
+ mov $NNN, %rax
+ call ___chkstk_ms/___chkstk
+ sub %rax, %rsp
+ [else]
+ sub $NNN, %rsp
+
+ [if needs FP]
+ .seh_stackalloc (size of XMM spill slots)
+ .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
+ [else]
+ .seh_stackalloc NNN
+
+ ; Spill XMMs
+ ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
+ ; they may get spilled on any platform, if the current function
+ ; calls @llvm.eh.unwind.init
+ [if needs FP]
+ [for all callee-saved XMM registers]
+ movaps %<xmm reg>, -MMM(%rbp)
+ [for all callee-saved XMM registers]
+ .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
+ ; i.e. the offset relative to (%rbp - SEHFrameOffset)
+ [else]
+ [for all callee-saved XMM registers]
+ movaps %<xmm reg>, KKK(%rsp)
+ [for all callee-saved XMM registers]
+ .seh_savexmm %<xmm reg>, KKK
+
+ .seh_endprologue
+
+ [if needs base pointer]
+ mov %rsp, %rbx
+ [if needs to restore base pointer]
+ mov %rsp, -MMM(%rbp)
+
+ ; Emit CFI info
+ [if needs FP]
+ [for all callee-saved registers]
+ .cfi_offset %<reg>, (offset from %rbp)
+ [else]
+ .cfi_def_cfa_offset (offset from RETADDR)
+ [for all callee-saved registers]
+ .cfi_offset %<reg>, (offset from %rsp)
+
+ Notes:
+ - .seh directives are emitted only for Windows 64 ABI
+ - .cv_fpo directives are emitted on win32 when emitting CodeView
+ - .cfi directives are emitted for all other ABIs
+ - for 32-bit code, substitute %e?? registers for %r??
+*/
+
+void X86FrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
+ "MF used frame lowering for wrong subtarget");
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const Function &Fn = MF.getFunction();
+ MachineModuleInfo &MMI = MF.getMMI();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
+ uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
+ bool IsFunclet = MBB.isEHFuncletEntry();
+ EHPersonality Personality = EHPersonality::Unknown;
+ if (Fn.hasPersonalityFn())
+ Personality = classifyEHPersonality(Fn.getPersonalityFn());
+ bool FnHasClrFunclet =
+ MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
+ bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
+ bool HasFP = hasFP(MF);
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
+ // FIXME: Emit FPO data for EH funclets.
+ bool NeedsWinFPO =
+ !IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag();
+ bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;
+ bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
+ Register FramePtr = TRI->getFrameRegister(MF);
+ const Register MachineFramePtr =
+ STI.isTarget64BitILP32()
+ ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
+ Register BasePtr = TRI->getBaseRegister();
+ bool HasWinCFI = false;
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ // Add RETADDR move area to callee saved frame size.
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta && IsWin64Prologue)
+ report_fatal_error("Can't handle guaranteed tail call under win64 yet");
+
+ if (TailCallReturnAddrDelta < 0)
+ X86FI->setCalleeSavedFrameSize(
+ X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+ const bool EmitStackProbeCall =
+ STI.getTargetLowering()->hasStackProbeSymbol(MF);
+ unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
+
+ // Re-align the stack on 64-bit if the x86-interrupt calling convention is
+ // used and an error code was pushed, since the x86-64 ABI requires a 16-byte
+ // stack alignment.
+ if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
+ Fn.arg_size() == 2) {
+ StackSize += 8;
+ MFI.setStackSize(StackSize);
+ emitSPUpdate(MBB, MBBI, DL, -8, /*InEpilogue=*/false);
+ }
+
+ // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+ // function, and use up to 128 bytes of stack space, don't have a frame
+ // pointer, calls, or dynamic alloca then we do not need to adjust the
+ // stack pointer (we fit in the Red Zone). We also check that we don't
+ // push and pop from the stack.
+ if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) &&
+ !MFI.hasVarSizedObjects() && // No dynamic alloca.
+ !MFI.adjustsStack() && // No calls.
+ !EmitStackProbeCall && // No stack probes.
+ !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
+ !MF.shouldSplitStack()) { // Regular stack
+ uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+ if (HasFP) MinSize += SlotSize;
+ X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
+ StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+ MFI.setStackSize(StackSize);
+ }
+
+ // Insert stack pointer adjustment for later moving of return addr. Only
+ // applies to tail call optimized functions where the callee argument stack
+ // size is bigger than the callers.
+ if (TailCallReturnAddrDelta < 0) {
+ BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
+ /*InEpilogue=*/false)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Mapping for machine moves:
+ //
+ // DST: VirtualFP AND
+ // SRC: VirtualFP => DW_CFA_def_cfa_offset
+ // ELSE => DW_CFA_def_cfa
+ //
+ // SRC: VirtualFP AND
+ // DST: Register => DW_CFA_def_cfa_register
+ //
+ // ELSE
+ // OFFSET < 0 => DW_CFA_offset_extended_sf
+ // REG < 64 => DW_CFA_offset + Reg
+ // ELSE => DW_CFA_offset_extended
+
+ uint64_t NumBytes = 0;
+ int stackGrowth = -SlotSize;
+
+ // Find the funclet establisher parameter
+ Register Establisher = X86::NoRegister;
+ if (IsClrFunclet)
+ Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
+ else if (IsFunclet)
+ Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
+
+ if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
+ // Immediately spill establisher into the home slot.
+ // The runtime cares about this.
+ // MOV64mr %rdx, 16(%rsp)
+ unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
+ .addReg(Establisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MBB.addLiveIn(Establisher);
+ }
+
+ if (HasFP) {
+ assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved");
+
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ // If required, include space for extra hidden slot for stashing base pointer.
+ if (X86FI->getRestoreBasePointer())
+ FrameSize += SlotSize;
+
+ NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+ // Callee-saved registers are pushed on stack before the stack is realigned.
+ if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+ NumBytes = alignTo(NumBytes, MaxAlign);
+
+ // Save EBP/RBP into the appropriate stack slot.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(MachineFramePtr, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsDwarfCFI) {
+ // Mark the place where EBP/RBP was saved.
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth));
+
+ // Change the rule for the FramePtr to be an "offset" rule.
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
+ nullptr, DwarfFramePtr, 2 * stackGrowth));
+ }
+
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+ .addImm(FramePtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (!IsWin64Prologue && !IsFunclet) {
+ // Update EBP with the new base value.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
+ FramePtr)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsDwarfCFI) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ // Define the current CFA to use the EBP/RBP register.
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
+ nullptr, DwarfFramePtr));
+ }
+
+ if (NeedsWinFPO) {
+ // .cv_fpo_setframe $FramePtr
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+ .addImm(FramePtr)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+ } else {
+ assert(!IsFunclet && "funclets without FPs not yet implemented");
+ NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+ }
+
+ // Update the offset adjustment, which is mainly used by codeview to translate
+ // from ESP to VFRAME relative local variable offsets.
+ if (!IsFunclet) {
+ if (HasFP && TRI->needsStackRealignment(MF))
+ MFI.setOffsetAdjustment(-NumBytes);
+ else
+ MFI.setOffsetAdjustment(-StackSize);
+ }
+
+ // For EH funclets, only allocate enough space for outgoing calls. Save the
+ // NumBytes value that we would've used for the parent frame.
+ unsigned ParentFrameNumBytes = NumBytes;
+ if (IsFunclet)
+ NumBytes = getWinEHFuncletFrameSize(MF);
+
+ // Skip the callee-saved push instructions.
+ bool PushedRegs = false;
+ int StackOffset = 2 * stackGrowth;
+
+ while (MBBI != MBB.end() &&
+ MBBI->getFlag(MachineInstr::FrameSetup) &&
+ (MBBI->getOpcode() == X86::PUSH32r ||
+ MBBI->getOpcode() == X86::PUSH64r)) {
+ PushedRegs = true;
+ Register Reg = MBBI->getOperand(0).getReg();
+ ++MBBI;
+
+ if (!HasFP && NeedsDwarfCFI) {
+ // Mark callee-saved push instruction.
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset));
+ StackOffset += stackGrowth;
+ }
+
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+ .addImm(Reg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ // Realign stack after we pushed callee-saved registers (so that we'll be
+ // able to calculate their offsets from the frame pointer).
+ // Don't do this for Win64, it needs to realign the stack after the prologue.
+ if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
+ assert(HasFP && "There should be a frame pointer if stack is realigned.");
+ BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
+
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlign))
+ .addImm(MaxAlign)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ // If there is an SUB32ri of ESP immediately before this instruction, merge
+ // the two. This can be the case when tail call elimination is enabled and
+ // the callee has more arguments then the caller.
+ NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+
+ // Adjust stack pointer: ESP -= numbytes.
+
+ // Windows and cygwin/mingw require a prologue helper routine when allocating
+ // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw
+ // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the
+ // stack and adjust the stack pointer in one go. The 64-bit version of
+ // __chkstk is only responsible for probing the stack. The 64-bit prologue is
+ // responsible for adjusting the stack pointer. Touching the stack at 4K
+ // increments is necessary to ensure that the guard pages used by the OS
+ // virtual memory manager are allocated in correct sequence.
+ uint64_t AlignedNumBytes = NumBytes;
+ if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
+ AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
+ if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) {
+ assert(!X86FI->getUsesRedZone() &&
+ "The Red Zone is not accounted for in stack probes");
+
+ // Check whether EAX is livein for this block.
+ bool isEAXAlive = isEAXLiveIn(MBB);
+
+ if (isEAXAlive) {
+ if (Is64Bit) {
+ // Save RAX
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+ .addReg(X86::RAX, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ // Save EAX
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+ .addReg(X86::EAX, RegState::Kill)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ if (Is64Bit) {
+ // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+ // Function prologue is responsible for adjusting the stack pointer.
+ int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
+ if (isUInt<32>(Alloc)) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(Alloc)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else if (isInt<32>(Alloc)) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
+ .addImm(Alloc)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+ .addImm(Alloc)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ } else {
+ // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
+ // We'll also use 4 already allocated bytes for EAX.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Call __chkstk, __chkstk_ms, or __alloca.
+ emitStackProbe(MF, MBB, MBBI, DL, true);
+
+ if (isEAXAlive) {
+ // Restore RAX/EAX
+ MachineInstr *MI;
+ if (Is64Bit)
+ MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX),
+ StackPtr, false, NumBytes - 8);
+ else
+ MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+ StackPtr, false, NumBytes - 4);
+ MI->setFlag(MachineInstr::FrameSetup);
+ MBB.insert(MBBI, MI);
+ }
+ } else if (NumBytes) {
+ emitSPUpdate(MBB, MBBI, DL, -(int64_t)NumBytes, /*InEpilogue=*/false);
+ }
+
+ if (NeedsWinCFI && NumBytes) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ int SEHFrameOffset = 0;
+ unsigned SPOrEstablisher;
+ if (IsFunclet) {
+ if (IsClrFunclet) {
+ // The establisher parameter passed to a CLR funclet is actually a pointer
+ // to the (mostly empty) frame of its nearest enclosing funclet; we have
+ // to find the root function establisher frame by loading the PSPSym from
+ // the intermediate frame.
+ unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+ MachinePointerInfo NoInfo;
+ MBB.addLiveIn(Establisher);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
+ Establisher, false, PSPSlotOffset)
+ .addMemOperand(MF.getMachineMemOperand(
+ NoInfo, MachineMemOperand::MOLoad, SlotSize, Align(SlotSize)));
+ ;
+ // Save the root establisher back into the current funclet's (mostly
+ // empty) frame, in case a sub-funclet or the GC needs it.
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
+ false, PSPSlotOffset)
+ .addReg(Establisher)
+ .addMemOperand(MF.getMachineMemOperand(
+ NoInfo,
+ MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+ SlotSize, Align(SlotSize)));
+ }
+ SPOrEstablisher = Establisher;
+ } else {
+ SPOrEstablisher = StackPtr;
+ }
+
+ if (IsWin64Prologue && HasFP) {
+ // Set RBP to a small fixed offset from RSP. In the funclet case, we base
+ // this calculation on the incoming establisher, which holds the value of
+ // RSP from the parent frame at the end of the prologue.
+ SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
+ if (SEHFrameOffset)
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
+ SPOrEstablisher, false, SEHFrameOffset);
+ else
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
+ .addReg(SPOrEstablisher);
+
+ // If this is not a funclet, emit the CFI describing our frame pointer.
+ if (NeedsWinCFI && !IsFunclet) {
+ assert(!NeedsWinFPO && "this setframe incompatible with FPO data");
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+ .addImm(FramePtr)
+ .addImm(SEHFrameOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ if (isAsynchronousEHPersonality(Personality))
+ MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
+ }
+ } else if (IsFunclet && STI.is32Bit()) {
+ // Reset EBP / ESI to something good for funclets.
+ MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
+ // If we're a catch funclet, we can be returned to via catchret. Save ESP
+ // into the registration node so that the runtime will restore it for us.
+ if (!MBB.isCleanupFuncletEntry()) {
+ assert(Personality == EHPersonality::MSVC_CXX);
+ Register FrameReg;
+ int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
+ int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed();
+ // ESP is the first field, so no extra displacement is needed.
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
+ false, EHRegOffset)
+ .addReg(X86::ESP);
+ }
+ }
+
+ while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
+ const MachineInstr &FrameInstr = *MBBI;
+ ++MBBI;
+
+ if (NeedsWinCFI) {
+ int FI;
+ if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
+ if (X86::FR64RegClass.contains(Reg)) {
+ int Offset;
+ Register IgnoredFrameReg;
+ if (IsWin64Prologue && IsFunclet)
+ Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
+ else
+ Offset =
+ getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() +
+ SEHFrameOffset;
+
+ HasWinCFI = true;
+ assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+ .addImm(Reg)
+ .addImm(Offset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+ }
+ }
+
+ if (NeedsWinCFI && HasWinCFI)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (FnHasClrFunclet && !IsFunclet) {
+ // Save the so-called Initial-SP (i.e. the value of the stack pointer
+ // immediately after the prolog) into the PSPSlot so that funclets
+ // and the GC can recover it.
+ unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+ auto PSPInfo = MachinePointerInfo::getFixedStack(
+ MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
+ PSPSlotOffset)
+ .addReg(StackPtr)
+ .addMemOperand(MF.getMachineMemOperand(
+ PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+ SlotSize, Align(SlotSize)));
+ }
+
+ // Realign stack after we spilled callee-saved registers (so that we'll be
+ // able to calculate their offsets from the frame pointer).
+ // Win64 requires aligning the stack after the prologue.
+ if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
+ assert(HasFP && "There should be a frame pointer if stack is realigned.");
+ BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
+ }
+
+ // We already dealt with stack realignment and funclets above.
+ if (IsFunclet && STI.is32Bit())
+ return;
+
+ // If we need a base pointer, set it up here. It's whatever the value
+ // of the stack pointer is at this point. Any variable size objects
+ // will be allocated after this, so we can still use the base pointer
+ // to reference locals.
+ if (TRI->hasBasePointer(MF)) {
+ // Update the base pointer with the current stack pointer.
+ unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+ .addReg(SPOrEstablisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ if (X86FI->getRestoreBasePointer()) {
+ // Stash value of base pointer. Saving RSP instead of EBP shortens
+ // dependence chain. Used by SjLj EH.
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
+ FramePtr, true, X86FI->getRestoreBasePointerOffset())
+ .addReg(SPOrEstablisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
+ // Stash the value of the frame pointer relative to the base pointer for
+ // Win32 EH. This supports Win32 EH, which does the inverse of the above:
+ // it recovers the frame pointer from the base pointer rather than the
+ // other way around.
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ Register UsedReg;
+ int Offset =
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
+ .getFixed();
+ assert(UsedReg == BasePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
+ .addReg(FramePtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
+ // Mark end of stack pointer adjustment.
+ if (!HasFP && NumBytes) {
+ // Define the current CFA rule to use the provided offset.
+ assert(StackSize);
+ BuildCFI(
+ MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth));
+ }
+
+ // Emit DWARF info specifying the offsets of the callee-saved registers.
+ emitCalleeSavedFrameMoves(MBB, MBBI, DL, true);
+ }
+
+ // X86 Interrupt handling function cannot assume anything about the direction
+ // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
+ // in each prologue of interrupt handler function.
+ //
+ // FIXME: Create "cld" instruction only in these cases:
+ // 1. The interrupt handling function uses any of the "rep" instructions.
+ // 2. Interrupt handling function calls another function.
+ //
+ if (Fn.getCallingConv() == CallingConv::X86_INTR)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // At this point we know if the function has WinCFI or not.
+ MF.setHasWinCFI(HasWinCFI);
+}
+
+bool X86FrameLowering::canUseLEAForSPInEpilogue(
+ const MachineFunction &MF) const {
+ // We can't use LEA instructions for adjusting the stack pointer if we don't
+ // have a frame pointer in the Win64 ABI. Only ADD instructions may be used
+ // to deallocate the stack.
+ // This means that we can use LEA for SP in two situations:
+ // 1. We *aren't* using the Win64 ABI which means we are free to use LEA.
+ // 2. We *have* a frame pointer which means we are permitted to use LEA.
+ return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
+}
+
+static bool isFuncletReturnInstr(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::CATCHRET:
+ case X86::CLEANUPRET:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("impossible");
+}
+
+// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
+// stack. It holds a pointer to the bottom of the root function frame. The
+// establisher frame pointer passed to a nested funclet may point to the
+// (mostly empty) frame of its parent funclet, but it will need to find
+// the frame of the root function to access locals. To facilitate this,
+// every funclet copies the pointer to the bottom of the root function
+// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
+// same offset for the PSPSym in the root function frame that's used in the
+// funclets' frames allows each funclet to dynamically accept any ancestor
+// frame as its establisher argument (the runtime doesn't guarantee the
+// immediate parent for some reason lost to history), and also allows the GC,
+// which uses the PSPSym for some bookkeeping, to find it in any funclet's
+// frame with only a single offset reported for the entire method.
+unsigned
+X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
+ const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
+ Register SPReg;
+ int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
+ /*IgnoreSPUpdates*/ true)
+ .getFixed();
+ assert(Offset >= 0 && SPReg == TRI->getStackRegister());
+ return static_cast<unsigned>(Offset);
+}
+
+unsigned
+X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ // This is the size of the pushed CSRs.
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ // This is the size of callee saved XMMs.
+ const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+ unsigned XMMSize = WinEHXMMSlotInfo.size() *
+ TRI->getSpillSize(X86::VR128RegClass);
+ // This is the amount of stack a funclet needs to allocate.
+ unsigned UsedSize;
+ EHPersonality Personality =
+ classifyEHPersonality(MF.getFunction().getPersonalityFn());
+ if (Personality == EHPersonality::CoreCLR) {
+ // CLR funclets need to hold enough space to include the PSPSym, at the
+ // same offset from the stack pointer (immediately after the prolog) as it
+ // resides at in the main function.
+ UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
+ } else {
+ // Other funclets just need enough stack for outgoing call arguments.
+ UsedSize = MF.getFrameInfo().getMaxCallFrameSize();
+ }
+ // RBP is not included in the callee saved register block. After pushing RBP,
+ // everything is 16 byte aligned. Everything we allocate before an outgoing
+ // call must also be 16 byte aligned.
+ unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign());
+ // Subtract out the size of the callee saved registers. This is how much stack
+ // each funclet will allocate.
+ return FrameSizeMinusRBP + XMMSize - CSSize;
+}
+
+static bool isTailCallOpcode(unsigned Opc) {
+ return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
+ Opc == X86::TCRETURNmi ||
+ Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNdi64 ||
+ Opc == X86::TCRETURNmi64;
+}
+
+void X86FrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator();
+ MachineBasicBlock::iterator MBBI = Terminator;
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+ // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+ const bool Is64BitILP32 = STI.isTarget64BitILP32();
+ Register FramePtr = TRI->getFrameRegister(MF);
+ Register MachineFramePtr =
+ Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
+
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool NeedsWin64CFI =
+ IsWin64Prologue && MF.getFunction().needsUnwindTableEntry();
+ bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ uint64_t StackSize = MFI.getStackSize();
+ uint64_t MaxAlign = calculateMaxStackAlign(MF);
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ bool HasFP = hasFP(MF);
+ uint64_t NumBytes = 0;
+
+ bool NeedsDwarfCFI = (!MF.getTarget().getTargetTriple().isOSDarwin() &&
+ !MF.getTarget().getTargetTriple().isOSWindows()) &&
+ MF.needsFrameMoves();
+
+ if (IsFunclet) {
+ assert(HasFP && "EH funclets without FP not yet implemented");
+ NumBytes = getWinEHFuncletFrameSize(MF);
+ } else if (HasFP) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ NumBytes = FrameSize - CSSize;
+
+ // Callee-saved registers were pushed on stack before the stack was
+ // realigned.
+ if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+ NumBytes = alignTo(FrameSize, MaxAlign);
+ } else {
+ NumBytes = StackSize - CSSize;
+ }
+ uint64_t SEHStackAllocAmt = NumBytes;
+
+ // AfterPop is the position to insert .cfi_restore.
+ MachineBasicBlock::iterator AfterPop = MBBI;
+ if (HasFP) {
+ // Pop EBP.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+ MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ if (NeedsDwarfCFI) {
+ unsigned DwarfStackPtr =
+ TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize));
+ if (!MBB.succ_empty() && !MBB.isReturnBlock()) {
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, AfterPop, DL,
+ MCCFIInstruction::createRestore(nullptr, DwarfFramePtr));
+ --MBBI;
+ --AfterPop;
+ }
+ --MBBI;
+ }
+ }
+
+ MachineBasicBlock::iterator FirstCSPop = MBBI;
+ // Skip the callee-saved pop instructions.
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = std::prev(MBBI);
+ unsigned Opc = PI->getOpcode();
+
+ if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
+ if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)))
+ break;
+ FirstCSPop = PI;
+ }
+
+ --MBBI;
+ }
+ MBBI = FirstCSPop;
+
+ if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET)
+ emitCatchRetReturnValue(MBB, FirstCSPop, &*Terminator);
+
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+
+ // If there is an ADD32ri or SUB32ri of ESP immediately before this
+ // instruction, merge the two instructions.
+ if (NumBytes || MFI.hasVarSizedObjects())
+ NumBytes += mergeSPUpdates(MBB, MBBI, true);
+
+ // If dynamic alloca is used, then reset esp to point to the last callee-saved
+ // slot before popping them off! Same applies for the case, when stack was
+ // realigned. Don't do this if this was a funclet epilogue, since the funclets
+ // will not do realignment or dynamic stack allocation.
+ if ((TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) &&
+ !IsFunclet) {
+ if (TRI->needsStackRealignment(MF))
+ MBBI = FirstCSPop;
+ unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
+ uint64_t LEAAmount =
+ IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
+
+ // There are only two legal forms of epilogue:
+ // - add SEHAllocationSize, %rsp
+ // - lea SEHAllocationSize(%FramePtr), %rsp
+ //
+ // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
+ // However, we may use this sequence if we have a frame pointer because the
+ // effects of the prologue can safely be undone.
+ if (LEAAmount != 0) {
+ unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+ FramePtr, false, LEAAmount);
+ --MBBI;
+ } else {
+ unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(FramePtr);
+ --MBBI;
+ }
+ } else if (NumBytes) {
+ // Adjust stack pointer back: ESP += numbytes.
+ emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true);
+ if (!hasFP(MF) && NeedsDwarfCFI) {
+ // Define the current CFA rule to use the provided offset.
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, CSSize + SlotSize));
+ }
+ --MBBI;
+ }
+
+ // Windows unwinder will not invoke function's exception handler if IP is
+ // either in prologue or in epilogue. This behavior causes a problem when a
+ // call immediately precedes an epilogue, because the return address points
+ // into the epilogue. To cope with that, we insert an epilogue marker here,
+ // then replace it with a 'nop' if it ends up immediately after a CALL in the
+ // final emitted code.
+ if (NeedsWin64CFI && MF.hasWinCFI())
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+
+ if (!hasFP(MF) && NeedsDwarfCFI) {
+ MBBI = FirstCSPop;
+ int64_t Offset = -CSSize - SlotSize;
+ // Mark callee-saved pop instruction.
+ // Define the current CFA rule to use the provided offset.
+ while (MBBI != MBB.end()) {
+ MachineBasicBlock::iterator PI = MBBI;
+ unsigned Opc = PI->getOpcode();
+ ++MBBI;
+ if (Opc == X86::POP32r || Opc == X86::POP64r) {
+ Offset += SlotSize;
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset));
+ }
+ }
+ }
+
+ // Emit DWARF info specifying the restores of the callee-saved registers.
+ // For epilogue with return inside or being other block without successor,
+ // no need to generate .cfi_restore for callee-saved registers.
+ if (NeedsDwarfCFI && !MBB.succ_empty() && !MBB.isReturnBlock()) {
+ emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false);
+ }
+
+ if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
+ // Add the return addr area delta back since we are not tail calling.
+ int Offset = -1 * X86FI->getTCReturnAddrDelta();
+ assert(Offset >= 0 && "TCDelta should never be positive");
+ if (Offset) {
+ // Check for possible merge with preceding ADD instruction.
+ Offset += mergeSPUpdates(MBB, Terminator, true);
+ emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
+ }
+ }
+
+ // Emit tilerelease for AMX kernel.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!MRI.reg_nodbg_empty(X86::TMMCFG))
+ BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
+}
+
+StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ Register &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ bool IsFixed = MFI.isFixedObjectIndex(FI);
+ // We can't calculate offset from frame pointer if the stack is realigned,
+ // so enforce usage of stack/base pointer. The base pointer is used when we
+ // have dynamic allocas in addition to dynamic realignment.
+ if (TRI->hasBasePointer(MF))
+ FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister();
+ else if (TRI->needsStackRealignment(MF))
+ FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister();
+ else
+ FrameReg = TRI->getFrameRegister(MF);
+
+ // Offset will hold the offset from the stack pointer at function entry to the
+ // object.
+ // We need to factor in additional offsets applied during the prologue to the
+ // frame, base, and stack pointer depending on which is used.
+ int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ uint64_t StackSize = MFI.getStackSize();
+ bool HasFP = hasFP(MF);
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ int64_t FPDelta = 0;
+
+ // In an x86 interrupt, remove the offset we added to account for the return
+ // address from any stack object allocated in the caller's frame. Interrupts
+ // do not have a standard return address. Fixed objects in the current frame,
+ // such as SSE register spills, should not get this treatment.
+ if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR &&
+ Offset >= 0) {
+ Offset += getOffsetOfLocalArea();
+ }
+
+ if (IsWin64Prologue) {
+ assert(!MFI.hasCalls() || (StackSize % 16) == 8);
+
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ // If required, include space for extra hidden slot for stashing base pointer.
+ if (X86FI->getRestoreBasePointer())
+ FrameSize += SlotSize;
+ uint64_t NumBytes = FrameSize - CSSize;
+
+ uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
+ if (FI && FI == X86FI->getFAIndex())
+ return StackOffset::getFixed(-SEHFrameOffset);
+
+ // FPDelta is the offset from the "traditional" FP location of the old base
+ // pointer followed by return address and the location required by the
+ // restricted Win64 prologue.
+ // Add FPDelta to all offsets below that go through the frame pointer.
+ FPDelta = FrameSize - SEHFrameOffset;
+ assert((!MFI.hasCalls() || (FPDelta % 16) == 0) &&
+ "FPDelta isn't aligned per the Win64 ABI!");
+ }
+
+
+ if (TRI->hasBasePointer(MF)) {
+ assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
+ if (FI < 0) {
+ // Skip the saved EBP.
+ return StackOffset::getFixed(Offset + SlotSize + FPDelta);
+ } else {
+ assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
+ return StackOffset::getFixed(Offset + StackSize);
+ }
+ } else if (TRI->needsStackRealignment(MF)) {
+ if (FI < 0) {
+ // Skip the saved EBP.
+ return StackOffset::getFixed(Offset + SlotSize + FPDelta);
+ } else {
+ assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
+ return StackOffset::getFixed(Offset + StackSize);
+ }
+ // FIXME: Support tail calls
+ } else {
+ if (!HasFP)
+ return StackOffset::getFixed(Offset + StackSize);
+
+ // Skip the saved EBP.
+ Offset += SlotSize;
+
+ // Skip the RETADDR move area
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ Offset -= TailCallReturnAddrDelta;
+ }
+
+ return StackOffset::getFixed(Offset + FPDelta);
+}
+
+int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+ const auto it = WinEHXMMSlotInfo.find(FI);
+
+ if (it == WinEHXMMSlotInfo.end())
+ return getFrameIndexReference(MF, FI, FrameReg).getFixed();
+
+ FrameReg = TRI->getStackRegister();
+ return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +
+ it->second;
+}
+
+StackOffset
+X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+ Register &FrameReg,
+ int Adjustment) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ FrameReg = TRI->getStackRegister();
+ return StackOffset::getFixed(MFI.getObjectOffset(FI) -
+ getOffsetOfLocalArea() + Adjustment);
+}
+
+StackOffset
+X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
+ int FI, Register &FrameReg,
+ bool IgnoreSPUpdates) const {
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ // Does not include any dynamic realign.
+ const uint64_t StackSize = MFI.getStackSize();
+ // LLVM arranges the stack as follows:
+ // ...
+ // ARG2
+ // ARG1
+ // RETADDR
+ // PUSH RBP <-- RBP points here
+ // PUSH CSRs
+ // ~~~~~~~ <-- possible stack realignment (non-win64)
+ // ...
+ // STACK OBJECTS
+ // ... <-- RSP after prologue points here
+ // ~~~~~~~ <-- possible stack realignment (win64)
+ //
+ // if (hasVarSizedObjects()):
+ // ... <-- "base pointer" (ESI/RBX) points here
+ // DYNAMIC ALLOCAS
+ // ... <-- RSP points here
+ //
+ // Case 1: In the simple case of no stack realignment and no dynamic
+ // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
+ // with fixed offsets from RSP.
+ //
+ // Case 2: In the case of stack realignment with no dynamic allocas, fixed
+ // stack objects are addressed with RBP and regular stack objects with RSP.
+ //
+ // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
+ // to address stack arguments for outgoing calls and nothing else. The "base
+ // pointer" points to local variables, and RBP points to fixed objects.
+ //
+ // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
+ // answer we give is relative to the SP after the prologue, and not the
+ // SP in the middle of the function.
+
+ if (MFI.isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) &&
+ !STI.isTargetWin64())
+ return getFrameIndexReference(MF, FI, FrameReg);
+
+ // If !hasReservedCallFrame the function might have SP adjustement in the
+ // body. So, even though the offset is statically known, it depends on where
+ // we are in the function.
+ if (!IgnoreSPUpdates && !hasReservedCallFrame(MF))
+ return getFrameIndexReference(MF, FI, FrameReg);
+
+ // We don't handle tail calls, and shouldn't be seeing them either.
+ assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
+ "we don't handle this case!");
+
+ // This is how the math works out:
+ //
+ // %rsp grows (i.e. gets lower) left to right. Each box below is
+ // one word (eight bytes). Obj0 is the stack slot we're trying to
+ // get to.
+ //
+ // ----------------------------------
+ // | BP | Obj0 | Obj1 | ... | ObjN |
+ // ----------------------------------
+ // ^ ^ ^ ^
+ // A B C E
+ //
+ // A is the incoming stack pointer.
+ // (B - A) is the local area offset (-8 for x86-64) [1]
+ // (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]
+ //
+ // |(E - B)| is the StackSize (absolute value, positive). For a
+ // stack that grown down, this works out to be (B - E). [3]
+ //
+ // E is also the value of %rsp after stack has been set up, and we
+ // want (C - E) -- the value we can add to %rsp to get to Obj0. Now
+ // (C - E) == (C - A) - (B - A) + (B - E)
+ // { Using [1], [2] and [3] above }
+ // == getObjectOffset - LocalAreaOffset + StackSize
+
+ return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize);
+}
+
+bool X86FrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+
+ unsigned CalleeSavedFrameSize = 0;
+ unsigned XMMCalleeSavedFrameSize = 0;
+ auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+ int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
+
+ int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+ if (TailCallReturnAddrDelta < 0) {
+ // create RETURNADDR area
+ // arg
+ // arg
+ // RETADDR
+ // { ...
+ // RETADDR area
+ // ...
+ // }
+ // [EBP]
+ MFI.CreateFixedObject(-TailCallReturnAddrDelta,
+ TailCallReturnAddrDelta - SlotSize, true);
+ }
+
+ // Spill the BasePtr if it's used.
+ if (this->TRI->hasBasePointer(MF)) {
+ // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
+ if (MF.hasEHFunclets()) {
+ int FI = MFI.CreateSpillStackObject(SlotSize, Align(SlotSize));
+ X86FI->setHasSEHFramePtrSave(true);
+ X86FI->setSEHFramePtrSaveIndex(FI);
+ }
+ }
+
+ if (hasFP(MF)) {
+ // emitPrologue always spills frame register the first thing.
+ SpillSlotOffset -= SlotSize;
+ MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+
+ // Since emitPrologue and emitEpilogue will handle spilling and restoring of
+ // the frame register, we can delete it from CSI list and not have to worry
+ // about avoiding it later.
+ Register FPReg = TRI->getFrameRegister(MF);
+ for (unsigned i = 0; i < CSI.size(); ++i) {
+ if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
+ CSI.erase(CSI.begin() + i);
+ break;
+ }
+ }
+ }
+
+ // Assign slots for GPRs. It increases frame size.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+
+ if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+ continue;
+
+ SpillSlotOffset -= SlotSize;
+ CalleeSavedFrameSize += SlotSize;
+
+ int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+ CSI[i - 1].setFrameIdx(SlotIndex);
+ }
+
+ X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
+ MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);
+
+ // Assign slots for XMMs.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ continue;
+
+ // If this is k-register make sure we lookup via the largest legal type.
+ MVT VT = MVT::Other;
+ if (X86::VK16RegClass.contains(Reg))
+ VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+ unsigned Size = TRI->getSpillSize(*RC);
+ Align Alignment = TRI->getSpillAlign(*RC);
+ // ensure alignment
+ assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
+ SpillSlotOffset = -alignTo(-SpillSlotOffset, Alignment);
+
+ // spill into slot
+ SpillSlotOffset -= Size;
+ int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
+ CSI[i - 1].setFrameIdx(SlotIndex);
+ MFI.ensureMaxAlignment(Alignment);
+
+ // Save the start offset and size of XMM in stack frame for funclets.
+ if (X86::VR128RegClass.contains(Reg)) {
+ WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize;
+ XMMCalleeSavedFrameSize += Size;
+ }
+ }
+
+ return true;
+}
+
+bool X86FrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
+ // for us, and there are no XMM CSRs on Win32.
+ if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
+ return true;
+
+ // Push GPRs. It increases frame size.
+ const MachineFunction &MF = *MBB.getParent();
+ unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i - 1].getReg();
+
+ if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+ continue;
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool isLiveIn = MRI.isLiveIn(Reg);
+ if (!isLiveIn)
+ MBB.addLiveIn(Reg);
+
+ // Decide whether we can add a kill flag to the use.
+ bool CanKill = !isLiveIn;
+ // Check if any subregister is live-in
+ if (CanKill) {
+ for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
+ if (MRI.isLiveIn(*AReg)) {
+ CanKill = false;
+ break;
+ }
+ }
+ }
+
+ // Do not set a kill flag on values that are also marked as live-in. This
+ // happens with the @llvm-returnaddress intrinsic and with arguments
+ // passed in callee saved registers.
+ // Omitting the kill flags is conservatively correct even if the live-in
+ // is not used after all.
+ BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, getKillRegState(CanKill))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+ // It can be done by spilling XMMs to stack frame.
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ continue;
+
+ // If this is k-register make sure we lookup via the largest legal type.
+ MVT VT = MVT::Other;
+ if (X86::VK16RegClass.contains(Reg))
+ VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
+
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+ TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
+ TRI);
+ --MI;
+ MI->setFlag(MachineInstr::FrameSetup);
+ ++MI;
+ }
+
+ return true;
+}
+
+void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineInstr *CatchRet) const {
+ // SEH shouldn't use catchret.
+ assert(!isAsynchronousEHPersonality(classifyEHPersonality(
+ MBB.getParent()->getFunction().getPersonalityFn())) &&
+ "SEH should not use CATCHRET");
+ DebugLoc DL = CatchRet->getDebugLoc();
+ MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB();
+
+ // Fill EAX/RAX with the address of the target block.
+ if (STI.is64Bit()) {
+ // LEA64r CatchRetTarget(%rip), %rax
+ BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(CatchRetTarget)
+ .addReg(0);
+ } else {
+ // MOV32ri $CatchRetTarget, %eax
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addMBB(CatchRetTarget);
+ }
+
+ // Record that we've taken the address of CatchRetTarget and no longer just
+ // reference it in a terminator.
+ CatchRetTarget->setHasAddressTaken();
+}
+
+bool X86FrameLowering::restoreCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
+ // Don't restore CSRs in 32-bit EH funclets. Matches
+ // spillCalleeSavedRegisters.
+ if (STI.is32Bit())
+ return true;
+ // Don't restore CSRs before an SEH catchret. SEH except blocks do not form
+ // funclets. emitEpilogue transforms these to normal jumps.
+ if (MI->getOpcode() == X86::CATCHRET) {
+ const Function &F = MBB.getParent()->getFunction();
+ bool IsSEH = isAsynchronousEHPersonality(
+ classifyEHPersonality(F.getPersonalityFn()));
+ if (IsSEH)
+ return true;
+ }
+ }
+
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ // Reload XMMs from stack frame.
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (X86::GR64RegClass.contains(Reg) ||
+ X86::GR32RegClass.contains(Reg))
+ continue;
+
+ // If this is k-register make sure we lookup via the largest legal type.
+ MVT VT = MVT::Other;
+ if (X86::VK16RegClass.contains(Reg))
+ VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+ TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+ }
+
+ // POP GPRs.
+ unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (!X86::GR64RegClass.contains(Reg) &&
+ !X86::GR32RegClass.contains(Reg))
+ continue;
+
+ BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+ return true;
+}
+
+void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ // Spill the BasePtr if it's used.
+ if (TRI->hasBasePointer(MF)){
+ Register BasePtr = TRI->getBaseRegister();
+ if (STI.isTarget64BitILP32())
+ BasePtr = getX86SubSuperRegister(BasePtr, 64);
+ SavedRegs.set(BasePtr);
+ }
+}
+
+static bool
+HasNestArgument(const MachineFunction *MF) {
+ const Function &F = MF->getFunction();
+ for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
+ I != E; I++) {
+ if (I->hasNestAttr() && !I->use_empty())
+ return true;
+ }
+ return false;
+}
+
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
+static unsigned
+GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
+ CallingConv::ID CallingConvention = MF.getFunction().getCallingConv();
+
+ // Erlang stuff.
+ if (CallingConvention == CallingConv::HiPE) {
+ if (Is64Bit)
+ return Primary ? X86::R14 : X86::R13;
+ else
+ return Primary ? X86::EBX : X86::EDI;
+ }
+
+ if (Is64Bit) {
+ if (IsLP64)
+ return Primary ? X86::R11 : X86::R12;
+ else
+ return Primary ? X86::R11D : X86::R12D;
+ }
+
+ bool IsNested = HasNestArgument(&MF);
+
+ if (CallingConvention == CallingConv::X86_FastCall ||
+ CallingConvention == CallingConv::Fast ||
+ CallingConvention == CallingConv::Tail) {
+ if (IsNested)
+ report_fatal_error("Segmented stacks does not support fastcall with "
+ "nested function.");
+ return Primary ? X86::EAX : X86::ECX;
+ }
+ if (IsNested)
+ return Primary ? X86::EDX : X86::EAX;
+ return Primary ? X86::ECX : X86::EAX;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual stack
+// limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+void X86FrameLowering::adjustForSegmentedStacks(
+ MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint64_t StackSize;
+ unsigned TlsReg, TlsOffset;
+ DebugLoc DL;
+
+ // To support shrink-wrapping we would need to insert the new blocks
+ // at the right place and update the branches to PrologueMBB.
+ assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
+ unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+ assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+ "Scratch register is live-in");
+
+ if (MF.getFunction().isVarArg())
+ report_fatal_error("Segmented stacks do not support vararg functions.");
+ if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
+ !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
+ !STI.isTargetDragonFly())
+ report_fatal_error("Segmented stacks not supported on this platform.");
+
+ // Eventually StackSize will be calculated by a link-time pass; which will
+ // also decide whether checking code needs to be injected into this particular
+ // prologue.
+ StackSize = MFI.getStackSize();
+
+ // Do not generate a prologue for leaf functions with a stack of size zero.
+ // For non-leaf functions we have to allow for the possibility that the
+ // callis to a non-split function, as in PR37807. This function could also
+ // take the address of a non-split function. When the linker tries to adjust
+ // its non-existent prologue, it would fail with an error. Mark the object
+ // file so that such failures are not errors. See this Go language bug-report
+ // https://go-review.googlesource.com/c/go/+/148819/
+ if (StackSize == 0 && !MFI.hasTailCall()) {
+ MF.getMMI().setHasNosplitStack(true);
+ return;
+ }
+
+ MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ bool IsNested = false;
+
+ // We need to know if the function has a nest argument only in 64 bit mode.
+ if (Is64Bit)
+ IsNested = HasNestArgument(&MF);
+
+ // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+ // allocMBB needs to be last (terminating) instruction.
+
+ for (const auto &LI : PrologueMBB.liveins()) {
+ allocMBB->addLiveIn(LI);
+ checkMBB->addLiveIn(LI);
+ }
+
+ if (IsNested)
+ allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
+
+ MF.push_front(allocMBB);
+ MF.push_front(checkMBB);
+
+ // When the frame size is less than 256 we just compare the stack
+ // boundary directly to the value of the stack pointer, per gcc.
+ bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
+ // Read the limit off the current stacklet off the stack_guard location.
+ if (Is64Bit) {
+ if (STI.isTargetLinux()) {
+ TlsReg = X86::FS;
+ TlsOffset = IsLP64 ? 0x70 : 0x40;
+ } else if (STI.isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+ } else if (STI.isTargetWin64()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x28; // pvArbitrary, reserved for application use
+ } else if (STI.isTargetFreeBSD()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x18;
+ } else if (STI.isTargetDragonFly()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x20; // use tls_tcb.tcb_segstack
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
+
+ if (CompareStackPointer)
+ ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
+ else
+ BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+ BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ } else {
+ if (STI.isTargetLinux()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x30;
+ } else if (STI.isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x48 + 90*4;
+ } else if (STI.isTargetWin32()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x14; // pvArbitrary, reserved for application use
+ } else if (STI.isTargetDragonFly()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x10; // use tls_tcb.tcb_segstack
+ } else if (STI.isTargetFreeBSD()) {
+ report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
+
+ if (CompareStackPointer)
+ ScratchReg = X86::ESP;
+ else
+ BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+ if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
+ STI.isTargetDragonFly()) {
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+ .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ } else if (STI.isTargetDarwin()) {
+
+ // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
+ unsigned ScratchReg2;
+ bool SaveScratch2;
+ if (CompareStackPointer) {
+ // The primary scratch register is available for holding the TLS offset.
+ ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+ SaveScratch2 = false;
+ } else {
+ // Need to use a second register to hold the TLS offset
+ ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
+
+ // Unfortunately, with fastcc the second scratch register may hold an
+ // argument.
+ SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+ }
+
+ // If Scratch2 is live-in then it needs to be saved.
+ assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+ "Scratch register is live-in and not saved");
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+ .addReg(ScratchReg2, RegState::Kill);
+
+ BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+ .addImm(TlsOffset);
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+ .addReg(ScratchReg)
+ .addReg(ScratchReg2).addImm(1).addReg(0)
+ .addImm(0)
+ .addReg(TlsReg);
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+ }
+ }
+
+ // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+ // It jumps to normal execution of the function body.
+ BuildMI(checkMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_A);
+
+ // On 32 bit we first push the arguments size and then the frame size. On 64
+ // bit, we pass the stack frame size in r10 and the argument size in r11.
+ if (Is64Bit) {
+ // Functions with nested arguments use R10, so it needs to be saved across
+ // the call to _morestack
+
+ const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
+ const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
+ const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
+ const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
+ const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
+
+ if (IsNested)
+ BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
+
+ BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
+ .addImm(StackSize);
+ BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
+ .addImm(X86FI->getArgumentStackSize());
+ } else {
+ BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+ .addImm(X86FI->getArgumentStackSize());
+ BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+ .addImm(StackSize);
+ }
+
+ // __morestack is in libgcc
+ if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+ // Under the large code model, we cannot assume that __morestack lives
+ // within 2^31 bytes of the call site, so we cannot use pc-relative
+ // addressing. We cannot perform the call via a temporary register,
+ // as the rax register may be used to store the static chain, and all
+ // other suitable registers may be either callee-save or used for
+ // parameter passing. We cannot use the stack at this point either
+ // because __morestack manipulates the stack directly.
+ //
+ // To avoid these issues, perform an indirect call via a read-only memory
+ // location containing the address.
+ //
+ // This solution is not perfect, as it assumes that the .rodata section
+ // is laid out within 2^31 bytes of each function body, but this seems
+ // to be sufficient for JIT.
+ // FIXME: Add retpoline support and remove the error here..
+ if (STI.useIndirectThunkCalls())
+ report_fatal_error("Emitting morestack calls on 64-bit with the large "
+ "code model and thunks not yet implemented.");
+ BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addExternalSymbol("__morestack_addr")
+ .addReg(0);
+ MF.getMMI().setUsesMorestackAddr(true);
+ } else {
+ if (Is64Bit)
+ BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack");
+ else
+ BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack");
+ }
+
+ if (IsNested)
+ BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
+ else
+ BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
+
+ allocMBB->addSuccessor(&PrologueMBB);
+
+ checkMBB->addSuccessor(allocMBB, BranchProbability::getZero());
+ checkMBB->addSuccessor(&PrologueMBB, BranchProbability::getOne());
+
+#ifdef EXPENSIVE_CHECKS
+ MF.verify();
+#endif
+}
+
+/// Lookup an ERTS parameter in the !hipe.literals named metadata node.
+/// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
+/// to fields it needs, through a named metadata node "hipe.literals" containing
+/// name-value pairs.
+static unsigned getHiPELiteral(
+ NamedMDNode *HiPELiteralsMD, const StringRef LiteralName) {
+ for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
+ MDNode *Node = HiPELiteralsMD->getOperand(i);
+ if (Node->getNumOperands() != 2) continue;
+ MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));
+ ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));
+ if (!NodeName || !NodeVal) continue;
+ ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());
+ if (ValConst && NodeName->getString() == LiteralName) {
+ return ValConst->getZExtValue();
+ }
+ }
+
+ report_fatal_error("HiPE literal " + LiteralName
+ + " required but not provided");
+}
+
+// Return true if there are no non-ehpad successors to MBB and there are no
+// non-meta instructions between MBBI and MBB.end().
+static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
+ MachineBasicBlock::const_iterator MBBI) {
+ return llvm::all_of(
+ MBB.successors(),
+ [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
+ std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
+ return MI.isMetaInstruction();
+ });
+}
+
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+/// ...
+/// IncStack:
+/// call inc_stack # doubles the stack space
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(
+ MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ DebugLoc DL;
+
+ // To support shrink-wrapping we would need to insert the new blocks
+ // at the right place and update the branches to PrologueMBB.
+ assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
+ // HiPE-specific values
+ NamedMDNode *HiPELiteralsMD = MF.getMMI().getModule()
+ ->getNamedMetadata("hipe.literals");
+ if (!HiPELiteralsMD)
+ report_fatal_error(
+ "Can't generate HiPE prologue without runtime parameters");
+ const unsigned HipeLeafWords
+ = getHiPELiteral(HiPELiteralsMD,
+ Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
+ const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+ const unsigned Guaranteed = HipeLeafWords * SlotSize;
+ unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs ?
+ MF.getFunction().arg_size() - CCRegisteredArgs : 0;
+ unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+ assert(STI.isTargetLinux() &&
+ "HiPE prologue is only supported on Linux operating systems.");
+
+ // Compute the largest caller's frame that is needed to fit the callees'
+ // frames. This 'MaxStack' is computed from:
+ //
+ // a) the fixed frame size, which is the space needed for all spilled temps,
+ // b) outgoing on-stack parameter areas, and
+ // c) the minimum stack space this function needs to make available for the
+ // functions it calls (a tunable ABI property).
+ if (MFI.hasCalls()) {
+ unsigned MoreStackForCalls = 0;
+
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (!MI.isCall())
+ continue;
+
+ // Get callee operand.
+ const MachineOperand &MO = MI.getOperand(0);
+
+ // Only take account of global function calls (no closures etc.).
+ if (!MO.isGlobal())
+ continue;
+
+ const Function *F = dyn_cast<Function>(MO.getGlobal());
+ if (!F)
+ continue;
+
+ // Do not update 'MaxStack' for primitive and built-in functions
+ // (encoded with names either starting with "erlang."/"bif_" or not
+ // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+ // "_", such as the BIF "suspend_0") as they are executed on another
+ // stack.
+ if (F->getName().find("erlang.") != StringRef::npos ||
+ F->getName().find("bif_") != StringRef::npos ||
+ F->getName().find_first_of("._") == StringRef::npos)
+ continue;
+
+ unsigned CalleeStkArity =
+ F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+ if (HipeLeafWords - 1 > CalleeStkArity)
+ MoreStackForCalls = std::max(MoreStackForCalls,
+ (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+ }
+ }
+ MaxStack += MoreStackForCalls;
+ }
+
+ // If the stack frame needed is larger than the guaranteed then runtime checks
+ // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+ if (MaxStack > Guaranteed) {
+ MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+ for (const auto &LI : PrologueMBB.liveins()) {
+ stackCheckMBB->addLiveIn(LI);
+ incStackMBB->addLiveIn(LI);
+ }
+
+ MF.push_front(incStackMBB);
+ MF.push_front(stackCheckMBB);
+
+ unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+ unsigned LEAop, CMPop, CALLop;
+ SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");
+ if (Is64Bit) {
+ SPReg = X86::RSP;
+ PReg = X86::RBP;
+ LEAop = X86::LEA64r;
+ CMPop = X86::CMP64rm;
+ CALLop = X86::CALL64pcrel32;
+ } else {
+ SPReg = X86::ESP;
+ PReg = X86::EBP;
+ LEAop = X86::LEA32r;
+ CMPop = X86::CMP32rm;
+ CALLop = X86::CALLpcrel32;
+ }
+
+ ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+ assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+ "HiPE prologue scratch register is live-in");
+
+ // Create new MBB for StackCheck:
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ // SPLimitOffset is in a fixed heap location (pointed by BP).
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_AE);
+
+ // Create new MBB for IncStack:
+ BuildMI(incStackMBB, DL, TII.get(CALLop)).
+ addExternalSymbol("inc_stack_0");
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)).addMBB(incStackMBB).addImm(X86::COND_LE);
+
+ stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
+ stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
+ incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
+ incStackMBB->addSuccessor(incStackMBB, {1, 100});
+ }
+#ifdef EXPENSIVE_CHECKS
+ MF.verify();
+#endif
+}
+
+bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ int Offset) const {
+ if (Offset <= 0)
+ return false;
+
+ if (Offset % SlotSize)
+ return false;
+
+ int NumPops = Offset / SlotSize;
+ // This is only worth it if we have at most 2 pops.
+ if (NumPops != 1 && NumPops != 2)
+ return false;
+
+ // Handle only the trivial case where the adjustment directly follows
+ // a call. This is the most common one, anyway.
+ if (MBBI == MBB.begin())
+ return false;
+ MachineBasicBlock::iterator Prev = std::prev(MBBI);
+ if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())
+ return false;
+
+ unsigned Regs[2];
+ unsigned FoundRegs = 0;
+
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const MachineOperand &RegMask = Prev->getOperand(1);
+
+ auto &RegClass =
+ Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
+ // Try to find up to NumPops free registers.
+ for (auto Candidate : RegClass) {
+ // Poor man's liveness:
+ // Since we're immediately after a call, any register that is clobbered
+ // by the call and not defined by it can be considered dead.
+ if (!RegMask.clobbersPhysReg(Candidate))
+ continue;
+
+ // Don't clobber reserved registers
+ if (MRI.isReserved(Candidate))
+ continue;
+
+ bool IsDef = false;
+ for (const MachineOperand &MO : Prev->implicit_operands()) {
+ if (MO.isReg() && MO.isDef() &&
+ TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {
+ IsDef = true;
+ break;
+ }
+ }
+
+ if (IsDef)
+ continue;
+
+ Regs[FoundRegs++] = Candidate;
+ if (FoundRegs == (unsigned)NumPops)
+ break;
+ }
+
+ if (FoundRegs == 0)
+ return false;
+
+ // If we found only one free register, but need two, reuse the same one twice.
+ while (FoundRegs < (unsigned)NumPops)
+ Regs[FoundRegs++] = Regs[0];
+
+ for (int i = 0; i < NumPops; ++i)
+ BuildMI(MBB, MBBI, DL,
+ TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);
+
+ return true;
+}
+
+MachineBasicBlock::iterator X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ bool reserveCallFrame = hasReservedCallFrame(MF);
+ unsigned Opcode = I->getOpcode();
+ bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+ DebugLoc DL = I->getDebugLoc();
+ uint64_t Amount = TII.getFrameSize(*I);
+ uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;
+ I = MBB.erase(I);
+ auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
+
+ // Try to avoid emitting dead SP adjustments if the block end is unreachable,
+ // typically because the function is marked noreturn (abort, throw,
+ // assert_fail, etc).
+ if (isDestroy && blockEndIsUnreachable(MBB, I))
+ return I;
+
+ if (!reserveCallFrame) {
+ // If the stack pointer can be changed after prologue, turn the
+ // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+ // adjcallstackdown instruction into 'add ESP, <amt>'
+
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ Amount = alignTo(Amount, getStackAlign());
+
+ const Function &F = MF.getFunction();
+ bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves();
+
+ // If we have any exception handlers in this function, and we adjust
+ // the SP before calls, we may need to indicate this to the unwinder
+ // using GNU_ARGS_SIZE. Note that this may be necessary even when
+ // Amount == 0, because the preceding function may have set a non-0
+ // GNU_ARGS_SIZE.
+ // TODO: We don't need to reset this between subsequent functions,
+ // if it didn't change.
+ bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();
+
+ if (HasDwarfEHHandlers && !isDestroy &&
+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
+ BuildCFI(MBB, InsertPos, DL,
+ MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
+
+ if (Amount == 0)
+ return I;
+
+ // Factor out the amount that gets handled inside the sequence
+ // (Pushes of argument for frame setup, callee pops for frame destroy)
+ Amount -= InternalAmt;
+
+ // TODO: This is needed only if we require precise CFA.
+ // If this is a callee-pop calling convention, emit a CFA adjust for
+ // the amount the callee popped.
+ if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
+ BuildCFI(MBB, InsertPos, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
+
+ // Add Amount to SP to destroy a frame, or subtract to setup.
+ int64_t StackAdjustment = isDestroy ? Amount : -Amount;
+
+ if (StackAdjustment) {
+ // Merge with any previous or following adjustment instruction. Note: the
+ // instructions merged with here do not have CFI, so their stack
+ // adjustments do not feed into CfaAdjustment.
+ StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
+ StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
+
+ if (StackAdjustment) {
+ if (!(F.hasMinSize() &&
+ adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
+ BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
+ /*InEpilogue=*/false);
+ }
+ }
+
+ if (DwarfCFI && !hasFP(MF)) {
+ // If we don't have FP, but need to generate unwind information,
+ // we need to set the correct CFA offset after the stack adjustment.
+ // How much we adjust the CFA offset depends on whether we're emitting
+ // CFI only for EH purposes or for debugging. EH only requires the CFA
+ // offset to be correct at each call site, while for debugging we want
+ // it to be more precise.
+
+ int64_t CfaAdjustment = -StackAdjustment;
+ // TODO: When not using precise CFA, we also need to adjust for the
+ // InternalAmt here.
+ if (CfaAdjustment) {
+ BuildCFI(MBB, InsertPos, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr,
+ CfaAdjustment));
+ }
+ }
+
+ return I;
+ }
+
+ if (InternalAmt) {
+ MachineBasicBlock::iterator CI = I;
+ MachineBasicBlock::iterator B = MBB.begin();
+ while (CI != B && !std::prev(CI)->isCall())
+ --CI;
+ BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /*InEpilogue=*/false);
+ }
+
+ return I;
+}
+
+bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
+ assert(MBB.getParent() && "Block is not attached to a function!");
+ const MachineFunction &MF = *MBB.getParent();
+ return !TRI->needsStackRealignment(MF) || !MBB.isLiveIn(X86::EFLAGS);
+}
+
+bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+ assert(MBB.getParent() && "Block is not attached to a function!");
+
+ // Win64 has strict requirements in terms of epilogue and we are
+ // not taking a chance at messing with them.
+ // I.e., unless this block is already an exit block, we can't use
+ // it as an epilogue.
+ if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
+ return false;
+
+ if (canUseLEAForSPInEpilogue(*MBB.getParent()))
+ return true;
+
+ // If we cannot use LEA to adjust SP, we may need to use ADD, which
+ // clobbers the EFLAGS. Check that we do not need to preserve it,
+ // otherwise, conservatively assume this is not
+ // safe to insert the epilogue here.
+ return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
+}
+
+bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+ // If we may need to emit frameless compact unwind information, give
+ // up as this is currently broken: PR25614.
+ bool CompactUnwind =
+ MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() !=
+ nullptr;
+ return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) ||
+ !CompactUnwind) &&
+ // The lowering of segmented stack and HiPE only support entry
+ // blocks as prologue blocks: PR26107. This limitation may be
+ // lifted if we fix:
+ // - adjustForSegmentedStacks
+ // - adjustForHiPEPrologue
+ MF.getFunction().getCallingConv() != CallingConv::HiPE &&
+ !MF.shouldSplitStack();
+}
+
+MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool RestoreSP) const {
+ assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
+ assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
+ assert(STI.is32Bit() && !Uses64BitFramePtr &&
+ "restoring EBP/ESI on non-32-bit target");
+
+ MachineFunction &MF = *MBB.getParent();
+ Register FramePtr = TRI->getFrameRegister(MF);
+ Register BasePtr = TRI->getBaseRegister();
+ WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // FIXME: Don't set FrameSetup flag in catchret case.
+
+ int FI = FuncInfo.EHRegNodeFrameIndex;
+ int EHRegSize = MFI.getObjectSize(FI);
+
+ if (RestoreSP) {
+ // MOV32rm -EHRegSize(%ebp), %esp
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
+ X86::EBP, true, -EHRegSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ Register UsedReg;
+ int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
+ int EndOffset = -EHRegOffset - EHRegSize;
+ FuncInfo.EHRegNodeEndOffset = EndOffset;
+
+ if (UsedReg == FramePtr) {
+ // ADD $offset, %ebp
+ unsigned ADDri = getADDriOpcode(false, EndOffset);
+ BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
+ .addReg(FramePtr)
+ .addImm(EndOffset)
+ .setMIFlag(MachineInstr::FrameSetup)
+ ->getOperand(3)
+ .setIsDead();
+ assert(EndOffset >= 0 &&
+ "end of registration object above normal EBP position!");
+ } else if (UsedReg == BasePtr) {
+ // LEA offset(%ebp), %esi
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
+ FramePtr, false, EndOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ // MOV32rm SavedEBPOffset(%esi), %ebp
+ assert(X86FI->getHasSEHFramePtrSave());
+ int Offset =
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
+ .getFixed();
+ assert(UsedReg == BasePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
+ UsedReg, true, Offset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
+ }
+ return MBBI;
+}
+
+int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+ return TRI->getSlotSize();
+}
+
+Register
+X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) const {
+ return TRI->getDwarfRegNum(StackPtr, true);
+}
+
+namespace {
+// Struct used by orderFrameObjects to help sort the stack objects.
+struct X86FrameSortingObject {
+ bool IsValid = false; // true if we care about this Object.
+ unsigned ObjectIndex = 0; // Index of Object into MFI list.
+ unsigned ObjectSize = 0; // Size of Object in bytes.
+ Align ObjectAlignment = Align(1); // Alignment of Object in bytes.
+ unsigned ObjectNumUses = 0; // Object static number of uses.
+};
+
+// The comparison function we use for std::sort to order our local
+// stack symbols. The current algorithm is to use an estimated
+// "density". This takes into consideration the size and number of
+// uses each object has in order to roughly minimize code size.
+// So, for example, an object of size 16B that is referenced 5 times
+// will get higher priority than 4 4B objects referenced 1 time each.
+// It's not perfect and we may be able to squeeze a few more bytes out of
+// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
+// fringe end can have special consideration, given their size is less
+// important, etc.), but the algorithmic complexity grows too much to be
+// worth the extra gains we get. This gets us pretty close.
+// The final order leaves us with objects with highest priority going
+// at the end of our list.
+struct X86FrameSortingComparator {
+ inline bool operator()(const X86FrameSortingObject &A,
+ const X86FrameSortingObject &B) const {
+ uint64_t DensityAScaled, DensityBScaled;
+
+ // For consistency in our comparison, all invalid objects are placed
+ // at the end. This also allows us to stop walking when we hit the
+ // first invalid item after it's all sorted.
+ if (!A.IsValid)
+ return false;
+ if (!B.IsValid)
+ return true;
+
+ // The density is calculated by doing :
+ // (double)DensityA = A.ObjectNumUses / A.ObjectSize
+ // (double)DensityB = B.ObjectNumUses / B.ObjectSize
+ // Since this approach may cause inconsistencies in
+ // the floating point <, >, == comparisons, depending on the floating
+ // point model with which the compiler was built, we're going
+ // to scale both sides by multiplying with
+ // A.ObjectSize * B.ObjectSize. This ends up factoring away
+ // the division and, with it, the need for any floating point
+ // arithmetic.
+ DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
+ static_cast<uint64_t>(B.ObjectSize);
+ DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
+ static_cast<uint64_t>(A.ObjectSize);
+
+ // If the two densities are equal, prioritize highest alignment
+ // objects. This allows for similar alignment objects
+ // to be packed together (given the same density).
+ // There's room for improvement here, also, since we can pack
+ // similar alignment (different density) objects next to each
+ // other to save padding. This will also require further
+ // complexity/iterations, and the overall gain isn't worth it,
+ // in general. Something to keep in mind, though.
+ if (DensityAScaled == DensityBScaled)
+ return A.ObjectAlignment < B.ObjectAlignment;
+
+ return DensityAScaled < DensityBScaled;
+ }
+};
+} // namespace
+
+// Order the symbols in the local stack.
+// We want to place the local stack objects in some sort of sensible order.
+// The heuristic we use is to try and pack them according to static number
+// of uses and size of object in order to minimize code size.
+void X86FrameLowering::orderFrameObjects(
+ const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Don't waste time if there's nothing to do.
+ if (ObjectsToAllocate.empty())
+ return;
+
+ // Create an array of all MFI objects. We won't need all of these
+ // objects, but we're going to create a full array of them to make
+ // it easier to index into when we're counting "uses" down below.
+ // We want to be able to easily/cheaply access an object by simply
+ // indexing into it, instead of having to search for it every time.
+ std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());
+
+ // Walk the objects we care about and mark them as such in our working
+ // struct.
+ for (auto &Obj : ObjectsToAllocate) {
+ SortingObjects[Obj].IsValid = true;
+ SortingObjects[Obj].ObjectIndex = Obj;
+ SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlign(Obj);
+ // Set the size.
+ int ObjectSize = MFI.getObjectSize(Obj);
+ if (ObjectSize == 0)
+ // Variable size. Just use 4.
+ SortingObjects[Obj].ObjectSize = 4;
+ else
+ SortingObjects[Obj].ObjectSize = ObjectSize;
+ }
+
+ // Count the number of uses for each object.
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ for (const MachineOperand &MO : MI.operands()) {
+ // Check to see if it's a local stack symbol.
+ if (!MO.isFI())
+ continue;
+ int Index = MO.getIndex();
+ // Check to see if it falls within our range, and is tagged
+ // to require ordering.
+ if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&
+ SortingObjects[Index].IsValid)
+ SortingObjects[Index].ObjectNumUses++;
+ }
+ }
+ }
+
+ // Sort the objects using X86FrameSortingAlgorithm (see its comment for
+ // info).
+ llvm::stable_sort(SortingObjects, X86FrameSortingComparator());
+
+ // Now modify the original list to represent the final order that
+ // we want. The order will depend on whether we're going to access them
+ // from the stack pointer or the frame pointer. For SP, the list should
+ // end up with the END containing objects that we want with smaller offsets.
+ // For FP, it should be flipped.
+ int i = 0;
+ for (auto &Obj : SortingObjects) {
+ // All invalid items are sorted at the end, so it's safe to stop.
+ if (!Obj.IsValid)
+ break;
+ ObjectsToAllocate[i++] = Obj.ObjectIndex;
+ }
+
+ // Flip it if we're accessing off of the FP.
+ if (!TRI->needsStackRealignment(MF) && hasFP(MF))
+ std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
+}
+
+
+unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
+ // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
+ unsigned Offset = 16;
+ // RBP is immediately pushed.
+ Offset += SlotSize;
+ // All callee-saved registers are then pushed.
+ Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+ // Every funclet allocates enough stack space for the largest outgoing call.
+ Offset += getWinEHFuncletFrameSize(MF);
+ return Offset;
+}
+
+void X86FrameLowering::processFunctionBeforeFrameFinalized(
+ MachineFunction &MF, RegScavenger *RS) const {
+ // Mark the function as not having WinCFI. We will set it back to true in
+ // emitPrologue if it gets called and emits CFI.
+ MF.setHasWinCFI(false);
+
+ // If we are using Windows x64 CFI, ensure that the stack is always 8 byte
+ // aligned. The format doesn't support misaligned stack adjustments.
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
+ MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize));
+
+ // If this function isn't doing Win64-style C++ EH, we don't need to do
+ // anything.
+ if (STI.is64Bit() && MF.hasEHFunclets() &&
+ classifyEHPersonality(MF.getFunction().getPersonalityFn()) ==
+ EHPersonality::MSVC_CXX) {
+ adjustFrameForMsvcCxxEh(MF);
+ }
+}
+
+void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {
+ // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
+ // relative to RSP after the prologue. Find the offset of the last fixed
+ // object, so that we can allocate a slot immediately following it. If there
+ // were no fixed objects, use offset -SlotSize, which is immediately after the
+ // return address. Fixed objects have negative frame indices.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
+ int64_t MinFixedObjOffset = -SlotSize;
+ for (int I = MFI.getObjectIndexBegin(); I < 0; ++I)
+ MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I));
+
+ for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+ for (WinEHHandlerType &H : TBME.HandlerArray) {
+ int FrameIndex = H.CatchObj.FrameIndex;
+ if (FrameIndex != INT_MAX) {
+ // Ensure alignment.
+ unsigned Align = MFI.getObjectAlign(FrameIndex).value();
+ MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
+ MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
+ MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
+ }
+ }
+ }
+
+ // Ensure alignment.
+ MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
+ int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
+ int UnwindHelpFI =
+ MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*IsImmutable=*/false);
+ EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
+
+ // Store -2 into UnwindHelp on function entry. We have to scan forwards past
+ // other frame setup instructions.
+ MachineBasicBlock &MBB = MF.front();
+ auto MBBI = MBB.begin();
+ while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+ ++MBBI;
+
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+ addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
+ UnwindHelpFI)
+ .addImm(-2);
+}
+
+void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced(
+ MachineFunction &MF, RegScavenger *RS) const {
+ if (STI.is32Bit() && MF.hasEHFunclets())
+ restoreWinEHStackPointersInParent(MF);
+}
+
+void X86FrameLowering::restoreWinEHStackPointersInParent(
+ MachineFunction &MF) const {
+ // 32-bit functions have to restore stack pointers when control is transferred
+ // back to the parent function. These blocks are identified as eh pads that
+ // are not funclet entries.
+ bool IsSEH = isAsynchronousEHPersonality(
+ classifyEHPersonality(MF.getFunction().getPersonalityFn()));
+ for (MachineBasicBlock &MBB : MF) {
+ bool NeedsRestore = MBB.isEHPad() && !MBB.isEHFuncletEntry();
+ if (NeedsRestore)
+ restoreWin32EHStackPointers(MBB, MBB.begin(), DebugLoc(),
+ /*RestoreSP=*/IsSEH);
+ }
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
new file mode 100644
index 000000000000..26e80811af2e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
@@ -0,0 +1,257 @@
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements X86-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
+
+namespace llvm {
+
+class MachineInstrBuilder;
+class MCCFIInstruction;
+class X86InstrInfo;
+class X86Subtarget;
+class X86RegisterInfo;
+
+class X86FrameLowering : public TargetFrameLowering {
+public:
+ X86FrameLowering(const X86Subtarget &STI, MaybeAlign StackAlignOverride);
+
+ // Cached subtarget predicates.
+
+ const X86Subtarget &STI;
+ const X86InstrInfo &TII;
+ const X86RegisterInfo *TRI;
+
+ unsigned SlotSize;
+
+ /// Is64Bit implies that x86_64 instructions are available.
+ bool Is64Bit;
+
+ bool IsLP64;
+
+ /// True if the 64-bit frame or stack pointer should be used. True for most
+ /// 64-bit targets with the exception of x32. If this is false, 32-bit
+ /// instruction operands should be used to manipulate StackPtr and FramePtr.
+ bool Uses64BitFramePtr;
+
+ unsigned StackPtr;
+
+ /// Emit target stack probe code. This is required for all
+ /// large stack allocations on Windows. The caller is required to materialize
+ /// the number of bytes to probe in RAX/EAX.
+ void emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ bool InProlog) const;
+
+ /// Replace a StackProbe inline-stub with the actual probe code inline.
+ void inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const override;
+
+ void
+ emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const override;
+
+ void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool IsPrologue) const override;
+
+ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+ /// the function.
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ void adjustForSegmentedStacks(MachineFunction &MF,
+ MachineBasicBlock &PrologueMBB) const override;
+
+ void adjustForHiPEPrologue(MachineFunction &MF,
+ MachineBasicBlock &PrologueMBB) const override;
+
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const override;
+
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ ArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool
+ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+ bool needsFrameIndexResolution(const MachineFunction &MF) const override;
+
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
+
+ int getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
+ Register &SPReg) const;
+ StackOffset getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+ Register &SPReg, int Adjustment) const;
+ StackOffset
+ getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+ Register &FrameReg,
+ bool IgnoreSPUpdates) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
+ unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
+
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS) const override;
+
+ void
+ processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
+ RegScavenger *RS) const override;
+
+ /// Check the instruction before/after the passed instruction. If
+ /// it is an ADD/SUB/LEA instruction it is deleted argument and the
+ /// stack adjustment is returned as a positive value for ADD/LEA and
+ /// a negative for SUB.
+ int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ bool doMergeWithPrevious) const;
+
+ /// Emit a series of instructions to increment / decrement the stack
+ /// pointer by a constant value.
+ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ const DebugLoc &DL, int64_t NumBytes, bool InEpilogue) const;
+
+ /// Check that LEA can be used on SP in an epilogue sequence for \p MF.
+ bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const;
+
+ /// Check whether or not the given \p MBB can be used as a prologue
+ /// for the target.
+ /// The prologue will be inserted first in this basic block.
+ /// This method is used by the shrink-wrapping pass to decide if
+ /// \p MBB will be correctly handled by the target.
+ /// As soon as the target enable shrink-wrapping without overriding
+ /// this method, we assume that each basic block is a valid
+ /// prologue.
+ bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+
+ /// Check whether or not the given \p MBB can be used as a epilogue
+ /// for the target.
+ /// The epilogue will be inserted before the first terminator of that block.
+ /// This method is used by the shrink-wrapping pass to decide if
+ /// \p MBB will be correctly handled by the target.
+ bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
+ /// Returns true if the target will correctly handle shrink wrapping.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
+ /// Order the symbols in the local stack.
+ /// We want to place the local stack objects in some sort of sensible order.
+ /// The heuristic we use is to try and pack them according to static number
+ /// of uses and size in order to minimize code size.
+ void orderFrameObjects(const MachineFunction &MF,
+ SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
+ /// Wraps up getting a CFI index and building a MachineInstr for it.
+ void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
+
+ /// Sets up EBP and optionally ESI based on the incoming EBP value. Only
+ /// needed for 32-bit. Used in funclet prologues and at catchret destinations.
+ MachineBasicBlock::iterator
+ restoreWin32EHStackPointers(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool RestoreSP = false) const;
+
+ void restoreWinEHStackPointersInParent(MachineFunction &MF) const;
+
+ int getInitialCFAOffset(const MachineFunction &MF) const override;
+
+ Register getInitialCFARegister(const MachineFunction &MF) const override;
+
+ /// Return true if the function has a redzone (accessible bytes past the
+ /// frame of the top of stack function) as part of it's ABI.
+ bool has128ByteRedZone(const MachineFunction& MF) const;
+
+private:
+ uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+
+ /// Emit target stack probe as a call to a helper function
+ void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ bool InProlog) const;
+
+ /// Emit target stack probe as an inline sequence.
+ void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const;
+ void emitStackProbeInlineWindowsCoreCLR64(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool InProlog) const;
+ void emitStackProbeInlineGeneric(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const;
+
+ void emitStackProbeInlineGenericBlock(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, uint64_t Offset,
+ uint64_t Align) const;
+
+ void emitStackProbeInlineGenericLoop(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, uint64_t Offset,
+ uint64_t Align) const;
+
+ void adjustFrameForMsvcCxxEh(MachineFunction &MF) const;
+
+ /// Aligns the stack pointer by ANDing it with -MaxAlign.
+ void BuildStackAlignAND(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ unsigned Reg, uint64_t MaxAlign) const;
+
+ /// Make small positive stack adjustments using POPs.
+ bool adjustStackWithPops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ int Offset) const;
+
+ /// Adjusts the stack pointer using LEA, SUB, or ADD.
+ MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t Offset,
+ bool InEpilogue) const;
+
+ unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
+
+ unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
+
+ /// Materialize the catchret target MBB in RAX.
+ void emitCatchRetReturnValue(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineInstr *CatchRet) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86GenRegisterBankInfo.def b/contrib/llvm-project/llvm/lib/Target/X86/X86GenRegisterBankInfo.def
new file mode 100644
index 000000000000..0fdea9071c29
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86GenRegisterBankInfo.def
@@ -0,0 +1,99 @@
+//===- X86GenRegisterBankInfo.def ----------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines all the static objects used by X86RegisterBankInfo.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+RegisterBankInfo::PartialMapping X86GenRegisterBankInfo::PartMappings[]{
+ /* StartIdx, Length, RegBank */
+ // GPR value
+ {0, 8, X86::GPRRegBank}, // :0
+ {0, 16, X86::GPRRegBank}, // :1
+ {0, 32, X86::GPRRegBank}, // :2
+ {0, 64, X86::GPRRegBank}, // :3
+ // FR32/64 , xmm registers
+ {0, 32, X86::VECRRegBank}, // :4
+ {0, 64, X86::VECRRegBank}, // :5
+ // VR128/256/512
+ {0, 128, X86::VECRRegBank}, // :6
+ {0, 256, X86::VECRRegBank}, // :7
+ {0, 512, X86::VECRRegBank}, // :8
+};
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
+#ifdef GET_TARGET_REGBANK_INFO_CLASS
+enum PartialMappingIdx {
+ PMI_None = -1,
+ PMI_GPR8,
+ PMI_GPR16,
+ PMI_GPR32,
+ PMI_GPR64,
+ PMI_FP32,
+ PMI_FP64,
+ PMI_VEC128,
+ PMI_VEC256,
+ PMI_VEC512
+};
+#endif // GET_TARGET_REGBANK_INFO_CLASS
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+#define INSTR_3OP(INFO) INFO, INFO, INFO,
+#define BREAKDOWN(INDEX, NUM) \
+ { &X86GenRegisterBankInfo::PartMappings[INDEX], NUM }
+// ValueMappings.
+RegisterBankInfo::ValueMapping X86GenRegisterBankInfo::ValMappings[]{
+ /* BreakDown, NumBreakDowns */
+ // 3-operands instructions (all binary operations should end up with one of
+ // those mapping).
+ INSTR_3OP(BREAKDOWN(PMI_GPR8, 1)) // 0: GPR_8
+ INSTR_3OP(BREAKDOWN(PMI_GPR16, 1)) // 3: GPR_16
+ INSTR_3OP(BREAKDOWN(PMI_GPR32, 1)) // 6: GPR_32
+ INSTR_3OP(BREAKDOWN(PMI_GPR64, 1)) // 9: GPR_64
+ INSTR_3OP(BREAKDOWN(PMI_FP32, 1)) // 12: Fp32
+ INSTR_3OP(BREAKDOWN(PMI_FP64, 1)) // 15: Fp64
+ INSTR_3OP(BREAKDOWN(PMI_VEC128, 1)) // 18: Vec128
+ INSTR_3OP(BREAKDOWN(PMI_VEC256, 1)) // 21: Vec256
+ INSTR_3OP(BREAKDOWN(PMI_VEC512, 1)) // 24: Vec512
+};
+#undef INSTR_3OP
+#undef BREAKDOWN
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
+#ifdef GET_TARGET_REGBANK_INFO_CLASS
+enum ValueMappingIdx {
+ VMI_None = -1,
+ VMI_3OpsGpr8Idx = PMI_GPR8 * 3,
+ VMI_3OpsGpr16Idx = PMI_GPR16 * 3,
+ VMI_3OpsGpr32Idx = PMI_GPR32 * 3,
+ VMI_3OpsGpr64Idx = PMI_GPR64 * 3,
+ VMI_3OpsFp32Idx = PMI_FP32 * 3,
+ VMI_3OpsFp64Idx = PMI_FP64 * 3,
+ VMI_3OpsVec128Idx = PMI_VEC128 * 3,
+ VMI_3OpsVec256Idx = PMI_VEC256 * 3,
+ VMI_3OpsVec512Idx = PMI_VEC512 * 3,
+};
+#undef GET_TARGET_REGBANK_INFO_CLASS
+#endif // GET_TARGET_REGBANK_INFO_CLASS
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+#undef GET_TARGET_REGBANK_INFO_IMPL
+const RegisterBankInfo::ValueMapping *
+X86GenRegisterBankInfo::getValueMapping(PartialMappingIdx Idx,
+ unsigned NumOperands) {
+
+ // We can use VMI_3Ops Mapping for all the cases.
+ if (NumOperands <= 3 && (Idx >= PMI_GPR8 && Idx <= PMI_VEC512))
+ return &ValMappings[(unsigned)Idx * 3];
+
+ llvm_unreachable("Unsupported PartialMappingIdx.");
+}
+
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..1df9a0d1700f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,6020 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include <stdint.h>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
+ cl::desc("Enable setting constant bits to reduce size of mask immediates"),
+ cl::Hidden);
+
+static cl::opt<bool> EnablePromoteAnyextLoad(
+ "x86-promote-anyext-load", cl::init(true),
+ cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
+
+extern cl::opt<bool> IndirectBranchTracking;
+
+//===----------------------------------------------------------------------===//
+// Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+ /// This corresponds to X86AddressMode, but uses SDValue's instead of register
+ /// numbers for the leaves of the matched tree.
+ struct X86ISelAddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ // This is really a union, discriminated by BaseType!
+ SDValue Base_Reg;
+ int Base_FrameIndex;
+
+ unsigned Scale;
+ SDValue IndexReg;
+ int32_t Disp;
+ SDValue Segment;
+ const GlobalValue *GV;
+ const Constant *CP;
+ const BlockAddress *BlockAddr;
+ const char *ES;
+ MCSymbol *MCSym;
+ int JT;
+ Align Alignment; // CP alignment.
+ unsigned char SymbolFlags; // X86II::MO_*
+ bool NegateIndex = false;
+
+ X86ISelAddressMode()
+ : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
+ Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+ MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {}
+
+ bool hasSymbolicDisplacement() const {
+ return GV != nullptr || CP != nullptr || ES != nullptr ||
+ MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
+ }
+
+ bool hasBaseOrIndexReg() const {
+ return BaseType == FrameIndexBase ||
+ IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
+ }
+
+ /// Return true if this addressing mode is already RIP-relative.
+ bool isRIPRelative() const {
+ if (BaseType != RegBase) return false;
+ if (RegisterSDNode *RegNode =
+ dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
+ return RegNode->getReg() == X86::RIP;
+ return false;
+ }
+
+ void setBaseReg(SDValue Reg) {
+ BaseType = RegBase;
+ Base_Reg = Reg;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(SelectionDAG *DAG = nullptr) {
+ dbgs() << "X86ISelAddressMode " << this << '\n';
+ dbgs() << "Base_Reg ";
+ if (Base_Reg.getNode())
+ Base_Reg.getNode()->dump(DAG);
+ else
+ dbgs() << "nul\n";
+ if (BaseType == FrameIndexBase)
+ dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
+ dbgs() << " Scale " << Scale << '\n'
+ << "IndexReg ";
+ if (NegateIndex)
+ dbgs() << "negate ";
+ if (IndexReg.getNode())
+ IndexReg.getNode()->dump(DAG);
+ else
+ dbgs() << "nul\n";
+ dbgs() << " Disp " << Disp << '\n'
+ << "GV ";
+ if (GV)
+ GV->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " CP ";
+ if (CP)
+ CP->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << '\n'
+ << "ES ";
+ if (ES)
+ dbgs() << ES;
+ else
+ dbgs() << "nul";
+ dbgs() << " MCSym ";
+ if (MCSym)
+ dbgs() << MCSym;
+ else
+ dbgs() << "nul";
+ dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
+ }
+#endif
+ };
+}
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ /// ISel - X86-specific code to select X86 machine instructions for
+ /// SelectionDAG operations.
+ ///
+ class X86DAGToDAGISel final : public SelectionDAGISel {
+ /// Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// If true, selector should try to optimize for minimum code size.
+ bool OptForMinSize;
+
+ /// Disable direct TLS access through segment registers.
+ bool IndirectTlsSegRefs;
+
+ public:
+ explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
+ OptForMinSize(false), IndirectTlsSegRefs(false) {}
+
+ StringRef getPassName() const override {
+ return "X86 DAG->DAG Instruction Selection";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // Reset the subtarget each time through.
+ Subtarget = &MF.getSubtarget<X86Subtarget>();
+ IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
+ "indirect-tls-seg-refs");
+
+ // OptFor[Min]Size are used in pattern predicates that isel is matching.
+ OptForMinSize = MF.getFunction().hasMinSize();
+ assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
+ "OptForMinSize implies OptForSize");
+
+ SelectionDAGISel::runOnMachineFunction(MF);
+ return true;
+ }
+
+ void emitFunctionEntryCode() override;
+
+ bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
+
+ void PreprocessISelDAG() override;
+ void PostprocessISelDAG() override;
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+ private:
+ void Select(SDNode *N) override;
+
+ bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+ bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
+ bool AllowSegmentRegForX32 = false);
+ bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
+ bool matchAddress(SDValue N, X86ISelAddressMode &AM);
+ bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
+ bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
+ bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth);
+ bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
+ bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
+ SDValue ScaleOp, SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp, SDValue &Segment);
+ bool selectMOV64Imm32(SDValue N, SDValue &Imm);
+ bool selectLEAAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectLEA64_32Addr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectTLSADDRAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool selectRelocImm(SDValue N, SDValue &Op);
+
+ bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
+ // Convenience method where P is also root.
+ bool tryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
+ }
+
+ bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
+ bool isProfitableToFormMaskedOp(SDNode *N) const;
+
+ /// Implement addressing mode selection for inline asm expressions.
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+ void emitSpecialCodeForMain();
+
+ inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
+ MVT VT, SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Base = CurDAG->getTargetFrameIndex(
+ AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
+ else if (AM.Base_Reg.getNode())
+ Base = AM.Base_Reg;
+ else
+ Base = CurDAG->getRegister(0, VT);
+
+ Scale = getI8Imm(AM.Scale, DL);
+
+ // Negate the index if needed.
+ if (AM.NegateIndex) {
+ unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
+ SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
+ AM.IndexReg), 0);
+ AM.IndexReg = Neg;
+ }
+
+ if (AM.IndexReg.getNode())
+ Index = AM.IndexReg;
+ else
+ Index = CurDAG->getRegister(0, VT);
+
+ // These are 32-bit even in 64-bit mode since RIP-relative offset
+ // is 32-bit.
+ if (AM.GV)
+ Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
+ MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ else if (AM.CP)
+ Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
+ AM.Disp, AM.SymbolFlags);
+ else if (AM.ES) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
+ Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+ } else if (AM.MCSym) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
+ assert(AM.SymbolFlags == 0 && "oo");
+ Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
+ } else if (AM.JT != -1) {
+ assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
+ Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+ } else if (AM.BlockAddr)
+ Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ else
+ Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
+
+ if (AM.Segment.getNode())
+ Segment = AM.Segment;
+ else
+ Segment = CurDAG->getRegister(0, MVT::i16);
+ }
+
+ // Utility function to determine whether we should avoid selecting
+ // immediate forms of instructions for better code size or not.
+ // At a high level, we'd like to avoid such instructions when
+ // we have similar constants used within the same basic block
+ // that can be kept in a register.
+ //
+ bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
+ uint32_t UseCount = 0;
+
+ // Do not want to hoist if we're not optimizing for size.
+ // TODO: We'd like to remove this restriction.
+ // See the comment in X86InstrInfo.td for more info.
+ if (!CurDAG->shouldOptForSize())
+ return false;
+
+ // Walk all the users of the immediate.
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
+
+ SDNode *User = *UI;
+
+ // This user is already selected. Count it as a legitimate use and
+ // move on.
+ if (User->isMachineOpcode()) {
+ UseCount++;
+ continue;
+ }
+
+ // We want to count stores of immediates as real uses.
+ if (User->getOpcode() == ISD::STORE &&
+ User->getOperand(1).getNode() == N) {
+ UseCount++;
+ continue;
+ }
+
+ // We don't currently match users that have > 2 operands (except
+ // for stores, which are handled above)
+ // Those instruction won't match in ISEL, for now, and would
+ // be counted incorrectly.
+ // This may change in the future as we add additional instruction
+ // types.
+ if (User->getNumOperands() != 2)
+ continue;
+
+ // If this is a sign-extended 8-bit integer immediate used in an ALU
+ // instruction, there is probably an opcode encoding to save space.
+ auto *C = dyn_cast<ConstantSDNode>(N);
+ if (C && isInt<8>(C->getSExtValue()))
+ continue;
+
+ // Immediates that are used for offsets as part of stack
+ // manipulation should be left alone. These are typically
+ // used to indicate SP offsets for argument passing and
+ // will get pulled into stores/pushes (implicitly).
+ if (User->getOpcode() == X86ISD::ADD ||
+ User->getOpcode() == ISD::ADD ||
+ User->getOpcode() == X86ISD::SUB ||
+ User->getOpcode() == ISD::SUB) {
+
+ // Find the other operand of the add/sub.
+ SDValue OtherOp = User->getOperand(0);
+ if (OtherOp.getNode() == N)
+ OtherOp = User->getOperand(1);
+
+ // Don't count if the other operand is SP.
+ RegisterSDNode *RegNode;
+ if (OtherOp->getOpcode() == ISD::CopyFromReg &&
+ (RegNode = dyn_cast_or_null<RegisterSDNode>(
+ OtherOp->getOperand(1).getNode())))
+ if ((RegNode->getReg() == X86::ESP) ||
+ (RegNode->getReg() == X86::RSP))
+ continue;
+ }
+
+ // ... otherwise, count this and move on.
+ UseCount++;
+ }
+
+ // If we have more than 1 use, then recommend for hoisting.
+ return (UseCount > 1);
+ }
+
+ /// Return a target constant with the specified value of type i8.
+ inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+ }
+
+ /// Return a target constant with the specified value, of type i32.
+ inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+ }
+
+ /// Return a target constant with the specified value, of type i64.
+ inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
+ }
+
+ SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
+ const SDLoc &DL) {
+ assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
+ uint64_t Index = N->getConstantOperandVal(1);
+ MVT VecVT = N->getOperand(0).getSimpleValueType();
+ return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
+ }
+
+ SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
+ const SDLoc &DL) {
+ assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
+ uint64_t Index = N->getConstantOperandVal(2);
+ MVT VecVT = N->getSimpleValueType(0);
+ return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
+ }
+
+ // Helper to detect unneeded and instructions on shift amounts. Called
+ // from PatFrags in tablegen.
+ bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
+ assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
+ const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+
+ if (Val.countTrailingOnes() >= Width)
+ return true;
+
+ APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
+ return Mask.countTrailingOnes() >= Width;
+ }
+
+ /// Return an SDNode that returns the value of the global base register.
+ /// Output instructions required to initialize the global base register,
+ /// if necessary.
+ SDNode *getGlobalBaseReg();
+
+ /// Return a reference to the TargetMachine, casted to the target-specific
+ /// type.
+ const X86TargetMachine &getTargetMachine() const {
+ return static_cast<const X86TargetMachine &>(TM);
+ }
+
+ /// Return a reference to the TargetInstrInfo, casted to the target-specific
+ /// type.
+ const X86InstrInfo *getInstrInfo() const {
+ return Subtarget->getInstrInfo();
+ }
+
+ /// Address-mode matching performs shift-of-and to and-of-shift
+ /// reassociation in order to expose more scaled addressing
+ /// opportunities.
+ bool ComplexPatternFuncMutatesDAG() const override {
+ return true;
+ }
+
+ bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
+
+ // Indicates we should prefer to use a non-temporal load for this load.
+ bool useNonTemporalLoad(LoadSDNode *N) const {
+ if (!N->isNonTemporal())
+ return false;
+
+ unsigned StoreSize = N->getMemoryVT().getStoreSize();
+
+ if (N->getAlignment() < StoreSize)
+ return false;
+
+ switch (StoreSize) {
+ default: llvm_unreachable("Unsupported store size");
+ case 4:
+ case 8:
+ return false;
+ case 16:
+ return Subtarget->hasSSE41();
+ case 32:
+ return Subtarget->hasAVX2();
+ case 64:
+ return Subtarget->hasAVX512();
+ }
+ }
+
+ bool foldLoadStoreIntoMemOperand(SDNode *Node);
+ MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
+ bool matchBitExtract(SDNode *Node);
+ bool shrinkAndImmediate(SDNode *N);
+ bool isMaskZeroExtended(SDNode *N) const;
+ bool tryShiftAmountMod(SDNode *N);
+ bool tryShrinkShlLogicImm(SDNode *N);
+ bool tryVPTERNLOG(SDNode *N);
+ bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC,
+ SDValue A, SDValue B, SDValue C, uint8_t Imm);
+ bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
+ bool tryMatchBitSelect(SDNode *N);
+
+ MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+ const SDLoc &dl, MVT VT, SDNode *Node);
+ MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+ const SDLoc &dl, MVT VT, SDNode *Node,
+ SDValue &InFlag);
+
+ bool tryOptimizeRem8Extend(SDNode *N);
+
+ bool onlyUsesZeroFlag(SDValue Flags) const;
+ bool hasNoSignFlagUses(SDValue Flags) const;
+ bool hasNoCarryFlagUses(SDValue Flags) const;
+ };
+}
+
+
+// Returns true if this masked compare can be implemented legally with this
+// type.
+static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
+ Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
+ Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
+ // We can get 256-bit 8 element types here without VLX being enabled. When
+ // this happens we will use 512-bit operations and the mask will not be
+ // zero extended.
+ EVT OpVT = N->getOperand(0).getValueType();
+ // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
+ // second operand.
+ if (Opcode == X86ISD::STRICT_CMPM)
+ OpVT = N->getOperand(1).getValueType();
+ if (OpVT.is256BitVector() || OpVT.is128BitVector())
+ return Subtarget->hasVLX();
+
+ return true;
+ }
+ // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
+ if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
+ Opcode == X86ISD::FSETCCM_SAE)
+ return true;
+
+ return false;
+}
+
+// Returns true if we can assume the writer of the mask has zero extended it
+// for us.
+bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
+ // If this is an AND, check if we have a compare on either side. As long as
+ // one side guarantees the mask is zero extended, the AND will preserve those
+ // zeros.
+ if (N->getOpcode() == ISD::AND)
+ return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
+ isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
+
+ return isLegalMaskCompare(N, Subtarget);
+}
+
+bool
+X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
+ if (OptLevel == CodeGenOpt::None) return false;
+
+ if (!N.hasOneUse())
+ return false;
+
+ if (N.getOpcode() != ISD::LOAD)
+ return true;
+
+ // Don't fold non-temporal loads if we have an instruction for them.
+ if (useNonTemporalLoad(cast<LoadSDNode>(N)))
+ return false;
+
+ // If N is a load, do additional profitability checks.
+ if (U == Root) {
+ switch (U->getOpcode()) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::ADC:
+ case X86ISD::SUB:
+ case X86ISD::SBB:
+ case X86ISD::AND:
+ case X86ISD::XOR:
+ case X86ISD::OR:
+ case ISD::ADD:
+ case ISD::ADDCARRY:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue Op1 = U->getOperand(1);
+
+ // If the other operand is a 8-bit immediate we should fold the immediate
+ // instead. This reduces code size.
+ // e.g.
+ // movl 4(%esp), %eax
+ // addl $4, %eax
+ // vs.
+ // movl $4, %eax
+ // addl 4(%esp), %eax
+ // The former is 2 bytes shorter. In case where the increment is 1, then
+ // the saving can be 4 bytes (by using incl %eax).
+ if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
+ if (Imm->getAPIntValue().isSignedIntN(8))
+ return false;
+
+ // If this is a 64-bit AND with an immediate that fits in 32-bits,
+ // prefer using the smaller and over folding the load. This is needed to
+ // make sure immediates created by shrinkAndImmediate are always folded.
+ // Ideally we would narrow the load during DAG combine and get the
+ // best of both worlds.
+ if (U->getOpcode() == ISD::AND &&
+ Imm->getAPIntValue().getBitWidth() == 64 &&
+ Imm->getAPIntValue().isIntN(32))
+ return false;
+
+ // If this really a zext_inreg that can be represented with a movzx
+ // instruction, prefer that.
+ // TODO: We could shrink the load and fold if it is non-volatile.
+ if (U->getOpcode() == ISD::AND &&
+ (Imm->getAPIntValue() == UINT8_MAX ||
+ Imm->getAPIntValue() == UINT16_MAX ||
+ Imm->getAPIntValue() == UINT32_MAX))
+ return false;
+
+ // ADD/SUB with can negate the immediate and use the opposite operation
+ // to fit 128 into a sign extended 8 bit immediate.
+ if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
+ (-Imm->getAPIntValue()).isSignedIntN(8))
+ return false;
+
+ if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
+ (-Imm->getAPIntValue()).isSignedIntN(8) &&
+ hasNoCarryFlagUses(SDValue(U, 1)))
+ return false;
+ }
+
+ // If the other operand is a TLS address, we should fold it instead.
+ // This produces
+ // movl %gs:0, %eax
+ // leal i@NTPOFF(%eax), %eax
+ // instead of
+ // movl $i@NTPOFF, %eax
+ // addl %gs:0, %eax
+ // if the block also has an access to a second TLS address this will save
+ // a load.
+ // FIXME: This is probably also true for non-TLS addresses.
+ if (Op1.getOpcode() == X86ISD::Wrapper) {
+ SDValue Val = Op1.getOperand(0);
+ if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+ }
+
+ // Don't fold load if this matches the BTS/BTR/BTC patterns.
+ // BTS: (or X, (shl 1, n))
+ // BTR: (and X, (rotl -2, n))
+ // BTC: (xor X, (shl 1, n))
+ if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
+ if (U->getOperand(0).getOpcode() == ISD::SHL &&
+ isOneConstant(U->getOperand(0).getOperand(0)))
+ return false;
+
+ if (U->getOperand(1).getOpcode() == ISD::SHL &&
+ isOneConstant(U->getOperand(1).getOperand(0)))
+ return false;
+ }
+ if (U->getOpcode() == ISD::AND) {
+ SDValue U0 = U->getOperand(0);
+ SDValue U1 = U->getOperand(1);
+ if (U0.getOpcode() == ISD::ROTL) {
+ auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
+ if (C && C->getSExtValue() == -2)
+ return false;
+ }
+
+ if (U1.getOpcode() == ISD::ROTL) {
+ auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
+ if (C && C->getSExtValue() == -2)
+ return false;
+ }
+ }
+
+ break;
+ }
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ // Don't fold a load into a shift by immediate. The BMI2 instructions
+ // support folding a load, but not an immediate. The legacy instructions
+ // support folding an immediate, but can't fold a load. Folding an
+ // immediate is preferable to folding a load.
+ if (isa<ConstantSDNode>(U->getOperand(1)))
+ return false;
+
+ break;
+ }
+ }
+
+ // Prevent folding a load if this can implemented with an insert_subreg or
+ // a move that implicitly zeroes.
+ if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isNullConstant(Root->getOperand(2)) &&
+ (Root->getOperand(0).isUndef() ||
+ ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
+ return false;
+
+ return true;
+}
+
+// Indicates it is profitable to form an AVX512 masked operation. Returning
+// false will favor a masked register-register masked move or vblendm and the
+// operation will be selected separately.
+bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
+ assert(
+ (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
+ "Unexpected opcode!");
+
+ // If the operation has additional users, the operation will be duplicated.
+ // Check the use count to prevent that.
+ // FIXME: Are there cheap opcodes we might want to duplicate?
+ return N->getOperand(1).hasOneUse();
+}
+
+/// Replace the original chain operand of the call with
+/// load's chain operand and move load below the call's chain operand.
+static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+ SDValue Call, SDValue OrigChain) {
+ SmallVector<SDValue, 8> Ops;
+ SDValue Chain = OrigChain.getOperand(0);
+ if (Chain.getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else {
+ assert(Chain.getOpcode() == ISD::TokenFactor &&
+ "Unexpected chain operand");
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+ if (Chain.getOperand(i).getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else
+ Ops.push_back(Chain.getOperand(i));
+ SDValue NewChain =
+ CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
+ Ops.clear();
+ Ops.push_back(NewChain);
+ }
+ Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
+ CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
+ CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
+ Load.getOperand(1), Load.getOperand(2));
+
+ Ops.clear();
+ Ops.push_back(SDValue(Load.getNode(), 1));
+ Ops.append(Call->op_begin() + 1, Call->op_end());
+ CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
+}
+
+/// Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+/// In the case of a tail call, there isn't a callseq node between the call
+/// chain and the load.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+ // The transformation is somewhat dangerous if the call's chain was glued to
+ // the call. After MoveBelowOrigChain the load is moved between the call and
+ // the chain, this can create a cycle if the load is not folded. So it is
+ // *really* important that we are sure the load will be folded.
+ if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+ return false;
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+ if (!LD ||
+ !LD->isSimple() ||
+ LD->getAddressingMode() != ISD::UNINDEXED ||
+ LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+
+ // Now let's find the callseq_start.
+ while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
+ if (!Chain.hasOneUse())
+ return false;
+ Chain = Chain.getOperand(0);
+ }
+
+ if (!Chain.getNumOperands())
+ return false;
+ // Since we are not checking for AA here, conservatively abort if the chain
+ // writes to memory. It's not safe to move the callee (a load) across a store.
+ if (isa<MemSDNode>(Chain.getNode()) &&
+ cast<MemSDNode>(Chain.getNode())->writeMem())
+ return false;
+ if (Chain.getOperand(0).getNode() == Callee.getNode())
+ return true;
+ if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+ Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+ Callee.getValue(1).hasOneUse())
+ return true;
+ return false;
+}
+
+static bool isEndbrImm64(uint64_t Imm) {
+// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
+// i.g: 0xF3660F1EFA, 0xF3670F1EFA
+ if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
+ return false;
+
+ uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
+ 0x65, 0x66, 0x67, 0xf0, 0xf2};
+ int i = 24; // 24bit 0x0F1EFA has matched
+ while (i < 64) {
+ uint8_t Byte = (Imm >> i) & 0xFF;
+ if (Byte == 0xF3)
+ return true;
+ if (!llvm::is_contained(OptionalPrefixBytes, Byte))
+ return false;
+ i += 8;
+ }
+
+ return false;
+}
+
+void X86DAGToDAGISel::PreprocessISelDAG() {
+ bool MadeChange = false;
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end(); I != E; ) {
+ SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+
+ // This is for CET enhancement.
+ //
+ // ENDBR32 and ENDBR64 have specific opcodes:
+ // ENDBR32: F3 0F 1E FB
+ // ENDBR64: F3 0F 1E FA
+ // And we want that attackers won’t find unintended ENDBR32/64
+ // opcode matches in the binary
+ // Here’s an example:
+ // If the compiler had to generate asm for the following code:
+ // a = 0xF30F1EFA
+ // it could, for example, generate:
+ // mov 0xF30F1EFA, dword ptr[a]
+ // In such a case, the binary would include a gadget that starts
+ // with a fake ENDBR64 opcode. Therefore, we split such generation
+ // into multiple operations, let it not shows in the binary
+ if (N->getOpcode() == ISD::Constant) {
+ MVT VT = N->getSimpleValueType(0);
+ int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
+ int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
+ if (Imm == EndbrImm || isEndbrImm64(Imm)) {
+ // Check that the cf-protection-branch is enabled.
+ Metadata *CFProtectionBranch =
+ MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
+ if (CFProtectionBranch || IndirectBranchTracking) {
+ SDLoc dl(N);
+ SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
+ Complement = CurDAG->getNOT(dl, Complement, VT);
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ }
+ }
+
+ // If this is a target specific AND node with no flag usages, turn it back
+ // into ISD::AND to enable test instruction matching.
+ if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
+ SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+
+ /// Convert vector increment or decrement to sub/add with an all-ones
+ /// constant:
+ /// add X, <1, 1...> --> sub X, <-1, -1...>
+ /// sub X, <1, 1...> --> add X, <-1, -1...>
+ /// The all-ones vector constant can be materialized using a pcmpeq
+ /// instruction that is commonly recognized as an idiom (has no register
+ /// dependency), so that's better/smaller than loading a splat 1 constant.
+ if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+ N->getSimpleValueType(0).isVector()) {
+
+ APInt SplatVal;
+ if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
+ SplatVal.isOneValue()) {
+ SDLoc DL(N);
+
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumElts = VT.getSizeInBits() / 32;
+ SDValue AllOnes =
+ CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
+ AllOnes = CurDAG->getBitcast(VT, AllOnes);
+
+ unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+ SDValue Res =
+ CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ }
+
+ switch (N->getOpcode()) {
+ case X86ISD::VBROADCAST: {
+ MVT VT = N->getSimpleValueType(0);
+ // Emulate v32i16/v64i8 broadcast without BWI.
+ if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
+ MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+ SDLoc dl(N);
+ SDValue NarrowBCast =
+ CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
+ SDValue Res =
+ CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
+ NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
+ unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+ Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
+ CurDAG->getIntPtrConstant(Index, dl));
+
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+
+ break;
+ }
+ case X86ISD::VBROADCAST_LOAD: {
+ MVT VT = N->getSimpleValueType(0);
+ // Emulate v32i16/v64i8 broadcast without BWI.
+ if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
+ MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+ auto *MemNode = cast<MemSDNode>(N);
+ SDLoc dl(N);
+ SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
+ SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
+ SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
+ MemNode->getMemOperand());
+ SDValue Res =
+ CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
+ NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
+ unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+ Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
+ CurDAG->getIntPtrConstant(Index, dl));
+
+ --I;
+ SDValue To[] = {Res, NarrowBCast.getValue(1)};
+ CurDAG->ReplaceAllUsesWith(N, To);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+
+ break;
+ }
+ case ISD::VSELECT: {
+ // Replace VSELECT with non-mask conditions with with BLENDV.
+ if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+ break;
+
+ assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+ SDValue Blendv =
+ CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1), N->getOperand(2));
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Blendv.getNode());
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ case ISD::FP_ROUND:
+ case ISD::STRICT_FP_ROUND:
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::STRICT_FP_TO_UINT: {
+ // Replace vector fp_to_s/uint with their X86 specific equivalent so we
+ // don't need 2 sets of patterns.
+ if (!N->getSimpleValueType(0).isVector())
+ break;
+
+ unsigned NewOpc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
+ case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
+ case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
+ case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
+ case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
+ case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
+ }
+ SDValue Res;
+ if (N->isStrictFPOpcode())
+ Res =
+ CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
+ {N->getOperand(0), N->getOperand(1)});
+ else
+ Res =
+ CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0));
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: {
+ // Replace vector shifts with their X86 specific equivalent so we don't
+ // need 2 sets of patterns.
+ if (!N->getValueType(0).isVector())
+ break;
+
+ unsigned NewOpc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
+ case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
+ case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
+ }
+ SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG: {
+ // Replace vector any extend with the zero extend equivalents so we don't
+ // need 2 sets of patterns. Ignore vXi1 extensions.
+ if (!N->getValueType(0).isVector())
+ break;
+
+ unsigned NewOpc;
+ if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
+ assert(N->getOpcode() == ISD::ANY_EXTEND &&
+ "Unexpected opcode for mask vector!");
+ NewOpc = ISD::SIGN_EXTEND;
+ } else {
+ NewOpc = N->getOpcode() == ISD::ANY_EXTEND
+ ? ISD::ZERO_EXTEND
+ : ISD::ZERO_EXTEND_VECTOR_INREG;
+ }
+
+ SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ case ISD::FCEIL:
+ case ISD::STRICT_FCEIL:
+ case ISD::FFLOOR:
+ case ISD::STRICT_FFLOOR:
+ case ISD::FTRUNC:
+ case ISD::STRICT_FTRUNC:
+ case ISD::FROUNDEVEN:
+ case ISD::STRICT_FROUNDEVEN:
+ case ISD::FNEARBYINT:
+ case ISD::STRICT_FNEARBYINT:
+ case ISD::FRINT:
+ case ISD::STRICT_FRINT: {
+ // Replace fp rounding with their X86 specific equivalent so we don't
+ // need 2 sets of patterns.
+ unsigned Imm;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::STRICT_FCEIL:
+ case ISD::FCEIL: Imm = 0xA; break;
+ case ISD::STRICT_FFLOOR:
+ case ISD::FFLOOR: Imm = 0x9; break;
+ case ISD::STRICT_FTRUNC:
+ case ISD::FTRUNC: Imm = 0xB; break;
+ case ISD::STRICT_FROUNDEVEN:
+ case ISD::FROUNDEVEN: Imm = 0x8; break;
+ case ISD::STRICT_FNEARBYINT:
+ case ISD::FNEARBYINT: Imm = 0xC; break;
+ case ISD::STRICT_FRINT:
+ case ISD::FRINT: Imm = 0x4; break;
+ }
+ SDLoc dl(N);
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Res;
+ if (IsStrict)
+ Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
+ {N->getValueType(0), MVT::Other},
+ {N->getOperand(0), N->getOperand(1),
+ CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
+ else
+ Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
+ N->getOperand(0),
+ CurDAG->getTargetConstant(Imm, dl, MVT::i32));
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ case X86ISD::FANDN:
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR: {
+ // Widen scalar fp logic ops to vector to reduce isel patterns.
+ // FIXME: Can we do this during lowering/combine.
+ MVT VT = N->getSimpleValueType(0);
+ if (VT.isVector() || VT == MVT::f128)
+ break;
+
+ MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
+ SDLoc dl(N);
+ SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
+ N->getOperand(0));
+ SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
+ N->getOperand(1));
+
+ SDValue Res;
+ if (Subtarget->hasSSE2()) {
+ EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
+ Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
+ Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
+ unsigned Opc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
+ case X86ISD::FAND: Opc = ISD::AND; break;
+ case X86ISD::FOR: Opc = ISD::OR; break;
+ case X86ISD::FXOR: Opc = ISD::XOR; break;
+ }
+ Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
+ Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
+ } else {
+ Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
+ }
+ Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
+ CurDAG->getIntPtrConstant(0, dl));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ }
+
+ if (OptLevel != CodeGenOpt::None &&
+ // Only do this when the target can fold the load into the call or
+ // jmp.
+ !Subtarget->useIndirectThunkCalls() &&
+ ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
+ (N->getOpcode() == X86ISD::TC_RETURN &&
+ (Subtarget->is64Bit() ||
+ !getTargetMachine().isPositionIndependent())))) {
+ /// Also try moving call address load from outside callseq_start to just
+ /// before the call to allow it to be folded.
+ ///
+ /// [Load chain]
+ /// ^
+ /// |
+ /// [Load]
+ /// ^ ^
+ /// | |
+ /// / \--
+ /// / |
+ ///[CALLSEQ_START] |
+ /// ^ |
+ /// | |
+ /// [LOAD/C2Reg] |
+ /// | |
+ /// \ /
+ /// \ /
+ /// [CALL]
+ bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
+ SDValue Chain = N->getOperand(0);
+ SDValue Load = N->getOperand(1);
+ if (!isCalleeLoad(Load, Chain, HasCallSeq))
+ continue;
+ moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+ ++NumLoadMoved;
+ MadeChange = true;
+ continue;
+ }
+
+ // Lower fpround and fpextend nodes that target the FP stack to be store and
+ // load to the stack. This is a gross hack. We would like to simply mark
+ // these as being illegal, but when we do that, legalize produces these when
+ // it expands calls, then expands these in the same legalize pass. We would
+ // like dag combine to be able to hack on these between the call expansion
+ // and the node legalization. As such this pass basically does "really
+ // late" legalization of these inline with the X86 isel pass.
+ // FIXME: This should only happen when not compiled with -O0.
+ switch (N->getOpcode()) {
+ default: continue;
+ case ISD::FP_ROUND:
+ case ISD::FP_EXTEND:
+ {
+ MVT SrcVT = N->getOperand(0).getSimpleValueType();
+ MVT DstVT = N->getSimpleValueType(0);
+
+ // If any of the sources are vectors, no fp stack involved.
+ if (SrcVT.isVector() || DstVT.isVector())
+ continue;
+
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(TLI);
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
+ continue;
+
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(1))
+ continue;
+ }
+
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
+ SDLoc dl(N);
+
+ // FIXME: optimize the case where the src/dest is a load or store?
+
+ SDValue Store = CurDAG->getTruncStore(
+ CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
+ MemTmp, MPI, MemVT);
+
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+ break;
+ }
+
+ //The sequence of events for lowering STRICT_FP versions of these nodes requires
+ //dealing with the chain differently, as there is already a preexisting chain.
+ case ISD::STRICT_FP_ROUND:
+ case ISD::STRICT_FP_EXTEND:
+ {
+ MVT SrcVT = N->getOperand(1).getSimpleValueType();
+ MVT DstVT = N->getSimpleValueType(0);
+
+ // If any of the sources are vectors, no fp stack involved.
+ if (SrcVT.isVector() || DstVT.isVector())
+ continue;
+
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(TLI);
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
+ continue;
+
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(2))
+ continue;
+ }
+
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
+ SDLoc dl(N);
+
+ // FIXME: optimize the case where the src/dest is a load or store?
+
+ //Since the operation is StrictFP, use the preexisting chain.
+ SDValue Store, Result;
+ if (!SrcIsSSE) {
+ SDVTList VTs = CurDAG->getVTList(MVT::Other);
+ SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
+ Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
+ MPI, /*Align*/ None,
+ MachineMemOperand::MOStore);
+ if (N->getFlags().hasNoFPExcept()) {
+ SDNodeFlags Flags = Store->getFlags();
+ Flags.setNoFPExcept(true);
+ Store->setFlags(Flags);
+ }
+ } else {
+ assert(SrcVT == MemVT && "Unexpected VT!");
+ Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
+ MPI);
+ }
+
+ if (!DstIsSSE) {
+ SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
+ SDValue Ops[] = {Store, MemTmp};
+ Result = CurDAG->getMemIntrinsicNode(
+ X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
+ /*Align*/ None, MachineMemOperand::MOLoad);
+ if (N->getFlags().hasNoFPExcept()) {
+ SDNodeFlags Flags = Result->getFlags();
+ Flags.setNoFPExcept(true);
+ Result->setFlags(Flags);
+ }
+ } else {
+ assert(DstVT == MemVT && "Unexpected VT!");
+ Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
+ }
+
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Result.getNode());
+ break;
+ }
+ }
+
+
+ // Now that we did that, the node is dead. Increment the iterator to the
+ // next node to process, then delete N.
+ ++I;
+ MadeChange = true;
+ }
+
+ // Remove any dead nodes that may have been left behind.
+ if (MadeChange)
+ CurDAG->RemoveDeadNodes();
+}
+
+// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
+bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
+ unsigned Opc = N->getMachineOpcode();
+ if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
+ Opc != X86::MOVSX64rr8)
+ return false;
+
+ SDValue N0 = N->getOperand(0);
+
+ // We need to be extracting the lower bit of an extend.
+ if (!N0.isMachineOpcode() ||
+ N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
+ N0.getConstantOperandVal(1) != X86::sub_8bit)
+ return false;
+
+ // We're looking for either a movsx or movzx to match the original opcode.
+ unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
+ : X86::MOVSX32rr8_NOREX;
+ SDValue N00 = N0.getOperand(0);
+ if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
+ return false;
+
+ if (Opc == X86::MOVSX64rr8) {
+ // If we had a sign extend from 8 to 64 bits. We still need to go from 32
+ // to 64.
+ MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
+ MVT::i64, N00);
+ ReplaceUses(N, Extend);
+ } else {
+ // Ok we can drop this extend and just use the original extend.
+ ReplaceUses(N, N00.getNode());
+ }
+
+ return true;
+}
+
+void X86DAGToDAGISel::PostprocessISelDAG() {
+ // Skip peepholes at -O0.
+ if (TM.getOptLevel() == CodeGenOpt::None)
+ return;
+
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ // Skip dead nodes and any non-machine opcodes.
+ if (N->use_empty() || !N->isMachineOpcode())
+ continue;
+
+ if (tryOptimizeRem8Extend(N)) {
+ MadeChange = true;
+ continue;
+ }
+
+ // Look for a TESTrr+ANDrr pattern where both operands of the test are
+ // the same. Rewrite to remove the AND.
+ unsigned Opc = N->getMachineOpcode();
+ if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
+ Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
+ N->getOperand(0) == N->getOperand(1) &&
+ N->isOnlyUserOf(N->getOperand(0).getNode()) &&
+ N->getOperand(0).isMachineOpcode()) {
+ SDValue And = N->getOperand(0);
+ unsigned N0Opc = And.getMachineOpcode();
+ if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
+ N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
+ MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
+ MVT::i32,
+ And.getOperand(0),
+ And.getOperand(1));
+ ReplaceUses(N, Test);
+ MadeChange = true;
+ continue;
+ }
+ if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
+ N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
+ unsigned NewOpc;
+ switch (N0Opc) {
+ case X86::AND8rm: NewOpc = X86::TEST8mr; break;
+ case X86::AND16rm: NewOpc = X86::TEST16mr; break;
+ case X86::AND32rm: NewOpc = X86::TEST32mr; break;
+ case X86::AND64rm: NewOpc = X86::TEST64mr; break;
+ }
+
+ // Need to swap the memory and register operand.
+ SDValue Ops[] = { And.getOperand(1),
+ And.getOperand(2),
+ And.getOperand(3),
+ And.getOperand(4),
+ And.getOperand(5),
+ And.getOperand(0),
+ And.getOperand(6) /* Chain */ };
+ MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
+ MVT::i32, MVT::Other, Ops);
+ CurDAG->setNodeMemRefs(
+ Test, cast<MachineSDNode>(And.getNode())->memoperands());
+ ReplaceUses(N, Test);
+ MadeChange = true;
+ continue;
+ }
+ }
+
+ // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
+ // used. We're doing this late so we can prefer to fold the AND into masked
+ // comparisons. Doing that can be better for the live range of the mask
+ // register.
+ if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
+ Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
+ N->getOperand(0) == N->getOperand(1) &&
+ N->isOnlyUserOf(N->getOperand(0).getNode()) &&
+ N->getOperand(0).isMachineOpcode() &&
+ onlyUsesZeroFlag(SDValue(N, 0))) {
+ SDValue And = N->getOperand(0);
+ unsigned N0Opc = And.getMachineOpcode();
+ // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
+ // KAND instructions and KTEST use the same ISA feature.
+ if (N0Opc == X86::KANDBrr ||
+ (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
+ N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
+ unsigned NewOpc;
+ switch (Opc) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
+ case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
+ case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
+ case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
+ }
+ MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
+ MVT::i32,
+ And.getOperand(0),
+ And.getOperand(1));
+ ReplaceUses(N, KTest);
+ MadeChange = true;
+ continue;
+ }
+ }
+
+ // Attempt to remove vectors moves that were inserted to zero upper bits.
+ if (Opc != TargetOpcode::SUBREG_TO_REG)
+ continue;
+
+ unsigned SubRegIdx = N->getConstantOperandVal(2);
+ if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
+ continue;
+
+ SDValue Move = N->getOperand(1);
+ if (!Move.isMachineOpcode())
+ continue;
+
+ // Make sure its one of the move opcodes we recognize.
+ switch (Move.getMachineOpcode()) {
+ default:
+ continue;
+ case X86::VMOVAPDrr: case X86::VMOVUPDrr:
+ case X86::VMOVAPSrr: case X86::VMOVUPSrr:
+ case X86::VMOVDQArr: case X86::VMOVDQUrr:
+ case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
+ case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
+ case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
+ case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
+ case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
+ case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
+ case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
+ case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
+ case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
+ case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
+ case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
+ break;
+ }
+
+ SDValue In = Move.getOperand(0);
+ if (!In.isMachineOpcode() ||
+ In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
+ continue;
+
+ // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
+ // the SHA instructions which use a legacy encoding.
+ uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
+ if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
+ (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
+ (TSFlags & X86II::EncodingMask) != X86II::XOP)
+ continue;
+
+ // Producing instruction is another vector instruction. We can drop the
+ // move.
+ CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
+ MadeChange = true;
+ }
+
+ if (MadeChange)
+ CurDAG->RemoveDeadNodes();
+}
+
+
+/// Emit any code that needs to be executed only in the main function.
+void X86DAGToDAGISel::emitSpecialCodeForMain() {
+ if (Subtarget->isTargetCygMing()) {
+ TargetLowering::ArgListTy Args;
+ auto &DL = CurDAG->getDataLayout();
+
+ TargetLowering::CallLoweringInfo CLI(*CurDAG);
+ CLI.setChain(CurDAG->getRoot())
+ .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
+ CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
+ std::move(Args));
+ const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
+ std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+ CurDAG->setRoot(Result.second);
+ }
+}
+
+void X86DAGToDAGISel::emitFunctionEntryCode() {
+ // If this is main, emit special code for main.
+ const Function &F = MF->getFunction();
+ if (F.hasExternalLinkage() && F.getName() == "main")
+ emitSpecialCodeForMain();
+}
+
+static bool isDispSafeForFrameIndex(int64_t Val) {
+ // On 64-bit platforms, we can run into an issue where a frame index
+ // includes a displacement that, when added to the explicit displacement,
+ // will overflow the displacement field. Assuming that the frame index
+ // displacement fits into a 31-bit integer (which is only slightly more
+ // aggressive than the current fundamental assumption that it fits into
+ // a 32-bit integer), a 31-bit disp should always be safe.
+ return isInt<31>(Val);
+}
+
+bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
+ X86ISelAddressMode &AM) {
+ // We may have already matched a displacement and the caller just added the
+ // symbolic displacement. So we still need to do the checks even if Offset
+ // is zero.
+
+ int64_t Val = AM.Disp + Offset;
+
+ // Cannot combine ExternalSymbol displacements with integer offsets.
+ if (Val != 0 && (AM.ES || AM.MCSym))
+ return true;
+
+ CodeModel::Model M = TM.getCodeModel();
+ if (Subtarget->is64Bit()) {
+ if (Val != 0 &&
+ !X86::isOffsetSuitableForCodeModel(Val, M,
+ AM.hasSymbolicDisplacement()))
+ return true;
+ // In addition to the checks required for a register base, check that
+ // we do not try to use an unsafe Disp with a frame index.
+ if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
+ !isDispSafeForFrameIndex(Val))
+ return true;
+ }
+ AM.Disp = Val;
+ return false;
+
+}
+
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
+ bool AllowSegmentRegForX32) {
+ SDValue Address = N->getOperand(1);
+
+ // load gs:0 -> GS segment register.
+ // load fs:0 -> FS segment register.
+ //
+ // This optimization is generally valid because the GNU TLS model defines that
+ // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
+ // with 32-bit registers, as we get in ILP32 mode, those registers are first
+ // zero-extended to 64 bits and then added it to the base address, which gives
+ // unwanted results when the register holds a negative value.
+ // For more information see http://people.redhat.com/drepper/tls.pdf
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) {
+ if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
+ !IndirectTlsSegRefs &&
+ (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
+ Subtarget->isTargetFuchsia())) {
+ if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
+ return true;
+ switch (N->getPointerInfo().getAddrSpace()) {
+ case X86AS::GS:
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ return false;
+ case X86AS::FS:
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ return false;
+ // Address space X86AS::SS is not handled here, because it is not used to
+ // address TLS areas.
+ }
+ }
+ }
+
+ return true;
+}
+
+/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
+/// mode. These wrap things that will resolve down into a symbol reference.
+/// If no match is possible, this returns true, otherwise it returns false.
+bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
+ // If the addressing mode already has a symbol as the displacement, we can
+ // never match another symbol.
+ if (AM.hasSymbolicDisplacement())
+ return true;
+
+ bool IsRIPRelTLS = false;
+ bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
+ if (IsRIPRel) {
+ SDValue Val = N.getOperand(0);
+ if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+ IsRIPRelTLS = true;
+ }
+
+ // We can't use an addressing mode in the 64-bit large code model.
+ // Global TLS addressing is an exception. In the medium code model,
+ // we use can use a mode when RIP wrappers are present.
+ // That signifies access to globals that are known to be "near",
+ // such as the GOT itself.
+ CodeModel::Model M = TM.getCodeModel();
+ if (Subtarget->is64Bit() &&
+ ((M == CodeModel::Large && !IsRIPRelTLS) ||
+ (M == CodeModel::Medium && !IsRIPRel)))
+ return true;
+
+ // Base and index reg must be 0 in order to use %rip as base.
+ if (IsRIPRel && AM.hasBaseOrIndexReg())
+ return true;
+
+ // Make a local copy in case we can't do this fold.
+ X86ISelAddressMode Backup = AM;
+
+ int64_t Offset = 0;
+ SDValue N0 = N.getOperand(0);
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.SymbolFlags = G->getTargetFlags();
+ Offset = G->getOffset();
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Alignment = CP->getAlign();
+ AM.SymbolFlags = CP->getTargetFlags();
+ Offset = CP->getOffset();
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+ AM.MCSym = S->getMCSymbol();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.SymbolFlags = BA->getTargetFlags();
+ Offset = BA->getOffset();
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+
+ if (foldOffsetIntoAddress(Offset, AM)) {
+ AM = Backup;
+ return true;
+ }
+
+ if (IsRIPRel)
+ AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+
+ // Commit the changes now that we know this fold is safe.
+ return false;
+}
+
+/// Add the specified node to the specified addressing mode, returning true if
+/// it cannot be done. This just pattern matches for the addressing mode.
+bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
+ if (matchAddressRecursively(N, AM, 0))
+ return true;
+
+ // Post-processing: Make a second attempt to fold a load, if we now know
+ // that there will not be any other register. This is only performed for
+ // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
+ // any foldable load the first time.
+ if (Subtarget->isTarget64BitILP32() &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
+ SDValue Save_Base_Reg = AM.Base_Reg;
+ if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
+ AM.Base_Reg = SDValue();
+ if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
+ AM.Base_Reg = Save_Base_Reg;
+ }
+ }
+
+ // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+ // a smaller encoding and avoids a scaled-index.
+ if (AM.Scale == 2 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr) {
+ AM.Base_Reg = AM.IndexReg;
+ AM.Scale = 1;
+ }
+
+ // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+ // because it has a smaller encoding.
+ // TODO: Which other code models can use this?
+ switch (TM.getCodeModel()) {
+ default: break;
+ case CodeModel::Small:
+ case CodeModel::Kernel:
+ if (Subtarget->is64Bit() &&
+ AM.Scale == 1 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr &&
+ AM.SymbolFlags == X86II::MO_NO_FLAG &&
+ AM.hasSymbolicDisplacement())
+ AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+ break;
+ }
+
+ return false;
+}
+
+bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
+ unsigned Depth) {
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ X86ISelAddressMode Backup = AM;
+ if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+ !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // Try again after commutating the operands.
+ if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
+ Depth + 1) &&
+ !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
+ return false;
+ AM = Backup;
+
+ // If we couldn't fold both operands into the address at the same time,
+ // see if we can just put each operand into a register and fold at least
+ // the add.
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ !AM.Base_Reg.getNode() &&
+ !AM.IndexReg.getNode()) {
+ N = Handle.getValue();
+ AM.Base_Reg = N.getOperand(0);
+ AM.IndexReg = N.getOperand(1);
+ AM.Scale = 1;
+ return false;
+ }
+ N = Handle.getValue();
+ return true;
+}
+
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+ if (N->getNodeId() == -1 ||
+ (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
+ SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
+ DAG.RepositionNode(Pos->getIterator(), N.getNode());
+ // Mark Node as invalid for pruning as after this it may be a successor to a
+ // selected node but otherwise be in the same position of Pos.
+ // Conservatively mark it with the same -abs(Id) to assure node id
+ // invariant is preserved.
+ N->setNodeId(Pos->getNodeId());
+ SelectionDAGISel::InvalidateNodeId(N.getNode());
+ }
+}
+
+// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
+// safe. This allows us to convert the shift and and into an h-register
+// extract and a scaled index. Returns false if the simplification is
+// performed.
+static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ !Shift.hasOneUse())
+ return true;
+
+ int ScaleLog = 8 - Shift.getConstantOperandVal(1);
+ if (ScaleLog <= 0 || ScaleLog >= 4 ||
+ Mask != (0xffu << ScaleLog))
+ return true;
+
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
+ SDValue NewMask = DAG.getConstant(0xff, DL, VT);
+ SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
+ SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, Eight);
+ insertDAGNode(DAG, N, Srl);
+ insertDAGNode(DAG, N, NewMask);
+ insertDAGNode(DAG, N, And);
+ insertDAGNode(DAG, N, ShlCount);
+ insertDAGNode(DAG, N, Shl);
+ DAG.ReplaceAllUsesWith(N, Shl);
+ DAG.RemoveDeadNode(N.getNode());
+ AM.IndexReg = And;
+ AM.Scale = (1 << ScaleLog);
+ return false;
+}
+
+// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
+// allows us to fold the shift into this addressing mode. Returns false if the
+// transform succeeded.
+static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+ X86ISelAddressMode &AM) {
+ SDValue Shift = N.getOperand(0);
+
+ // Use a signed mask so that shifting right will insert sign bits. These
+ // bits will be removed when we shift the result left so it doesn't matter
+ // what we use. This might allow a smaller immediate encoding.
+ int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
+
+ // If we have an any_extend feeding the AND, look through it to see if there
+ // is a shift behind it. But only if the AND doesn't use the extended bits.
+ // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
+ bool FoundAnyExtend = false;
+ if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
+ Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
+ isUInt<32>(Mask)) {
+ FoundAnyExtend = true;
+ Shift = Shift.getOperand(0);
+ }
+
+ if (Shift.getOpcode() != ISD::SHL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ SDValue X = Shift.getOperand(0);
+
+ // Not likely to be profitable if either the AND or SHIFT node has more
+ // than one use (unless all uses are for address computation). Besides,
+ // isel mechanism requires their node ids to be reused.
+ if (!N.hasOneUse() || !Shift.hasOneUse())
+ return true;
+
+ // Verify that the shift amount is something we can fold.
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
+ return true;
+
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ if (FoundAnyExtend) {
+ SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
+ insertDAGNode(DAG, N, NewX);
+ X = NewX;
+ }
+
+ SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
+ SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, NewMask);
+ insertDAGNode(DAG, N, NewAnd);
+ insertDAGNode(DAG, N, NewShift);
+ DAG.ReplaceAllUsesWith(N, NewShift);
+ DAG.RemoveDeadNode(N.getNode());
+
+ AM.Scale = 1 << ShiftAmt;
+ AM.IndexReg = NewAnd;
+ return false;
+}
+
+// Implement some heroics to detect shifts of masked values where the mask can
+// be replaced by extending the shift and undoing that in the addressing mode
+// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
+// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
+// the addressing mode. This results in code such as:
+//
+// int f(short *y, int *lookup_table) {
+// ...
+// return *y + lookup_table[*y >> 11];
+// }
+//
+// Turning into:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $11, %ecx
+// addl (%rsi,%rcx,4), %eax
+//
+// Instead of:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $9, %ecx
+// andl $124, %rcx
+// addl (%rsi,%rcx), %eax
+//
+// Note that this function assumes the mask is provided as a mask *after* the
+// value is shifted. The input chain may or may not match that, but computing
+// such a mask is trivial.
+static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ unsigned MaskLZ = countLeadingZeros(Mask);
+ unsigned MaskTZ = countTrailingZeros(Mask);
+
+ // The amount of shift we're trying to fit into the addressing mode is taken
+ // from the trailing zeros of the mask.
+ unsigned AMShiftAmt = MaskTZ;
+
+ // There is nothing we can do here unless the mask is removing some bits.
+ // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+ if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
+
+ // We also need to ensure that mask is a continuous run of bits.
+ if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+
+ // Scale the leading zero count down based on the actual size of the value.
+ // Also scale it down based on the size of the shift.
+ unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
+ if (MaskLZ < ScaleDown)
+ return true;
+ MaskLZ -= ScaleDown;
+
+ // The final check is to ensure that any masked out high bits of X are
+ // already known to be zero. Otherwise, the mask has a semantic impact
+ // other than masking out a couple of low bits. Unfortunately, because of
+ // the mask, zero extensions will be removed from operands in some cases.
+ // This code works extra hard to look through extensions because we can
+ // replace them with zero extensions cheaply if necessary.
+ bool ReplacingAnyExtend = false;
+ if (X.getOpcode() == ISD::ANY_EXTEND) {
+ unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
+ X.getOperand(0).getSimpleValueType().getSizeInBits();
+ // Assume that we'll replace the any-extend with a zero-extend, and
+ // narrow the search to the extended value.
+ X = X.getOperand(0);
+ MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
+ ReplacingAnyExtend = true;
+ }
+ APInt MaskedHighBits =
+ APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
+ KnownBits Known = DAG.computeKnownBits(X);
+ if (MaskedHighBits != Known.Zero) return true;
+
+ // We've identified a pattern that can be transformed into a single shift
+ // and an addressing mode. Make it so.
+ MVT VT = N.getSimpleValueType();
+ if (ReplacingAnyExtend) {
+ assert(X.getValueType() != VT);
+ // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
+ SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
+ insertDAGNode(DAG, N, NewX);
+ X = NewX;
+ }
+ SDLoc DL(N);
+ SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
+ SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+ SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
+ SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, NewSRLAmt);
+ insertDAGNode(DAG, N, NewSRL);
+ insertDAGNode(DAG, N, NewSHLAmt);
+ insertDAGNode(DAG, N, NewSHL);
+ DAG.ReplaceAllUsesWith(N, NewSHL);
+ DAG.RemoveDeadNode(N.getNode());
+
+ AM.Scale = 1 << AMShiftAmt;
+ AM.IndexReg = NewSRL;
+ return false;
+}
+
+// Transform "(X >> SHIFT) & (MASK << C1)" to
+// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
+// matched to a BEXTR later. Returns false if the simplification is performed.
+static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM,
+ const X86Subtarget &Subtarget) {
+ if (Shift.getOpcode() != ISD::SRL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ !Shift.hasOneUse() || !N.hasOneUse())
+ return true;
+
+ // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
+ if (!Subtarget.hasTBM() &&
+ !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
+ return true;
+
+ // We need to ensure that mask is a continuous run of bits.
+ if (!isShiftedMask_64(Mask)) return true;
+
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+
+ // The amount of shift we're trying to fit into the addressing mode is taken
+ // from the trailing zeros of the mask.
+ unsigned AMShiftAmt = countTrailingZeros(Mask);
+
+ // There is nothing we can do here unless the mask is removing some bits.
+ // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+ if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
+
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
+ SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+ SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
+ SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
+ SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ insertDAGNode(DAG, N, NewSRLAmt);
+ insertDAGNode(DAG, N, NewSRL);
+ insertDAGNode(DAG, N, NewMask);
+ insertDAGNode(DAG, N, NewAnd);
+ insertDAGNode(DAG, N, NewSHLAmt);
+ insertDAGNode(DAG, N, NewSHL);
+ DAG.ReplaceAllUsesWith(N, NewSHL);
+ DAG.RemoveDeadNode(N.getNode());
+
+ AM.Scale = 1 << AMShiftAmt;
+ AM.IndexReg = NewAnd;
+ return false;
+}
+
+bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth) {
+ SDLoc dl(N);
+ LLVM_DEBUG({
+ dbgs() << "MatchAddress: ";
+ AM.dump(CurDAG);
+ });
+ // Limit recursion.
+ if (Depth > 5)
+ return matchAddressBase(N, AM);
+
+ // If this is already a %rip relative address, we can only merge immediates
+ // into it. Instead of handling this in every case, we handle it here.
+ // RIP relative addressing: %rip + 32-bit displacement!
+ if (AM.isRIPRelative()) {
+ // FIXME: JumpTable and ExternalSymbol address currently don't like
+ // displacements. It isn't very important, but this should be fixed for
+ // consistency.
+ if (!(AM.ES || AM.MCSym) && AM.JT != -1)
+ return true;
+
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
+ if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
+ return false;
+ return true;
+ }
+
+ switch (N.getOpcode()) {
+ default: break;
+ case ISD::LOCAL_RECOVER: {
+ if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
+ if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
+ // Use the symbol and don't prefix it.
+ AM.MCSym = ESNode->getMCSymbol();
+ return false;
+ }
+ break;
+ }
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!foldOffsetIntoAddress(Val, AM))
+ return false;
+ break;
+ }
+
+ case X86ISD::Wrapper:
+ case X86ISD::WrapperRIP:
+ if (!matchWrapper(N, AM))
+ return false;
+ break;
+
+ case ISD::LOAD:
+ if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
+ return false;
+ break;
+
+ case ISD::FrameIndex:
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+ AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+ AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+ return false;
+ }
+ break;
+
+ case ISD::SHL:
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
+ break;
+
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ unsigned Val = CN->getZExtValue();
+ // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+ // that the base operand remains free for further matching. If
+ // the base doesn't end up getting used, a post-processing step
+ // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+ if (Val == 1 || Val == 2 || Val == 3) {
+ AM.Scale = 1 << Val;
+ SDValue ShVal = N.getOperand(0);
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (CurDAG->isBaseWithConstantOffset(ShVal)) {
+ AM.IndexReg = ShVal.getOperand(0);
+ ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
+ uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
+ if (!foldOffsetIntoAddress(Disp, AM))
+ return false;
+ }
+
+ AM.IndexReg = ShVal;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SRL: {
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
+ "Unexpected value size!");
+
+ SDValue And = N.getOperand(0);
+ if (And.getOpcode() != ISD::AND) break;
+ SDValue X = And.getOperand(0);
+
+ // The mask used for the transform is expected to be post-shift, but we
+ // found the shift first so just apply the shift to the mask before passing
+ // it down.
+ if (!isa<ConstantSDNode>(N.getOperand(1)) ||
+ !isa<ConstantSDNode>(And.getOperand(1)))
+ break;
+ uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into the scale, and return false if we
+ // succeed.
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+ return false;
+ break;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI:
+ // A mul_lohi where we need the low part can be folded as a plain multiply.
+ if (N.getResNo() != 0) break;
+ LLVM_FALLTHROUGH;
+ case ISD::MUL:
+ case X86ISD::MUL_IMM:
+ // X*[3,5,9] -> X+X*[2,4,8]
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
+ if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+ CN->getZExtValue() == 9) {
+ AM.Scale = unsigned(CN->getZExtValue())-1;
+
+ SDValue MulVal = N.getOperand(0);
+ SDValue Reg;
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+ isa<ConstantSDNode>(MulVal.getOperand(1))) {
+ Reg = MulVal.getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(MulVal.getOperand(1));
+ uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
+ if (foldOffsetIntoAddress(Disp, AM))
+ Reg = N.getOperand(0);
+ } else {
+ Reg = N.getOperand(0);
+ }
+
+ AM.IndexReg = AM.Base_Reg = Reg;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SUB: {
+ // Given A-B, if A can be completely folded into the address and
+ // the index field with the index field unused, use -B as the index.
+ // This is a win if a has multiple parts that can be folded into
+ // the address. Also, this saves a mov if the base register has
+ // other uses, since it avoids a two-address sub instruction, however
+ // it costs an additional mov if the index register has other uses.
+
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ // Test if the LHS of the sub can be folded.
+ X86ISelAddressMode Backup = AM;
+ if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
+ N = Handle.getValue();
+ AM = Backup;
+ break;
+ }
+ N = Handle.getValue();
+ // Test if the index field is free for use.
+ if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+ AM = Backup;
+ break;
+ }
+
+ int Cost = 0;
+ SDValue RHS = N.getOperand(1);
+ // If the RHS involves a register with multiple uses, this
+ // transformation incurs an extra mov, due to the neg instruction
+ // clobbering its operand.
+ if (!RHS.getNode()->hasOneUse() ||
+ RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+ RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+ RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+ (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getOperand(0).getValueType() == MVT::i32))
+ ++Cost;
+ // If the base is a register with multiple uses, this
+ // transformation may save a mov.
+ if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
+ !AM.Base_Reg.getNode()->hasOneUse()) ||
+ AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ --Cost;
+ // If the folded LHS was interesting, this transformation saves
+ // address arithmetic.
+ if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+ ((AM.Disp != 0) && (Backup.Disp == 0)) +
+ (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+ --Cost;
+ // If it doesn't look like it may be an overall win, don't do it.
+ if (Cost >= 0) {
+ AM = Backup;
+ break;
+ }
+
+ // Ok, the transformation is legal and appears profitable. Go for it.
+ // Negation will be emitted later to avoid creating dangling nodes if this
+ // was an unprofitable LEA.
+ AM.IndexReg = RHS;
+ AM.NegateIndex = true;
+ AM.Scale = 1;
+ return false;
+ }
+
+ case ISD::ADD:
+ if (!matchAdd(N, AM, Depth))
+ return false;
+ break;
+
+ case ISD::OR:
+ // We want to look through a transform in InstCombine and DAGCombiner that
+ // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
+ // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
+ // An 'lea' can then be used to match the shift (multiply) and add:
+ // and $1, %esi
+ // lea (%rsi, %rdi, 8), %rax
+ if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
+ !matchAdd(N, AM, Depth))
+ return false;
+ break;
+
+ case ISD::AND: {
+ // Perform some heroic transforms on an and of a constant-count shift
+ // with a constant to enable use of the scaled offset field.
+
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
+ "Unexpected value size!");
+
+ if (!isa<ConstantSDNode>(N.getOperand(1)))
+ break;
+
+ if (N.getOperand(0).getOpcode() == ISD::SRL) {
+ SDValue Shift = N.getOperand(0);
+ SDValue X = Shift.getOperand(0);
+
+ uint64_t Mask = N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into an extract and scale.
+ if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to fold the mask and shift directly into the scale.
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to fold the mask and shift into BEXTR and scale.
+ if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
+ return false;
+ }
+
+ // Try to swap the mask and shift to place shifts which can be done as
+ // a scale on the outside of the mask.
+ if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
+ return false;
+
+ break;
+ }
+ case ISD::ZERO_EXTEND: {
+ // Try to widen a zexted shift left to the same size as its use, so we can
+ // match the shift as a scale factor.
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
+ break;
+ if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
+ break;
+
+ // Give up if the shift is not a valid scale factor [1,2,3].
+ SDValue Shl = N.getOperand(0);
+ auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
+ if (!ShAmtC || ShAmtC->getZExtValue() > 3)
+ break;
+
+ // The narrow shift must only shift out zero bits (it must be 'nuw').
+ // That makes it safe to widen to the destination type.
+ APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
+ ShAmtC->getZExtValue());
+ if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
+ break;
+
+ // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
+ SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
+
+ // Convert the shift to scale factor.
+ AM.Scale = 1 << ShAmtC->getZExtValue();
+ AM.IndexReg = Zext;
+
+ insertDAGNode(*CurDAG, N, Zext);
+ insertDAGNode(*CurDAG, N, NewShl);
+ CurDAG->ReplaceAllUsesWith(N, NewShl);
+ CurDAG->RemoveDeadNode(N.getNode());
+ return false;
+ }
+ }
+
+ return matchAddressBase(N, AM);
+}
+
+/// Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+ // Is the base register already occupied?
+ if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
+ // If so, check to see if the scale index register is set.
+ if (!AM.IndexReg.getNode()) {
+ AM.IndexReg = N;
+ AM.Scale = 1;
+ return false;
+ }
+
+ // Otherwise, we cannot select it.
+ return true;
+ }
+
+ // Default, generate it as a register.
+ AM.BaseType = X86ISelAddressMode::RegBase;
+ AM.Base_Reg = N;
+ return false;
+}
+
+/// Helper for selectVectorAddr. Handles things that can be folded into a
+/// gather scatter address. The index register and scale should have already
+/// been handled.
+bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
+ // TODO: Support other operations.
+ switch (N.getOpcode()) {
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!foldOffsetIntoAddress(Val, AM))
+ return false;
+ break;
+ }
+ case X86ISD::Wrapper:
+ if (!matchWrapper(N, AM))
+ return false;
+ break;
+ }
+
+ return matchAddressBase(N, AM);
+}
+
+bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
+ SDValue IndexOp, SDValue ScaleOp,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ X86ISelAddressMode AM;
+ AM.IndexReg = IndexOp;
+ AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue();
+
+ unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
+ if (AddrSpace == X86AS::GS)
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ if (AddrSpace == X86AS::FS)
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ if (AddrSpace == X86AS::SS)
+ AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
+
+ SDLoc DL(BasePtr);
+ MVT VT = BasePtr.getSimpleValueType();
+
+ // Try to match into the base and displacement fields.
+ if (matchVectorAddress(BasePtr, AM))
+ return false;
+
+ getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// Returns true if it is able to pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+///
+/// Parent is the parent node of the addr operand that is being matched. It
+/// is always a load, store, atomic node, or null. It is only null when
+/// checking memory operands for inline asm nodes.
+bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ X86ISelAddressMode AM;
+
+ if (Parent &&
+ // This list of opcodes are all the nodes that have an "addr:$ptr" operand
+ // that are not a MemSDNode, and thus don't have proper addrspace info.
+ Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
+ Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
+ Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+ Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
+ Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
+ Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+ Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
+ unsigned AddrSpace =
+ cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+ if (AddrSpace == X86AS::GS)
+ AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+ if (AddrSpace == X86AS::FS)
+ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+ if (AddrSpace == X86AS::SS)
+ AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
+ }
+
+ // Save the DL and VT before calling matchAddress, it can invalidate N.
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+
+ if (matchAddress(N, AM))
+ return false;
+
+ getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
+ // In static codegen with small code model, we can get the address of a label
+ // into a register with 'movl'
+ if (N->getOpcode() != X86ISD::Wrapper)
+ return false;
+
+ N = N.getOperand(0);
+
+ // At least GNU as does not accept 'movl' for TPOFF relocations.
+ // FIXME: We could use 'movl' when we know we are targeting MC.
+ if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+
+ Imm = N;
+ if (N->getOpcode() != ISD::TargetGlobalAddress)
+ return TM.getCodeModel() == CodeModel::Small;
+
+ Optional<ConstantRange> CR =
+ cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
+ if (!CR)
+ return TM.getCodeModel() == CodeModel::Small;
+
+ return CR->getUnsignedMax().ult(1ull << 32);
+}
+
+bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
+ SDLoc DL(N);
+
+ if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+ return false;
+
+ RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
+ if (RN && RN->getReg() == 0)
+ Base = CurDAG->getRegister(0, MVT::i64);
+ else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
+ // Base could already be %rip, particularly in the x32 ABI.
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
+ MVT::i64), 0);
+ Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
+ Base);
+ }
+
+ RN = dyn_cast<RegisterSDNode>(Index);
+ if (RN && RN->getReg() == 0)
+ Index = CurDAG->getRegister(0, MVT::i64);
+ else {
+ assert(Index.getValueType() == MVT::i32 &&
+ "Expect to be extending 32-bit registers for use in LEA");
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
+ MVT::i64), 0);
+ Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
+ Index);
+ }
+
+ return true;
+}
+
+/// Calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ X86ISelAddressMode AM;
+
+ // Save the DL and VT before calling matchAddress, it can invalidate N.
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+
+ // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+ // segments.
+ SDValue Copy = AM.Segment;
+ SDValue T = CurDAG->getRegister(0, MVT::i32);
+ AM.Segment = T;
+ if (matchAddress(N, AM))
+ return false;
+ assert (T == AM.Segment);
+ AM.Segment = Copy;
+
+ unsigned Complexity = 0;
+ if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
+ Complexity = 1;
+ else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Complexity = 4;
+
+ if (AM.IndexReg.getNode())
+ Complexity++;
+
+ // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+ // a simple shift.
+ if (AM.Scale > 1)
+ Complexity++;
+
+ // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+ // to a LEA. This is determined with some experimentation but is by no means
+ // optimal (especially for code size consideration). LEA is nice because of
+ // its three-address nature. Tweak the cost function again when we can run
+ // convertToThreeAddress() at register allocation time.
+ if (AM.hasSymbolicDisplacement()) {
+ // For X86-64, always use LEA to materialize RIP-relative addresses.
+ if (Subtarget->is64Bit())
+ Complexity = 4;
+ else
+ Complexity += 2;
+ }
+
+ // Heuristic: try harder to form an LEA from ADD if the operands set flags.
+ // Unlike ADD, LEA does not affect flags, so we will be less likely to require
+ // duplicating flag-producing instructions later in the pipeline.
+ if (N.getOpcode() == ISD::ADD) {
+ auto isMathWithFlags = [](SDValue V) {
+ switch (V.getOpcode()) {
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::ADC:
+ case X86ISD::SBB:
+ /* TODO: These opcodes can be added safely, but we may want to justify
+ their inclusion for different reasons (better for reg-alloc).
+ case X86ISD::SMUL:
+ case X86ISD::UMUL:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ */
+ // Value 1 is the flag output of the node - verify it's not dead.
+ return !SDValue(V.getNode(), 1).use_empty();
+ default:
+ return false;
+ }
+ };
+ // TODO: This could be an 'or' rather than 'and' to make the transform more
+ // likely to happen. We might want to factor in whether there's a
+ // load folding opportunity for the math op that disappears with LEA.
+ if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
+ Complexity++;
+ }
+
+ if (AM.Disp)
+ Complexity++;
+
+ // If it isn't worth using an LEA, reject it.
+ if (Complexity <= 2)
+ return false;
+
+ getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+ const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+
+ X86ISelAddressMode AM;
+ AM.GV = GA->getGlobal();
+ AM.Disp += GA->getOffset();
+ AM.SymbolFlags = GA->getTargetFlags();
+
+ if (Subtarget->is32Bit()) {
+ AM.Scale = 1;
+ AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+ }
+
+ MVT VT = N.getSimpleValueType();
+ getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
+ // Keep track of the original value type and whether this value was
+ // truncated. If we see a truncation from pointer type to VT that truncates
+ // bits that are known to be zero, we can use a narrow reference.
+ EVT VT = N.getValueType();
+ bool WasTruncated = false;
+ if (N.getOpcode() == ISD::TRUNCATE) {
+ WasTruncated = true;
+ N = N.getOperand(0);
+ }
+
+ if (N.getOpcode() != X86ISD::Wrapper)
+ return false;
+
+ // We can only use non-GlobalValues as immediates if they were not truncated,
+ // as we do not have any range information. If we have a GlobalValue and the
+ // address was not truncated, we can select it as an operand directly.
+ unsigned Opc = N.getOperand(0)->getOpcode();
+ if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
+ Op = N.getOperand(0);
+ // We can only select the operand directly if we didn't have to look past a
+ // truncate.
+ return !WasTruncated;
+ }
+
+ // Check that the global's range fits into VT.
+ auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
+ Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
+ if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
+ return false;
+
+ // Okay, we can use a narrow reference.
+ Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
+ GA->getOffset(), GA->getTargetFlags());
+ return true;
+}
+
+bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ assert(Root && P && "Unknown root/parent nodes");
+ if (!ISD::isNON_EXTLoad(N.getNode()) ||
+ !IsProfitableToFold(N, P, Root) ||
+ !IsLegalToFold(N, P, Root, OptLevel))
+ return false;
+
+ return selectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ assert(Root && P && "Unknown root/parent nodes");
+ if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
+ !IsProfitableToFold(N, P, Root) ||
+ !IsLegalToFold(N, P, Root, OptLevel))
+ return false;
+
+ return selectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+/// Return an SDNode that returns the value of the global base register.
+/// Output instructions required to initialize the global base register,
+/// if necessary.
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+ unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+ auto &DL = MF->getDataLayout();
+ return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
+}
+
+bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
+ if (N->getOpcode() == ISD::TRUNCATE)
+ N = N->getOperand(0).getNode();
+ if (N->getOpcode() != X86ISD::Wrapper)
+ return false;
+
+ auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
+ if (!GA)
+ return false;
+
+ Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
+ if (!CR)
+ return Width == 32 && TM.getCodeModel() == CodeModel::Small;
+
+ return CR->getSignedMin().sge(-1ull << Width) &&
+ CR->getSignedMax().slt(1ull << Width);
+}
+
+static X86::CondCode getCondFromNode(SDNode *N) {
+ assert(N->isMachineOpcode() && "Unexpected node");
+ X86::CondCode CC = X86::COND_INVALID;
+ unsigned Opc = N->getMachineOpcode();
+ if (Opc == X86::JCC_1)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
+ else if (Opc == X86::SETCCr)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
+ else if (Opc == X86::SETCCm)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
+ else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
+ Opc == X86::CMOV64rr)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
+ else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
+ Opc == X86::CMOV64rm)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
+
+ return CC;
+}
+
+/// Test whether the given X86ISD::CMP node has any users that use a flag
+/// other than ZF.
+bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
+ // Examine each user of the node.
+ for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
+ UI != UE; ++UI) {
+ // Only check things that use the flags.
+ if (UI.getUse().getResNo() != Flags.getResNo())
+ continue;
+ // Only examine CopyToReg uses that copy to EFLAGS.
+ if (UI->getOpcode() != ISD::CopyToReg ||
+ cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
+ return false;
+ // Examine each user of the CopyToReg use.
+ for (SDNode::use_iterator FlagUI = UI->use_begin(),
+ FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+ // Only examine the Flag result.
+ if (FlagUI.getUse().getResNo() != 1) continue;
+ // Anything unusual: assume conservatively.
+ if (!FlagUI->isMachineOpcode()) return false;
+ // Examine the condition code of the user.
+ X86::CondCode CC = getCondFromNode(*FlagUI);
+
+ switch (CC) {
+ // Comparisons which only use the zero flag.
+ case X86::COND_E: case X86::COND_NE:
+ continue;
+ // Anything else: assume conservatively.
+ default:
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+/// Test whether the given X86ISD::CMP node has any uses which require the SF
+/// flag to be accurate.
+bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
+ // Examine each user of the node.
+ for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
+ UI != UE; ++UI) {
+ // Only check things that use the flags.
+ if (UI.getUse().getResNo() != Flags.getResNo())
+ continue;
+ // Only examine CopyToReg uses that copy to EFLAGS.
+ if (UI->getOpcode() != ISD::CopyToReg ||
+ cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
+ return false;
+ // Examine each user of the CopyToReg use.
+ for (SDNode::use_iterator FlagUI = UI->use_begin(),
+ FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+ // Only examine the Flag result.
+ if (FlagUI.getUse().getResNo() != 1) continue;
+ // Anything unusual: assume conservatively.
+ if (!FlagUI->isMachineOpcode()) return false;
+ // Examine the condition code of the user.
+ X86::CondCode CC = getCondFromNode(*FlagUI);
+
+ switch (CC) {
+ // Comparisons which don't examine the SF flag.
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ case X86::COND_E: case X86::COND_NE:
+ case X86::COND_O: case X86::COND_NO:
+ case X86::COND_P: case X86::COND_NP:
+ continue;
+ // Anything else: assume conservatively.
+ default:
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+static bool mayUseCarryFlag(X86::CondCode CC) {
+ switch (CC) {
+ // Comparisons which don't examine the CF flag.
+ case X86::COND_O: case X86::COND_NO:
+ case X86::COND_E: case X86::COND_NE:
+ case X86::COND_S: case X86::COND_NS:
+ case X86::COND_P: case X86::COND_NP:
+ case X86::COND_L: case X86::COND_GE:
+ case X86::COND_G: case X86::COND_LE:
+ return false;
+ // Anything else: assume conservatively.
+ default:
+ return true;
+ }
+}
+
+/// Test whether the given node which sets flags has any uses which require the
+/// CF flag to be accurate.
+ bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
+ // Examine each user of the node.
+ for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
+ UI != UE; ++UI) {
+ // Only check things that use the flags.
+ if (UI.getUse().getResNo() != Flags.getResNo())
+ continue;
+
+ unsigned UIOpc = UI->getOpcode();
+
+ if (UIOpc == ISD::CopyToReg) {
+ // Only examine CopyToReg uses that copy to EFLAGS.
+ if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
+ return false;
+ // Examine each user of the CopyToReg use.
+ for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
+ FlagUI != FlagUE; ++FlagUI) {
+ // Only examine the Flag result.
+ if (FlagUI.getUse().getResNo() != 1)
+ continue;
+ // Anything unusual: assume conservatively.
+ if (!FlagUI->isMachineOpcode())
+ return false;
+ // Examine the condition code of the user.
+ X86::CondCode CC = getCondFromNode(*FlagUI);
+
+ if (mayUseCarryFlag(CC))
+ return false;
+ }
+
+ // This CopyToReg is ok. Move on to the next user.
+ continue;
+ }
+
+ // This might be an unselected node. So look for the pre-isel opcodes that
+ // use flags.
+ unsigned CCOpNo;
+ switch (UIOpc) {
+ default:
+ // Something unusual. Be conservative.
+ return false;
+ case X86ISD::SETCC: CCOpNo = 0; break;
+ case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
+ case X86ISD::CMOV: CCOpNo = 2; break;
+ case X86ISD::BRCOND: CCOpNo = 2; break;
+ }
+
+ X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
+ if (mayUseCarryFlag(CC))
+ return false;
+ }
+ return true;
+}
+
+/// Check whether or not the chain ending in StoreNode is suitable for doing
+/// the {load; op; store} to modify transformation.
+static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
+ SDValue StoredVal, SelectionDAG *CurDAG,
+ unsigned LoadOpNo,
+ LoadSDNode *&LoadNode,
+ SDValue &InputChain) {
+ // Is the stored value result 0 of the operation?
+ if (StoredVal.getResNo() != 0) return false;
+
+ // Are there other uses of the operation other than the store?
+ if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+ // Is the store non-extending and non-indexed?
+ if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+ return false;
+
+ SDValue Load = StoredVal->getOperand(LoadOpNo);
+ // Is the stored value a non-extending and non-indexed load?
+ if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+ // Return LoadNode by reference.
+ LoadNode = cast<LoadSDNode>(Load);
+
+ // Is store the only read of the loaded value?
+ if (!Load.hasOneUse())
+ return false;
+
+ // Is the address of the store the same as the load?
+ if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+ LoadNode->getOffset() != StoreNode->getOffset())
+ return false;
+
+ bool FoundLoad = false;
+ SmallVector<SDValue, 4> ChainOps;
+ SmallVector<const SDNode *, 4> LoopWorklist;
+ SmallPtrSet<const SDNode *, 16> Visited;
+ const unsigned int Max = 1024;
+
+ // Visualization of Load-Op-Store fusion:
+ // -------------------------
+ // Legend:
+ // *-lines = Chain operand dependencies.
+ // |-lines = Normal operand dependencies.
+ // Dependencies flow down and right. n-suffix references multiple nodes.
+ //
+ // C Xn C
+ // * * *
+ // * * *
+ // Xn A-LD Yn TF Yn
+ // * * \ | * |
+ // * * \ | * |
+ // * * \ | => A--LD_OP_ST
+ // * * \| \
+ // TF OP \
+ // * | \ Zn
+ // * | \
+ // A-ST Zn
+ //
+
+ // This merge induced dependences from: #1: Xn -> LD, OP, Zn
+ // #2: Yn -> LD
+ // #3: ST -> Zn
+
+ // Ensure the transform is safe by checking for the dual
+ // dependencies to make sure we do not induce a loop.
+
+ // As LD is a predecessor to both OP and ST we can do this by checking:
+ // a). if LD is a predecessor to a member of Xn or Yn.
+ // b). if a Zn is a predecessor to ST.
+
+ // However, (b) can only occur through being a chain predecessor to
+ // ST, which is the same as Zn being a member or predecessor of Xn,
+ // which is a subset of LD being a predecessor of Xn. So it's
+ // subsumed by check (a).
+
+ SDValue Chain = StoreNode->getChain();
+
+ // Gather X elements in ChainOps.
+ if (Chain == Load.getValue(1)) {
+ FoundLoad = true;
+ ChainOps.push_back(Load.getOperand(0));
+ } else if (Chain.getOpcode() == ISD::TokenFactor) {
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+ SDValue Op = Chain.getOperand(i);
+ if (Op == Load.getValue(1)) {
+ FoundLoad = true;
+ // Drop Load, but keep its chain. No cycle check necessary.
+ ChainOps.push_back(Load.getOperand(0));
+ continue;
+ }
+ LoopWorklist.push_back(Op.getNode());
+ ChainOps.push_back(Op);
+ }
+ }
+
+ if (!FoundLoad)
+ return false;
+
+ // Worklist is currently Xn. Add Yn to worklist.
+ for (SDValue Op : StoredVal->ops())
+ if (Op.getNode() != LoadNode)
+ LoopWorklist.push_back(Op.getNode());
+
+ // Check (a) if Load is a predecessor to Xn + Yn
+ if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
+ true))
+ return false;
+
+ InputChain =
+ CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
+ return true;
+}
+
+// Change a chain of {load; op; store} of the same value into a simple op
+// through memory of that value, if the uses of the modified value and its
+// address are suitable.
+//
+// The tablegen pattern memory operand pattern is currently not able to match
+// the case where the EFLAGS on the original operation are used.
+//
+// To move this to tablegen, we'll need to improve tablegen to allow flags to
+// be transferred from a node in the pattern to the result node, probably with
+// a new keyword. For example, we have this
+// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+// (implicit EFLAGS)]>;
+// but maybe need something like this
+// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+// (transferrable EFLAGS)]>;
+//
+// Until then, we manually fold these and instruction select the operation
+// here.
+bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+ SDValue StoredVal = StoreNode->getOperand(1);
+ unsigned Opc = StoredVal->getOpcode();
+
+ // Before we try to select anything, make sure this is memory operand size
+ // and opcode we can handle. Note that this must match the code below that
+ // actually lowers the opcodes.
+ EVT MemVT = StoreNode->getMemoryVT();
+ if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
+ MemVT != MVT::i8)
+ return false;
+
+ bool IsCommutable = false;
+ bool IsNegate = false;
+ switch (Opc) {
+ default:
+ return false;
+ case X86ISD::SUB:
+ IsNegate = isNullConstant(StoredVal.getOperand(0));
+ break;
+ case X86ISD::SBB:
+ break;
+ case X86ISD::ADD:
+ case X86ISD::ADC:
+ case X86ISD::AND:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ IsCommutable = true;
+ break;
+ }
+
+ unsigned LoadOpNo = IsNegate ? 1 : 0;
+ LoadSDNode *LoadNode = nullptr;
+ SDValue InputChain;
+ if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
+ LoadNode, InputChain)) {
+ if (!IsCommutable)
+ return false;
+
+ // This operation is commutable, try the other operand.
+ LoadOpNo = 1;
+ if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
+ LoadNode, InputChain))
+ return false;
+ }
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
+ Segment))
+ return false;
+
+ auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
+ unsigned Opc8) {
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ case MVT::i64:
+ return Opc64;
+ case MVT::i32:
+ return Opc32;
+ case MVT::i16:
+ return Opc16;
+ case MVT::i8:
+ return Opc8;
+ default:
+ llvm_unreachable("Invalid size!");
+ }
+ };
+
+ MachineSDNode *Result;
+ switch (Opc) {
+ case X86ISD::SUB:
+ // Handle negate.
+ if (IsNegate) {
+ unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
+ X86::NEG8m);
+ const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
+ MVT::Other, Ops);
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::ADD:
+ // Try to match inc/dec.
+ if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
+ bool IsOne = isOneConstant(StoredVal.getOperand(1));
+ bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
+ // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
+ if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
+ unsigned NewOpc =
+ ((Opc == X86ISD::ADD) == IsOne)
+ ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
+ : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
+ const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
+ MVT::Other, Ops);
+ break;
+ }
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::ADC:
+ case X86ISD::SBB:
+ case X86ISD::AND:
+ case X86ISD::OR:
+ case X86ISD::XOR: {
+ auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
+ switch (Opc) {
+ case X86ISD::ADD:
+ return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
+ X86::ADD8mr);
+ case X86ISD::ADC:
+ return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
+ X86::ADC8mr);
+ case X86ISD::SUB:
+ return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
+ X86::SUB8mr);
+ case X86ISD::SBB:
+ return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
+ X86::SBB8mr);
+ case X86ISD::AND:
+ return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
+ X86::AND8mr);
+ case X86ISD::OR:
+ return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
+ case X86ISD::XOR:
+ return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
+ X86::XOR8mr);
+ default:
+ llvm_unreachable("Invalid opcode!");
+ }
+ };
+ auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
+ switch (Opc) {
+ case X86ISD::ADD:
+ return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
+ case X86ISD::ADC:
+ return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
+ case X86ISD::SUB:
+ return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
+ case X86ISD::SBB:
+ return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
+ case X86ISD::AND:
+ return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
+ case X86ISD::OR:
+ return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
+ case X86ISD::XOR:
+ return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
+ default:
+ llvm_unreachable("Invalid opcode!");
+ }
+ };
+ auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
+ switch (Opc) {
+ case X86ISD::ADD:
+ return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
+ X86::ADD8mi);
+ case X86ISD::ADC:
+ return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
+ X86::ADC8mi);
+ case X86ISD::SUB:
+ return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
+ X86::SUB8mi);
+ case X86ISD::SBB:
+ return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
+ X86::SBB8mi);
+ case X86ISD::AND:
+ return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
+ X86::AND8mi);
+ case X86ISD::OR:
+ return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
+ X86::OR8mi);
+ case X86ISD::XOR:
+ return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
+ X86::XOR8mi);
+ default:
+ llvm_unreachable("Invalid opcode!");
+ }
+ };
+
+ unsigned NewOpc = SelectRegOpcode(Opc);
+ SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
+
+ // See if the operand is a constant that we can fold into an immediate
+ // operand.
+ if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
+ int64_t OperandV = OperandC->getSExtValue();
+
+ // Check if we can shrink the operand enough to fit in an immediate (or
+ // fit into a smaller immediate) by negating it and switching the
+ // operation.
+ if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
+ ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
+ (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
+ isInt<32>(-OperandV))) &&
+ hasNoCarryFlagUses(StoredVal.getValue(1))) {
+ OperandV = -OperandV;
+ Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
+ }
+
+ // First try to fit this into an Imm8 operand. If it doesn't fit, then try
+ // the larger immediate operand.
+ if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
+ Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
+ NewOpc = SelectImm8Opcode(Opc);
+ } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
+ Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
+ NewOpc = SelectImmOpcode(Opc);
+ }
+ }
+
+ if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
+ SDValue CopyTo =
+ CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
+ StoredVal.getOperand(2), SDValue());
+
+ const SDValue Ops[] = {Base, Scale, Index, Disp,
+ Segment, Operand, CopyTo, CopyTo.getValue(1)};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
+ Ops);
+ } else {
+ const SDValue Ops[] = {Base, Scale, Index, Disp,
+ Segment, Operand, InputChain};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
+ Ops);
+ }
+ break;
+ }
+ default:
+ llvm_unreachable("Invalid opcode!");
+ }
+
+ MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
+ LoadNode->getMemOperand()};
+ CurDAG->setNodeMemRefs(Result, MemOps);
+
+ // Update Load Chain uses as well.
+ ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
+ ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+ ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return true;
+}
+
+// See if this is an X & Mask that we can match to BEXTR/BZHI.
+// Where Mask is one of the following patterns:
+// a) x & (1 << nbits) - 1
+// b) x & ~(-1 << nbits)
+// c) x & (-1 >> (32 - y))
+// d) x << (32 - y) >> (32 - y)
+bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
+ assert(
+ (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
+ "Should be either an and-mask, or right-shift after clearing high bits.");
+
+ // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
+ if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
+ return false;
+
+ MVT NVT = Node->getSimpleValueType(0);
+
+ // Only supported for 32 and 64 bits.
+ if (NVT != MVT::i32 && NVT != MVT::i64)
+ return false;
+
+ SDValue NBits;
+
+ // If we have BMI2's BZHI, we are ok with muti-use patterns.
+ // Else, if we only have BMI1's BEXTR, we require one-use.
+ const bool CanHaveExtraUses = Subtarget->hasBMI2();
+ auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
+ return CanHaveExtraUses ||
+ Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
+ };
+ auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
+ auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
+
+ auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
+ if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
+ assert(V.getSimpleValueType() == MVT::i32 &&
+ V.getOperand(0).getSimpleValueType() == MVT::i64 &&
+ "Expected i64 -> i32 truncation");
+ V = V.getOperand(0);
+ }
+ return V;
+ };
+
+ // a) x & ((1 << nbits) + (-1))
+ auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
+ &NBits](SDValue Mask) -> bool {
+ // Match `add`. Must only have one use!
+ if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
+ return false;
+ // We should be adding all-ones constant (i.e. subtracting one.)
+ if (!isAllOnesConstant(Mask->getOperand(1)))
+ return false;
+ // Match `1 << nbits`. Might be truncated. Must only have one use!
+ SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
+ if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
+ return false;
+ if (!isOneConstant(M0->getOperand(0)))
+ return false;
+ NBits = M0->getOperand(1);
+ return true;
+ };
+
+ auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
+ V = peekThroughOneUseTruncation(V);
+ return CurDAG->MaskedValueIsAllOnes(
+ V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
+ NVT.getSizeInBits()));
+ };
+
+ // b) x & ~(-1 << nbits)
+ auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
+ &NBits](SDValue Mask) -> bool {
+ // Match `~()`. Must only have one use!
+ if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
+ return false;
+ // The -1 only has to be all-ones for the final Node's NVT.
+ if (!isAllOnes(Mask->getOperand(1)))
+ return false;
+ // Match `-1 << nbits`. Might be truncated. Must only have one use!
+ SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
+ if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
+ return false;
+ // The -1 only has to be all-ones for the final Node's NVT.
+ if (!isAllOnes(M0->getOperand(0)))
+ return false;
+ NBits = M0->getOperand(1);
+ return true;
+ };
+
+ // Match potentially-truncated (bitwidth - y)
+ auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
+ unsigned Bitwidth) {
+ // Skip over a truncate of the shift amount.
+ if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
+ ShiftAmt = ShiftAmt.getOperand(0);
+ // The trunc should have been the only user of the real shift amount.
+ if (!checkOneUse(ShiftAmt))
+ return false;
+ }
+ // Match the shift amount as: (bitwidth - y). It should go away, too.
+ if (ShiftAmt.getOpcode() != ISD::SUB)
+ return false;
+ auto *V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
+ if (!V0 || V0->getZExtValue() != Bitwidth)
+ return false;
+ NBits = ShiftAmt.getOperand(1);
+ return true;
+ };
+
+ // c) x & (-1 >> (32 - y))
+ auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
+ matchShiftAmt](SDValue Mask) -> bool {
+ // The mask itself may be truncated.
+ Mask = peekThroughOneUseTruncation(Mask);
+ unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
+ // Match `l>>`. Must only have one use!
+ if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
+ return false;
+ // We should be shifting truly all-ones constant.
+ if (!isAllOnesConstant(Mask.getOperand(0)))
+ return false;
+ SDValue M1 = Mask.getOperand(1);
+ // The shift amount should not be used externally.
+ if (!checkOneUse(M1))
+ return false;
+ return matchShiftAmt(M1, Bitwidth);
+ };
+
+ SDValue X;
+
+ // d) x << (32 - y) >> (32 - y)
+ auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
+ &X](SDNode *Node) -> bool {
+ if (Node->getOpcode() != ISD::SRL)
+ return false;
+ SDValue N0 = Node->getOperand(0);
+ if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
+ return false;
+ unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
+ SDValue N1 = Node->getOperand(1);
+ SDValue N01 = N0->getOperand(1);
+ // Both of the shifts must be by the exact same value.
+ // There should not be any uses of the shift amount outside of the pattern.
+ if (N1 != N01 || !checkTwoUse(N1))
+ return false;
+ if (!matchShiftAmt(N1, Bitwidth))
+ return false;
+ X = N0->getOperand(0);
+ return true;
+ };
+
+ auto matchLowBitMask = [matchPatternA, matchPatternB,
+ matchPatternC](SDValue Mask) -> bool {
+ return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
+ };
+
+ if (Node->getOpcode() == ISD::AND) {
+ X = Node->getOperand(0);
+ SDValue Mask = Node->getOperand(1);
+
+ if (matchLowBitMask(Mask)) {
+ // Great.
+ } else {
+ std::swap(X, Mask);
+ if (!matchLowBitMask(Mask))
+ return false;
+ }
+ } else if (!matchPatternD(Node))
+ return false;
+
+ SDLoc DL(Node);
+
+ // Truncate the shift amount.
+ NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
+
+ // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
+ // All the other bits are undefined, we do not care about them.
+ SDValue ImplDef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
+
+ SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
+ NBits = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
+ NBits, SRIdxVal), 0);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
+
+ if (Subtarget->hasBMI2()) {
+ // Great, just emit the the BZHI..
+ if (NVT != MVT::i32) {
+ // But have to place the bit count into the wide-enough register first.
+ NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
+ }
+
+ SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
+ ReplaceNode(Node, Extract.getNode());
+ SelectCode(Extract.getNode());
+ return true;
+ }
+
+ // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
+ // *logically* shifted (potentially with one-use trunc inbetween),
+ // and the truncation was the only use of the shift,
+ // and if so look past one-use truncation.
+ {
+ SDValue RealX = peekThroughOneUseTruncation(X);
+ // FIXME: only if the shift is one-use?
+ if (RealX != X && RealX.getOpcode() == ISD::SRL)
+ X = RealX;
+ }
+
+ MVT XVT = X.getSimpleValueType();
+
+ // Else, emitting BEXTR requires one more step.
+ // The 'control' of BEXTR has the pattern of:
+ // [15...8 bit][ 7...0 bit] location
+ // [ bit count][ shift] name
+ // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
+
+ // Shift NBits left by 8 bits, thus producing 'control'.
+ // This makes the low 8 bits to be zero.
+ SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
+ SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
+
+ // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
+ // FIXME: only if the shift is one-use?
+ if (X.getOpcode() == ISD::SRL) {
+ SDValue ShiftAmt = X.getOperand(1);
+ X = X.getOperand(0);
+
+ assert(ShiftAmt.getValueType() == MVT::i8 &&
+ "Expected shift amount to be i8");
+
+ // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
+ // We could zext to i16 in some form, but we intentionally don't do that.
+ SDValue OrigShiftAmt = ShiftAmt;
+ ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
+ insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
+
+ // And now 'or' these low 8 bits of shift amount into the 'control'.
+ Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
+ }
+
+ // But have to place the 'control' into the wide-enough register first.
+ if (XVT != MVT::i32) {
+ Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
+ }
+
+ // And finally, form the BEXTR itself.
+ SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
+
+ // The 'X' was originally truncated. Do that now.
+ if (XVT != NVT) {
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
+ Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
+ }
+
+ ReplaceNode(Node, Extract.getNode());
+ SelectCode(Extract.getNode());
+
+ return true;
+}
+
+// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
+MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
+ MVT NVT = Node->getSimpleValueType(0);
+ SDLoc dl(Node);
+
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ // If we have TBM we can use an immediate for the control. If we have BMI
+ // we should only do this if the BEXTR instruction is implemented well.
+ // Otherwise moving the control into a register makes this more costly.
+ // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
+ // hoisting the move immediate would make it worthwhile with a less optimal
+ // BEXTR?
+ bool PreferBEXTR =
+ Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
+ if (!PreferBEXTR && !Subtarget->hasBMI2())
+ return nullptr;
+
+ // Must have a shift right.
+ if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
+ return nullptr;
+
+ // Shift can't have additional users.
+ if (!N0->hasOneUse())
+ return nullptr;
+
+ // Only supported for 32 and 64 bits.
+ if (NVT != MVT::i32 && NVT != MVT::i64)
+ return nullptr;
+
+ // Shift amount and RHS of and must be constant.
+ ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ if (!MaskCst || !ShiftCst)
+ return nullptr;
+
+ // And RHS must be a mask.
+ uint64_t Mask = MaskCst->getZExtValue();
+ if (!isMask_64(Mask))
+ return nullptr;
+
+ uint64_t Shift = ShiftCst->getZExtValue();
+ uint64_t MaskSize = countPopulation(Mask);
+
+ // Don't interfere with something that can be handled by extracting AH.
+ // TODO: If we are able to fold a load, BEXTR might still be better than AH.
+ if (Shift == 8 && MaskSize == 8)
+ return nullptr;
+
+ // Make sure we are only using bits that were in the original value, not
+ // shifted in.
+ if (Shift + MaskSize > NVT.getSizeInBits())
+ return nullptr;
+
+ // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
+ // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
+ // does not fit into 32 bits. Load folding is not a sufficient reason.
+ if (!PreferBEXTR && MaskSize <= 32)
+ return nullptr;
+
+ SDValue Control;
+ unsigned ROpc, MOpc;
+
+ if (!PreferBEXTR) {
+ assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
+ // If we can't make use of BEXTR then we can't fuse shift+mask stages.
+ // Let's perform the mask first, and apply shift later. Note that we need to
+ // widen the mask to account for the fact that we'll apply shift afterwards!
+ Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
+ ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
+ MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
+ unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
+ Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+ } else {
+ // The 'control' of BEXTR has the pattern of:
+ // [15...8 bit][ 7...0 bit] location
+ // [ bit count][ shift] name
+ // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
+ Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+ if (Subtarget->hasTBM()) {
+ ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+ MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+ } else {
+ assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
+ // BMI requires the immediate to placed in a register.
+ ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+ MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+ unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
+ Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+ }
+ }
+
+ MachineSDNode *NewNode;
+ SDValue Input = N0->getOperand(0);
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = {
+ Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
+ NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ // Update the chain.
+ ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
+ } else {
+ NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
+ }
+
+ if (!PreferBEXTR) {
+ // We still need to apply the shift.
+ SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
+ unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
+ NewNode =
+ CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
+ }
+
+ return NewNode;
+}
+
+// Emit a PCMISTR(I/M) instruction.
+MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
+ bool MayFoldLoad, const SDLoc &dl,
+ MVT VT, SDNode *Node) {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ SDValue Imm = Node->getOperand(2);
+ const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+ Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+ // Try to fold a load. No need to check alignment.
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+ N1.getOperand(0) };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
+ return CNode;
+ }
+
+ SDValue Ops[] = { N0, N1, Imm };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
+ MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+ return CNode;
+}
+
+// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
+// to emit a second instruction after this one. This is needed since we have two
+// copyToReg nodes glued before this and we need to continue that glue through.
+MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
+ bool MayFoldLoad, const SDLoc &dl,
+ MVT VT, SDNode *Node,
+ SDValue &InFlag) {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N2 = Node->getOperand(2);
+ SDValue Imm = Node->getOperand(4);
+ const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+ Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+ // Try to fold a load. No need to check alignment.
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+ N2.getOperand(0), InFlag };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 3);
+ // Update the chain.
+ ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
+ return CNode;
+ }
+
+ SDValue Ops[] = { N0, N2, Imm, InFlag };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
+ MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 2);
+ return CNode;
+}
+
+bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ // Only handle scalar shifts.
+ if (VT.isVector())
+ return false;
+
+ // Narrower shifts only mask to 5 bits in hardware.
+ unsigned Size = VT == MVT::i64 ? 64 : 32;
+
+ SDValue OrigShiftAmt = N->getOperand(1);
+ SDValue ShiftAmt = OrigShiftAmt;
+ SDLoc DL(N);
+
+ // Skip over a truncate of the shift amount.
+ if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
+ ShiftAmt = ShiftAmt->getOperand(0);
+
+ // This function is called after X86DAGToDAGISel::matchBitExtract(),
+ // so we are not afraid that we might mess up BZHI/BEXTR pattern.
+
+ SDValue NewShiftAmt;
+ if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
+ SDValue Add0 = ShiftAmt->getOperand(0);
+ SDValue Add1 = ShiftAmt->getOperand(1);
+ // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
+ // to avoid the ADD/SUB.
+ if (isa<ConstantSDNode>(Add1) &&
+ cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
+ NewShiftAmt = Add0;
+ // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
+ // generate a NEG instead of a SUB of a constant.
+ } else if (ShiftAmt->getOpcode() == ISD::SUB &&
+ isa<ConstantSDNode>(Add0) &&
+ cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
+ cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
+ // Insert a negate op.
+ // TODO: This isn't guaranteed to replace the sub if there is a logic cone
+ // that uses it that's not a shift.
+ EVT SubVT = ShiftAmt.getValueType();
+ SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
+ SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
+ NewShiftAmt = Neg;
+
+ // Insert these operands into a valid topological order so they can
+ // get selected independently.
+ insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
+ insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
+ } else
+ return false;
+ } else
+ return false;
+
+ if (NewShiftAmt.getValueType() != MVT::i8) {
+ // Need to truncate the shift amount.
+ NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
+ // Add to a correct topological ordering.
+ insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
+ }
+
+ // Insert a new mask to keep the shift amount legal. This should be removed
+ // by isel patterns.
+ NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
+ CurDAG->getConstant(Size - 1, DL, MVT::i8));
+ // Place in a correct topological ordering.
+ insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
+
+ SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
+ NewShiftAmt);
+ if (UpdatedNode != N) {
+ // If we found an existing node, we should replace ourselves with that node
+ // and wait for it to be selected after its other users.
+ ReplaceNode(N, UpdatedNode);
+ return true;
+ }
+
+ // If the original shift amount is now dead, delete it so that we don't run
+ // it through isel.
+ if (OrigShiftAmt.getNode()->use_empty())
+ CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
+
+ // Now that we've optimized the shift amount, defer to normal isel to get
+ // load folding and legacy vs BMI2 selection without repeating it here.
+ SelectCode(N);
+ return true;
+}
+
+bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
+ MVT NVT = N->getSimpleValueType(0);
+ unsigned Opcode = N->getOpcode();
+ SDLoc dl(N);
+
+ // For operations of the form (x << C1) op C2, check if we can use a smaller
+ // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+ SDValue Shift = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+ if (!Cst)
+ return false;
+
+ int64_t Val = Cst->getSExtValue();
+
+ // If we have an any_extend feeding the AND, look through it to see if there
+ // is a shift behind it. But only if the AND doesn't use the extended bits.
+ // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
+ bool FoundAnyExtend = false;
+ if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
+ Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
+ isUInt<32>(Val)) {
+ FoundAnyExtend = true;
+ Shift = Shift.getOperand(0);
+ }
+
+ if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
+ return false;
+
+ // i8 is unshrinkable, i16 should be promoted to i32.
+ if (NVT != MVT::i32 && NVT != MVT::i64)
+ return false;
+
+ ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ if (!ShlCst)
+ return false;
+
+ uint64_t ShAmt = ShlCst->getZExtValue();
+
+ // Make sure that we don't change the operation by removing bits.
+ // This only matters for OR and XOR, AND is unaffected.
+ uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
+ if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+ return false;
+
+ // Check the minimum bitwidth for the new constant.
+ // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+ auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
+ if (Opcode == ISD::AND) {
+ // AND32ri is the same as AND64ri32 with zext imm.
+ // Try this before sign extended immediates below.
+ ShiftedVal = (uint64_t)Val >> ShAmt;
+ if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
+ return true;
+ // Also swap order when the AND can become MOVZX.
+ if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
+ return true;
+ }
+ ShiftedVal = Val >> ShAmt;
+ if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
+ (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
+ return true;
+ if (Opcode != ISD::AND) {
+ // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
+ ShiftedVal = (uint64_t)Val >> ShAmt;
+ if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
+ return true;
+ }
+ return false;
+ };
+
+ int64_t ShiftedVal;
+ if (!CanShrinkImmediate(ShiftedVal))
+ return false;
+
+ // Ok, we can reorder to get a smaller immediate.
+
+ // But, its possible the original immediate allowed an AND to become MOVZX.
+ // Doing this late due to avoid the MakedValueIsZero call as late as
+ // possible.
+ if (Opcode == ISD::AND) {
+ // Find the smallest zext this could possibly be.
+ unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
+ ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
+
+ // Figure out which bits need to be zero to achieve that mask.
+ APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
+ ZExtWidth);
+ NeededMask &= ~Cst->getAPIntValue();
+
+ if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
+ return false;
+ }
+
+ SDValue X = Shift.getOperand(0);
+ if (FoundAnyExtend) {
+ SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
+ insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
+ X = NewX;
+ }
+
+ SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
+ insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
+ SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
+ insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
+ SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
+ Shift.getOperand(1));
+ ReplaceNode(N, NewSHL.getNode());
+ SelectCode(NewSHL.getNode());
+ return true;
+}
+
+bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
+ SDNode *ParentBC, SDValue A, SDValue B,
+ SDValue C, uint8_t Imm) {
+ assert(A.isOperandOf(ParentA));
+ assert(B.isOperandOf(ParentBC));
+ assert(C.isOperandOf(ParentBC));
+
+ auto tryFoldLoadOrBCast =
+ [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp, SDValue &Segment) {
+ if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
+ return true;
+
+ // Not a load, check for broadcast which may be behind a bitcast.
+ if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
+ P = L.getNode();
+ L = L.getOperand(0);
+ }
+
+ if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
+ return false;
+
+ // Only 32 and 64 bit broadcasts are supported.
+ auto *MemIntr = cast<MemIntrinsicSDNode>(L);
+ unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
+ if (Size != 32 && Size != 64)
+ return false;
+
+ return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
+ };
+
+ bool FoldedLoad = false;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ FoldedLoad = true;
+ } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
+ Tmp4)) {
+ FoldedLoad = true;
+ std::swap(A, C);
+ // Swap bits 1/4 and 3/6.
+ uint8_t OldImm = Imm;
+ Imm = OldImm & 0xa5;
+ if (OldImm & 0x02) Imm |= 0x10;
+ if (OldImm & 0x10) Imm |= 0x02;
+ if (OldImm & 0x08) Imm |= 0x40;
+ if (OldImm & 0x40) Imm |= 0x08;
+ } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3,
+ Tmp4)) {
+ FoldedLoad = true;
+ std::swap(B, C);
+ // Swap bits 1/2 and 5/6.
+ uint8_t OldImm = Imm;
+ Imm = OldImm & 0x99;
+ if (OldImm & 0x02) Imm |= 0x04;
+ if (OldImm & 0x04) Imm |= 0x02;
+ if (OldImm & 0x20) Imm |= 0x40;
+ if (OldImm & 0x40) Imm |= 0x20;
+ }
+
+ SDLoc DL(Root);
+
+ SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+
+ MVT NVT = Root->getSimpleValueType(0);
+
+ MachineSDNode *MNode;
+ if (FoldedLoad) {
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+
+ unsigned Opc;
+ if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(C);
+ unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
+ assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
+
+ bool UseD = EltSize == 32;
+ if (NVT.is128BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
+ else if (NVT.is256BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
+ else if (NVT.is512BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
+ else
+ llvm_unreachable("Unexpected vector size!");
+ } else {
+ bool UseD = NVT.getVectorElementType() == MVT::i32;
+ if (NVT.is128BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
+ else if (NVT.is256BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
+ else if (NVT.is512BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
+ else
+ llvm_unreachable("Unexpected vector size!");
+ }
+
+ SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
+ MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+
+ // Update the chain.
+ ReplaceUses(C.getValue(1), SDValue(MNode, 1));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
+ } else {
+ bool UseD = NVT.getVectorElementType() == MVT::i32;
+ unsigned Opc;
+ if (NVT.is128BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
+ else if (NVT.is256BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
+ else if (NVT.is512BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
+ else
+ llvm_unreachable("Unexpected vector size!");
+
+ MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
+ }
+
+ ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
+ CurDAG->RemoveDeadNode(Root);
+ return true;
+}
+
+// Try to match two logic ops to a VPTERNLOG.
+// FIXME: Handle inverted inputs?
+// FIXME: Handle more complex patterns that use an operand more than once?
+bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
+ MVT NVT = N->getSimpleValueType(0);
+
+ // Make sure we support VPTERNLOG.
+ if (!NVT.isVector() || !Subtarget->hasAVX512() ||
+ NVT.getVectorElementType() == MVT::i1)
+ return false;
+
+ // We need VLX for 128/256-bit.
+ if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
+ return false;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ auto getFoldableLogicOp = [](SDValue Op) {
+ // Peek through single use bitcast.
+ if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
+ Op = Op.getOperand(0);
+
+ if (!Op.hasOneUse())
+ return SDValue();
+
+ unsigned Opc = Op.getOpcode();
+ if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
+ Opc == X86ISD::ANDNP)
+ return Op;
+
+ return SDValue();
+ };
+
+ SDValue A, FoldableOp;
+ if ((FoldableOp = getFoldableLogicOp(N1))) {
+ A = N0;
+ } else if ((FoldableOp = getFoldableLogicOp(N0))) {
+ A = N1;
+ } else
+ return false;
+
+ SDValue B = FoldableOp.getOperand(0);
+ SDValue C = FoldableOp.getOperand(1);
+
+ // We can build the appropriate control immediate by performing the logic
+ // operation we're matching using these constants for A, B, and C.
+ const uint8_t TernlogMagicA = 0xf0;
+ const uint8_t TernlogMagicB = 0xcc;
+ const uint8_t TernlogMagicC = 0xaa;
+
+ uint8_t Imm;
+ switch (FoldableOp.getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
+ case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
+ case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
+ case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
+ }
+
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86ISD::ANDNP:
+ if (A == N0)
+ Imm &= ~TernlogMagicA;
+ else
+ Imm = ~(Imm) & TernlogMagicA;
+ break;
+ case ISD::AND: Imm &= TernlogMagicA; break;
+ case ISD::OR: Imm |= TernlogMagicA; break;
+ case ISD::XOR: Imm ^= TernlogMagicA; break;
+ }
+
+ return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm);
+}
+
+/// If the high bits of an 'and' operand are known zero, try setting the
+/// high bits of an 'and' constant operand to produce a smaller encoding by
+/// creating a small, sign-extended negative immediate rather than a large
+/// positive one. This reverses a transform in SimplifyDemandedBits that
+/// shrinks mask constants by clearing bits. There is also a possibility that
+/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
+/// case, just replace the 'and'. Return 'true' if the node is replaced.
+bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
+ // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
+ // have immediate operands.
+ MVT VT = And->getSimpleValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
+ if (!And1C)
+ return false;
+
+ // Bail out if the mask constant is already negative. It's can't shrink more.
+ // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
+ // patterns to use a 32-bit and instead of a 64-bit and by relying on the
+ // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
+ // are negative too.
+ APInt MaskVal = And1C->getAPIntValue();
+ unsigned MaskLZ = MaskVal.countLeadingZeros();
+ if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
+ return false;
+
+ // Don't extend into the upper 32 bits of a 64 bit mask.
+ if (VT == MVT::i64 && MaskLZ >= 32) {
+ MaskLZ -= 32;
+ MaskVal = MaskVal.trunc(32);
+ }
+
+ SDValue And0 = And->getOperand(0);
+ APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
+ APInt NegMaskVal = MaskVal | HighZeros;
+
+ // If a negative constant would not allow a smaller encoding, there's no need
+ // to continue. Only change the constant when we know it's a win.
+ unsigned MinWidth = NegMaskVal.getMinSignedBits();
+ if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
+ return false;
+
+ // Extend masks if we truncated above.
+ if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
+ NegMaskVal = NegMaskVal.zext(64);
+ HighZeros = HighZeros.zext(64);
+ }
+
+ // The variable operand must be all zeros in the top bits to allow using the
+ // new, negative constant as the mask.
+ if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
+ return false;
+
+ // Check if the mask is -1. In that case, this is an unnecessary instruction
+ // that escaped earlier analysis.
+ if (NegMaskVal.isAllOnesValue()) {
+ ReplaceNode(And, And0.getNode());
+ return true;
+ }
+
+ // A negative mask allows a smaller encoding. Create a new 'and' node.
+ SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
+ insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
+ SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
+ ReplaceNode(And, NewAnd.getNode());
+ SelectCode(NewAnd.getNode());
+ return true;
+}
+
+static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
+ bool FoldedBCast, bool Masked) {
+#define VPTESTM_CASE(VT, SUFFIX) \
+case MVT::VT: \
+ if (Masked) \
+ return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
+ return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
+
+
+#define VPTESTM_BROADCAST_CASES(SUFFIX) \
+default: llvm_unreachable("Unexpected VT!"); \
+VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
+VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
+VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
+VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
+VPTESTM_CASE(v16i32, DZ##SUFFIX) \
+VPTESTM_CASE(v8i64, QZ##SUFFIX)
+
+#define VPTESTM_FULL_CASES(SUFFIX) \
+VPTESTM_BROADCAST_CASES(SUFFIX) \
+VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
+VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
+VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
+VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
+VPTESTM_CASE(v64i8, BZ##SUFFIX) \
+VPTESTM_CASE(v32i16, WZ##SUFFIX)
+
+ if (FoldedBCast) {
+ switch (TestVT.SimpleTy) {
+ VPTESTM_BROADCAST_CASES(rmb)
+ }
+ }
+
+ if (FoldedLoad) {
+ switch (TestVT.SimpleTy) {
+ VPTESTM_FULL_CASES(rm)
+ }
+ }
+
+ switch (TestVT.SimpleTy) {
+ VPTESTM_FULL_CASES(rr)
+ }
+
+#undef VPTESTM_FULL_CASES
+#undef VPTESTM_BROADCAST_CASES
+#undef VPTESTM_CASE
+}
+
+// Try to create VPTESTM instruction. If InMask is not null, it will be used
+// to form a masked operation.
+bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
+ SDValue InMask) {
+ assert(Subtarget->hasAVX512() && "Expected AVX512!");
+ assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Unexpected VT!");
+
+ // Look for equal and not equal compares.
+ ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
+ return false;
+
+ SDValue SetccOp0 = Setcc.getOperand(0);
+ SDValue SetccOp1 = Setcc.getOperand(1);
+
+ // Canonicalize the all zero vector to the RHS.
+ if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
+ std::swap(SetccOp0, SetccOp1);
+
+ // See if we're comparing against zero.
+ if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
+ return false;
+
+ SDValue N0 = SetccOp0;
+
+ MVT CmpVT = N0.getSimpleValueType();
+ MVT CmpSVT = CmpVT.getVectorElementType();
+
+ // Start with both operands the same. We'll try to refine this.
+ SDValue Src0 = N0;
+ SDValue Src1 = N0;
+
+ {
+ // Look through single use bitcasts.
+ SDValue N0Temp = N0;
+ if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
+ N0Temp = N0.getOperand(0);
+
+ // Look for single use AND.
+ if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
+ Src0 = N0Temp.getOperand(0);
+ Src1 = N0Temp.getOperand(1);
+ }
+ }
+
+ // Without VLX we need to widen the operation.
+ bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
+
+ auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
+ SDValue &Base, SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ // If we need to widen, we can't fold the load.
+ if (!Widen)
+ if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
+ return true;
+
+ // If we didn't fold a load, try to match broadcast. No widening limitation
+ // for this. But only 32 and 64 bit types are supported.
+ if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
+ return false;
+
+ // Look through single use bitcasts.
+ if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
+ P = L.getNode();
+ L = L.getOperand(0);
+ }
+
+ if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
+ return false;
+
+ auto *MemIntr = cast<MemIntrinsicSDNode>(L);
+ if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
+ return false;
+
+ return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
+ };
+
+ // We can only fold loads if the sources are unique.
+ bool CanFoldLoads = Src0 != Src1;
+
+ bool FoldedLoad = false;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (CanFoldLoads) {
+ FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4);
+ if (!FoldedLoad) {
+ // And is commutative.
+ FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
+ Tmp2, Tmp3, Tmp4);
+ if (FoldedLoad)
+ std::swap(Src0, Src1);
+ }
+ }
+
+ bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
+
+ bool IsMasked = InMask.getNode() != nullptr;
+
+ SDLoc dl(Root);
+
+ MVT ResVT = Setcc.getSimpleValueType();
+ MVT MaskVT = ResVT;
+ if (Widen) {
+ // Widen the inputs using insert_subreg or copy_to_regclass.
+ unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
+ unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
+ unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
+ CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
+ MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
+ CmpVT), 0);
+ Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
+
+ if (!FoldedBCast)
+ Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
+
+ if (IsMasked) {
+ // Widen the mask.
+ unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
+ SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+ InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, MaskVT, InMask, RC), 0);
+ }
+ }
+
+ bool IsTestN = CC == ISD::SETEQ;
+ unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
+ IsMasked);
+
+ MachineSDNode *CNode;
+ if (FoldedLoad) {
+ SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
+
+ if (IsMasked) {
+ SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+ Src1.getOperand(0) };
+ CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ } else {
+ SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+ Src1.getOperand(0) };
+ CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ }
+
+ // Update the chain.
+ ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
+ } else {
+ if (IsMasked)
+ CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
+ else
+ CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
+ }
+
+ // If we widened, we need to shrink the mask VT.
+ if (Widen) {
+ unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
+ SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+ CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, ResVT, SDValue(CNode, 0), RC);
+ }
+
+ ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
+ CurDAG->RemoveDeadNode(Root);
+ return true;
+}
+
+// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
+// into vpternlog.
+bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
+ assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
+
+ MVT NVT = N->getSimpleValueType(0);
+
+ // Make sure we support VPTERNLOG.
+ if (!NVT.isVector() || !Subtarget->hasAVX512())
+ return false;
+
+ // We need VLX for 128/256-bit.
+ if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
+ return false;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Canonicalize AND to LHS.
+ if (N1.getOpcode() == ISD::AND)
+ std::swap(N0, N1);
+
+ if (N0.getOpcode() != ISD::AND ||
+ N1.getOpcode() != X86ISD::ANDNP ||
+ !N0.hasOneUse() || !N1.hasOneUse())
+ return false;
+
+ // ANDN is not commutable, use it to pick down A and C.
+ SDValue A = N1.getOperand(0);
+ SDValue C = N1.getOperand(1);
+
+ // AND is commutable, if one operand matches A, the other operand is B.
+ // Otherwise this isn't a match.
+ SDValue B;
+ if (N0.getOperand(0) == A)
+ B = N0.getOperand(1);
+ else if (N0.getOperand(1) == A)
+ B = N0.getOperand(0);
+ else
+ return false;
+
+ SDLoc dl(N);
+ SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
+ SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
+ ReplaceNode(N, Ternlog.getNode());
+
+ return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
+ A, B, C, 0xCA);
+}
+
+void X86DAGToDAGISel::Select(SDNode *Node) {
+ MVT NVT = Node->getSimpleValueType(0);
+ unsigned Opcode = Node->getOpcode();
+ SDLoc dl(Node);
+
+ if (Node->isMachineOpcode()) {
+ LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ Node->setNodeId(-1);
+ return; // Already selected.
+ }
+
+ switch (Opcode) {
+ default: break;
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = Node->getConstantOperandVal(1);
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_encodekey128:
+ case Intrinsic::x86_encodekey256: {
+ if (!Subtarget->hasKL())
+ break;
+
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break;
+ case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break;
+ }
+
+ SDValue Chain = Node->getOperand(0);
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
+ SDValue());
+ if (Opcode == X86::ENCODEKEY256)
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
+ Chain.getValue(1));
+
+ MachineSDNode *Res = CurDAG->getMachineNode(
+ Opcode, dl, Node->getVTList(),
+ {Node->getOperand(2), Chain, Chain.getValue(1)});
+ ReplaceNode(Node, Res);
+ return;
+ }
+ case Intrinsic::x86_tileloadd64_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc = X86::PTILELOADDV;
+ // _tile_loadd_internal(row, col, buf, STRIDE)
+ SDValue Base = Node->getOperand(4);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(5);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Base,
+ Scale,
+ Index,
+ Disp,
+ Segment,
+ CFG,
+ Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ case Intrinsic::x86_tdpbssd_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ SDValue Chain = Node->getOperand(0);
+ unsigned Opc = X86::PTDPBSSDV;
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Node->getOperand(4),
+ Node->getOperand(5),
+ Node->getOperand(6),
+ Node->getOperand(7),
+ CFG,
+ Chain};
+ MachineSDNode *CNode =
+ CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ case Intrinsic::x86_tilezero_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc = X86::PTILEZEROV;
+ SDValue Chain = Node->getOperand(0);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
+ MachineSDNode *CNode =
+ CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ }
+ break;
+ }
+ case ISD::INTRINSIC_VOID: {
+ unsigned IntNo = Node->getConstantOperandVal(1);
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_sse3_monitor:
+ case Intrinsic::x86_monitorx:
+ case Intrinsic::x86_clzero: {
+ bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
+
+ unsigned Opc = 0;
+ switch (IntNo) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_sse3_monitor:
+ if (!Subtarget->hasSSE3())
+ break;
+ Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
+ break;
+ case Intrinsic::x86_monitorx:
+ if (!Subtarget->hasMWAITX())
+ break;
+ Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
+ break;
+ case Intrinsic::x86_clzero:
+ if (!Subtarget->hasCLZERO())
+ break;
+ Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
+ break;
+ }
+
+ if (Opc) {
+ unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
+ SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
+ Node->getOperand(2), SDValue());
+ SDValue InFlag = Chain.getValue(1);
+
+ if (IntNo == Intrinsic::x86_sse3_monitor ||
+ IntNo == Intrinsic::x86_monitorx) {
+ // Copy the other two operands to ECX and EDX.
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+ { Chain, InFlag});
+ ReplaceNode(Node, CNode);
+ return;
+ }
+
+ break;
+ }
+ case Intrinsic::x86_tilestored64_internal: {
+ unsigned Opc = X86::PTILESTOREDV;
+ // _tile_stored_internal(row, col, buf, STRIDE, c)
+ SDValue Base = Node->getOperand(4);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(5);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Base,
+ Scale,
+ Index,
+ Disp,
+ Segment,
+ Node->getOperand(6),
+ CFG,
+ Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ case Intrinsic::x86_tileloadd64:
+ case Intrinsic::x86_tileloaddt164:
+ case Intrinsic::x86_tilestored64: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
+ case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
+ case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
+ }
+ // FIXME: Match displacement and scale.
+ unsigned TIndex = Node->getConstantOperandVal(2);
+ SDValue TReg = getI8Imm(TIndex, dl);
+ SDValue Base = Node->getOperand(3);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(4);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ if (Opc == X86::PTILESTORED) {
+ SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ } else {
+ SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ }
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ }
+ break;
+ }
+ case ISD::BRIND: {
+ if (Subtarget->isTargetNaCl())
+ // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
+ // leave the instruction alone.
+ break;
+ if (Subtarget->isTarget64BitILP32()) {
+ // Converts a 32-bit register to a 64-bit, zero-extended version of
+ // it. This is needed because x86-64 can do many things, but jmp %r32
+ // ain't one of them.
+ SDValue Target = Node->getOperand(1);
+ assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
+ SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
+ SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
+ Node->getOperand(0), ZextTarget);
+ ReplaceNode(Node, Brind.getNode());
+ SelectCode(ZextTarget.getNode());
+ SelectCode(Brind.getNode());
+ return;
+ }
+ break;
+ }
+ case X86ISD::GlobalBaseReg:
+ ReplaceNode(Node, getGlobalBaseReg());
+ return;
+
+ case ISD::BITCAST:
+ // Just drop all 128/256/512-bit bitcasts.
+ if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
+ NVT == MVT::f128) {
+ ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ break;
+
+ case ISD::SRL:
+ if (matchBitExtract(Node))
+ return;
+ LLVM_FALLTHROUGH;
+ case ISD::SRA:
+ case ISD::SHL:
+ if (tryShiftAmountMod(Node))
+ return;
+ break;
+
+ case X86ISD::VPTERNLOG: {
+ uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
+ if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2), Imm))
+ return;
+ break;
+ }
+
+ case X86ISD::ANDNP:
+ if (tryVPTERNLOG(Node))
+ return;
+ break;
+
+ case ISD::AND:
+ if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
+ // Try to form a masked VPTESTM. Operands can be in either order.
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
+ tryVPTESTM(Node, N0, N1))
+ return;
+ if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
+ tryVPTESTM(Node, N1, N0))
+ return;
+ }
+
+ if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
+ ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ if (matchBitExtract(Node))
+ return;
+ if (AndImmShrink && shrinkAndImmediate(Node))
+ return;
+
+ LLVM_FALLTHROUGH;
+ case ISD::OR:
+ case ISD::XOR:
+ if (tryShrinkShlLogicImm(Node))
+ return;
+ if (Opcode == ISD::OR && tryMatchBitSelect(Node))
+ return;
+ if (tryVPTERNLOG(Node))
+ return;
+
+ LLVM_FALLTHROUGH;
+ case ISD::ADD:
+ case ISD::SUB: {
+ // Try to avoid folding immediates with multiple uses for optsize.
+ // This code tries to select to register form directly to avoid going
+ // through the isel table which might fold the immediate. We can't change
+ // the patterns on the add/sub/and/or/xor with immediate paterns in the
+ // tablegen files to check immediate use count without making the patterns
+ // unavailable to the fast-isel table.
+ if (!CurDAG->shouldOptForSize())
+ break;
+
+ // Only handle i8/i16/i32/i64.
+ if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
+ break;
+
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+ if (!Cst)
+ break;
+
+ int64_t Val = Cst->getSExtValue();
+
+ // Make sure its an immediate that is considered foldable.
+ // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
+ if (!isInt<8>(Val) && !isInt<32>(Val))
+ break;
+
+ // If this can match to INC/DEC, let it go.
+ if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
+ break;
+
+ // Check if we should avoid folding this immediate.
+ if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
+ break;
+
+ // We should not fold the immediate. So we need a register form instead.
+ unsigned ROpc, MOpc;
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::i8:
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
+ case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
+ case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
+ case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
+ case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
+ }
+ break;
+ case MVT::i16:
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
+ case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
+ case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
+ case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
+ case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
+ }
+ break;
+ case MVT::i32:
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
+ case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
+ case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
+ case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
+ case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
+ }
+ break;
+ case MVT::i64:
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
+ case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
+ case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
+ case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
+ case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
+ }
+ break;
+ }
+
+ // Ok this is a AND/OR/XOR/ADD/SUB with constant.
+
+ // If this is a not a subtract, we can still try to fold a load.
+ if (Opcode != ISD::SUB) {
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ // Update the chain.
+ ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ }
+
+ CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
+ return;
+ }
+
+ case X86ISD::SMUL:
+ // i16/i32/i64 are handled with isel patterns.
+ if (NVT != MVT::i8)
+ break;
+ LLVM_FALLTHROUGH;
+ case X86ISD::UMUL: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ unsigned LoReg, ROpc, MOpc;
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8:
+ LoReg = X86::AL;
+ ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
+ MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
+ break;
+ case MVT::i16:
+ LoReg = X86::AX;
+ ROpc = X86::MUL16r;
+ MOpc = X86::MUL16m;
+ break;
+ case MVT::i32:
+ LoReg = X86::EAX;
+ ROpc = X86::MUL32r;
+ MOpc = X86::MUL32m;
+ break;
+ case MVT::i64:
+ LoReg = X86::RAX;
+ ROpc = X86::MUL64r;
+ MOpc = X86::MUL64m;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ // Multiply is commutative.
+ if (!FoldedLoad) {
+ FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ if (FoldedLoad)
+ std::swap(N0, N1);
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+ N0, SDValue()).getValue(1);
+
+ MachineSDNode *CNode;
+ if (FoldedLoad) {
+ // i16/i32/i64 use an instruction that produces a low and high result even
+ // though only the low result is used.
+ SDVTList VTs;
+ if (NVT == MVT::i8)
+ VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
+ else
+ VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
+
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
+ } else {
+ // i16/i32/i64 use an instruction that produces a low and high result even
+ // though only the low result is used.
+ SDVTList VTs;
+ if (NVT == MVT::i8)
+ VTs = CurDAG->getVTList(NVT, MVT::i32);
+ else
+ VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
+
+ CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
+ }
+
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ unsigned Opc, MOpc;
+ unsigned LoReg, HiReg;
+ bool IsSigned = Opcode == ISD::SMUL_LOHI;
+ bool UseMULX = !IsSigned && Subtarget->hasBMI2();
+ bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i32:
+ Opc = UseMULXHi ? X86::MULX32Hrr :
+ UseMULX ? X86::MULX32rr :
+ IsSigned ? X86::IMUL32r : X86::MUL32r;
+ MOpc = UseMULXHi ? X86::MULX32Hrm :
+ UseMULX ? X86::MULX32rm :
+ IsSigned ? X86::IMUL32m : X86::MUL32m;
+ LoReg = UseMULX ? X86::EDX : X86::EAX;
+ HiReg = X86::EDX;
+ break;
+ case MVT::i64:
+ Opc = UseMULXHi ? X86::MULX64Hrr :
+ UseMULX ? X86::MULX64rr :
+ IsSigned ? X86::IMUL64r : X86::MUL64r;
+ MOpc = UseMULXHi ? X86::MULX64Hrm :
+ UseMULX ? X86::MULX64rm :
+ IsSigned ? X86::IMUL64m : X86::MUL64m;
+ LoReg = UseMULX ? X86::RDX : X86::RAX;
+ HiReg = X86::RDX;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ // Multiply is commmutative.
+ if (!foldedLoad) {
+ foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ if (foldedLoad)
+ std::swap(N0, N1);
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+ N0, SDValue()).getValue(1);
+ SDValue ResHi, ResLo;
+ if (foldedLoad) {
+ SDValue Chain;
+ MachineSDNode *CNode = nullptr;
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ if (UseMULXHi) {
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ Chain = SDValue(CNode, 1);
+ } else if (UseMULX) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ Chain = SDValue(CNode, 2);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ Chain = SDValue(CNode, 0);
+ InFlag = SDValue(CNode, 1);
+ }
+
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), Chain);
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
+ } else {
+ SDValue Ops[] = { N1, InFlag };
+ if (UseMULXHi) {
+ SDVTList VTs = CurDAG->getVTList(NVT);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ } else if (UseMULX) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 0);
+ }
+ }
+
+ // Copy the low half of the result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ if (!ResLo) {
+ assert(LoReg && "Register for low half is not defined!");
+ ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
+ NVT, InFlag);
+ InFlag = ResLo.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 0), ResLo);
+ LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
+ dbgs() << '\n');
+ }
+ // Copy the high half of the result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ if (!ResHi) {
+ assert(HiReg && "Register for high half is not defined!");
+ ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
+ NVT, InFlag);
+ InFlag = ResHi.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 1), ResHi);
+ LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
+ dbgs() << '\n');
+ }
+
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ unsigned ROpc, MOpc;
+ bool isSigned = Opcode == ISD::SDIVREM;
+ if (!isSigned) {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
+ case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
+ case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
+ case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
+ }
+ } else {
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
+ case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+ case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+ case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+ }
+ }
+
+ unsigned LoReg, HiReg, ClrReg;
+ unsigned SExtOpcode;
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8:
+ LoReg = X86::AL; ClrReg = HiReg = X86::AH;
+ SExtOpcode = 0; // Not used.
+ break;
+ case MVT::i16:
+ LoReg = X86::AX; HiReg = X86::DX;
+ ClrReg = X86::DX;
+ SExtOpcode = X86::CWD;
+ break;
+ case MVT::i32:
+ LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+ SExtOpcode = X86::CDQ;
+ break;
+ case MVT::i64:
+ LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+ SExtOpcode = X86::CQO;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+ SDValue InFlag;
+ if (NVT == MVT::i8) {
+ // Special case for div8, just use a move with zero extension to AX to
+ // clear the upper 8 bits (AH).
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
+ MachineSDNode *Move;
+ if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+ unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
+ : X86::MOVZX16rm8;
+ Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
+ Chain = SDValue(Move, 1);
+ ReplaceUses(N0.getValue(1), Chain);
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
+ } else {
+ unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
+ : X86::MOVZX16rr8;
+ Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
+ Chain = CurDAG->getEntryNode();
+ }
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
+ SDValue());
+ InFlag = Chain.getValue(1);
+ } else {
+ InFlag =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+ LoReg, N0, SDValue()).getValue(1);
+ if (isSigned && !signBitIsZero) {
+ // Sign extend the low part into the high part.
+ InFlag =
+ SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
+ } else {
+ // Zero out the high part, effectively zero extending the input.
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
+ SDValue ClrNode =
+ SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
+ switch (NVT.SimpleTy) {
+ case MVT::i16:
+ ClrNode =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
+ CurDAG->getTargetConstant(X86::sub_16bit, dl,
+ MVT::i32)),
+ 0);
+ break;
+ case MVT::i32:
+ break;
+ case MVT::i64:
+ ClrNode =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
+ CurDAG->getTargetConstant(X86::sub_32bit, dl,
+ MVT::i32)),
+ 0);
+ break;
+ default:
+ llvm_unreachable("Unexpected division source");
+ }
+
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+ ClrNode, InFlag).getValue(1);
+ }
+ }
+
+ if (foldedLoad) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ MachineSDNode *CNode =
+ CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
+ InFlag = SDValue(CNode, 1);
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
+ } else {
+ InFlag =
+ SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0);
+ }
+
+ // Prevent use of AH in a REX instruction by explicitly copying it to
+ // an ABCD_L register.
+ //
+ // The current assumption of the register allocator is that isel
+ // won't generate explicit references to the GR8_ABCD_H registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
+ if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
+ SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
+ unsigned AHExtOpcode =
+ isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
+
+ SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
+ MVT::Glue, AHCopy, InFlag);
+ SDValue Result(RNode, 0);
+ InFlag = SDValue(RNode, 1);
+
+ Result =
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
+
+ ReplaceUses(SDValue(Node, 1), Result);
+ LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+ dbgs() << '\n');
+ }
+ // Copy the division (low) result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ LoReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 0), Result);
+ LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+ dbgs() << '\n');
+ }
+ // Copy the remainder (high) result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ HiReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 1), Result);
+ LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+ dbgs() << '\n');
+ }
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
+ case X86ISD::FCMP:
+ case X86ISD::STRICT_FCMP:
+ case X86ISD::STRICT_FCMPS: {
+ bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
+ Node->getOpcode() == X86ISD::STRICT_FCMPS;
+ SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
+ SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
+
+ // Save the original VT of the compare.
+ MVT CmpVT = N0.getSimpleValueType();
+
+ // Floating point needs special handling if we don't have FCOMI.
+ if (Subtarget->hasCMov())
+ break;
+
+ bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
+
+ unsigned Opc;
+ switch (CmpVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected type!");
+ case MVT::f32:
+ Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
+ break;
+ case MVT::f64:
+ Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
+ break;
+ case MVT::f80:
+ Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
+ break;
+ }
+
+ SDValue Cmp;
+ SDValue Chain =
+ IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
+ if (IsStrictCmp) {
+ SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other);
+ Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
+ Chain = Cmp.getValue(1);
+ } else {
+ Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0);
+ }
+
+ // Move FPSW to AX.
+ SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue());
+ Chain = FPSW;
+ SDValue FNSTSW =
+ SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW,
+ FPSW.getValue(1)),
+ 0);
+
+ // Extract upper 8-bits of AX.
+ SDValue Extract =
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
+
+ // Move AH into flags.
+ // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+ assert(Subtarget->hasLAHFSAHF() &&
+ "Target doesn't support SAHF or FCOMI?");
+ SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
+ Chain = AH;
+ SDValue SAHF = SDValue(
+ CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
+
+ if (IsStrictCmp)
+ ReplaceUses(SDValue(Node, 1), Chain);
+
+ ReplaceUses(SDValue(Node, 0), SAHF);
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
+ case X86ISD::CMP: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ // Optimizations for TEST compares.
+ if (!isNullConstant(N1))
+ break;
+
+ // Save the original VT of the compare.
+ MVT CmpVT = N0.getSimpleValueType();
+
+ // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
+ // by a test instruction. The test should be removed later by
+ // analyzeCompare if we are using only the zero flag.
+ // TODO: Should we check the users and use the BEXTR flags directly?
+ if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+ if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
+ unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
+ : X86::TEST32rr;
+ SDValue BEXTR = SDValue(NewNode, 0);
+ NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
+ ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ }
+
+ // We can peek through truncates, but we need to be careful below.
+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
+ N0 = N0.getOperand(0);
+
+ // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+ // use a smaller encoding.
+ // Look past the truncate if CMP is the only use of it.
+ if (N0.getOpcode() == ISD::AND &&
+ N0.getNode()->hasOneUse() &&
+ N0.getValueType() != MVT::i8) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!C) break;
+ uint64_t Mask = C->getZExtValue();
+
+ // Check if we can replace AND+IMM64 with a shift. This is possible for
+ // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
+ // flag.
+ if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
+ onlyUsesZeroFlag(SDValue(Node, 0))) {
+ if (isMask_64(~Mask)) {
+ unsigned TrailingZeros = countTrailingZeros(Mask);
+ SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
+ SDValue Shift =
+ SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
+ N0.getOperand(0), Imm), 0);
+ MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
+ MVT::i32, Shift, Shift);
+ ReplaceNode(Node, Test);
+ return;
+ }
+ if (isMask_64(Mask)) {
+ unsigned LeadingZeros = countLeadingZeros(Mask);
+ SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
+ SDValue Shift =
+ SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
+ N0.getOperand(0), Imm), 0);
+ MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
+ MVT::i32, Shift, Shift);
+ ReplaceNode(Node, Test);
+ return;
+ }
+ }
+
+ MVT VT;
+ int SubRegOp;
+ unsigned ROpc, MOpc;
+
+ // For each of these checks we need to be careful if the sign flag is
+ // being used. It is only safe to use the sign flag in two conditions,
+ // either the sign bit in the shrunken mask is zero or the final test
+ // size is equal to the original compare size.
+
+ if (isUInt<8>(Mask) &&
+ (!(Mask & 0x80) || CmpVT == MVT::i8 ||
+ hasNoSignFlagUses(SDValue(Node, 0)))) {
+ // For example, convert "testl %eax, $8" to "testb %al, $8"
+ VT = MVT::i8;
+ SubRegOp = X86::sub_8bit;
+ ROpc = X86::TEST8ri;
+ MOpc = X86::TEST8mi;
+ } else if (OptForMinSize && isUInt<16>(Mask) &&
+ (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
+ hasNoSignFlagUses(SDValue(Node, 0)))) {
+ // For example, "testl %eax, $32776" to "testw %ax, $32776".
+ // NOTE: We only want to form TESTW instructions if optimizing for
+ // min size. Otherwise we only save one byte and possibly get a length
+ // changing prefix penalty in the decoders.
+ VT = MVT::i16;
+ SubRegOp = X86::sub_16bit;
+ ROpc = X86::TEST16ri;
+ MOpc = X86::TEST16mi;
+ } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
+ ((!(Mask & 0x80000000) &&
+ // Without minsize 16-bit Cmps can get here so we need to
+ // be sure we calculate the correct sign flag if needed.
+ (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
+ CmpVT == MVT::i32 ||
+ hasNoSignFlagUses(SDValue(Node, 0)))) {
+ // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+ // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
+ // Otherwize, we find ourselves in a position where we have to do
+ // promotion. If previous passes did not promote the and, we assume
+ // they had a good reason not to and do not promote here.
+ VT = MVT::i32;
+ SubRegOp = X86::sub_32bit;
+ ROpc = X86::TEST32ri;
+ MOpc = X86::TEST32mi;
+ } else {
+ // No eligible transformation was found.
+ break;
+ }
+
+ SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
+ SDValue Reg = N0.getOperand(0);
+
+ // Emit a testl or testw.
+ MachineSDNode *NewNode;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
+ if (!LoadN->isSimple()) {
+ unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
+ if (MOpc == X86::TEST8mi && NumVolBits != 8)
+ break;
+ else if (MOpc == X86::TEST16mi && NumVolBits != 16)
+ break;
+ else if (MOpc == X86::TEST32mi && NumVolBits != 32)
+ break;
+ }
+ }
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+ Reg.getOperand(0) };
+ NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
+ // Update the chain.
+ ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(NewNode,
+ {cast<LoadSDNode>(Reg)->getMemOperand()});
+ } else {
+ // Extract the subregister if necessary.
+ if (N0.getValueType() != VT)
+ Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
+
+ NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
+ }
+ // Replace CMP with TEST.
+ ReplaceNode(Node, NewNode);
+ return;
+ }
+ break;
+ }
+ case X86ISD::PCMPISTR: {
+ if (!Subtarget->hasSSE42())
+ break;
+
+ bool NeedIndex = !SDValue(Node, 0).use_empty();
+ bool NeedMask = !SDValue(Node, 1).use_empty();
+ // We can't fold a load if we are going to make two instructions.
+ bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+ MachineSDNode *CNode;
+ if (NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
+ CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+ }
+ if (NeedIndex || !NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
+ CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ }
+
+ // Connect the flag usage to the last instruction created.
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case X86ISD::PCMPESTR: {
+ if (!Subtarget->hasSSE42())
+ break;
+
+ // Copy the two implicit register inputs.
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
+ Node->getOperand(1),
+ SDValue()).getValue(1);
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
+ Node->getOperand(3), InFlag).getValue(1);
+
+ bool NeedIndex = !SDValue(Node, 0).use_empty();
+ bool NeedMask = !SDValue(Node, 1).use_empty();
+ // We can't fold a load if we are going to make two instructions.
+ bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+ MachineSDNode *CNode;
+ if (NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
+ CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
+ InFlag);
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+ }
+ if (NeedIndex || !NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
+ CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ }
+ // Connect the flag usage to the last instruction created.
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
+ case ISD::SETCC: {
+ if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
+ return;
+
+ break;
+ }
+
+ case ISD::STORE:
+ if (foldLoadStoreIntoMemOperand(Node))
+ return;
+ break;
+
+ case X86ISD::SETCC_CARRY: {
+ // We have to do this manually because tblgen will put the eflags copy in
+ // the wrong place if we use an extract_subreg in the pattern.
+ MVT VT = Node->getSimpleValueType(0);
+
+ // Copy flags to the EFLAGS register and glue it to next node.
+ SDValue EFLAGS =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+ Node->getOperand(1), SDValue());
+
+ // Create a 64-bit instruction if the result is 64-bits otherwise use the
+ // 32-bit version.
+ unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
+ MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+ SDValue Result = SDValue(
+ CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
+
+ // For less than 32-bits we need to extract from the 32-bit node.
+ if (VT == MVT::i8 || VT == MVT::i16) {
+ int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
+ Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
+ }
+
+ ReplaceUses(SDValue(Node, 0), Result);
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case X86ISD::SBB: {
+ if (isNullConstant(Node->getOperand(0)) &&
+ isNullConstant(Node->getOperand(1))) {
+ MVT VT = Node->getSimpleValueType(0);
+
+ // Create zero.
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
+ SDValue Zero =
+ SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
+ if (VT == MVT::i64) {
+ Zero = SDValue(
+ CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
+ CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
+ 0);
+ }
+
+ // Copy flags to the EFLAGS register and glue it to next node.
+ SDValue EFLAGS =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+ Node->getOperand(2), SDValue());
+
+ // Create a 64-bit instruction if the result is 64-bits otherwise use the
+ // 32-bit version.
+ unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
+ MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+ VTs = CurDAG->getVTList(SBBVT, MVT::i32);
+ SDValue Result =
+ SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS,
+ EFLAGS.getValue(1)}),
+ 0);
+
+ // Replace the flag use.
+ ReplaceUses(SDValue(Node, 1), Result.getValue(1));
+
+ // Replace the result use.
+ if (!SDValue(Node, 0).use_empty()) {
+ // For less than 32-bits we need to extract from the 32-bit node.
+ if (VT == MVT::i8 || VT == MVT::i16) {
+ int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
+ Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
+ }
+ ReplaceUses(SDValue(Node, 0), Result);
+ }
+
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ break;
+ }
+ case X86ISD::MGATHER: {
+ auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
+ SDValue IndexOp = Mgt->getIndex();
+ SDValue Mask = Mgt->getMask();
+ MVT IndexVT = IndexOp.getSimpleValueType();
+ MVT ValueVT = Node->getSimpleValueType(0);
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ // This is just to prevent crashes if the nodes are malformed somehow. We're
+ // otherwise only doing loose type checking in here based on type what
+ // a type constraint would say just like table based isel.
+ if (!ValueVT.isVector() || !MaskVT.isVector())
+ break;
+
+ unsigned NumElts = ValueVT.getVectorNumElements();
+ MVT ValueSVT = ValueVT.getVectorElementType();
+
+ bool IsFP = ValueSVT.isFloatingPoint();
+ unsigned EltSize = ValueSVT.getSizeInBits();
+
+ unsigned Opc = 0;
+ bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
+ if (AVX512Gather) {
+ if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
+ else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
+ else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
+ else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
+ else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
+ else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
+ else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
+ else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
+ } else {
+ assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
+ "Unexpected mask VT!");
+ if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
+ else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
+ else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
+ else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
+ else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
+ }
+
+ if (!Opc)
+ break;
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
+ Base, Scale, Index, Disp, Segment))
+ break;
+
+ SDValue PassThru = Mgt->getPassThru();
+ SDValue Chain = Mgt->getChain();
+ // Gather instructions have a mask output not in the ISD node.
+ SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
+
+ MachineSDNode *NewNode;
+ if (AVX512Gather) {
+ SDValue Ops[] = {PassThru, Mask, Base, Scale,
+ Index, Disp, Segment, Chain};
+ NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+ } else {
+ SDValue Ops[] = {PassThru, Base, Scale, Index,
+ Disp, Segment, Mask, Chain};
+ NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+ }
+ CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
+ ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case X86ISD::MSCATTER: {
+ auto *Sc = cast<X86MaskedScatterSDNode>(Node);
+ SDValue Value = Sc->getValue();
+ SDValue IndexOp = Sc->getIndex();
+ MVT IndexVT = IndexOp.getSimpleValueType();
+ MVT ValueVT = Value.getSimpleValueType();
+
+ // This is just to prevent crashes if the nodes are malformed somehow. We're
+ // otherwise only doing loose type checking in here based on type what
+ // a type constraint would say just like table based isel.
+ if (!ValueVT.isVector())
+ break;
+
+ unsigned NumElts = ValueVT.getVectorNumElements();
+ MVT ValueSVT = ValueVT.getVectorElementType();
+
+ bool IsFP = ValueSVT.isFloatingPoint();
+ unsigned EltSize = ValueSVT.getSizeInBits();
+
+ unsigned Opc;
+ if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
+ else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
+ else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
+ else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
+ else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
+ else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
+ else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
+ else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
+ else
+ break;
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
+ Base, Scale, Index, Disp, Segment))
+ break;
+
+ SDValue Mask = Sc->getMask();
+ SDValue Chain = Sc->getChain();
+ // Scatter instructions have a mask output not in the ISD node.
+ SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
+ SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
+
+ MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+ CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
+ ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case ISD::PREALLOCATED_SETUP: {
+ auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ auto CallId = MFI->getPreallocatedIdForCallSite(
+ cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+ SDValue Chain = Node->getOperand(0);
+ SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+ MachineSDNode *New = CurDAG->getMachineNode(
+ TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
+ ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case ISD::PREALLOCATED_ARG: {
+ auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ auto CallId = MFI->getPreallocatedIdForCallSite(
+ cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+ SDValue Chain = Node->getOperand(0);
+ SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+ SDValue ArgIndex = Node->getOperand(2);
+ SDValue Ops[3];
+ Ops[0] = CallIdValue;
+ Ops[1] = ArgIndex;
+ Ops[2] = Chain;
+ MachineSDNode *New = CurDAG->getMachineNode(
+ TargetOpcode::PREALLOCATED_ARG, dl,
+ CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
+ MVT::Other),
+ Ops);
+ ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
+ ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case X86ISD::AESENCWIDE128KL:
+ case X86ISD::AESDECWIDE128KL:
+ case X86ISD::AESENCWIDE256KL:
+ case X86ISD::AESDECWIDE256KL: {
+ if (!Subtarget->hasWIDEKL())
+ break;
+
+ unsigned Opcode;
+ switch (Node->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case X86ISD::AESENCWIDE128KL:
+ Opcode = X86::AESENCWIDE128KL;
+ break;
+ case X86ISD::AESDECWIDE128KL:
+ Opcode = X86::AESDECWIDE128KL;
+ break;
+ case X86ISD::AESENCWIDE256KL:
+ Opcode = X86::AESENCWIDE256KL;
+ break;
+ case X86ISD::AESDECWIDE256KL:
+ Opcode = X86::AESDECWIDE256KL;
+ break;
+ }
+
+ SDValue Chain = Node->getOperand(0);
+ SDValue Addr = Node->getOperand(1);
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
+ break;
+
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
+ SDValue());
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
+ Chain.getValue(1));
+
+ MachineSDNode *Res = CurDAG->getMachineNode(
+ Opcode, dl, Node->getVTList(),
+ {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
+ CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
+ ReplaceNode(Node, Res);
+ return;
+ }
+ }
+
+ SelectCode(Node);
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1, Op2, Op3, Op4;
+ switch (ConstraintID) {
+ default:
+ llvm_unreachable("Unexpected asm memory constraint");
+ case InlineAsm::Constraint_o: // offsetable ??
+ case InlineAsm::Constraint_v: // not offsetable ??
+ case InlineAsm::Constraint_m: // memory
+ case InlineAsm::Constraint_X:
+ if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
+ return true;
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ OutOps.push_back(Op2);
+ OutOps.push_back(Op3);
+ OutOps.push_back(Op4);
+ return false;
+}
+
+/// This pass converts a legalized DAG into a X86-specific DAG,
+/// ready for instruction scheduling.
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel) {
+ return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 000000000000..0dd20235aa3c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,51718 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
+#include "X86.h"
+#include "X86CallingConv.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86IntrinsicsInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <bitset>
+#include <cctype>
+#include <numeric>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<int> ExperimentalPrefLoopAlignment(
+ "x86-experimental-pref-loop-alignment", cl::init(4),
+ cl::desc(
+ "Sets the preferable loop alignment for experiments (as log2 bytes)"
+ "(the last x86-experimental-pref-loop-alignment bits"
+ " of the loop header PC will be 0)."),
+ cl::Hidden);
+
+static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
+ "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
+ cl::desc(
+ "Sets the preferable loop alignment for experiments (as log2 bytes) "
+ "for innermost loops only. If specified, this option overrides "
+ "alignment set by x86-experimental-pref-loop-alignment."),
+ cl::Hidden);
+
+static cl::opt<bool> MulConstantOptimization(
+ "mul-constant-optimization", cl::init(true),
+ cl::desc("Replace 'mul x, Const' with more effective instructions like "
+ "SHIFT, LEA, etc."),
+ cl::Hidden);
+
+static cl::opt<bool> ExperimentalUnorderedISEL(
+ "x86-experimental-unordered-atomic-isel", cl::init(false),
+ cl::desc("Use LoadSDNode and StoreSDNode instead of "
+ "AtomicSDNode for unordered atomic loads and "
+ "stores respectively."),
+ cl::Hidden);
+
+/// Call this when the user attempts to do something unsupported, like
+/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
+/// report_fatal_error, so calling code should attempt to recover without
+/// crashing.
+static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
+ const char *Msg) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ DAG.getContext()->diagnose(
+ DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
+}
+
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI)
+ : TargetLowering(TM), Subtarget(STI) {
+ bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
+ X86ScalarSSEf64 = Subtarget.hasSSE2();
+ X86ScalarSSEf32 = Subtarget.hasSSE1();
+ MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
+
+ // Set up the TargetLowering object.
+
+ // X86 is weird. It always uses i8 for shift amounts and setcc results.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ // For 64-bit, since we have so many registers, use the ILP scheduler.
+ // For 32-bit, use the register pressure specific scheduling.
+ // For Atom, always use ILP scheduling.
+ if (Subtarget.isAtom())
+ setSchedulingPreference(Sched::ILP);
+ else if (Subtarget.is64Bit())
+ setSchedulingPreference(Sched::ILP);
+ else
+ setSchedulingPreference(Sched::RegPressure);
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+ // Bypass expensive divides and use cheaper ones.
+ if (TM.getOptLevel() >= CodeGenOpt::Default) {
+ if (Subtarget.hasSlowDivide32())
+ addBypassSlowDiv(32, 8);
+ if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
+ addBypassSlowDiv(64, 32);
+ }
+
+ // Setup Windows compiler runtime calls.
+ if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const CallingConv::ID CC;
+ } LibraryCalls[] = {
+ { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
+ { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
+ { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
+ { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
+ { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ }
+ }
+
+ if (Subtarget.getTargetTriple().isOSMSVCRT()) {
+ // MSVCRT doesn't have powi; fall back to pow
+ setLibcallName(RTLIB::POWI_F32, nullptr);
+ setLibcallName(RTLIB::POWI_F64, nullptr);
+ }
+
+ // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
+ // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
+ // FIXME: Should we be limiting the atomic size on other configs? Default is
+ // 1024.
+ if (!Subtarget.hasCmpxchg8b())
+ setMaxAtomicSizeInBitsSupported(32);
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, &X86::GR8RegClass);
+ addRegisterClass(MVT::i16, &X86::GR16RegClass);
+ addRegisterClass(MVT::i32, &X86::GR32RegClass);
+ if (Subtarget.is64Bit())
+ addRegisterClass(MVT::i64, &X86::GR64RegClass);
+
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+ // We don't accept any truncstore of integer registers.
+ setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // SETOEQ and SETUNE require checking two conditions.
+ for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
+ setCondCodeAction(ISD::SETOEQ, VT, Expand);
+ setCondCodeAction(ISD::SETUNE, VT, Expand);
+ }
+
+ // Integer absolute.
+ if (Subtarget.hasCMov()) {
+ setOperationAction(ISD::ABS , MVT::i16 , Custom);
+ setOperationAction(ISD::ABS , MVT::i32 , Custom);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::ABS , MVT::i64 , Custom);
+ }
+
+ // Funnel shifts.
+ for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
+ // For slow shld targets we only lower for code size.
+ LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
+
+ setOperationAction(ShiftOp , MVT::i8 , Custom);
+ setOperationAction(ShiftOp , MVT::i16 , Custom);
+ setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
+ if (Subtarget.is64Bit())
+ setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
+ }
+
+ if (!Subtarget.useSoftFloat()) {
+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+ // operation.
+ setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
+ // We have an algorithm for SSE2, and we turn this into a 64-bit
+ // FILD or VCVTUSI2SS/SD for other targets.
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
+ // We have an algorithm for SSE2->double, and we turn this into a
+ // 64-bit FILD followed by conditional FADD for other targets.
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
+
+ // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
+ // SSE has no i16 to fp conversion, only i32. We promote in the handler
+ // to allow f80 to use i16 and f64 to use i16 with sse1 only
+ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
+ // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+
+ // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+ // FIXME: This doesn't generate invalid exception when it should. PR44019.
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+
+ // Handle FP_TO_UINT by promoting the destination to a larger signed
+ // conversion.
+ setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+ // FIXME: This doesn't generate invalid exception when it should. PR44019.
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ // FIXME: This doesn't generate invalid exception when it should. PR44019.
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+
+ setOperationAction(ISD::LRINT, MVT::f32, Custom);
+ setOperationAction(ISD::LRINT, MVT::f64, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f32, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f64, Custom);
+
+ if (!Subtarget.is64Bit()) {
+ setOperationAction(ISD::LRINT, MVT::i64, Custom);
+ setOperationAction(ISD::LLRINT, MVT::i64, Custom);
+ }
+ }
+
+ if (Subtarget.hasSSE2()) {
+ // Custom lowering for saturating float to int conversions.
+ // We handle promotion to larger result types manually.
+ for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
+ setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
+ }
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
+ }
+ }
+
+ // Handle address space casts between mixed sized pointers.
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+
+ // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+ if (!X86ScalarSSEf64) {
+ setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
+ setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
+ // Without SSE, i64->f64 goes through memory.
+ setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
+ }
+ } else if (!Subtarget.is64Bit())
+ setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
+
+ // Scalar integer divide and remainder are lowered to use operations that
+ // produce two results, to match the available instructions. This exposes
+ // the two-result form to trivial CSE, which is able to combine x/y and x%y
+ // into a single instruction.
+ //
+ // Scalar integer multiply-high is also lowered to use two-result
+ // operations, to match the available instructions. However, plain multiply
+ // (low) operations are left as Legal, as there are single-result
+ // instructions for this in x86. Using the two-result multiply instructions
+ // when both high and low results are needed must be arranged by dagcombine.
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ }
+
+ setOperationAction(ISD::BR_JT , MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND , MVT::Other, Custom);
+ for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
+ MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ setOperationAction(ISD::BR_CC, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ }
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+ setOperationAction(ISD::FREM , MVT::f32 , Expand);
+ setOperationAction(ISD::FREM , MVT::f64 , Expand);
+ setOperationAction(ISD::FREM , MVT::f80 , Expand);
+ setOperationAction(ISD::FREM , MVT::f128 , Expand);
+ setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
+
+ // Promote the i8 variants and force them on up to i32 which has a shorter
+ // encoding.
+ setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
+ setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ if (!Subtarget.hasBMI()) {
+ setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
+ }
+ }
+
+ if (Subtarget.hasLZCNT()) {
+ // When promoting the i8 variants, force them to i32 for a shorter
+ // encoding.
+ setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
+ setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ } else {
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::CTLZ , VT, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
+ }
+ }
+
+ for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
+ ISD::STRICT_FP_TO_FP16}) {
+ // Special handling for half-precision floating point conversions.
+ // If we don't have F16C support, then lower half float conversions
+ // into library calls.
+ setOperationAction(
+ Op, MVT::f32,
+ (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
+ // There's never any support for operations beyond MVT::f32.
+ setOperationAction(Op, MVT::f64, Expand);
+ setOperationAction(Op, MVT::f80, Expand);
+ setOperationAction(Op, MVT::f128, Expand);
+ }
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+
+ setOperationAction(ISD::PARITY, MVT::i8, Custom);
+ if (Subtarget.hasPOPCNT()) {
+ setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
+ } else {
+ setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ else
+ setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
+
+ setOperationAction(ISD::PARITY, MVT::i16, Custom);
+ setOperationAction(ISD::PARITY, MVT::i32, Custom);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::PARITY, MVT::i64, Custom);
+ }
+
+ setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
+
+ if (!Subtarget.hasMOVBE())
+ setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
+
+ // X86 wants to expand cmov itself.
+ for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
+ }
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ }
+
+ // Custom action for SELECT MMX and expand action for SELECT_CC MMX
+ setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
+
+ setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
+ // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
+ // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+ setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+ if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
+ setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+
+ // Darwin ABI issue.
+ for (auto VT : { MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::ConstantPool , VT, Custom);
+ setOperationAction(ISD::JumpTable , VT, Custom);
+ setOperationAction(ISD::GlobalAddress , VT, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
+ setOperationAction(ISD::ExternalSymbol , VT, Custom);
+ setOperationAction(ISD::BlockAddress , VT, Custom);
+ }
+
+ // 64-bit shl, sra, srl (iff 32-bit x86)
+ for (auto VT : { MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::SHL_PARTS, VT, Custom);
+ setOperationAction(ISD::SRA_PARTS, VT, Custom);
+ setOperationAction(ISD::SRL_PARTS, VT, Custom);
+ }
+
+ if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
+ setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
+
+ setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
+
+ // Expand certain atomics
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+ }
+
+ if (!Subtarget.is64Bit())
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+
+ if (Subtarget.hasCmpxchg16b()) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
+ }
+
+ // FIXME - use subtarget debug flags
+ if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
+ !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
+ TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
+ setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+ setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+ setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+ setOperationAction(ISD::VASTART , MVT::Other, Custom);
+ setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ bool Is64Bit = Subtarget.is64Bit();
+ setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
+
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+
+ // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
+ setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
+ setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
+
+ if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
+ // f32 and f64 use SSE.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
+ : &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
+ : &X86::FR64RegClass);
+
+ // Disable f32->f64 extload as we can only generate this in one instruction
+ // under optsize. So its easier to pattern match (fpext (load)) for that
+ // case instead of needing to emit 2 instructions for extload in the
+ // non-optsize case.
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+
+ for (auto VT : { MVT::f32, MVT::f64 }) {
+ // Use ANDPD to simulate FABS.
+ setOperationAction(ISD::FABS, VT, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG, VT, Custom);
+
+ // Use ANDPD and ORPD to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+
+ // These might be better off as horizontal vector ops.
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FSUB, VT, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , VT, Expand);
+ setOperationAction(ISD::FCOS , VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ }
+
+ // Lower this to MOVMSK plus an AND.
+ setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+ setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
+ } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
+ (UseX87 || Is64Bit)) {
+ // Use SSE for f32, x87 for f64.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ if (UseX87)
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+
+ // Use ANDPS to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ if (UseX87)
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+
+ // Use ANDPS and ORPS to simulate FCOPYSIGN.
+ if (UseX87)
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+ if (UseX87) {
+ // Always expand sin/cos functions even though x87 has an instruction.
+ setOperationAction(ISD::FSIN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ }
+ } else if (UseX87) {
+ // f32 and f64 in x87.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+ addRegisterClass(MVT::f32, &X86::RFP32RegClass);
+
+ for (auto VT : { MVT::f32, MVT::f64 }) {
+ setOperationAction(ISD::UNDEF, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+
+ // Always expand sin/cos functions even though x87 has an instruction.
+ setOperationAction(ISD::FSIN , VT, Expand);
+ setOperationAction(ISD::FCOS , VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ }
+ }
+
+ // Expand FP32 immediates into loads from the stack, save special cases.
+ if (isTypeLegal(MVT::f32)) {
+ if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
+ addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+ } else // SSE immediates.
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ }
+ // Expand FP64 immediates into loads from the stack, save special cases.
+ if (isTypeLegal(MVT::f64)) {
+ if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+ } else // SSE immediates.
+ addLegalFPImmediate(APFloat(+0.0)); // xorpd
+ }
+ // Handle constrained floating-point operations of scalar.
+ setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
+
+ // We don't support FMA.
+ setOperationAction(ISD::FMA, MVT::f64, Expand);
+ setOperationAction(ISD::FMA, MVT::f32, Expand);
+
+ // f80 always uses X87.
+ if (UseX87) {
+ addRegisterClass(MVT::f80, &X86::RFP80RegClass);
+ setOperationAction(ISD::UNDEF, MVT::f80, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+ {
+ APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
+ addLegalFPImmediate(TmpFlt); // FLD0
+ TmpFlt.changeSign();
+ addLegalFPImmediate(TmpFlt); // FLD0/FCHS
+
+ bool ignored;
+ APFloat TmpFlt2(+1.0);
+ TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
+ &ignored);
+ addLegalFPImmediate(TmpFlt2); // FLD1
+ TmpFlt2.changeSign();
+ addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
+ }
+
+ // Always expand sin/cos functions even though x87 has an instruction.
+ setOperationAction(ISD::FSIN , MVT::f80, Expand);
+ setOperationAction(ISD::FCOS , MVT::f80, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+
+ setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+ setOperationAction(ISD::FCEIL, MVT::f80, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+ setOperationAction(ISD::FRINT, MVT::f80, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
+ setOperationAction(ISD::FMA, MVT::f80, Expand);
+ setOperationAction(ISD::LROUND, MVT::f80, Expand);
+ setOperationAction(ISD::LLROUND, MVT::f80, Expand);
+ setOperationAction(ISD::LRINT, MVT::f80, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f80, Custom);
+
+ // Handle constrained floating-point operations of scalar.
+ setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
+ // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
+ // as Custom.
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
+ }
+
+ // f128 uses xmm registers, but most operations require libcalls.
+ if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
+ addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+
+ addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
+
+ setOperationAction(ISD::FADD, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
+ setOperationAction(ISD::FSUB, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
+ setOperationAction(ISD::FDIV, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
+ setOperationAction(ISD::FMUL, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
+ setOperationAction(ISD::FMA, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
+
+ setOperationAction(ISD::FABS, MVT::f128, Custom);
+ setOperationAction(ISD::FNEG, MVT::f128, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+
+ setOperationAction(ISD::FSIN, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
+ setOperationAction(ISD::FCOS, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
+ setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
+ // No STRICT_FSINCOS
+ setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
+ // We need to custom handle any FP_ROUND with an f128 input, but
+ // LegalizeDAG uses the result type to know when to run a custom handler.
+ // So we have to list all legal floating point result types here.
+ if (isTypeLegal(MVT::f32)) {
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
+ }
+ if (isTypeLegal(MVT::f64)) {
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
+ }
+ if (isTypeLegal(MVT::f80)) {
+ setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
+ }
+
+ setOperationAction(ISD::SETCC, MVT::f128, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+ }
+
+ // Always use a library call for pow.
+ setOperationAction(ISD::FPOW , MVT::f32 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f64 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f80 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f128 , Expand);
+
+ setOperationAction(ISD::FLOG, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+ setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
+
+ // Some FP actions are always expanded for vector types.
+ for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
+ MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ }
+
+ // First set operation action for all vector types to either promote
+ // (for widening) or expand (for scalarization). Then we will selectively
+ // turn on ones that can be effectively codegen'd.
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
+ setOperationAction(ISD::FMA, VT, Expand);
+ setOperationAction(ISD::FFLOOR, VT, Expand);
+ setOperationAction(ISD::FCEIL, VT, Expand);
+ setOperationAction(ISD::FTRUNC, VT, Expand);
+ setOperationAction(ISD::FRINT, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::SETCC, VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+ setOperationAction(ISD::TRUNCATE, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
+ setOperationAction(ISD::ANY_EXTEND, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
+ setTruncStoreAction(InnerVT, VT, Expand);
+
+ setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
+
+ // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
+ // types, we have to deal with them whether we ask for Expansion or not.
+ // Setting Expand causes its own optimisation problems though, so leave
+ // them legal.
+ if (VT.getVectorElementType() == MVT::i1)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+
+ // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
+ // split/scalarized right now.
+ if (VT.getVectorElementType() == MVT::f16)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+ }
+ }
+
+ // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+ // with -msoft-float, disable use of MMX as well.
+ if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
+ addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
+ // No operations on x86mmx supported, everything uses intrinsics.
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
+ addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+
+ setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
+ setOperationAction(ISD::STORE, MVT::v2f32, Custom);
+
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
+ addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+
+ // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
+ // registers cannot be used even for integer operations.
+ addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+ addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+ addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+ addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+
+ for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
+ MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
+ setOperationAction(ISD::SDIV, VT, Custom);
+ setOperationAction(ISD::SREM, VT, Custom);
+ setOperationAction(ISD::UDIV, VT, Custom);
+ setOperationAction(ISD::UREM, VT, Custom);
+ }
+
+ setOperationAction(ISD::MUL, MVT::v2i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
+ setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
+
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
+ }
+
+ setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
+ setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
+ setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::ABS, VT, Custom);
+
+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+ // setcc all the way to isel and prefer SETGT in some isel patterns.
+ setCondCodeAction(ISD::SETLT, VT, Custom);
+ setCondCodeAction(ISD::SETLE, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+
+ if (VT == MVT::v2i64 && !Subtarget.is64Bit())
+ continue;
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+
+ // Custom lower v2i64 and v2f64 selects.
+ setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
+
+ // Custom legalize these to avoid over promotion or custom promotion.
+ for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
+ }
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
+
+ // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
+
+ // We want to legalize this to an f64 load rather than an i64 load on
+ // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
+ // store.
+ setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
+ setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
+ setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
+ if (!Subtarget.hasAVX512())
+ setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+
+ // In the customized shift lowering, the legal v4i32/v2i64 cases
+ // in AVX2 will be recognized.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
+
+ setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
+
+ // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
+ // shifts) is better.
+ if (!Subtarget.useAVX512Regs() &&
+ !(Subtarget.hasBWI() && Subtarget.hasVLX()))
+ setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+ setOperationAction(ISD::ABS, MVT::v16i8, Legal);
+ setOperationAction(ISD::ABS, MVT::v8i16, Legal);
+ setOperationAction(ISD::ABS, MVT::v4i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
+
+ // These might be better off as horizontal vector ops.
+ setOperationAction(ISD::ADD, MVT::i16, Custom);
+ setOperationAction(ISD::ADD, MVT::i32, Custom);
+ setOperationAction(ISD::SUB, MVT::i16, Custom);
+ setOperationAction(ISD::SUB, MVT::i32, Custom);
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
+ for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
+ setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
+
+ setOperationAction(ISD::FROUND, RoundedTy, Custom);
+ }
+
+ setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
+ setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
+ setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
+ setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
+
+ setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+
+ // FIXME: Do we need to handle scalar-to-vector here?
+ setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+
+ // We directly match byte blends in the backend as they match the VSELECT
+ // condition form.
+ setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+
+ // SSE41 brings specific instructions for doing vector sign extend even in
+ // cases where we don't have SRA.
+ for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
+ }
+
+ // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+ for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
+ setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
+ }
+
+ // i8 vectors are custom because the source register and source
+ // source memory operand types are not the same width.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+
+ if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
+ // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
+ // do the pre and post work in the vector domain.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
+ // We need to mark SINT_TO_FP as Custom even though we want to expand it
+ // so that DAG combine doesn't try to turn it into uint_to_fp.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
+ }
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
+ setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+ setOperationAction(ISD::ROTL, VT, Custom);
+
+ // XOP can efficiently perform BITREVERSE with VPPERM.
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
+ bool HasInt256 = Subtarget.hasInt256();
+
+ addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+
+ for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
+
+ setOperationAction(ISD::FROUND, VT, Custom);
+
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ }
+
+ // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
+ // even though v8i16 is a legal type.
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
+
+ if (!Subtarget.hasAVX512())
+ setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
+
+ // In the customized shift lowering, the legal v8i32/v4i64 cases
+ // in AVX2 will be recognized.
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
+
+ // These types need custom splitting if their input is a 128-bit vector.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
+
+ // With BWI, expanding (and promoting the shifts) is the better.
+ if (!Subtarget.useBWIRegs())
+ setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+
+ for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ }
+
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+
+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+ // setcc all the way to isel and prefer SETGT in some isel patterns.
+ setCondCodeAction(ISD::SETLT, VT, Custom);
+ setCondCodeAction(ISD::SETLE, VT, Custom);
+ }
+
+ if (Subtarget.hasAnyFMA()) {
+ for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
+ MVT::v2f64, MVT::v4f64 }) {
+ setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::STRICT_FMA, VT, Legal);
+ }
+ }
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
+ }
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::ABS, MVT::v4i64, Custom);
+ setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
+ setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
+ setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
+ setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+ setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
+ }
+
+ for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
+ }
+
+ if (HasInt256) {
+ // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
+ // when we have a 256bit-wide blend with immediate.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
+
+ // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+ for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
+ setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
+ }
+ }
+
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
+ setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
+
+ // Extract subvector is special because the value type
+ // (result) is 128-bit but the source is 256-bit wide.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v4f32, MVT::v2f64 }) {
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+ }
+
+ // Custom lower several nodes for 256-bit types.
+ for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v8f32, MVT::v4f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+ }
+
+ if (HasInt256) {
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
+ // Custom legalize 2x32 to get a little better code.
+ setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
+ setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
+
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ }
+ }
+
+ // This block controls legalization of the mask vector sizes that are
+ // available with AVX512. 512-bit vectors are in a separate block controlled
+ // by useAVX512Regs.
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
+ addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
+ addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
+ addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
+ addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
+ addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+
+ setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
+
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
+
+ // There is no byte sized k-register load or store without AVX512DQ.
+ if (!Subtarget.hasDQI()) {
+ setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::STORE, MVT::v1i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v2i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v8i1, Custom);
+ }
+
+ // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::UADDSAT, VT, Custom);
+ setOperationAction(ISD::SADDSAT, VT, Custom);
+ setOperationAction(ISD::USUBSAT, VT, Custom);
+ setOperationAction(ISD::SSUBSAT, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ }
+
+ for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ }
+
+ // This block controls legalization for 512-bit operations with 32/64 bit
+ // elements. 512-bits can be disabled based on prefer-vector-width and
+ // required-vector-width function attributes.
+ if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
+ bool HasBWI = Subtarget.hasBWI();
+
+ addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+ addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
+
+ for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
+ setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
+ setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
+ if (HasBWI)
+ setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
+ }
+
+ for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::STRICT_FMA, VT, Legal);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ }
+
+ for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
+ setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
+ }
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
+
+ setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ if (HasBWI)
+ setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
+
+ // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
+ // to 512-bit rather than use the AVX2 instructions so that we can use
+ // k-masks.
+ if (!Subtarget.hasVLX()) {
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ }
+ }
+
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+
+ if (HasBWI) {
+ // Extends from v64i1 masks to 512-bit vectors.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
+ }
+
+ for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
+
+ setOperationAction(ISD::FROUND, VT, Custom);
+ }
+
+ for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
+ }
+
+ setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+ setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v64i8, Custom);
+
+ setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
+
+ setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
+
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+
+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+ // setcc all the way to isel and prefer SETGT in some isel patterns.
+ setCondCodeAction(ISD::SETLT, VT, Custom);
+ setCondCodeAction(ISD::SETLE, VT, Custom);
+ }
+ for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::ROTL, VT, Custom);
+ setOperationAction(ISD::ROTR, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
+ }
+
+ if (Subtarget.hasDQI()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Legal);
+ }
+
+ if (Subtarget.hasCDI()) {
+ // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+ for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
+ setOperationAction(ISD::CTLZ, VT, Legal);
+ }
+ } // Subtarget.hasCDI()
+
+ if (Subtarget.hasVPOPCNTDQ()) {
+ for (auto VT : { MVT::v16i32, MVT::v8i64 })
+ setOperationAction(ISD::CTPOP, VT, Legal);
+ }
+
+ // Extract subvector is special because the value type
+ // (result) is 256-bit but the source is 512-bit wide.
+ // 128-bit was made Legal under AVX1.
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v8f32, MVT::v4f64 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
+ MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+ if (HasBWI) {
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
+ } else {
+ setOperationAction(ISD::STORE, MVT::v32i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v64i8, Custom);
+ }
+
+ if (Subtarget.hasVBMI2()) {
+ for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
+ setOperationAction(ISD::FSHL, VT, Custom);
+ setOperationAction(ISD::FSHR, VT, Custom);
+ }
+
+ setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
+ }
+ }// useAVX512Regs
+
+ // This block controls legalization for operations that don't have
+ // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
+ // narrower widths.
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
+ // These operations are handled on non-VLX by artificially widening in
+ // isel patterns.
+
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+
+ if (Subtarget.hasDQI()) {
+ // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
+ // v2f32 UINT_TO_FP is already custom under SSE2.
+ assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
+ isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
+ "Unexpected operation action!");
+ // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
+ }
+
+ for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
+ }
+
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::ROTL, VT, Custom);
+ setOperationAction(ISD::ROTR, VT, Custom);
+ }
+
+ // Custom legalize 2x32 to get a little better code.
+ setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
+ setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
+
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+
+ if (Subtarget.hasDQI()) {
+ for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::SINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::MUL, VT, Legal);
+ }
+ }
+
+ if (Subtarget.hasCDI()) {
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::CTLZ, VT, Legal);
+ }
+ } // Subtarget.hasCDI()
+
+ if (Subtarget.hasVPOPCNTDQ()) {
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
+ setOperationAction(ISD::CTPOP, VT, Legal);
+ }
+ }
+
+ // This block control legalization of v32i1/v64i1 which are available with
+ // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
+ // useBWIRegs.
+ if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
+ addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
+ addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
+
+ for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::UADDSAT, VT, Custom);
+ setOperationAction(ISD::SADDSAT, VT, Custom);
+ setOperationAction(ISD::USUBSAT, VT, Custom);
+ setOperationAction(ISD::SSUBSAT, VT, Custom);
+
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v16i1, MVT::v32i1 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ // Extends from v32i1 masks to 256-bit vectors.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
+
+ for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
+ setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
+ }
+
+ // These operations are handled on non-VLX by artificially widening in
+ // isel patterns.
+ // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
+
+ if (Subtarget.hasBITALG()) {
+ for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
+ setOperationAction(ISD::CTPOP, VT, Legal);
+ }
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
+ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
+
+ setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+
+ if (Subtarget.hasBWI()) {
+ setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+ }
+
+ setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
+ }
+
+ if (Subtarget.hasAMXTILE()) {
+ addRegisterClass(MVT::x86amx, &X86::TILERegClass);
+ }
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ if (!Subtarget.is64Bit()) {
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+ }
+
+ // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
+ // handle type legalization for these operations here.
+ //
+ // FIXME: We really should do custom legalization for addition and
+ // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
+ // than generic legalization for 64-bit multiplication-with-overflow, though.
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ // Add/Sub/Mul with overflow operations are custom lowered.
+ setOperationAction(ISD::SADDO, VT, Custom);
+ setOperationAction(ISD::UADDO, VT, Custom);
+ setOperationAction(ISD::SSUBO, VT, Custom);
+ setOperationAction(ISD::USUBO, VT, Custom);
+ setOperationAction(ISD::SMULO, VT, Custom);
+ setOperationAction(ISD::UMULO, VT, Custom);
+
+ // Support carry in as value rather than glue.
+ setOperationAction(ISD::ADDCARRY, VT, Custom);
+ setOperationAction(ISD::SUBCARRY, VT, Custom);
+ setOperationAction(ISD::SETCCCARRY, VT, Custom);
+ setOperationAction(ISD::SADDO_CARRY, VT, Custom);
+ setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
+ }
+
+ if (!Subtarget.is64Bit()) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
+ setLibcallName(RTLIB::MUL_I128, nullptr);
+ }
+
+ // Combine sin / cos into _sincos_stret if it is available.
+ if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
+ getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ }
+
+ if (Subtarget.isTargetWin64()) {
+ setOperationAction(ISD::SDIV, MVT::i128, Custom);
+ setOperationAction(ISD::UDIV, MVT::i128, Custom);
+ setOperationAction(ISD::SREM, MVT::i128, Custom);
+ setOperationAction(ISD::UREM, MVT::i128, Custom);
+ }
+
+ // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+ // is. We should promote the value to 64-bits to solve this.
+ // This is what the CRT headers do - `fmodf` is an inline header
+ // function casting to f64 and calling `fmod`.
+ if (Subtarget.is32Bit() &&
+ (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
+ for (ISD::NodeType Op :
+ {ISD::FCEIL, ISD::STRICT_FCEIL,
+ ISD::FCOS, ISD::STRICT_FCOS,
+ ISD::FEXP, ISD::STRICT_FEXP,
+ ISD::FFLOOR, ISD::STRICT_FFLOOR,
+ ISD::FREM, ISD::STRICT_FREM,
+ ISD::FLOG, ISD::STRICT_FLOG,
+ ISD::FLOG10, ISD::STRICT_FLOG10,
+ ISD::FPOW, ISD::STRICT_FPOW,
+ ISD::FSIN, ISD::STRICT_FSIN})
+ if (isOperationExpand(Op, MVT::f32))
+ setOperationAction(Op, MVT::f32, Promote);
+
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::CONCAT_VECTORS);
+ setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
+ setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
+ setTargetDAGCombine(ISD::BITCAST);
+ setTargetDAGCombine(ISD::VSELECT);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FNEG);
+ setTargetDAGCombine(ISD::FMA);
+ setTargetDAGCombine(ISD::STRICT_FMA);
+ setTargetDAGCombine(ISD::FMINNUM);
+ setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::MLOAD);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::MSTORE);
+ setTargetDAGCombine(ISD::TRUNCATE);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
+ setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
+ setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::MSCATTER);
+ setTargetDAGCombine(ISD::MGATHER);
+ setTargetDAGCombine(ISD::FP16_TO_FP);
+ setTargetDAGCombine(ISD::FP_EXTEND);
+ setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
+ setTargetDAGCombine(ISD::FP_ROUND);
+
+ computeRegisterProperties(Subtarget.getRegisterInfo());
+
+ MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+ MaxStoresPerMemsetOptSize = 8;
+ MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+ MaxStoresPerMemcpyOptSize = 4;
+ MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+ MaxStoresPerMemmoveOptSize = 4;
+
+ // TODO: These control memcmp expansion in CGP and could be raised higher, but
+ // that needs to benchmarked and balanced with the potential use of vector
+ // load/store types (PR33329, PR33914).
+ MaxLoadsPerMemcmp = 2;
+ MaxLoadsPerMemcmpOptSize = 2;
+
+ // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
+ setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
+
+ // An out-of-order CPU can speculatively execute past a predictable branch,
+ // but a conditional move could be stalled by an expensive earlier operation.
+ PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
+ EnableExtLdPromotion = true;
+ setPrefFunctionAlignment(Align(16));
+
+ verifyIntrinsicTables();
+
+ // Default to having -disable-strictnode-mutation on
+ IsStrictFPEnabled = true;
+}
+
+// This has so far only been implemented for 64-bit MachO.
+bool X86TargetLowering::useLoadStackGuardNode() const {
+ return Subtarget.isTargetMachO() && Subtarget.is64Bit();
+}
+
+bool X86TargetLowering::useStackGuardXorFP() const {
+ // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
+ return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
+}
+
+SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
+ const SDLoc &DL) const {
+ EVT PtrTy = getPointerTy(DAG.getDataLayout());
+ unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
+ MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
+ return SDValue(Node, 0);
+}
+
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(MVT VT) const {
+ if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
+ !Subtarget.hasBWI())
+ return TypeSplitVector;
+
+ if (VT.getVectorNumElements() != 1 &&
+ VT.getVectorElementType() != MVT::i1)
+ return TypeWidenVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+static std::pair<MVT, unsigned>
+handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
+ const X86Subtarget &Subtarget) {
+ // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
+ // convention is one that uses k registers.
+ if (NumElts == 2)
+ return {MVT::v2i64, 1};
+ if (NumElts == 4)
+ return {MVT::v4i32, 1};
+ if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
+ CC != CallingConv::Intel_OCL_BI)
+ return {MVT::v8i16, 1};
+ if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
+ CC != CallingConv::Intel_OCL_BI)
+ return {MVT::v16i8, 1};
+ // v32i1 passes in ymm unless we have BWI and the calling convention is
+ // regcall.
+ if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
+ return {MVT::v32i8, 1};
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
+ if (Subtarget.useAVX512Regs())
+ return {MVT::v64i8, 1};
+ return {MVT::v32i8, 2};
+ }
+
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
+ NumElts > 64)
+ return {MVT::i8, NumElts};
+
+ return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
+}
+
+MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return RegisterVT;
+ }
+
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
+unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return NumRegisters;
+ }
+
+ return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+}
+
+unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const {
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512() &&
+ (!isPowerOf2_32(VT.getVectorNumElements()) ||
+ (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
+ VT.getVectorNumElements() > 64)) {
+ RegisterVT = MVT::i8;
+ IntermediateVT = MVT::i1;
+ NumIntermediates = VT.getVectorNumElements();
+ return NumIntermediates;
+ }
+
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ CC != CallingConv::X86_RegCall) {
+ RegisterVT = MVT::v32i8;
+ IntermediateVT = MVT::v32i1;
+ NumIntermediates = 2;
+ return 2;
+ }
+
+ return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
+ NumIntermediates, RegisterVT);
+}
+
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
+ LLVMContext& Context,
+ EVT VT) const {
+ if (!VT.isVector())
+ return MVT::i8;
+
+ if (Subtarget.hasAVX512()) {
+ const unsigned NumElts = VT.getVectorNumElements();
+
+ // Figure out what this type will be legalized to.
+ EVT LegalVT = VT;
+ while (getTypeAction(Context, LegalVT) != TypeLegal)
+ LegalVT = getTypeToTransformTo(Context, LegalVT);
+
+ // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
+ if (LegalVT.getSimpleVT().is512BitVector())
+ return EVT::getVectorVT(Context, MVT::i1, NumElts);
+
+ if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
+ // If we legalized to less than a 512-bit vector, then we will use a vXi1
+ // compare for vXi32/vXi64 for sure. If we have BWI we will also support
+ // vXi16/vXi8.
+ MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
+ if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
+ return EVT::getVectorVT(Context, MVT::i1, NumElts);
+ }
+ }
+
+ return VT.changeVectorElementTypeToInteger();
+}
+
+/// Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
+ if (MaxAlign == 16)
+ return;
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
+ MaxAlign = Align(16);
+ } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ Align EltAlign;
+ getMaxByValAlign(ATy->getElementType(), EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ for (auto *EltTy : STy->elements()) {
+ Align EltAlign;
+ getMaxByValAlign(EltTy, EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ if (MaxAlign == 16)
+ break;
+ }
+ }
+}
+
+/// Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
+ const DataLayout &DL) const {
+ if (Subtarget.is64Bit()) {
+ // Max of 8 and alignment of type.
+ Align TyAlign = DL.getABITypeAlign(Ty);
+ if (TyAlign > 8)
+ return TyAlign.value();
+ return 8;
+ }
+
+ Align Alignment(4);
+ if (Subtarget.hasSSE1())
+ getMaxByValAlign(Ty, Alignment);
+ return Alignment.value();
+}
+
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+/// For vector ops we check that the overall size isn't larger than our
+/// preferred vector width.
+EVT X86TargetLowering::getOptimalMemOpType(
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
+ if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ if (Op.size() >= 16 &&
+ (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
+ // FIXME: Check if unaligned 64-byte accesses are slow.
+ if (Op.size() >= 64 && Subtarget.hasAVX512() &&
+ (Subtarget.getPreferVectorWidth() >= 512)) {
+ return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
+ }
+ // FIXME: Check if unaligned 32-byte accesses are slow.
+ if (Op.size() >= 32 && Subtarget.hasAVX() &&
+ (Subtarget.getPreferVectorWidth() >= 256)) {
+ // Although this isn't a well-supported type for AVX1, we'll let
+ // legalization and shuffle lowering produce the optimal codegen. If we
+ // choose an optimal type with a vector element larger than a byte,
+ // getMemsetStores() may create an intermediate splat (using an integer
+ // multiply) before we splat as a vector.
+ return MVT::v32i8;
+ }
+ if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
+ return MVT::v16i8;
+ // TODO: Can SSE1 handle a byte vector?
+ // If we have SSE1 registers we should be able to use them.
+ if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
+ (Subtarget.getPreferVectorWidth() >= 128))
+ return MVT::v4f32;
+ } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
+ Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
+ // Do not use f64 to lower memcpy if source is string constant. It's
+ // better to use i32 to avoid the loads.
+ // Also, do not use f64 to lower memset unless this is a memset of zeros.
+ // The gymnastics of splatting a byte value into an XMM register and then
+ // only using 8-byte stores (because this is a CPU with slow unaligned
+ // 16-byte accesses) makes that a loser.
+ return MVT::f64;
+ }
+ }
+ // This is a compromise. If we reach here, unaligned accesses may be slow on
+ // this target. However, creating smaller, aligned accesses could be even
+ // slower and would certainly be a lot more code.
+ if (Subtarget.is64Bit() && Op.size() >= 8)
+ return MVT::i64;
+ return MVT::i32;
+}
+
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+ if (VT == MVT::f32)
+ return X86ScalarSSEf32;
+ else if (VT == MVT::f64)
+ return X86ScalarSSEf64;
+ return true;
+}
+
+bool X86TargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
+ if (Fast) {
+ switch (VT.getSizeInBits()) {
+ default:
+ // 8-byte and under are always assumed to be fast.
+ *Fast = true;
+ break;
+ case 128:
+ *Fast = !Subtarget.isUnalignedMem16Slow();
+ break;
+ case 256:
+ *Fast = !Subtarget.isUnalignedMem32Slow();
+ break;
+ // TODO: What about AVX-512 (512-bit) accesses?
+ }
+ }
+ // NonTemporal vector memory ops must be aligned.
+ if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
+ // NT loads can only be vector aligned, so if its less aligned than the
+ // minimum vector size (which we can split the vector down to), we might as
+ // well use a regular unaligned vector load.
+ // We don't have any NT loads pre-SSE41.
+ if (!!(Flags & MachineMemOperand::MOLoad))
+ return (Align < 16 || !Subtarget.hasSSE41());
+ return false;
+ }
+ // Misaligned accesses of any size are always allowed.
+ return true;
+}
+
+/// Return the entry encoding for a jump table in the
+/// current function. The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+ // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+ // symbol.
+ if (isPositionIndependent() && Subtarget.isPICStyleGOT())
+ return MachineJumpTableInfo::EK_Custom32;
+
+ // Otherwise, use the normal jump table encoding heuristics.
+ return TargetLowering::getJumpTableEncoding();
+}
+
+bool X86TargetLowering::useSoftFloat() const {
+ return Subtarget.useSoftFloat();
+}
+
+void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
+ ArgListTy &Args) const {
+
+ // Only relabel X86-32 for C / Stdcall CCs.
+ if (Subtarget.is64Bit())
+ return;
+ if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
+ return;
+ unsigned ParamRegs = 0;
+ if (auto *M = MF->getFunction().getParent())
+ ParamRegs = M->getNumberRegisterParameters();
+
+ // Mark the first N int arguments as having reg
+ for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
+ Type *T = Args[Idx].Ty;
+ if (T->isIntOrPtrTy())
+ if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
+ unsigned numRegs = 1;
+ if (MF->getDataLayout().getTypeAllocSize(T) > 4)
+ numRegs = 2;
+ if (ParamRegs < numRegs)
+ return;
+ ParamRegs -= numRegs;
+ Args[Idx].IsInReg = true;
+ }
+ }
+}
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned uid,MCContext &Ctx) const{
+ assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
+ // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+ // entries.
+ return MCSymbolRefExpr::create(MBB->getSymbol(),
+ MCSymbolRefExpr::VK_GOTOFF, Ctx);
+}
+
+/// Returns relocation base for the given PIC jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ if (!Subtarget.is64Bit())
+ // This doesn't have SDLoc associated with it, but is not really the
+ // same as a Register.
+ return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+ getPointerTy(DAG.getDataLayout()));
+ return Table;
+}
+
+/// This returns the relocation base for the given PIC jumptable,
+/// the same as getPICJumpTableRelocBase, but as an MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+ MCContext &Ctx) const {
+ // X86-64 uses RIP relative addressing based on the jump table label.
+ if (Subtarget.isPICStyleRIPRel())
+ return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+ // Otherwise, the reference is relative to the PIC base.
+ return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+}
+
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
+ const TargetRegisterClass *RRC = nullptr;
+ uint8_t Cost = 1;
+ switch (VT.SimpleTy) {
+ default:
+ return TargetLowering::findRepresentativeClass(TRI, VT);
+ case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
+ RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
+ break;
+ case MVT::x86mmx:
+ RRC = &X86::VR64RegClass;
+ break;
+ case MVT::f32: case MVT::f64:
+ case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+ case MVT::v4f32: case MVT::v2f64:
+ case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
+ case MVT::v8f32: case MVT::v4f64:
+ case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
+ case MVT::v16f32: case MVT::v8f64:
+ RRC = &X86::VR128XRegClass;
+ break;
+ }
+ return std::make_pair(RRC, Cost);
+}
+
+unsigned X86TargetLowering::getAddressSpace() const {
+ if (Subtarget.is64Bit())
+ return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
+ return 256;
+}
+
+static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
+ return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
+ (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
+}
+
+static Constant* SegmentOffset(IRBuilder<> &IRB,
+ unsigned Offset, unsigned AddressSpace) {
+ return ConstantExpr::getIntToPtr(
+ ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+ // glibc, bionic, and Fuchsia have a special slot for the stack guard in
+ // tcbhead_t; use it instead of the usual global variable (see
+ // sysdeps/{i386,x86_64}/nptl/tls.h)
+ if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
+ if (Subtarget.isTargetFuchsia()) {
+ // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
+ return SegmentOffset(IRB, 0x10, getAddressSpace());
+ } else {
+ unsigned AddressSpace = getAddressSpace();
+ // Specially, some users may customize the base reg and offset.
+ unsigned Offset = getTargetMachine().Options.StackProtectorGuardOffset;
+ // If we don't set -stack-protector-guard-offset value:
+ // %fs:0x28, unless we're using a Kernel code model, in which case
+ // it's %gs:0x28. gs:0x14 on i386.
+ if (Offset == (unsigned)-1)
+ Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+
+ const auto &GuardReg = getTargetMachine().Options.StackProtectorGuardReg;
+ if (GuardReg == "fs")
+ AddressSpace = X86AS::FS;
+ else if (GuardReg == "gs")
+ AddressSpace = X86AS::GS;
+ return SegmentOffset(IRB, Offset, AddressSpace);
+ }
+ }
+ return TargetLowering::getIRStackGuard(IRB);
+}
+
+void X86TargetLowering::insertSSPDeclarations(Module &M) const {
+ // MSVC CRT provides functionalities for stack protection.
+ if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
+ Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
+ // MSVC CRT has a global variable holding security cookie.
+ M.getOrInsertGlobal("__security_cookie",
+ Type::getInt8PtrTy(M.getContext()));
+
+ // MSVC CRT has a function to validate security cookie.
+ FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
+ "__security_check_cookie", Type::getVoidTy(M.getContext()),
+ Type::getInt8PtrTy(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
+ F->setCallingConv(CallingConv::X86_FastCall);
+ F->addAttribute(1, Attribute::AttrKind::InReg);
+ }
+ return;
+ }
+
+ auto GuardMode = getTargetMachine().Options.StackProtectorGuard;
+
+ // glibc, bionic, and Fuchsia have a special slot for the stack guard.
+ if ((GuardMode == llvm::StackProtectorGuards::TLS ||
+ GuardMode == llvm::StackProtectorGuards::None)
+ && hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
+ return;
+ TargetLowering::insertSSPDeclarations(M);
+}
+
+Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
+ // MSVC CRT has a global variable holding security cookie.
+ if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
+ Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
+ return M.getGlobalVariable("__security_cookie");
+ }
+ return TargetLowering::getSDagStackGuard(M);
+}
+
+Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+ // MSVC CRT has a function to validate security cookie.
+ if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
+ Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
+ return M.getFunction("__security_check_cookie");
+ }
+ return TargetLowering::getSSPStackGuardCheck(M);
+}
+
+Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+ if (Subtarget.getTargetTriple().isOSContiki())
+ return getDefaultSafeStackPointerLocation(IRB, false);
+
+ // Android provides a fixed TLS slot for the SafeStack pointer. See the
+ // definition of TLS_SLOT_SAFESTACK in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ if (Subtarget.isTargetAndroid()) {
+ // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+ // %gs:0x24 on i386
+ unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+ return SegmentOffset(IRB, Offset, getAddressSpace());
+ }
+
+ // Fuchsia is similar.
+ if (Subtarget.isTargetFuchsia()) {
+ // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
+ return SegmentOffset(IRB, 0x18, getAddressSpace());
+ }
+
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool X86TargetLowering::CanLowerReturn(
+ CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, RetCC_X86);
+}
+
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+ static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
+ return ScratchRegs;
+}
+
+/// Lowers masks values (v*i1) to the local register values
+/// \returns DAG node after lowering to register type
+static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
+ const SDLoc &Dl, SelectionDAG &DAG) {
+ EVT ValVT = ValArg.getValueType();
+
+ if (ValVT == MVT::v1i1)
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
+ DAG.getIntPtrConstant(0, Dl));
+
+ if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
+ (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
+ // Two stage lowering might be required
+ // bitcast: v8i1 -> i8 / v16i1 -> i16
+ // anyextend: i8 -> i32 / i16 -> i32
+ EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
+ SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
+ if (ValLoc == MVT::i32)
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
+ return ValToCopy;
+ }
+
+ if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
+ (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
+ // One stage lowering is required
+ // bitcast: v32i1 -> i32 / v64i1 -> i64
+ return DAG.getBitcast(ValLoc, ValArg);
+ }
+
+ return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
+}
+
+/// Breaks v64i1 value into two registers and adds the new node to the DAG
+static void Passv64i1ArgInRegs(
+ const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
+ SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
+ CCValAssign &NextVA, const X86Subtarget &Subtarget) {
+ assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
+ assert(Subtarget.is32Bit() && "Expecting 32 bit target");
+ assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
+ assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+ "The value should reside in two registers");
+
+ // Before splitting the value we cast it to i64
+ Arg = DAG.getBitcast(MVT::i64, Arg);
+
+ // Splitting the value into two i32 types
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
+ DAG.getConstant(0, Dl, MVT::i32));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
+ DAG.getConstant(1, Dl, MVT::i32));
+
+ // Attach the two i32 types into corresponding registers
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
+ RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ // In some cases we need to disable registers from the default CSR list.
+ // For example, when they are used for argument passing.
+ bool ShouldDisableCalleeSavedRegister =
+ CallConv == CallingConv::X86_RegCall ||
+ MF.getFunction().hasFnAttribute("no_caller_saved_registers");
+
+ if (CallConv == CallingConv::X86_INTR && !Outs.empty())
+ report_fatal_error("X86 interrupts may not return any value");
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+ SmallVector<std::pair<Register, SDValue>, 4> RetVals;
+ for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
+ ++I, ++OutsIndex) {
+ CCValAssign &VA = RVLocs[I];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ // Add the register to the CalleeSaveDisableRegs list.
+ if (ShouldDisableCalleeSavedRegister)
+ MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
+
+ SDValue ValToCopy = OutVals[OutsIndex];
+ EVT ValVT = ValToCopy.getValueType();
+
+ // Promote values to the appropriate types.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::AExt) {
+ if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
+ ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
+ else
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ }
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
+
+ assert(VA.getLocInfo() != CCValAssign::FPExt &&
+ "Unexpected FP-extend for return value.");
+
+ // Report an error if we have attempted to return a value via an XMM
+ // register and SSE was disabled.
+ if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
+ errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ } else if (!Subtarget.hasSSE2() &&
+ X86::FR64XRegClass.contains(VA.getLocReg()) &&
+ ValVT == MVT::f64) {
+ // When returning a double via an XMM register, report an error if SSE2 is
+ // not enabled.
+ errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ }
+
+ // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+ // the RET instruction and handled by the FP Stackifier.
+ if (VA.getLocReg() == X86::FP0 ||
+ VA.getLocReg() == X86::FP1) {
+ // If this is a copy from an xmm register to ST(0), use an FPExtend to
+ // change the value to the FP stack register class.
+ if (isScalarFPTypeInSSEReg(VA.getValVT()))
+ ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+ RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
+ // Don't emit a copytoreg.
+ continue;
+ }
+
+ // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+ // which is returned in RAX / RDX.
+ if (Subtarget.is64Bit()) {
+ if (ValVT == MVT::x86mmx) {
+ if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+ ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
+ ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ ValToCopy);
+ // If we don't have SSE2 available, convert to v4f32 so the generated
+ // register is legal.
+ if (!Subtarget.hasSSE2())
+ ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
+ }
+ }
+ }
+
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+
+ Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
+ Subtarget);
+
+ // Add the second register to the CalleeSaveDisableRegs list.
+ if (ShouldDisableCalleeSavedRegister)
+ MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
+ } else {
+ RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
+ }
+ }
+
+ SDValue Flag;
+ SmallVector<SDValue, 6> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ // Operand #1 = Bytes To Pop
+ RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
+ MVT::i32));
+
+ // Copy the result values into the output registers.
+ for (auto &RetVal : RetVals) {
+ if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
+ RetOps.push_back(RetVal.second);
+ continue; // Don't emit a copytoreg.
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(
+ DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
+ }
+
+ // Swift calling convention does not require we copy the sret argument
+ // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
+
+ // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax.
+ //
+ // Checking Function.hasStructRetAttr() here is insufficient because the IR
+ // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+ // false, then an sret argument may be implicitly inserted in the SelDAG. In
+ // either case FuncInfo->setSRetReturnReg() will have been called.
+ if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
+ // When we have both sret and another return value, we should use the
+ // original Chain stored in RetOps[0], instead of the current Chain updated
+ // in the above loop. If we only have sret, RetOps[0] equals to Chain.
+
+ // For the case of sret and another return value, we have
+ // Chain_0 at the function entry
+ // Chain_1 = getCopyToReg(Chain_0) in the above loop
+ // If we use Chain_1 in getCopyFromReg, we will have
+ // Val = getCopyFromReg(Chain_1)
+ // Chain_2 = getCopyToReg(Chain_1, Val) from below
+
+ // getCopyToReg(Chain_0) will be glued together with
+ // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
+ // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
+ // Data dependency from Unit B to Unit A due to usage of Val in
+ // getCopyToReg(Chain_1, Val)
+ // Chain dependency from Unit A to Unit B
+
+ // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
+ SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
+ getPointerTy(MF.getDataLayout()));
+
+ Register RetValReg
+ = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
+ X86::RAX : X86::EAX;
+ Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
+ Flag = Chain.getValue(1);
+
+ // RAX/EAX now acts like a return value.
+ RetOps.push_back(
+ DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+
+ // Add the returned register to the CalleeSaveDisableRegs list.
+ if (ShouldDisableCalleeSavedRegister)
+ MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
+ }
+
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (X86::GR64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ X86ISD::NodeType opcode = X86ISD::RET_FLAG;
+ if (CallConv == CallingConv::X86_INTR)
+ opcode = X86ISD::IRET;
+ return DAG.getNode(opcode, dl, MVT::Other, RetOps);
+}
+
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+ if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
+ return false;
+
+ SDValue TCChain = Chain;
+ SDNode *Copy = *N->use_begin();
+ if (Copy->getOpcode() == ISD::CopyToReg) {
+ // If the copy has a glue operand, we conservatively assume it isn't safe to
+ // perform a tail call.
+ if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+ return false;
+ TCChain = Copy->getOperand(0);
+ } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+ return false;
+
+ bool HasRet = false;
+ for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+ UI != UE; ++UI) {
+ if (UI->getOpcode() != X86ISD::RET_FLAG)
+ return false;
+ // If we are returning more than one value, we can definitely
+ // not make a tail call see PR19530
+ if (UI->getNumOperands() > 4)
+ return false;
+ if (UI->getNumOperands() == 4 &&
+ UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
+ return false;
+ HasRet = true;
+ }
+
+ if (!HasRet)
+ return false;
+
+ Chain = TCChain;
+ return true;
+}
+
+EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const {
+ MVT ReturnMVT = MVT::i32;
+
+ bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
+ if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
+ // The ABI does not require i1, i8 or i16 to be extended.
+ //
+ // On Darwin, there is code in the wild relying on Clang's old behaviour of
+ // always extending i8/i16 return values, so keep doing that for now.
+ // (PR26665).
+ ReturnMVT = MVT::i8;
+ }
+
+ EVT MinVT = getRegisterType(Context, ReturnMVT);
+ return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
+/// Reads two 32 bit registers and creates a 64 bit mask value.
+/// \param VA The current 32 bit value that need to be assigned.
+/// \param NextVA The next 32 bit value that need to be assigned.
+/// \param Root The parent DAG node.
+/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
+/// glue purposes. In the case the DAG is already using
+/// physical register instead of virtual, we should glue
+/// our new SDValue to InFlag SDvalue.
+/// \return a new SDvalue of size 64bit.
+static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
+ SDValue &Root, SelectionDAG &DAG,
+ const SDLoc &Dl, const X86Subtarget &Subtarget,
+ SDValue *InFlag = nullptr) {
+ assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
+ assert(Subtarget.is32Bit() && "Expecting 32 bit target");
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Expecting first location of 64 bit width type");
+ assert(NextVA.getValVT() == VA.getValVT() &&
+ "The locations should have the same type");
+ assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+ "The values should reside in two registers");
+
+ SDValue Lo, Hi;
+ SDValue ArgValueLo, ArgValueHi;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetRegisterClass *RC = &X86::GR32RegClass;
+
+ // Read a 32 bit value from the registers.
+ if (nullptr == InFlag) {
+ // When no physical register is present,
+ // create an intermediate virtual register.
+ Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
+ Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+ ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
+ } else {
+ // When a physical register is available read the value from it and glue
+ // the reads together.
+ ArgValueLo =
+ DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
+ *InFlag = ArgValueLo.getValue(2);
+ ArgValueHi =
+ DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
+ *InFlag = ArgValueHi.getValue(2);
+ }
+
+ // Convert the i32 type into v32i1 type.
+ Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
+
+ // Convert the i32 type into v32i1 type.
+ Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
+
+ // Concatenate the two values together.
+ return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
+}
+
+/// The function will lower a register of various sizes (8/16/32/64)
+/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
+/// \returns a DAG node contains the operand after lowering to mask type.
+static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
+ const EVT &ValLoc, const SDLoc &Dl,
+ SelectionDAG &DAG) {
+ SDValue ValReturned = ValArg;
+
+ if (ValVT == MVT::v1i1)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
+
+ if (ValVT == MVT::v64i1) {
+ // In 32 bit machine, this case is handled by getv64i1Argument
+ assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
+ // In 64 bit machine, There is no need to truncate the value only bitcast
+ } else {
+ MVT maskLen;
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ case MVT::v8i1:
+ maskLen = MVT::i8;
+ break;
+ case MVT::v16i1:
+ maskLen = MVT::i16;
+ break;
+ case MVT::v32i1:
+ maskLen = MVT::i32;
+ break;
+ default:
+ llvm_unreachable("Expecting a vector of i1 types");
+ }
+
+ ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
+ }
+ return DAG.getBitcast(ValVT, ValReturned);
+}
+
+/// Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue X86TargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ uint32_t *RegMask) const {
+
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
+ ++I, ++InsIndex) {
+ CCValAssign &VA = RVLocs[I];
+ EVT CopyVT = VA.getLocVT();
+
+ // In some calling conventions we need to remove the used registers
+ // from the register mask.
+ if (RegMask) {
+ for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
+ SubRegs.isValid(); ++SubRegs)
+ RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+ }
+
+ // Report an error if there was an attempt to return FP values via XMM
+ // registers.
+ if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
+ errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+ if (VA.getLocReg() == X86::XMM1)
+ VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
+ else
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ } else if (!Subtarget.hasSSE2() &&
+ X86::FR64XRegClass.contains(VA.getLocReg()) &&
+ CopyVT == MVT::f64) {
+ errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+ if (VA.getLocReg() == X86::XMM1)
+ VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
+ else
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ }
+
+ // If we prefer to use the value in xmm registers, copy it out as f80 and
+ // use a truncate to move it from fp stack reg to xmm reg.
+ bool RoundAfterCopy = false;
+ if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
+ if (!Subtarget.hasX87())
+ report_fatal_error("X87 register return with X87 disabled");
+ CopyVT = MVT::f80;
+ RoundAfterCopy = (CopyVT != VA.getLocVT());
+ }
+
+ SDValue Val;
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+ Val =
+ getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
+ } else {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
+ .getValue(1);
+ Val = Chain.getValue(0);
+ InFlag = Chain.getValue(2);
+ }
+
+ if (RoundAfterCopy)
+ Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+ // This truncation won't change the value.
+ DAG.getIntPtrConstant(1, dl));
+
+ if (VA.isExtInLoc()) {
+ if (VA.getValVT().isVector() &&
+ VA.getValVT().getScalarType() == MVT::i1 &&
+ ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+ (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+ // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
+ Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
+ } else
+ Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+ }
+
+ if (VA.getLocInfo() == CCValAssign::BCvt)
+ Val = DAG.getBitcast(VA.getValVT(), Val);
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+// StdCall calling convention seems to be standard for many Windows' API
+// routines and around. It differs from C calling convention just a little:
+// callee should clean up the stack, not caller. Symbols should be also
+// decorated in some fancy way :) It doesn't support any vector arguments.
+// For info on fast calling convention see Fast Calling Convention (tail call)
+// implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+enum StructReturnType {
+ NotStructReturn,
+ RegStructReturn,
+ StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
+ if (Outs.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg() || IsMCU)
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// Determines whether a function uses struct return semantics.
+static StructReturnType
+argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
+ if (Ins.empty())
+ return NotStructReturn;
+
+ const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg() || IsMCU)
+ return RegStructReturn;
+ return StackStructReturn;
+}
+
+/// Make a copy of an aggregate at address specified by "Src" to address
+/// "Dst" with size and alignment information specified by the specific
+/// parameter attribute. The copy will be passed as a byval function parameter.
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+ SDValue Chain, ISD::ArgFlagsTy Flags,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
+
+ return DAG.getMemcpy(
+ Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
+ /*isVolatile*/ false, /*AlwaysInline=*/true,
+ /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+ CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
+ CC == CallingConv::HHVM || CC == CallingConv::Tail);
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ // C calling conventions:
+ case CallingConv::C:
+ case CallingConv::Win64:
+ case CallingConv::X86_64_SysV:
+ // Callee pop conventions:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_VectorCall:
+ case CallingConv::X86_FastCall:
+ // Swift:
+ case CallingConv::Swift:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+/// Return true if the function is being made into a tailcall target by
+/// changing its ABI.
+static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
+ return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
+}
+
+bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
+ if (!CI->isTailCall())
+ return false;
+
+ CallingConv::ID CalleeCC = CI->getCallingConv();
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ return true;
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ MachineFrameInfo &MFI, unsigned i) const {
+ // Create the nodes corresponding to a load from this parameter slot.
+ ISD::ArgFlagsTy Flags = Ins[i].Flags;
+ bool AlwaysUseMutable = shouldGuaranteeTCO(
+ CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
+ bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+ EVT ValVT;
+ MVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ // If value is passed by pointer we have address passed instead of the value
+ // itself. No need to extend if the mask value and location share the same
+ // absolute size.
+ bool ExtendedInMem =
+ VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
+ VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
+
+ if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
+ ValVT = VA.getLocVT();
+ else
+ ValVT = VA.getValVT();
+
+ // FIXME: For now, all byval parameter objects are marked mutable. This can be
+ // changed with more analysis.
+ // In case of tail call optimization mark all arguments mutable. Since they
+ // could be overwritten by lowering of arguments in case of a tail call.
+ if (Flags.isByVal()) {
+ unsigned Bytes = Flags.getByValSize();
+ if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+
+ // FIXME: For now, all byval parameter objects are marked as aliasing. This
+ // can be improved with deeper analysis.
+ int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
+ /*isAliased=*/true);
+ return DAG.getFrameIndex(FI, PtrVT);
+ }
+
+ EVT ArgVT = Ins[i].ArgVT;
+
+ // If this is a vector that has been split into multiple parts, and the
+ // scalar size of the parts don't match the vector element size, then we can't
+ // elide the copy. The parts will have padding between them instead of being
+ // packed like a vector.
+ bool ScalarizedAndExtendedVector =
+ ArgVT.isVector() && !VA.getLocVT().isVector() &&
+ VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
+
+ // This is an argument in memory. We might be able to perform copy elision.
+ // If the argument is passed directly in memory without any extension, then we
+ // can perform copy elision. Large vector types, for example, may be passed
+ // indirectly by pointer.
+ if (Flags.isCopyElisionCandidate() &&
+ VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
+ !ScalarizedAndExtendedVector) {
+ SDValue PartAddr;
+ if (Ins[i].PartOffset == 0) {
+ // If this is a one-part value or the first part of a multi-part value,
+ // create a stack object for the entire argument value type and return a
+ // load from our portion of it. This assumes that if the first part of an
+ // argument is in memory, the rest will also be in memory.
+ int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
+ /*IsImmutable=*/false);
+ PartAddr = DAG.getFrameIndex(FI, PtrVT);
+ return DAG.getLoad(
+ ValVT, dl, Chain, PartAddr,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ } else {
+ // This is not the first piece of an argument in memory. See if there is
+ // already a fixed stack object including this offset. If so, assume it
+ // was created by the PartOffset == 0 branch above and create a load from
+ // the appropriate offset into it.
+ int64_t PartBegin = VA.getLocMemOffset();
+ int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
+ int FI = MFI.getObjectIndexBegin();
+ for (; MFI.isFixedObjectIndex(FI); ++FI) {
+ int64_t ObjBegin = MFI.getObjectOffset(FI);
+ int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
+ if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
+ break;
+ }
+ if (MFI.isFixedObjectIndex(FI)) {
+ SDValue Addr =
+ DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
+ DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
+ return DAG.getLoad(
+ ValVT, dl, Chain, Addr,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
+ Ins[i].PartOffset));
+ }
+ }
+ }
+
+ int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
+ VA.getLocMemOffset(), isImmutable);
+
+ // Set SExt or ZExt flag.
+ if (VA.getLocInfo() == CCValAssign::ZExt) {
+ MFI.setObjectZExt(FI, true);
+ } else if (VA.getLocInfo() == CCValAssign::SExt) {
+ MFI.setObjectSExt(FI, true);
+ }
+
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ SDValue Val = DAG.getLoad(
+ ValVT, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ return ExtendedInMem
+ ? (VA.getValVT().isVector()
+ ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
+ : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
+ : Val;
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.is64Bit());
+
+ if (Subtarget.isCallingConvWin64(CallConv)) {
+ static const MCPhysReg GPR64ArgRegsWin64[] = {
+ X86::RCX, X86::RDX, X86::R8, X86::R9
+ };
+ return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
+ }
+
+ static const MCPhysReg GPR64ArgRegs64Bit[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+ };
+ return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
+ CallingConv::ID CallConv,
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.is64Bit());
+ if (Subtarget.isCallingConvWin64(CallConv)) {
+ // The XMM registers which might contain var arg parameters are shadowed
+ // in their paired GPR. So we only need to save the GPR to their home
+ // slots.
+ // TODO: __vectorcall will change this.
+ return None;
+ }
+
+ const Function &F = MF.getFunction();
+ bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
+ bool isSoftFloat = Subtarget.useSoftFloat();
+ assert(!(isSoftFloat && NoImplicitFloatOps) &&
+ "SSE register cannot be used when SSE is disabled!");
+ if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
+ // Kernel mode asks for SSE to be disabled, so there are no XMM argument
+ // registers.
+ return None;
+
+ static const MCPhysReg XMMArgRegs64Bit[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
+}
+
+#ifndef NDEBUG
+static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
+ return llvm::is_sorted(
+ ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
+ return A.getValNo() < B.getValNo();
+ });
+}
+#endif
+
+namespace {
+/// This is a helper class for lowering variable arguments parameters.
+class VarArgsLoweringHelper {
+public:
+ VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ CallingConv::ID CallConv, CCState &CCInfo)
+ : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
+ TheMachineFunction(DAG.getMachineFunction()),
+ TheFunction(TheMachineFunction.getFunction()),
+ FrameInfo(TheMachineFunction.getFrameInfo()),
+ FrameLowering(*Subtarget.getFrameLowering()),
+ TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
+ CCInfo(CCInfo) {}
+
+ // Lower variable arguments parameters.
+ void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
+
+private:
+ void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
+
+ void forwardMustTailParameters(SDValue &Chain);
+
+ bool is64Bit() const { return Subtarget.is64Bit(); }
+ bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
+
+ X86MachineFunctionInfo *FuncInfo;
+ const SDLoc &DL;
+ SelectionDAG &DAG;
+ const X86Subtarget &Subtarget;
+ MachineFunction &TheMachineFunction;
+ const Function &TheFunction;
+ MachineFrameInfo &FrameInfo;
+ const TargetFrameLowering &FrameLowering;
+ const TargetLowering &TargLowering;
+ CallingConv::ID CallConv;
+ CCState &CCInfo;
+};
+} // namespace
+
+void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
+ SDValue &Chain, unsigned StackSize) {
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start. We
+ // can skip this if there are no va_start calls.
+ if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
+ CallConv != CallingConv::X86_ThisCall)) {
+ FuncInfo->setVarArgsFrameIndex(
+ FrameInfo.CreateFixedObject(1, StackSize, true));
+ }
+
+ // Figure out if XMM registers are in use.
+ assert(!(Subtarget.useSoftFloat() &&
+ TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ // 64-bit calling conventions support varargs and register parameters, so we
+ // have to do extra work to spill them in the prologue.
+ if (is64Bit()) {
+ // Find the first unallocated argument registers.
+ ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+ ArrayRef<MCPhysReg> ArgXMMs =
+ get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
+
+ assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ if (isWin64()) {
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(
+ FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+ // Fixup to set vararg frame on shadow area (4 x i64).
+ if (NumIntRegs < 4)
+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+ } else {
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by dereferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
+ ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
+ }
+
+ SmallVector<SDValue, 6>
+ LiveGPRs; // list of SDValue for GPR registers keeping live input value
+ SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
+ // keeping live input value
+ SDValue ALVal; // if applicable keeps SDValue for %al register
+
+ // Gather all the live in physical registers.
+ for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+ Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
+ LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
+ }
+ const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
+ if (!AvailableXmms.empty()) {
+ Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
+ ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
+ for (MCPhysReg Reg : AvailableXmms) {
+ Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);
+ LiveXMMRegs.push_back(
+ DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));
+ }
+ }
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN =
+ DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ TargLowering.getPointerTy(DAG.getDataLayout()));
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (SDValue Val : LiveGPRs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, DL,
+ TargLowering.getPointerTy(DAG.getDataLayout()),
+ RSFIN, DAG.getIntPtrConstant(Offset, DL));
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(),
+ FuncInfo->getRegSaveFrameIndex(), Offset));
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ // Now store the XMM (fp + vector) parameter registers.
+ if (!LiveXMMRegs.empty()) {
+ SmallVector<SDValue, 12> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+ SaveXMMOps.push_back(ALVal);
+ SaveXMMOps.push_back(
+ DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
+ SaveXMMOps.push_back(
+ DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
+ llvm::append_range(SaveXMMOps, LiveXMMRegs);
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
+ MVT::Other, SaveXMMOps));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+ }
+}
+
+void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
+ // Find the largest legal vector type.
+ MVT VecVT = MVT::Other;
+ // FIXME: Only some x86_32 calling conventions support AVX512.
+ if (Subtarget.useAVX512Regs() &&
+ (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
+ CallConv == CallingConv::Intel_OCL_BI)))
+ VecVT = MVT::v16f32;
+ else if (Subtarget.hasAVX())
+ VecVT = MVT::v8f32;
+ else if (Subtarget.hasSSE2())
+ VecVT = MVT::v4f32;
+
+ // We forward some GPRs and some vector types.
+ SmallVector<MVT, 2> RegParmTypes;
+ MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
+ RegParmTypes.push_back(IntVT);
+ if (VecVT != MVT::Other)
+ RegParmTypes.push_back(VecVT);
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+ // Forward AL for SysV x86_64 targets, since it is used for varargs.
+ if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
+ Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
+ Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+ }
+
+ // Copy all forwards from physical to virtual registers.
+ for (ForwardedRegister &FR : Forwards) {
+ // FIXME: Can we use a less constrained schedule?
+ SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
+ FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
+ TargLowering.getRegClassFor(FR.VT));
+ Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
+ }
+}
+
+void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
+ unsigned StackSize) {
+ // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
+ // If necessary, it would be set into the correct value later.
+ FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+ FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+
+ if (FrameInfo.hasVAStart())
+ createVarArgAreaAndStoreRegisters(Chain, StackSize);
+
+ if (FrameInfo.hasMustTailInVarArgFunc())
+ forwardMustTailParameters(Chain);
+}
+
+SDValue X86TargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ const Function &F = MF.getFunction();
+ if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
+ F.getName() == "main")
+ FuncInfo->setForceFramePointer(true);
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ bool Is64Bit = Subtarget.is64Bit();
+ bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
+
+ assert(
+ !(IsVarArg && canGuaranteeTCO(CallConv)) &&
+ "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+ // Allocate shadow area for Win64.
+ if (IsWin64)
+ CCInfo.AllocateStack(32, Align(8));
+
+ CCInfo.AnalyzeArguments(Ins, CC_X86);
+
+ // In vectorcall calling convention a second pass is required for the HVA
+ // types.
+ if (CallingConv::X86_VectorCall == CallConv) {
+ CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
+ }
+
+ // The next loop assumes that the locations are in the same order of the
+ // input arguments.
+ assert(isSortedByValueNo(ArgLocs) &&
+ "Argument Location list must be sorted before lowering");
+
+ SDValue ArgValue;
+ for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
+ ++I, ++InsIndex) {
+ assert(InsIndex < Ins.size() && "Invalid Ins index");
+ CCValAssign &VA = ArgLocs[I];
+
+ if (VA.isRegLoc()) {
+ EVT RegVT = VA.getLocVT();
+ if (VA.needsCustom()) {
+ assert(
+ VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+
+ // v64i1 values, in regcall calling convention, that are
+ // compiled to 32 bit arch, are split up into two registers.
+ ArgValue =
+ getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
+ } else {
+ const TargetRegisterClass *RC;
+ if (RegVT == MVT::i8)
+ RC = &X86::GR8RegClass;
+ else if (RegVT == MVT::i16)
+ RC = &X86::GR16RegClass;
+ else if (RegVT == MVT::i32)
+ RC = &X86::GR32RegClass;
+ else if (Is64Bit && RegVT == MVT::i64)
+ RC = &X86::GR64RegClass;
+ else if (RegVT == MVT::f32)
+ RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
+ else if (RegVT == MVT::f64)
+ RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
+ else if (RegVT == MVT::f80)
+ RC = &X86::RFP80RegClass;
+ else if (RegVT == MVT::f128)
+ RC = &X86::VR128RegClass;
+ else if (RegVT.is512BitVector())
+ RC = &X86::VR512RegClass;
+ else if (RegVT.is256BitVector())
+ RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
+ else if (RegVT.is128BitVector())
+ RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
+ else if (RegVT == MVT::x86mmx)
+ RC = &X86::VR64RegClass;
+ else if (RegVT == MVT::v1i1)
+ RC = &X86::VK1RegClass;
+ else if (RegVT == MVT::v8i1)
+ RC = &X86::VK8RegClass;
+ else if (RegVT == MVT::v16i1)
+ RC = &X86::VK16RegClass;
+ else if (RegVT == MVT::v32i1)
+ RC = &X86::VK32RegClass;
+ else if (RegVT == MVT::v64i1)
+ RC = &X86::VK64RegClass;
+ else
+ llvm_unreachable("Unknown argument type!");
+
+ Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+ }
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
+
+ if (VA.isExtInLoc()) {
+ // Handle MMX values passed in XMM regs.
+ if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
+ ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
+ else if (VA.getValVT().isVector() &&
+ VA.getValVT().getScalarType() == MVT::i1 &&
+ ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+ (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+ // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
+ ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
+ } else
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+ }
+ } else {
+ assert(VA.isMemLoc());
+ ArgValue =
+ LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
+ }
+
+ // If value is passed via pointer - do a load.
+ if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
+ ArgValue =
+ DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
+
+ InVals.push_back(ArgValue);
+ }
+
+ for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
+ // Swift calling convention does not require we copy the sret argument
+ // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
+ if (CallConv == CallingConv::Swift)
+ continue;
+
+ // All x86 ABIs require that for returning structs by value we copy the
+ // sret argument into %rax/%eax (depending on ABI) for the return. Save
+ // the argument into a virtual register so that we can access it from the
+ // return points.
+ if (Ins[I].Flags.isSRet()) {
+ Register Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ MVT PtrTy = getPointerTy(DAG.getDataLayout());
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ FuncInfo->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+ break;
+ }
+ }
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+ // Align stack specially for tail calls.
+ if (shouldGuaranteeTCO(CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt))
+ StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+ if (IsVarArg)
+ VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
+ .lowerVarArgsParameters(Chain, StackSize);
+
+ // Some CCs need callee pop.
+ if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt)) {
+ FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+ } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
+ // X86 interrupts must pop the error code (and the alignment padding) if
+ // present.
+ FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
+ } else {
+ FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
+ // If this is an sret function, the return should pop the hidden pointer.
+ if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+ !Subtarget.getTargetTriple().isOSMSVCRT() &&
+ argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
+ FuncInfo->setBytesToPopOnReturn(4);
+ }
+
+ if (!Is64Bit) {
+ // RegSaveFrameIndex is X86-64 only.
+ FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+ }
+
+ FuncInfo->setArgumentStackSize(StackSize);
+
+ if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
+ EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
+ if (Personality == EHPersonality::CoreCLR) {
+ assert(Is64Bit);
+ // TODO: Add a mechanism to frame lowering that will allow us to indicate
+ // that we'd prefer this slot be allocated towards the bottom of the frame
+ // (i.e. near the stack pointer after allocating the frame). Every
+ // funclet needs a copy of this slot in its (mostly empty) frame, and the
+ // offset from the bottom of this and each funclet's frame must be the
+ // same, so the size of funclets' (mostly empty) frames is dictated by
+ // how far this slot is from the bottom (since they allocate just enough
+ // space to accommodate holding this slot at the correct offset).
+ int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
+ EHInfo->PSPSymFrameIdx = PSPSymFI;
+ }
+ }
+
+ if (CallConv == CallingConv::X86_RegCall ||
+ F.hasFnAttribute("no_caller_saved_registers")) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (std::pair<Register, Register> Pair : MRI.liveins())
+ MRI.disableCalleeSavedRegister(Pair.first);
+ }
+
+ return Chain;
+}
+
+SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+ SDValue Arg, const SDLoc &dl,
+ SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags,
+ bool isByVal) const {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+ PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ StackPtr, PtrOff);
+ if (isByVal)
+ return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+
+ return DAG.getStore(
+ Chain, dl, Arg, PtrOff,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+}
+
+/// Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
+ SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
+ bool Is64Bit, int FPDiff, const SDLoc &dl) const {
+ // Adjust the Return address stack slot.
+ EVT VT = getPointerTy(DAG.getDataLayout());
+ OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+ // Load the "old" Return address.
+ OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
+ return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// Emit a store of the return address if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ EVT PtrVT, unsigned SlotSize,
+ int FPDiff, const SDLoc &dl) {
+ // Store the return address to the appropriate stack slot.
+ if (!FPDiff) return Chain;
+ // Calculate the new stack slot for the return address.
+ int NewReturnAddrFI =
+ MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+ false);
+ SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
+ Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), NewReturnAddrFI));
+ return Chain;
+}
+
+/// Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ Mask.push_back(NumElems);
+ for (unsigned i = 1; i != NumElems; ++i)
+ Mask.push_back(i);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
+SDValue
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool &isTailCall = CLI.IsTailCall;
+ bool isVarArg = CLI.IsVarArg;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool Is64Bit = Subtarget.is64Bit();
+ bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
+ StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
+ bool IsSibcall = false;
+ bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
+ CallConv == CallingConv::Tail;
+ X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+ const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);
+ const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
+ bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
+ (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
+ const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
+ bool HasNoCfCheck =
+ (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
+ bool IsIndirectCall = (CI && CI->isIndirectCall());
+ const Module *M = MF.getMMI().getModule();
+ Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
+
+ MachineFunction::CallSiteInfo CSInfo;
+ if (CallConv == CallingConv::X86_INTR)
+ report_fatal_error("X86 interrupts may not be called directly");
+
+ if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
+ // If we are using a GOT, disable tail calls to external symbols with
+ // default visibility. Tail calling such a symbol requires using a GOT
+ // relocation, which forces early binding of the symbol. This breaks code
+ // that require lazy function symbol resolution. Using musttail or
+ // GuaranteedTailCallOpt will override this.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (!G || (!G->getGlobal()->hasLocalLinkage() &&
+ G->getGlobal()->hasDefaultVisibility()))
+ isTailCall = false;
+ }
+
+ bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
+ if (IsMustTail) {
+ // Force this to be a tail call. The verifier rules are enough to ensure
+ // that we can lower this successfully without moving the return address
+ // around.
+ isTailCall = true;
+ } else if (isTailCall) {
+ // Check if it's really possible to do a tail call.
+ isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+ isVarArg, SR != NotStructReturn,
+ MF.getFunction().hasStructRetAttr(), CLI.RetTy,
+ Outs, OutVals, Ins, DAG);
+
+ // Sibcalls are automatically detected tailcalls which do not require
+ // ABI changes.
+ if (!IsGuaranteeTCO && isTailCall)
+ IsSibcall = true;
+
+ if (isTailCall)
+ ++NumTailCalls;
+ }
+
+ assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
+ "Var args not supported with calling convention fastcc, ghc or hipe");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+ // Allocate shadow area for Win64.
+ if (IsWin64)
+ CCInfo.AllocateStack(32, Align(8));
+
+ CCInfo.AnalyzeArguments(Outs, CC_X86);
+
+ // In vectorcall calling convention a second pass is required for the HVA
+ // types.
+ if (CallingConv::X86_VectorCall == CallConv) {
+ CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
+ }
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+ if (IsSibcall)
+ // This is a sibcall. The memory operands are available in caller's
+ // own caller's stack.
+ NumBytes = 0;
+ else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+ int FPDiff = 0;
+ if (isTailCall && !IsSibcall && !IsMustTail) {
+ // Lower arguments at fp - stackoffset + fpdiff.
+ unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
+ FPDiff = NumBytesCallerPushed - NumBytes;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < X86Info->getTCReturnAddrDelta())
+ X86Info->setTCReturnAddrDelta(FPDiff);
+ }
+
+ unsigned NumBytesToPush = NumBytes;
+ unsigned NumBytesToPop = NumBytes;
+
+ // If we have an inalloca argument, all stack space has already been allocated
+ // for us and be right at the top of the stack. We don't support multiple
+ // arguments passed in memory when using inalloca.
+ if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
+ NumBytesToPush = 0;
+ if (!ArgLocs.back().isMemLoc())
+ report_fatal_error("cannot use inalloca attribute on a register "
+ "parameter");
+ if (ArgLocs.back().getLocMemOffset() != 0)
+ report_fatal_error("any parameter with the inalloca attribute must be "
+ "the only memory argument");
+ } else if (CLI.IsPreallocated) {
+ assert(ArgLocs.back().isMemLoc() &&
+ "cannot use preallocated attribute on a register "
+ "parameter");
+ SmallVector<size_t, 4> PreallocatedOffsets;
+ for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
+ if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
+ PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
+ }
+ }
+ auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
+ MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
+ MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
+ NumBytesToPush = 0;
+ }
+
+ if (!IsSibcall && !IsMustTail)
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
+ NumBytes - NumBytesToPush, dl);
+
+ SDValue RetAddrFrIdx;
+ // Load return address for tail calls.
+ if (isTailCall && FPDiff)
+ Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+ Is64Bit, FPDiff, dl);
+
+ SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+ SDValue StackPtr;
+
+ // The next loop assumes that the locations are in the same order of the
+ // input arguments.
+ assert(isSortedByValueNo(ArgLocs) &&
+ "Argument Location list must be sorted before lowering");
+
+ // Walk the register/memloc assignments, inserting copies/loads. In the case
+ // of tail call optimization arguments are handle later.
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
+ ++I, ++OutIndex) {
+ assert(OutIndex < Outs.size() && "Invalid Out index");
+ // Skip inalloca/preallocated arguments, they have already been written.
+ ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
+ if (Flags.isInAlloca() || Flags.isPreallocated())
+ continue;
+
+ CCValAssign &VA = ArgLocs[I];
+ EVT RegVT = VA.getLocVT();
+ SDValue Arg = OutVals[OutIndex];
+ bool isByVal = Flags.isByVal();
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::AExt:
+ if (Arg.getValueType().isVector() &&
+ Arg.getValueType().getVectorElementType() == MVT::i1)
+ Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
+ else if (RegVT.is128BitVector()) {
+ // Special case: passing MMX values in XMM registers.
+ Arg = DAG.getBitcast(MVT::i64, Arg);
+ Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+ Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+ } else
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getBitcast(RegVT, Arg);
+ break;
+ case CCValAssign::Indirect: {
+ if (isByVal) {
+ // Memcpy the argument to a temporary stack slot to prevent
+ // the caller from seeing any modifications the callee may make
+ // as guaranteed by the `byval` attribute.
+ int FrameIdx = MF.getFrameInfo().CreateStackObject(
+ Flags.getByValSize(),
+ std::max(Align(16), Flags.getNonZeroByValAlign()), false);
+ SDValue StackSlot =
+ DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
+ Chain =
+ CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
+ // From now on treat this as a regular pointer
+ Arg = StackSlot;
+ isByVal = false;
+ } else {
+ // Store the argument.
+ SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+ int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+ Chain = DAG.getStore(
+ Chain, dl, Arg, SpillSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ Arg = SpillSlot;
+ }
+ break;
+ }
+ }
+
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+ // Split v64i1 value into two registers
+ Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
+ } else if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.EmitCallSiteInfo)
+ CSInfo.emplace_back(VA.getLocReg(), I);
+ if (isVarArg && IsWin64) {
+ // Win64 ABI requires argument XMM reg to be copied to the corresponding
+ // shadow reg if callee is a varargs function.
+ Register ShadowReg;
+ switch (VA.getLocReg()) {
+ case X86::XMM0: ShadowReg = X86::RCX; break;
+ case X86::XMM1: ShadowReg = X86::RDX; break;
+ case X86::XMM2: ShadowReg = X86::R8; break;
+ case X86::XMM3: ShadowReg = X86::R9; break;
+ }
+ if (ShadowReg)
+ RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
+ }
+ } else if (!IsSibcall && (!isTailCall || isByVal)) {
+ assert(VA.isMemLoc());
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ getPointerTy(DAG.getDataLayout()));
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+ dl, DAG, VA, Flags, isByVal));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+ if (Subtarget.isPICStyleGOT()) {
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer (except regcall).
+ if (!isTailCall) {
+ // Indirect call with RegCall calling convertion may use up all the
+ // general registers, so it is not suitable to bind EBX reister for
+ // GOT address, just let register allocator handle it.
+ if (CallConv != CallingConv::X86_RegCall)
+ RegsToPass.push_back(std::make_pair(
+ Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+ getPointerTy(DAG.getDataLayout()))));
+ } else {
+ // If we are tail calling and generating PIC/GOT style code load the
+ // address of the callee into ECX. The value in ecx is used as target of
+ // the tail jump. This is done to circumvent the ebx/callee-saved problem
+ // for tail calls on PIC/GOT architectures. Normally we would just put the
+ // address of GOT into ebx and then call target@PLT. But for tail calls
+ // ebx would be restored (since ebx is callee saved) before jumping to the
+ // target@PLT.
+
+ // Note: The actual moving to ECX is done further down.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (G && !G->getGlobal()->hasLocalLinkage() &&
+ G->getGlobal()->hasDefaultVisibility())
+ Callee = LowerGlobalAddress(Callee, DAG);
+ else if (isa<ExternalSymbolSDNode>(Callee))
+ Callee = LowerExternalSymbol(Callee, DAG);
+ }
+ }
+
+ if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // Count the number of XMM registers allocated.
+ static const MCPhysReg XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+ assert((Subtarget.hasSSE1() || !NumXMMRegs)
+ && "SSE registers cannot be used when SSE is disabled");
+ RegsToPass.push_back(std::make_pair(Register(X86::AL),
+ DAG.getConstant(NumXMMRegs, dl,
+ MVT::i8)));
+ }
+
+ if (isVarArg && IsMustTail) {
+ const auto &Forwards = X86Info->getForwardedMustTailRegParms();
+ for (const auto &F : Forwards) {
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+ RegsToPass.push_back(std::make_pair(F.PReg, Val));
+ }
+ }
+
+ // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
+ // don't need this because the eligibility check rejects calls that require
+ // shuffling arguments passed in memory.
+ if (!IsSibcall && isTailCall) {
+ // Force all the incoming stack arguments to be loaded from the stack
+ // before any new outgoing arguments are stored to the stack, because the
+ // outgoing stack slots may alias the incoming argument stack slots, and
+ // the alias isn't otherwise explicit. This is slightly more conservative
+ // than necessary, because it means that each store effectively depends
+ // on every argument instead of just those arguments it would clobber.
+ SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+ SmallVector<SDValue, 8> MemOpChains2;
+ SDValue FIN;
+ int FI = 0;
+ for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
+ ++I, ++OutsIndex) {
+ CCValAssign &VA = ArgLocs[I];
+
+ if (VA.isRegLoc()) {
+ if (VA.needsCustom()) {
+ assert((CallConv == CallingConv::X86_RegCall) &&
+ "Expecting custom case only in regcall calling convention");
+ // This means that we are in special case where one argument was
+ // passed through two register locations - Skip the next location
+ ++I;
+ }
+
+ continue;
+ }
+
+ assert(VA.isMemLoc());
+ SDValue Arg = OutVals[OutsIndex];
+ ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
+ // Skip inalloca/preallocated arguments. They don't require any work.
+ if (Flags.isInAlloca() || Flags.isPreallocated())
+ continue;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
+ FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ getPointerTy(DAG.getDataLayout()));
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+ ArgChain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(DAG.getStore(
+ ArgChain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+ }
+ }
+
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
+
+ // Store the return address to the appropriate stack slot.
+ Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+ getPointerTy(DAG.getDataLayout()),
+ RegInfo->getSlotSize(), FPDiff, dl);
+ }
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
+ assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+ // In the 64-bit large code model, we have to make all calls
+ // through a register, since the call instruction's 32-bit
+ // pc-relative offset may not be large enough to hold the whole
+ // address.
+ } else if (Callee->getOpcode() == ISD::GlobalAddress ||
+ Callee->getOpcode() == ISD::ExternalSymbol) {
+ // Lower direct calls to global addresses and external symbols. Setting
+ // ForCall to true here has the effect of removing WrapperRIP when possible
+ // to allow direct calls to be selected without first materializing the
+ // address into a register.
+ Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
+ } else if (Subtarget.isTarget64BitILP32() &&
+ Callee->getValueType(0) == MVT::i32) {
+ // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
+ Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SmallVector<SDValue, 8> Ops;
+
+ if (!IsSibcall && isTailCall && !IsMustTail) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ InFlag = Chain.getValue(1);
+ }
+
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (isTailCall)
+ Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
+ // set X86_INTR calling convention because it has the same CSR mask
+ // (same preserved registers).
+ const uint32_t *Mask = RegInfo->getCallPreservedMask(
+ MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+
+ // If this is an invoke in a 32-bit function using a funclet-based
+ // personality, assume the function clobbers all registers. If an exception
+ // is thrown, the runtime will not restore CSRs.
+ // FIXME: Model this more precisely so that we can register allocate across
+ // the normal edge and spill and fill across the exceptional edge.
+ if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
+ const Function &CallerFn = MF.getFunction();
+ EHPersonality Pers =
+ CallerFn.hasPersonalityFn()
+ ? classifyEHPersonality(CallerFn.getPersonalityFn())
+ : EHPersonality::Unknown;
+ if (isFuncletEHPersonality(Pers))
+ Mask = RegInfo->getNoPreservedMask();
+ }
+
+ // Define a new register mask from the existing mask.
+ uint32_t *RegMask = nullptr;
+
+ // In some calling conventions we need to remove the used physical registers
+ // from the reg mask.
+ if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ // Allocate a new Reg Mask and copy Mask.
+ RegMask = MF.allocateRegMask();
+ unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
+ memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
+
+ // Make sure all sub registers of the argument registers are reset
+ // in the RegMask.
+ for (auto const &RegPair : RegsToPass)
+ for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
+ SubRegs.isValid(); ++SubRegs)
+ RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+
+ // Create the RegMask Operand according to our updated mask.
+ Ops.push_back(DAG.getRegisterMask(RegMask));
+ } else {
+ // Create the RegMask Operand according to the static mask.
+ Ops.push_back(DAG.getRegisterMask(Mask));
+ }
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ if (isTailCall) {
+ // We used to do:
+ //// If this is the first return lowered for this function, add the regs
+ //// to the liveout set for the function.
+ // This isn't right, although it's probably harmless on x86; liveouts
+ // should be computed from returns not tail calls. Consider a void
+ // function making a tail call to a function returning int.
+ MF.getFrameInfo().setHasTailCall();
+ SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+ DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+ return Ret;
+ }
+
+ if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
+ Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
+ } else {
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+ }
+ InFlag = Chain.getValue(1);
+ DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
+ DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+
+ // Save heapallocsite metadata.
+ if (CLI.CB)
+ if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
+ DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
+
+ // Create the CALLSEQ_END node.
+ unsigned NumBytesForCalleeToPop;
+ if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ DAG.getTarget().Options.GuaranteedTailCallOpt))
+ NumBytesForCalleeToPop = NumBytes; // Callee pops everything
+ else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+ !Subtarget.getTargetTriple().isOSMSVCRT() &&
+ SR == StackStructReturn)
+ // If this is a call to a struct-return function, the callee
+ // pops the hidden struct pointer, so we have to push it back.
+ // This is common for Darwin/X86, Linux & Mingw32 targets.
+ // For MSVC Win32 targets, the caller pops the hidden struct pointer.
+ NumBytesForCalleeToPop = 4;
+ else
+ NumBytesForCalleeToPop = 0; // Callee pops nothing.
+
+ // Returns a flag for retval copy to use.
+ if (!IsSibcall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+ DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
+ true),
+ InFlag, dl);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+ InVals, RegMask);
+}
+
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// On X86_64 architecture with GOT-style position independent code only local
+// (within module) calls are supported at the moment.
+// To keep the stack aligned according to platform abi the function
+// GetAlignedArgumentStackSize ensures that argument delta is always multiples
+// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
+// If a tail called function callee has more arguments than the caller the
+// caller needs to make sure that there is room to move the RETADDR to. This is
+// achieved by reserving an area the size of the argument delta right after the
+// original RETADDR, but before the saved framepointer or the spilled registers
+// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+// stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
+/// requirement.
+unsigned
+X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
+ SelectionDAG &DAG) const {
+ const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
+ const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
+ assert(StackSize % SlotSize == 0 &&
+ "StackSize must be a multiple of SlotSize");
+ return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
+}
+
+/// Return true if the given stack call argument is already available in the
+/// same position (relatively) of the caller's incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+ MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
+ const X86InstrInfo *TII, const CCValAssign &VA) {
+ unsigned Bytes = Arg.getValueSizeInBits() / 8;
+
+ for (;;) {
+ // Look through nodes that don't alter the bits of the incoming value.
+ unsigned Op = Arg.getOpcode();
+ if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
+ Arg = Arg.getOperand(0);
+ continue;
+ }
+ if (Op == ISD::TRUNCATE) {
+ const SDValue &TruncInput = Arg.getOperand(0);
+ if (TruncInput.getOpcode() == ISD::AssertZext &&
+ cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
+ Arg.getValueType()) {
+ Arg = TruncInput.getOperand(0);
+ continue;
+ }
+ }
+ break;
+ }
+
+ int FI = INT_MAX;
+ if (Arg.getOpcode() == ISD::CopyFromReg) {
+ Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+ if (!VR.isVirtual())
+ return false;
+ MachineInstr *Def = MRI->getVRegDef(VR);
+ if (!Def)
+ return false;
+ if (!Flags.isByVal()) {
+ if (!TII->isLoadFromStackSlot(*Def, FI))
+ return false;
+ } else {
+ unsigned Opcode = Def->getOpcode();
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r) &&
+ Def->getOperand(1).isFI()) {
+ FI = Def->getOperand(1).getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+ }
+ } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+ if (Flags.isByVal())
+ // ByVal argument is passed in as a pointer but it's now being
+ // dereferenced. e.g.
+ // define @foo(%struct.X* %A) {
+ // tail call @bar(%struct.X* byval %A)
+ // }
+ return false;
+ SDValue Ptr = Ld->getBasePtr();
+ FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FINode)
+ return false;
+ FI = FINode->getIndex();
+ } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+ FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+ FI = FINode->getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+
+ assert(FI != INT_MAX);
+ if (!MFI.isFixedObjectIndex(FI))
+ return false;
+
+ if (Offset != MFI.getObjectOffset(FI))
+ return false;
+
+ // If this is not byval, check that the argument stack object is immutable.
+ // inalloca and argument copy elision can create mutable argument stack
+ // objects. Byval objects can be mutated, but a byval call intends to pass the
+ // mutated memory.
+ if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
+ return false;
+
+ if (VA.getLocVT().getFixedSizeInBits() >
+ Arg.getValueSizeInBits().getFixedSize()) {
+ // If the argument location is wider than the argument type, check that any
+ // extension flags match.
+ if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
+ Flags.isSExt() != MFI.isObjectSExt(FI)) {
+ return false;
+ }
+ }
+
+ return Bytes == MFI.getObjectSize(FI);
+}
+
+/// Check whether the call is eligible for tail call optimization. Targets
+/// that want to do tail call optimization should implement this function.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ // If -tailcallopt is specified, make fastcc functions tail-callable.
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function &CallerF = MF.getFunction();
+
+ // If the function return type is x86_fp80 and the callee return type is not,
+ // then the FP_EXTEND of the call result is not a nop. It's not safe to
+ // perform a tailcall optimization here.
+ if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+ return false;
+
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+ bool CCMatch = CallerCC == CalleeCC;
+ bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
+ bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
+ bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
+ CalleeCC == CallingConv::Tail;
+
+ // Win64 functions have extra shadow space for argument homing. Don't do the
+ // sibcall if the caller and callee have mismatched expectations for this
+ // space.
+ if (IsCalleeWin64 != IsCallerWin64)
+ return false;
+
+ if (IsGuaranteeTCO) {
+ if (canGuaranteeTCO(CalleeCC) && CCMatch)
+ return true;
+ return false;
+ }
+
+ // Look for obvious safe cases to perform tail call optimization that do not
+ // require ABI changes. This is what gcc calls sibcall.
+
+ // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+ // emit a special epilogue.
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ if (RegInfo->needsStackRealignment(MF))
+ return false;
+
+ // Also avoid sibcall optimization if either caller or callee uses struct
+ // return semantics.
+ if (isCalleeStructRet || isCallerStructRet)
+ return false;
+
+ // Do not sibcall optimize vararg calls unless all arguments are passed via
+ // registers.
+ LLVMContext &C = *DAG.getContext();
+ if (isVarArg && !Outs.empty()) {
+ // Optimizing for varargs on Win64 is unlikely to be safe without
+ // additional testing.
+ if (IsCalleeWin64 || IsCallerWin64)
+ return false;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+ if (!ArgLocs[i].isRegLoc())
+ return false;
+ }
+
+ // If the call result is in ST0 / ST1, it needs to be popped off the x87
+ // stack. Therefore, if it's not used by the call it is not safe to optimize
+ // this into a sibcall.
+ bool Unused = false;
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ if (!Ins[i].Used) {
+ Unused = true;
+ break;
+ }
+ }
+ if (Unused) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+ return false;
+ }
+ }
+
+ // Check that the call results are passed in the same way.
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+ RetCC_X86, RetCC_X86))
+ return false;
+ // The callee has to preserve all registers the caller needs to preserve.
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ if (!CCMatch) {
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+ }
+
+ unsigned StackArgsSize = 0;
+
+ // If the callee takes no arguments then go on to check the results of the
+ // call.
+ if (!Outs.empty()) {
+ // Check if stack adjustment is needed. For now, do not do this if any
+ // argument is passed on the stack.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+ // Allocate shadow area for Win64
+ if (IsCalleeWin64)
+ CCInfo.AllocateStack(32, Align(8));
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ StackArgsSize = CCInfo.getNextStackOffset();
+
+ if (CCInfo.getNextStackOffset()) {
+ // Check if the arguments are already laid out in the right way as
+ // the caller's fixed stack objects.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ return false;
+ if (!VA.isRegLoc()) {
+ if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+ MFI, MRI, TII, VA))
+ return false;
+ }
+ }
+ }
+
+ bool PositionIndependent = isPositionIndependent();
+ // If the tailcall address may be in a register, then make sure it's
+ // possible to register allocate for it. In 32-bit, the call address can
+ // only target EAX, EDX, or ECX since the tail call must be scheduled after
+ // callee-saved registers are restored. These happen to be the same
+ // registers used to pass 'inreg' arguments so watch out for those.
+ if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) ||
+ PositionIndependent)) {
+ unsigned NumInRegs = 0;
+ // In PIC we need an extra register to formulate the address computation
+ // for the callee.
+ unsigned MaxInRegs = PositionIndependent ? 2 : 3;
+
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (!VA.isRegLoc())
+ continue;
+ Register Reg = VA.getLocReg();
+ switch (Reg) {
+ default: break;
+ case X86::EAX: case X86::EDX: case X86::ECX:
+ if (++NumInRegs == MaxInRegs)
+ return false;
+ break;
+ }
+ }
+ }
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+ return false;
+ }
+
+ bool CalleeWillPop =
+ X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt);
+
+ if (unsigned BytesToPop =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+ // If we have bytes to pop, the callee must pop them.
+ bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
+ if (!CalleePopMatches)
+ return false;
+ } else if (CalleeWillPop && StackArgsSize > 0) {
+ // If we don't have bytes to pop, make sure the callee doesn't pop any.
+ return false;
+ }
+
+ return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const {
+ return X86::createFastISel(funcInfo, libInfo);
+}
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+static bool MayFoldLoad(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+static bool MayFoldIntoZeroExtend(SDValue Op) {
+ if (Op.hasOneUse()) {
+ unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
+ return (ISD::ZERO_EXTEND == Opcode);
+ }
+ return false;
+}
+
+static bool isTargetShuffle(unsigned Opcode) {
+ switch(Opcode) {
+ default: return false;
+ case X86ISD::BLENDI:
+ case X86ISD::PSHUFB:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::SHUFP:
+ case X86ISD::INSERTPS:
+ case X86ISD::EXTRQI:
+ case X86ISD::INSERTQI:
+ case X86ISD::VALIGN:
+ case X86ISD::PALIGNR:
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
+ case X86ISD::MOVLHPS:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::MOVDDUP:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ case X86ISD::VBROADCAST:
+ case X86ISD::VPERMILPI:
+ case X86ISD::VPERMILPV:
+ case X86ISD::VPERM2X128:
+ case X86ISD::SHUF128:
+ case X86ISD::VPERMIL2:
+ case X86ISD::VPERMI:
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
+ case X86ISD::VZEXT_MOVL:
+ return true;
+ }
+}
+
+static bool isTargetShuffleVariableMask(unsigned Opcode) {
+ switch (Opcode) {
+ default: return false;
+ // Target Shuffles.
+ case X86ISD::PSHUFB:
+ case X86ISD::VPERMILPV:
+ case X86ISD::VPERMIL2:
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
+ return true;
+ // 'Faux' Target Shuffles.
+ case ISD::OR:
+ case ISD::AND:
+ case X86ISD::ANDNP:
+ return true;
+ }
+}
+
+static bool isTargetShuffleSplat(SDValue Op) {
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::EXTRACT_SUBVECTOR)
+ return isTargetShuffleSplat(Op.getOperand(0));
+ return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
+}
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+ if (ReturnAddrIndex == 0) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
+ -(int64_t)SlotSize,
+ false);
+ FuncInfo->setRAIndex(ReturnAddrIndex);
+ }
+
+ return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
+}
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement) {
+ // Offset should fit into 32 bit immediate field.
+ if (!isInt<32>(Offset))
+ return false;
+
+ // If we don't have a symbolic displacement - we don't have any extra
+ // restrictions.
+ if (!hasSymbolicDisplacement)
+ return true;
+
+ // FIXME: Some tweaks might be needed for medium code model.
+ if (M != CodeModel::Small && M != CodeModel::Kernel)
+ return false;
+
+ // For small code model we assume that latest object is 16MB before end of 31
+ // bits boundary. We may also accept pretty large negative constants knowing
+ // that all objects are in the positive half of address space.
+ if (M == CodeModel::Small && Offset < 16*1024*1024)
+ return true;
+
+ // For kernel code model we know that all object resist in the negative half
+ // of 32bits address space. We may not accept negative offsets, since they may
+ // be just off and we may accept pretty large positive ones.
+ if (M == CodeModel::Kernel && Offset >= 0)
+ return true;
+
+ return false;
+}
+
+/// Determines whether the callee is required to pop its own arguments.
+/// Callee pop is necessary to support tail calls.
+bool X86::isCalleePop(CallingConv::ID CallingConv,
+ bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
+ // If GuaranteeTCO is true, we force some calls to be callee pop so that we
+ // can guarantee TCO.
+ if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
+ return true;
+
+ switch (CallingConv) {
+ default:
+ return false;
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_VectorCall:
+ return !is64Bit;
+ }
+}
+
+/// Return true if the condition is an signed comparison operation.
+static bool isX86CCSigned(unsigned X86CC) {
+ switch (X86CC) {
+ default:
+ llvm_unreachable("Invalid integer condition!");
+ case X86::COND_E:
+ case X86::COND_NE:
+ case X86::COND_B:
+ case X86::COND_A:
+ case X86::COND_BE:
+ case X86::COND_AE:
+ return false;
+ case X86::COND_G:
+ case X86::COND_GE:
+ case X86::COND_L:
+ case X86::COND_LE:
+ return true;
+ }
+}
+
+static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Invalid integer condition!");
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETGT: return X86::COND_G;
+ case ISD::SETGE: return X86::COND_GE;
+ case ISD::SETLT: return X86::COND_L;
+ case ISD::SETLE: return X86::COND_LE;
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETULT: return X86::COND_B;
+ case ISD::SETUGT: return X86::COND_A;
+ case ISD::SETULE: return X86::COND_BE;
+ case ISD::SETUGE: return X86::COND_AE;
+ }
+}
+
+/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
+/// condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
+ bool isFP, SDValue &LHS, SDValue &RHS,
+ SelectionDAG &DAG) {
+ if (!isFP) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+ // X > -1 -> X == 0, jump !sign.
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return X86::COND_NS;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ // X < 0 -> X == 0, jump on sign.
+ return X86::COND_S;
+ }
+ if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
+ // X >= 0 -> X == 0, jump on !sign.
+ return X86::COND_NS;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
+ // X < 1 -> X <= 0
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ return X86::COND_LE;
+ }
+ }
+
+ return TranslateIntegerX86CC(SetCCOpcode);
+ }
+
+ // First determine if it is required or is profitable to flip the operands.
+
+ // If LHS is a foldable load, but RHS is not, flip the condition.
+ if (ISD::isNON_EXTLoad(LHS.getNode()) &&
+ !ISD::isNON_EXTLoad(RHS.getNode())) {
+ SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+ std::swap(LHS, RHS);
+ }
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETOLT:
+ case ISD::SETOLE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ std::swap(LHS, RHS);
+ break;
+ }
+
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Condcode should be pre-legalized away");
+ case ISD::SETUEQ:
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETOLT: // flipped
+ case ISD::SETOGT:
+ case ISD::SETGT: return X86::COND_A;
+ case ISD::SETOLE: // flipped
+ case ISD::SETOGE:
+ case ISD::SETGE: return X86::COND_AE;
+ case ISD::SETUGT: // flipped
+ case ISD::SETULT:
+ case ISD::SETLT: return X86::COND_B;
+ case ISD::SETUGE: // flipped
+ case ISD::SETULE:
+ case ISD::SETLE: return X86::COND_BE;
+ case ISD::SETONE:
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETUO: return X86::COND_P;
+ case ISD::SETO: return X86::COND_NP;
+ case ISD::SETOEQ:
+ case ISD::SETUNE: return X86::COND_INVALID;
+ }
+}
+
+/// Is there a floating point cmov for the specific X86 condition code?
+/// Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+ switch (X86CC) {
+ default:
+ return false;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_E:
+ case X86::COND_P:
+ case X86::COND_A:
+ case X86::COND_AE:
+ case X86::COND_NE:
+ case X86::COND_NP:
+ return true;
+ }
+}
+
+
+bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I,
+ MachineFunction &MF,
+ unsigned Intrinsic) const {
+ Info.flags = MachineMemOperand::MONone;
+ Info.offset = 0;
+
+ const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
+ if (!IntrData) {
+ switch (Intrinsic) {
+ case Intrinsic::x86_aesenc128kl:
+ case Intrinsic::x86_aesdec128kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(1);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::x86_aesenc256kl:
+ case Intrinsic::x86_aesdec256kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(1);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::x86_aesencwide128kl:
+ case Intrinsic::x86_aesdecwide128kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::x86_aesencwide256kl:
+ case Intrinsic::x86_aesdecwide256kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ }
+ return false;
+ }
+
+ switch (IntrData->Type) {
+ case TRUNCATE_TO_MEM_VI8:
+ case TRUNCATE_TO_MEM_VI16:
+ case TRUNCATE_TO_MEM_VI32: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.ptrVal = I.getArgOperand(0);
+ MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
+ MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+ if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
+ ScalarVT = MVT::i8;
+ else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
+ ScalarVT = MVT::i16;
+ else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
+ ScalarVT = MVT::i32;
+
+ Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOStore;
+ break;
+ }
+ case GATHER:
+ case GATHER_AVX2: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getType());
+ MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
+ unsigned NumElts = std::min(DataVT.getVectorNumElements(),
+ IndexVT.getVectorNumElements());
+ Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ break;
+ }
+ case SCATTER: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
+ MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
+ unsigned NumElts = std::min(DataVT.getVectorNumElements(),
+ IndexVT.getVectorNumElements());
+ Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOStore;
+ break;
+ }
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+/// Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
+ for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+ if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+ return true;
+ }
+ return false;
+}
+
+bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtTy,
+ EVT NewVT) const {
+ assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
+
+ // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
+ // relocation target a movq or addq instruction: don't let the load shrink.
+ SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
+ if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
+ if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
+ return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+
+ // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
+ // those uses are extracted directly into a store, then the extract + store
+ // can be store-folded. Therefore, it's probably not worth splitting the load.
+ EVT VT = Load->getValueType(0);
+ if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
+ for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
+ // Skip uses of the chain value. Result 0 of the node is the load value.
+ if (UI.getUse().getResNo() != 0)
+ continue;
+
+ // If this use is not an extract + store, it's probably worth splitting.
+ if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
+ UI->use_begin()->getOpcode() != ISD::STORE)
+ return true;
+ }
+ // All non-chain uses are extract + store.
+ return false;
+ }
+
+ return true;
+}
+
+/// Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0 || BitSize > 64)
+ return false;
+ return true;
+}
+
+bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
+ // If we are using XMM registers in the ABI and the condition of the select is
+ // a floating-point compare and we have blendv or conditional move, then it is
+ // cheaper to select instead of doing a cross-register move and creating a
+ // load that depends on the compare result.
+ bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
+ return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
+}
+
+bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
+ // TODO: It might be a win to ease or lift this restriction, but the generic
+ // folds in DAGCombiner conflict with vector folds for an AVX512 target.
+ if (VT.isVector() && Subtarget.hasAVX512())
+ return false;
+
+ return true;
+}
+
+bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const {
+ // TODO: We handle scalars using custom code, but generic combining could make
+ // that unnecessary.
+ APInt MulC;
+ if (!ISD::isConstantSplatVector(C.getNode(), MulC))
+ return false;
+
+ // Find the type this will be legalized too. Otherwise we might prematurely
+ // convert this to shl+add/sub and then still have to type legalize those ops.
+ // Another choice would be to defer the decision for illegal types until
+ // after type legalization. But constant splat vectors of i64 can't make it
+ // through type legalization on 32-bit targets so we would need to special
+ // case vXi64.
+ while (getTypeAction(Context, VT) != TypeLegal)
+ VT = getTypeToTransformTo(Context, VT);
+
+ // If vector multiply is legal, assume that's faster than shl + add/sub.
+ // TODO: Multiply is a complex op with higher latency and lower throughput in
+ // most implementations, so this check could be loosened based on type
+ // and/or a CPU attribute.
+ if (isOperationLegal(ISD::MUL, VT))
+ return false;
+
+ // shl+add, shl+sub, shl+add+neg
+ return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
+ (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
+}
+
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ // Mask vectors support all subregister combinations and operations that
+ // extract half of vector.
+ if (ResVT.getVectorElementType() == MVT::i1)
+ return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
+ (Index == ResVT.getVectorNumElements()));
+
+ return (Index % ResVT.getVectorNumElements()) == 0;
+}
+
+bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
+ unsigned Opc = VecOp.getOpcode();
+
+ // Assume target opcodes can't be scalarized.
+ // TODO - do we have any exceptions?
+ if (Opc >= ISD::BUILTIN_OP_END)
+ return false;
+
+ // If the vector op is not supported, try to convert to scalar.
+ EVT VecVT = VecOp.getValueType();
+ if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
+ return true;
+
+ // If the vector op is supported, but the scalar op is not, the transform may
+ // not be worthwhile.
+ EVT ScalarVT = VecVT.getScalarType();
+ return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
+}
+
+bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool) const {
+ // TODO: Allow vectors?
+ if (VT.isVector())
+ return false;
+ return VT.isSimple() || !isOperationExpand(Opcode, VT);
+}
+
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+ // Speculate cttz only if we can directly use TZCNT.
+ return Subtarget.hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+ // Speculate ctlz only if we can directly use LZCNT.
+ return Subtarget.hasLZCNT();
+}
+
+bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
+ if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
+ BitcastVT.getVectorElementType() == MVT::i1)
+ return false;
+
+ if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
+ return false;
+
+ // If both types are legal vectors, it's always ok to convert them.
+ if (LoadVT.isVector() && BitcastVT.isVector() &&
+ isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
+ return true;
+
+ return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
+}
+
+bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+ const SelectionDAG &DAG) const {
+ // Do not merge to float value size (128 bytes) if no implicit
+ // float attribute is set.
+ bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+
+ if (NoFloat) {
+ unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
+ return (MemVT.getSizeInBits() <= MaxIntSize);
+ }
+ // Make sure we don't merge greater than our preferred vector
+ // width.
+ if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
+ return false;
+
+ return true;
+}
+
+bool X86TargetLowering::isCtlzFast() const {
+ return Subtarget.hasFastLZCNT();
+}
+
+bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
+ const Instruction &AndI) const {
+ return true;
+}
+
+bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
+ EVT VT = Y.getValueType();
+
+ if (VT.isVector())
+ return false;
+
+ if (!Subtarget.hasBMI())
+ return false;
+
+ // There are only 32-bit and 64-bit forms for 'andn'.
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ return !isa<ConstantSDNode>(Y);
+}
+
+bool X86TargetLowering::hasAndNot(SDValue Y) const {
+ EVT VT = Y.getValueType();
+
+ if (!VT.isVector())
+ return hasAndNotCompare(Y);
+
+ // Vector.
+
+ if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
+ return false;
+
+ if (VT == MVT::v4i32)
+ return true;
+
+ return Subtarget.hasSSE2();
+}
+
+bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
+ return X.getValueType().isScalarInteger(); // 'bt'
+}
+
+bool X86TargetLowering::
+ shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const {
+ // Does baseline recommend not to perform the fold by default?
+ if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
+ return false;
+ // For scalars this transform is always beneficial.
+ if (X.getValueType().isScalarInteger())
+ return true;
+ // If all the shift amounts are identical, then transform is beneficial even
+ // with rudimentary SSE2 shifts.
+ if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
+ return true;
+ // If we have AVX2 with it's powerful shift operations, then it's also good.
+ if (Subtarget.hasAVX2())
+ return true;
+ // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
+ return NewShiftOpcode == ISD::SHL;
+}
+
+bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
+ const SDNode *N, CombineLevel Level) const {
+ assert(((N->getOpcode() == ISD::SHL &&
+ N->getOperand(0).getOpcode() == ISD::SRL) ||
+ (N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getOpcode() == ISD::SHL)) &&
+ "Expected shift-shift mask");
+ EVT VT = N->getValueType(0);
+ if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
+ (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
+ // Only fold if the shift values are equal - so it folds to AND.
+ // TODO - we should fold if either is a non-uniform vector but we don't do
+ // the fold for non-splats yet.
+ return N->getOperand(1) == N->getOperand(0).getOperand(1);
+ }
+ return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
+}
+
+bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
+ EVT VT = Y.getValueType();
+
+ // For vectors, we don't have a preference, but we probably want a mask.
+ if (VT.isVector())
+ return false;
+
+ // 64-bit shifts on 32-bit targets produce really bad bloated code.
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ return false;
+
+ return true;
+}
+
+bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
+ SDNode *N) const {
+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+ !Subtarget.isOSWindows())
+ return false;
+ return true;
+}
+
+bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
+ // Any legal vector type can be splatted more efficiently than
+ // loading/spilling from memory.
+ return isTypeLegal(VT);
+}
+
+MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
+ MVT VT = MVT::getIntegerVT(NumBits);
+ if (isTypeLegal(VT))
+ return VT;
+
+ // PMOVMSKB can handle this.
+ if (NumBits == 128 && isTypeLegal(MVT::v16i8))
+ return MVT::v16i8;
+
+ // VPMOVMSKB can handle this.
+ if (NumBits == 256 && isTypeLegal(MVT::v32i8))
+ return MVT::v32i8;
+
+ // TODO: Allow 64-bit type for 32-bit target.
+ // TODO: 512-bit types should be allowed, but make sure that those
+ // cases are handled in combineVectorSizedSetCCEquality().
+
+ return MVT::INVALID_SIMPLE_VALUE_TYPE;
+}
+
+/// Val is the undef sentinel value or equal to the specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+ return ((Val == SM_SentinelUndef) || (Val == CmpVal));
+}
+
+/// Return true if every element in Mask is the undef sentinel value or equal to
+/// the specified value..
+static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
+ return llvm::all_of(Mask, [CmpVal](int M) {
+ return (M == SM_SentinelUndef) || (M == CmpVal);
+ });
+}
+
+/// Val is either the undef or zero sentinel value.
+static bool isUndefOrZero(int Val) {
+ return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
+}
+
+/// Return true if every element in Mask, beginning from position Pos and ending
+/// in Pos+Size is the undef sentinel value.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+ return llvm::all_of(Mask.slice(Pos, Size),
+ [](int M) { return M == SM_SentinelUndef; });
+}
+
+/// Return true if the mask creates a vector whose lower half is undefined.
+static bool isUndefLowerHalf(ArrayRef<int> Mask) {
+ unsigned NumElts = Mask.size();
+ return isUndefInRange(Mask, 0, NumElts / 2);
+}
+
+/// Return true if the mask creates a vector whose upper half is undefined.
+static bool isUndefUpperHalf(ArrayRef<int> Mask) {
+ unsigned NumElts = Mask.size();
+ return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
+}
+
+/// Return true if Val falls within the specified range (L, H].
+static bool isInRange(int Val, int Low, int Hi) {
+ return (Val >= Low && Val < Hi);
+}
+
+/// Return true if the value of any element in Mask falls within the specified
+/// range (L, H].
+static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
+}
+
+/// Return true if the value of any element in Mask is the zero sentinel value.
+static bool isAnyZero(ArrayRef<int> Mask) {
+ return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+}
+
+/// Return true if the value of any element in Mask is the zero or undef
+/// sentinel values.
+static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
+ return llvm::any_of(Mask, [](int M) {
+ return M == SM_SentinelZero || M == SM_SentinelUndef;
+ });
+}
+
+/// Return true if Val is undef or if its value falls within the
+/// specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+ return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
+}
+
+/// Return true if every element in Mask is undef or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ return llvm::all_of(
+ Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
+}
+
+/// Return true if Val is undef, zero or if its value falls within the
+/// specified range (L, H].
+static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
+ return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
+}
+
+/// Return true if every element in Mask is undef, zero or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ return llvm::all_of(
+ Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos + Size, falls within the specified
+/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
+ unsigned Size, int Low, int Step = 1) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
+ if (!isUndefOrEqual(Mask[i], Low))
+ return false;
+ return true;
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (Low, Low+Size], or is undef or is zero.
+static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
+ unsigned Size, int Low,
+ int Step = 1) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
+ if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
+ return false;
+ return true;
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef or is zero.
+static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
+ unsigned Size) {
+ return llvm::all_of(Mask.slice(Pos, Size),
+ [](int M) { return isUndefOrZero(M); });
+}
+
+/// Helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+ SmallVectorImpl<int> &WidenedMask) {
+ WidenedMask.assign(Mask.size() / 2, 0);
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+ int M0 = Mask[i];
+ int M1 = Mask[i + 1];
+
+ // If both elements are undef, its trivial.
+ if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
+ WidenedMask[i / 2] = SM_SentinelUndef;
+ continue;
+ }
+
+ // Check for an undef mask and a mask value properly aligned to fit with
+ // a pair of values. If we find such a case, use the non-undef mask's value.
+ if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
+ WidenedMask[i / 2] = M1 / 2;
+ continue;
+ }
+ if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
+ WidenedMask[i / 2] = M0 / 2;
+ continue;
+ }
+
+ // When zeroing, we need to spread the zeroing across both lanes to widen.
+ if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
+ if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
+ (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
+ WidenedMask[i / 2] = SM_SentinelZero;
+ continue;
+ }
+ return false;
+ }
+
+ // Finally check if the two mask values are adjacent and aligned with
+ // a pair.
+ if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
+ WidenedMask[i / 2] = M0 / 2;
+ continue;
+ }
+
+ // Otherwise we can't safely widen the elements used in this shuffle.
+ return false;
+ }
+ assert(WidenedMask.size() == Mask.size() / 2 &&
+ "Incorrect size of mask after widening the elements!");
+
+ return true;
+}
+
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ bool V2IsZero,
+ SmallVectorImpl<int> &WidenedMask) {
+ // Create an alternative mask with info about zeroable elements.
+ // Here we do not set undef elements as zeroable.
+ SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+ if (V2IsZero) {
+ assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
+ for (int i = 0, Size = Mask.size(); i != Size; ++i)
+ if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+ ZeroableMask[i] = SM_SentinelZero;
+ }
+ return canWidenShuffleElements(ZeroableMask, WidenedMask);
+}
+
+static bool canWidenShuffleElements(ArrayRef<int> Mask) {
+ SmallVector<int, 32> WidenedMask;
+ return canWidenShuffleElements(Mask, WidenedMask);
+}
+
+// Attempt to narrow/widen shuffle mask until it matches the target number of
+// elements.
+static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
+ SmallVectorImpl<int> &ScaledMask) {
+ unsigned NumSrcElts = Mask.size();
+ assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
+ "Illegal shuffle scale factor");
+
+ // Narrowing is guaranteed to work.
+ if (NumDstElts >= NumSrcElts) {
+ int Scale = NumDstElts / NumSrcElts;
+ llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
+ return true;
+ }
+
+ // We have to repeat the widening until we reach the target size, but we can
+ // split out the first widening as it sets up ScaledMask for us.
+ if (canWidenShuffleElements(Mask, ScaledMask)) {
+ while (ScaledMask.size() > NumDstElts) {
+ SmallVector<int, 16> WidenedMask;
+ if (!canWidenShuffleElements(ScaledMask, WidenedMask))
+ return false;
+ ScaledMask = std::move(WidenedMask);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+/// Returns true if Elt is a constant zero or a floating point constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+ return isNullConstant(Elt) || isNullFPConstant(Elt);
+}
+
+// Build a vector of constants.
+// Use an UNDEF node if MaskElt == -1.
+// Split 64-bit constants in the 32-bit mode.
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
+ const SDLoc &dl, bool IsMask = false) {
+
+ SmallVector<SDValue, 32> Ops;
+ bool Split = false;
+
+ MVT ConstVecVT = VT;
+ unsigned NumElts = VT.getVectorNumElements();
+ bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+ if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+ ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+ Split = true;
+ }
+
+ MVT EltVT = ConstVecVT.getVectorElementType();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ bool IsUndef = Values[i] < 0 && IsMask;
+ SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(Values[i], dl, EltVT);
+ Ops.push_back(OpNode);
+ if (Split)
+ Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(0, dl, EltVT));
+ }
+ SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
+ if (Split)
+ ConstsNode = DAG.getBitcast(VT, ConstsNode);
+ return ConstsNode;
+}
+
+static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
+ MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+ assert(Bits.size() == Undefs.getBitWidth() &&
+ "Unequal constant and undef arrays");
+ SmallVector<SDValue, 32> Ops;
+ bool Split = false;
+
+ MVT ConstVecVT = VT;
+ unsigned NumElts = VT.getVectorNumElements();
+ bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+ if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+ ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+ Split = true;
+ }
+
+ MVT EltVT = ConstVecVT.getVectorElementType();
+ for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+ if (Undefs[i]) {
+ Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
+ continue;
+ }
+ const APInt &V = Bits[i];
+ assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
+ if (Split) {
+ Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
+ Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
+ } else if (EltVT == MVT::f32) {
+ APFloat FV(APFloat::IEEEsingle(), V);
+ Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+ } else if (EltVT == MVT::f64) {
+ APFloat FV(APFloat::IEEEdouble(), V);
+ Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+ } else {
+ Ops.push_back(DAG.getConstant(V, dl, EltVT));
+ }
+ }
+
+ SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
+ return DAG.getBitcast(VT, ConstsNode);
+}
+
+/// Returns a vector of specified type with all zero elements.
+static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
+ VT.getVectorElementType() == MVT::i1) &&
+ "Unexpected vector type");
+
+ // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
+ // type. This ensures they get CSE'd. But if the integer type is not
+ // available, use a floating-point +0.0 instead.
+ SDValue Vec;
+ if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
+ Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
+ } else if (VT.isFloatingPoint()) {
+ Vec = DAG.getConstantFP(+0.0, dl, VT);
+ } else if (VT.getVectorElementType() == MVT::i1) {
+ assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
+ "Unexpected vector type");
+ Vec = DAG.getConstant(0, dl, VT);
+ } else {
+ unsigned Num32BitElts = VT.getSizeInBits() / 32;
+ Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
+ }
+ return DAG.getBitcast(VT, Vec);
+}
+
+static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
+ const SDLoc &dl, unsigned vectorWidth) {
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ unsigned Factor = VT.getSizeInBits()/vectorWidth;
+ EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+ VT.getVectorNumElements()/Factor);
+
+ // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
+ unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
+
+ // If the input is a buildvector just emit a smaller one.
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getBuildVector(ResultVT, dl,
+ Vec->ops().slice(IdxVal, ElemsPerChunk));
+
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
+}
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert((Vec.getValueType().is256BitVector() ||
+ Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+ return extractSubVector(Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+ return extractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl,
+ unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ // Inserting UNDEF is Result
+ if (Vec.isUndef())
+ return Result;
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ EVT ResultVT = Result.getValueType();
+
+ // Insert the relevant vectorWidth bits.
+ unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
+
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits. This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference. Idx is an index in the 128 bits
+/// we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+ return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Widen a vector to a larger size with the same scalar type, with the new
+/// elements either zero or undef.
+static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
+ Vec.getValueType().getScalarType() == VT.getScalarType() &&
+ "Unsupported vector widening type");
+ SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
+ : DAG.getUNDEF(VT);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+/// Widen a vector to a larger size with the same scalar type, with the new
+/// elements either zero or undef.
+static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, unsigned WideSizeInBits) {
+ assert(Vec.getValueSizeInBits() < WideSizeInBits &&
+ (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
+ "Unsupported vector widening type");
+ unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
+ MVT SVT = Vec.getSimpleValueType().getScalarType();
+ MVT VT = MVT::getVectorVT(SVT, WideNumElts);
+ return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
+}
+
+// Helper function to collect subvector ops that are concatenated together,
+// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
+// The subvectors in Ops are guaranteed to be the same type.
+static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+ assert(Ops.empty() && "Expected an empty ops vector");
+
+ if (N->getOpcode() == ISD::CONCAT_VECTORS) {
+ Ops.append(N->op_begin(), N->op_end());
+ return true;
+ }
+
+ if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue Src = N->getOperand(0);
+ SDValue Sub = N->getOperand(1);
+ const APInt &Idx = N->getConstantOperandAPInt(2);
+ EVT VT = Src.getValueType();
+ EVT SubVT = Sub.getValueType();
+
+ // TODO - Handle more general insert_subvector chains.
+ if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
+ Idx == (VT.getVectorNumElements() / 2)) {
+ // insert_subvector(insert_subvector(undef, x, lo), y, hi)
+ if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueType() == SubVT &&
+ isNullConstant(Src.getOperand(2))) {
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
+ // insert_subvector(x, extract_subvector(x, lo), hi)
+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
+ Ops.append(2, Sub);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ EVT VT = Op.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned SizeInBits = VT.getSizeInBits();
+ assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
+ "Can't split odd sized vector");
+
+ SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
+ SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
+ return std::make_pair(Lo, Hi);
+}
+
+// Split an unary integer op into 2 half sized ops.
+static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // Make sure we only try to split 256/512-bit types to avoid creating
+ // narrow vectors.
+ assert((Op.getOperand(0).getValueType().is256BitVector() ||
+ Op.getOperand(0).getValueType().is512BitVector()) &&
+ (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
+ assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
+ VT.getVectorNumElements() &&
+ "Unexpected VTs!");
+
+ SDLoc dl(Op);
+
+ // Extract the Lo/Hi vectors
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
+ DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
+}
+
+/// Break a binary integer operation into 2 half sized ops and then
+/// concatenate the result back.
+static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // Sanity check that all the types match.
+ assert(Op.getOperand(0).getValueType() == VT &&
+ Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
+ assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
+
+ SDLoc dl(Op);
+
+ // Extract the LHS Lo/Hi vectors
+ SDValue LHS1, LHS2;
+ std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
+
+ // Extract the RHS Lo/Hi vectors
+ SDValue RHS1, RHS2;
+ std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
+}
+
+// Helper for splitting operands of an operation to legal target size and
+// apply a function on each part.
+// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
+// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
+// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
+// The argument Builder is a function that will be applied on each split part:
+// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
+template <typename F>
+SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
+ F Builder, bool CheckBWI = true) {
+ assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
+ unsigned NumSubs = 1;
+ if ((CheckBWI && Subtarget.useBWIRegs()) ||
+ (!CheckBWI && Subtarget.useAVX512Regs())) {
+ if (VT.getSizeInBits() > 512) {
+ NumSubs = VT.getSizeInBits() / 512;
+ assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
+ }
+ } else if (Subtarget.hasAVX2()) {
+ if (VT.getSizeInBits() > 256) {
+ NumSubs = VT.getSizeInBits() / 256;
+ assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
+ }
+ } else {
+ if (VT.getSizeInBits() > 128) {
+ NumSubs = VT.getSizeInBits() / 128;
+ assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
+ }
+ }
+
+ if (NumSubs == 1)
+ return Builder(DAG, DL, Ops);
+
+ SmallVector<SDValue, 4> Subs;
+ for (unsigned i = 0; i != NumSubs; ++i) {
+ SmallVector<SDValue, 2> SubOps;
+ for (SDValue Op : Ops) {
+ EVT OpVT = Op.getValueType();
+ unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
+ unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
+ SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
+ }
+ Subs.push_back(Builder(DAG, DL, SubOps));
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
+}
+
+/// Insert i1-subvector to i1-vector.
+static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ unsigned IdxVal = Op.getConstantOperandVal(2);
+
+ // Inserting undef is a nop. We can just return the original vector.
+ if (SubVec.isUndef())
+ return Vec;
+
+ if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
+ return Op;
+
+ MVT OpVT = Op.getSimpleValueType();
+ unsigned NumElems = OpVT.getVectorNumElements();
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+
+ // Extend to natively supported kshift.
+ MVT WideOpVT = OpVT;
+ if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
+ WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+
+ // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
+ // if necessary.
+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ // May need to promote to a legal type.
+ Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ DAG.getConstant(0, dl, WideOpVT),
+ SubVec, Idx);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ }
+
+ MVT SubVecVT = SubVec.getSimpleValueType();
+ unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
+ assert(IdxVal + SubVecNumElems <= NumElems &&
+ IdxVal % SubVecVT.getSizeInBits() == 0 &&
+ "Unexpected index value in INSERT_SUBVECTOR");
+
+ SDValue Undef = DAG.getUNDEF(WideOpVT);
+
+ if (IdxVal == 0) {
+ // Zero lower bits of the Vec
+ SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
+ ZeroIdx);
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
+ // Merge them together, SubVec should be zero extended.
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ DAG.getConstant(0, dl, WideOpVT),
+ SubVec, ZeroIdx);
+ Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ }
+
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ Undef, SubVec, ZeroIdx);
+
+ if (Vec.isUndef()) {
+ assert(IdxVal != 0 && "Unexpected index");
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
+ }
+
+ if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ assert(IdxVal != 0 && "Unexpected index");
+ NumElems = WideOpVT.getVectorNumElements();
+ unsigned ShiftLeft = NumElems - SubVecNumElems;
+ unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
+ if (ShiftRight != 0)
+ SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
+ }
+
+ // Simple case when we put subvector in the upper part
+ if (IdxVal + SubVecNumElems == NumElems) {
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
+ if (SubVecNumElems * 2 == NumElems) {
+ // Special case, use legal zero extending insert_subvector. This allows
+ // isel to optimize when bits are known zero.
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ DAG.getConstant(0, dl, WideOpVT),
+ Vec, ZeroIdx);
+ } else {
+ // Otherwise use explicit shifts to zero the bits.
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ Undef, Vec, ZeroIdx);
+ NumElems = WideOpVT.getVectorNumElements();
+ SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
+ }
+ Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ }
+
+ // Inserting into the middle is more complicated.
+
+ NumElems = WideOpVT.getVectorNumElements();
+
+ // Widen the vector if needed.
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+
+ unsigned ShiftLeft = NumElems - SubVecNumElems;
+ unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+
+ // Do an optimization for the the most frequently used types.
+ if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
+ APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
+ Mask0.flipAllBits();
+ SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
+ SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
+ Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
+ SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+ Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
+
+ // Reduce to original width if needed.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ }
+
+ // Clear the upper bits of the subvector and move it to its insert position.
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
+ SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+
+ // Isolate the bits below the insertion point.
+ unsigned LowShift = NumElems - IdxVal;
+ SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
+ DAG.getTargetConstant(LowShift, dl, MVT::i8));
+ Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
+ DAG.getTargetConstant(LowShift, dl, MVT::i8));
+
+ // Isolate the bits after the last inserted bit.
+ unsigned HighShift = IdxVal + SubVecNumElems;
+ SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
+ DAG.getTargetConstant(HighShift, dl, MVT::i8));
+ High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
+ DAG.getTargetConstant(HighShift, dl, MVT::i8));
+
+ // Now OR all 3 pieces together.
+ Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
+ SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
+
+ // Reduce to original width if needed.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
+}
+
+static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
+ EVT SubVT = V1.getValueType();
+ EVT SubSVT = SubVT.getScalarType();
+ unsigned SubNumElts = SubVT.getVectorNumElements();
+ unsigned SubVectorWidth = SubVT.getSizeInBits();
+ EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
+ SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
+ return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
+}
+
+/// Returns a vector of specified type with all bits set.
+/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+ "Expected a 128/256/512-bit vector type");
+
+ APInt Ones = APInt::getAllOnesValue(32);
+ unsigned NumElts = VT.getSizeInBits() / 32;
+ SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
+ return DAG.getBitcast(VT, Vec);
+}
+
+// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
+static unsigned getOpcode_EXTEND(unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ return ISD::ANY_EXTEND;
+ case ISD::ZERO_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return ISD::ZERO_EXTEND;
+ case ISD::SIGN_EXTEND:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return ISD::SIGN_EXTEND;
+ }
+ llvm_unreachable("Unknown opcode");
+}
+
+// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
+static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ return ISD::ANY_EXTEND_VECTOR_INREG;
+ case ISD::ZERO_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return ISD::ZERO_EXTEND_VECTOR_INREG;
+ case ISD::SIGN_EXTEND:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return ISD::SIGN_EXTEND_VECTOR_INREG;
+ }
+ llvm_unreachable("Unknown opcode");
+}
+
+static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
+ SDValue In, SelectionDAG &DAG) {
+ EVT InVT = In.getValueType();
+ assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
+ assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
+ ISD::ZERO_EXTEND == Opcode) &&
+ "Unknown extension opcode");
+
+ // For 256-bit vectors, we only need the lower (128-bit) input half.
+ // For 512-bit vectors, we only need the lower input half or quarter.
+ if (InVT.getSizeInBits() > 128) {
+ assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
+ "Expected VTs to be the same size!");
+ unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
+ In = extractSubVector(In, 0, DAG, DL,
+ std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
+ InVT = In.getValueType();
+ }
+
+ if (VT.getVectorNumElements() != InVT.getVectorNumElements())
+ Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
+
+ return DAG.getNode(Opcode, DL, VT, In);
+}
+
+// Match (xor X, -1) -> X.
+// Match extract_subvector(xor X, -1) -> extract_subvector(X).
+// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
+ V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
+ if (V.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
+ return V.getOperand(0);
+ if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
+ if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
+ Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
+ Not, V.getOperand(1));
+ }
+ }
+ SmallVector<SDValue, 2> CatOps;
+ if (collectConcatOps(V.getNode(), CatOps)) {
+ for (SDValue &CatOp : CatOps) {
+ SDValue NotCat = IsNOT(CatOp, DAG);
+ if (!NotCat) return SDValue();
+ CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
+ }
+ return SDValue();
+}
+
+void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
+ bool Lo, bool Unary) {
+ assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
+ "Illegal vector type to unpack");
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+ for (int i = 0; i < NumElts; ++i) {
+ unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+ int Pos = (i % NumEltsInLane) / 2 + LaneStart;
+ Pos += (Unary ? 0 : NumElts * (i % 2));
+ Pos += (Lo ? 0 : NumEltsInLane / 2);
+ Mask.push_back(Pos);
+ }
+}
+
+/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+/// imposed by AVX and specific to the unary pattern. Example:
+/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+ bool Lo) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ int NumElts = VT.getVectorNumElements();
+ for (int i = 0; i < NumElts; ++i) {
+ int Pos = i / 2;
+ Pos += (Lo ? 0 : NumElts / 2);
+ Mask.push_back(Pos);
+ }
+}
+
+/// Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
+ SDValue V1, SDValue V2) {
+ SmallVector<int, 8> Mask;
+ createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
+/// Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
+ SDValue V1, SDValue V2) {
+ SmallVector<int, 8> Mask;
+ createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
+/// Return a vector_shuffle of the specified vector of zero or undef vector.
+/// This produces a shuffle where the low element of V2 is swizzled into the
+/// zero/undef vector, landing at element Idx.
+/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
+ bool IsZero,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = V2.getSimpleValueType();
+ SDValue V1 = IsZero
+ ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
+ int NumElems = VT.getVectorNumElements();
+ SmallVector<int, 16> MaskVec(NumElems);
+ for (int i = 0; i != NumElems; ++i)
+ // If this is the insertion idx, put the low elt of V2 here.
+ MaskVec[i] = (i == Idx) ? NumElems : i;
+ return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
+}
+
+static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
+ if (Ptr.getOpcode() == X86ISD::Wrapper ||
+ Ptr.getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr.getOperand(0);
+
+ auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
+ return nullptr;
+
+ return CNode->getConstVal();
+}
+
+static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
+ if (!Load || !ISD::isNormalLoad(Load))
+ return nullptr;
+ return getTargetConstantFromBasePtr(Load->getBasePtr());
+}
+
+static const Constant *getTargetConstantFromNode(SDValue Op) {
+ Op = peekThroughBitcasts(Op);
+ return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
+}
+
+const Constant *
+X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
+ assert(LD && "Unexpected null LoadSDNode");
+ return getTargetConstantFromNode(LD);
+}
+
+// Extract raw constant bits from constant pools.
+static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
+ APInt &UndefElts,
+ SmallVectorImpl<APInt> &EltBits,
+ bool AllowWholeUndefs = true,
+ bool AllowPartialUndefs = true) {
+ assert(EltBits.empty() && "Expected an empty EltBits vector");
+
+ Op = peekThroughBitcasts(Op);
+
+ EVT VT = Op.getValueType();
+ unsigned SizeInBits = VT.getSizeInBits();
+ assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
+ unsigned NumElts = SizeInBits / EltSizeInBits;
+
+ // Bitcast a source array of element bits to the target size.
+ auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
+ unsigned NumSrcElts = UndefSrcElts.getBitWidth();
+ unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
+ assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
+ "Constant bit sizes don't match");
+
+ // Don't split if we don't allow undef bits.
+ bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
+ if (UndefSrcElts.getBoolValue() && !AllowUndefs)
+ return false;
+
+ // If we're already the right size, don't bother bitcasting.
+ if (NumSrcElts == NumElts) {
+ UndefElts = UndefSrcElts;
+ EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
+ return true;
+ }
+
+ // Extract all the undef/constant element data and pack into single bitsets.
+ APInt UndefBits(SizeInBits, 0);
+ APInt MaskBits(SizeInBits, 0);
+
+ for (unsigned i = 0; i != NumSrcElts; ++i) {
+ unsigned BitOffset = i * SrcEltSizeInBits;
+ if (UndefSrcElts[i])
+ UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
+ MaskBits.insertBits(SrcEltBits[i], BitOffset);
+ }
+
+ // Split the undef/constant single bitset data into the target elements.
+ UndefElts = APInt(NumElts, 0);
+ EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ unsigned BitOffset = i * EltSizeInBits;
+ APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
+
+ // Only treat an element as UNDEF if all bits are UNDEF.
+ if (UndefEltBits.isAllOnesValue()) {
+ if (!AllowWholeUndefs)
+ return false;
+ UndefElts.setBit(i);
+ continue;
+ }
+
+ // If only some bits are UNDEF then treat them as zero (or bail if not
+ // supported).
+ if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
+ return false;
+
+ EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
+ }
+ return true;
+ };
+
+ // Collect constant bits and insert into mask/undef bit masks.
+ auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
+ unsigned UndefBitIndex) {
+ if (!Cst)
+ return false;
+ if (isa<UndefValue>(Cst)) {
+ Undefs.setBit(UndefBitIndex);
+ return true;
+ }
+ if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
+ Mask = CInt->getValue();
+ return true;
+ }
+ if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
+ Mask = CFP->getValueAPF().bitcastToAPInt();
+ return true;
+ }
+ return false;
+ };
+
+ // Handle UNDEFs.
+ if (Op.isUndef()) {
+ APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
+ SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+
+ // Extract scalar constant bits.
+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ APInt UndefSrcElts = APInt::getNullValue(1);
+ SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+ if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+ APInt UndefSrcElts = APInt::getNullValue(1);
+ APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
+ SmallVector<APInt, 64> SrcEltBits(1, RawBits);
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+
+ // Extract constant bits from build vector.
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+ const SDValue &Src = Op.getOperand(i);
+ if (Src.isUndef()) {
+ UndefSrcElts.setBit(i);
+ continue;
+ }
+ auto *Cst = cast<ConstantSDNode>(Src);
+ SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+ }
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+ if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+ const SDValue &Src = Op.getOperand(i);
+ if (Src.isUndef()) {
+ UndefSrcElts.setBit(i);
+ continue;
+ }
+ auto *Cst = cast<ConstantFPSDNode>(Src);
+ APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
+ SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
+ }
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+
+ // Extract constant bits from constant pool vector.
+ if (auto *Cst = getTargetConstantFromNode(Op)) {
+ Type *CstTy = Cst->getType();
+ unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+ if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
+ return false;
+
+ unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
+ for (unsigned i = 0; i != NumSrcElts; ++i)
+ if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
+ UndefSrcElts, i))
+ return false;
+
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+
+ // Extract constant bits from a broadcasted constant pool scalar.
+ if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
+ EltSizeInBits <= VT.getScalarSizeInBits()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return false;
+
+ SDValue Ptr = MemIntr->getBasePtr();
+ if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
+ unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
+ if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
+ if (UndefSrcElts[0])
+ UndefSrcElts.setBits(0, NumSrcElts);
+ SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+ }
+ }
+
+ // Extract constant bits from a subvector broadcast.
+ if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ SDValue Ptr = MemIntr->getBasePtr();
+ if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
+ Type *CstTy = Cst->getType();
+ unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+ if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
+ return false;
+ unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
+ unsigned NumSubVecs = SizeInBits / CstSizeInBits;
+ APInt UndefSubElts(NumSubElts, 0);
+ SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
+ APInt(SubEltSizeInBits, 0));
+ for (unsigned i = 0; i != NumSubElts; ++i) {
+ if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
+ UndefSubElts, i))
+ return false;
+ for (unsigned j = 1; j != NumSubVecs; ++j)
+ SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
+ }
+ UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
+ UndefSubElts);
+ return CastBitData(UndefSubElts, SubEltBits);
+ }
+ }
+
+ // Extract a rematerialized scalar constant insertion.
+ if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
+ Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits;
+ auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
+ SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
+ SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+
+ // Insert constant bits from a base and sub vector sources.
+ if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
+ // If bitcasts to larger elements we might lose track of undefs - don't
+ // allow any to be safe.
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
+
+ APInt UndefSrcElts, UndefSubElts;
+ SmallVector<APInt, 32> EltSrcBits, EltSubBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
+ UndefSubElts, EltSubBits,
+ AllowWholeUndefs && AllowUndefs,
+ AllowPartialUndefs && AllowUndefs) &&
+ getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
+ UndefSrcElts, EltSrcBits,
+ AllowWholeUndefs && AllowUndefs,
+ AllowPartialUndefs && AllowUndefs)) {
+ unsigned BaseIdx = Op.getConstantOperandVal(2);
+ UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
+ for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
+ EltSrcBits[BaseIdx + i] = EltSubBits[i];
+ return CastBitData(UndefSrcElts, EltSrcBits);
+ }
+ }
+
+ // Extract constant bits from a subvector's source.
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ // TODO - support extract_subvector through bitcasts.
+ if (EltSizeInBits != VT.getScalarSizeInBits())
+ return false;
+
+ if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
+ UndefElts, EltBits, AllowWholeUndefs,
+ AllowPartialUndefs)) {
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned NumSubElts = VT.getVectorNumElements();
+ unsigned BaseIdx = Op.getConstantOperandVal(1);
+ UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
+ if ((BaseIdx + NumSubElts) != NumSrcElts)
+ EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
+ if (BaseIdx != 0)
+ EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
+ return true;
+ }
+ }
+
+ // Extract constant bits from shuffle node sources.
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
+ // TODO - support shuffle through bitcasts.
+ if (EltSizeInBits != VT.getScalarSizeInBits())
+ return false;
+
+ ArrayRef<int> Mask = SVN->getMask();
+ if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
+ llvm::any_of(Mask, [](int M) { return M < 0; }))
+ return false;
+
+ APInt UndefElts0, UndefElts1;
+ SmallVector<APInt, 32> EltBits0, EltBits1;
+ if (isAnyInRange(Mask, 0, NumElts) &&
+ !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
+ UndefElts0, EltBits0, AllowWholeUndefs,
+ AllowPartialUndefs))
+ return false;
+ if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
+ !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
+ UndefElts1, EltBits1, AllowWholeUndefs,
+ AllowPartialUndefs))
+ return false;
+
+ UndefElts = APInt::getNullValue(NumElts);
+ for (int i = 0; i != (int)NumElts; ++i) {
+ int M = Mask[i];
+ if (M < 0) {
+ UndefElts.setBit(i);
+ EltBits.push_back(APInt::getNullValue(EltSizeInBits));
+ } else if (M < (int)NumElts) {
+ if (UndefElts0[M])
+ UndefElts.setBit(i);
+ EltBits.push_back(EltBits0[M]);
+ } else {
+ if (UndefElts1[M - NumElts])
+ UndefElts.setBit(i);
+ EltBits.push_back(EltBits1[M - NumElts]);
+ }
+ }
+ return true;
+ }
+
+ return false;
+}
+
+namespace llvm {
+namespace X86 {
+bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
+ APInt UndefElts;
+ SmallVector<APInt, 16> EltBits;
+ if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
+ UndefElts, EltBits, true,
+ AllowPartialUndefs)) {
+ int SplatIndex = -1;
+ for (int i = 0, e = EltBits.size(); i != e; ++i) {
+ if (UndefElts[i])
+ continue;
+ if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
+ SplatIndex = -1;
+ break;
+ }
+ SplatIndex = i;
+ }
+ if (0 <= SplatIndex) {
+ SplatVal = EltBits[SplatIndex];
+ return true;
+ }
+ }
+
+ return false;
+}
+} // namespace X86
+} // namespace llvm
+
+static bool getTargetShuffleMaskIndices(SDValue MaskNode,
+ unsigned MaskEltSizeInBits,
+ SmallVectorImpl<uint64_t> &RawMask,
+ APInt &UndefElts) {
+ // Extract the raw target constant bits.
+ SmallVector<APInt, 64> EltBits;
+ if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
+ EltBits, /* AllowWholeUndefs */ true,
+ /* AllowPartialUndefs */ false))
+ return false;
+
+ // Insert the extracted elements into the mask.
+ for (const APInt &Elt : EltBits)
+ RawMask.push_back(Elt.getZExtValue());
+
+ return true;
+}
+
+/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
+/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
+/// Note: This ignores saturation, so inputs must be checked first.
+static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+ bool Unary, unsigned NumStages = 1) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
+ unsigned Offset = Unary ? 0 : NumElts;
+ unsigned Repetitions = 1u << (NumStages - 1);
+ unsigned Increment = 1u << NumStages;
+ assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
+
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
+ Mask.push_back(Elt + (Lane * NumEltsPerLane));
+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
+ Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
+ }
+ }
+}
+
+// Split the demanded elts of a PACKSS/PACKUS node between its operands.
+static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
+ APInt &DemandedLHS, APInt &DemandedRHS) {
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumElts = DemandedElts.getBitWidth();
+ int NumInnerElts = NumElts / 2;
+ int NumEltsPerLane = NumElts / NumLanes;
+ int NumInnerEltsPerLane = NumInnerElts / NumLanes;
+
+ DemandedLHS = APInt::getNullValue(NumInnerElts);
+ DemandedRHS = APInt::getNullValue(NumInnerElts);
+
+ // Map DemandedElts to the packed operands.
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
+ int OuterIdx = (Lane * NumEltsPerLane) + Elt;
+ int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
+ if (DemandedElts[OuterIdx])
+ DemandedLHS.setBit(InnerIdx);
+ if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
+ DemandedRHS.setBit(InnerIdx);
+ }
+ }
+}
+
+// Split the demanded elts of a HADD/HSUB node between its operands.
+static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
+ APInt &DemandedLHS, APInt &DemandedRHS) {
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumElts = DemandedElts.getBitWidth();
+ int NumEltsPerLane = NumElts / NumLanes;
+ int HalfEltsPerLane = NumEltsPerLane / 2;
+
+ DemandedLHS = APInt::getNullValue(NumElts);
+ DemandedRHS = APInt::getNullValue(NumElts);
+
+ // Map DemandedElts to the horizontal operands.
+ for (int Idx = 0; Idx != NumElts; ++Idx) {
+ if (!DemandedElts[Idx])
+ continue;
+ int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
+ int LocalIdx = Idx % NumEltsPerLane;
+ if (LocalIdx < HalfEltsPerLane) {
+ DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
+ DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
+ } else {
+ LocalIdx -= HalfEltsPerLane;
+ DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
+ DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
+ }
+ }
+}
+
+/// Calculates the shuffle mask corresponding to the target-specific opcode.
+/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
+/// operands in \p Ops, and returns true.
+/// Sets \p IsUnary to true if only one source is used. Note that this will set
+/// IsUnary for shuffles which use a single input multiple times, and in those
+/// cases it will adjust the mask to only have indices within that single input.
+/// It is an error to call this with non-empty Mask/Ops vectors.
+static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
+ SmallVectorImpl<SDValue> &Ops,
+ SmallVectorImpl<int> &Mask, bool &IsUnary) {
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ SmallVector<uint64_t, 32> RawMask;
+ APInt RawUndefs;
+ uint64_t ImmN;
+
+ assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
+ assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
+
+ IsUnary = false;
+ bool IsFakeUnary = false;
+ switch (N->getOpcode()) {
+ case X86ISD::BLENDI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeBLENDMask(NumElems, ImmN, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::SHUFP:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::INSERTPS:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeINSERTPSMask(ImmN, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::EXTRQI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ if (isa<ConstantSDNode>(N->getOperand(1)) &&
+ isa<ConstantSDNode>(N->getOperand(2))) {
+ int BitLen = N->getConstantOperandVal(1);
+ int BitIdx = N->getConstantOperandVal(2);
+ DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
+ IsUnary = true;
+ }
+ break;
+ case X86ISD::INSERTQI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ if (isa<ConstantSDNode>(N->getOperand(2)) &&
+ isa<ConstantSDNode>(N->getOperand(3))) {
+ int BitLen = N->getConstantOperandVal(2);
+ int BitIdx = N->getConstantOperandVal(3);
+ DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ }
+ break;
+ case X86ISD::UNPCKH:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::UNPCKL:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::MOVHLPS:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ DecodeMOVHLPSMask(NumElems, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::MOVLHPS:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ DecodeMOVLHPSMask(NumElems, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::VALIGN:
+ assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
+ "Only 32-bit and 64-bit elements are supported!");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeVALIGNMask(NumElems, ImmN, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(0));
+ break;
+ case X86ISD::PALIGNR:
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePALIGNRMask(NumElems, ImmN, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(0));
+ break;
+ case X86ISD::VSHLDQ:
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSLLDQMask(NumElems, ImmN, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VSRLDQ:
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSRLDQMask(NumElems, ImmN, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFD:
+ case X86ISD::VPERMILPI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFHW:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSHUFHWMask(NumElems, ImmN, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::PSHUFLW:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSHUFLWMask(NumElems, ImmN, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VZEXT_MOVL:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ DecodeZeroMoveLowMask(NumElems, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VBROADCAST:
+ // We only decode broadcasts of same-sized vectors, peeking through to
+ // extracted subvectors is likely to cause hasOneUse issues with
+ // SimplifyDemandedBits etc.
+ if (N->getOperand(0).getValueType() == VT) {
+ DecodeVectorBroadcast(NumElems, Mask);
+ IsUnary = true;
+ break;
+ }
+ return false;
+ case X86ISD::VPERMILPV: {
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ IsUnary = true;
+ SDValue MaskNode = N->getOperand(1);
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+ RawUndefs)) {
+ DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::PSHUFB: {
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ IsUnary = true;
+ SDValue MaskNode = N->getOperand(1);
+ if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
+ DecodePSHUFBMask(RawMask, RawUndefs, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeVPERMMask(NumElems, ImmN, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
+ break;
+ case X86ISD::VPERM2X128:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::SHUF128:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ break;
+ case X86ISD::MOVSLDUP:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ DecodeMOVSLDUPMask(NumElems, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVSHDUP:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ DecodeMOVSHDUPMask(NumElems, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::MOVDDUP:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ DecodeMOVDDUPMask(NumElems, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VPERMIL2: {
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ SDValue MaskNode = N->getOperand(2);
+ SDValue CtrlNode = N->getOperand(3);
+ if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
+ unsigned CtrlImm = CtrlOp->getZExtValue();
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+ RawUndefs)) {
+ DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
+ Mask);
+ break;
+ }
+ }
+ return false;
+ }
+ case X86ISD::VPPERM: {
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ SDValue MaskNode = N->getOperand(2);
+ if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
+ DecodeVPPERMMask(RawMask, RawUndefs, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMV: {
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ IsUnary = true;
+ // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
+ Ops.push_back(N->getOperand(1));
+ SDValue MaskNode = N->getOperand(0);
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+ RawUndefs)) {
+ DecodeVPERMVMask(RawMask, RawUndefs, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMV3: {
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
+ // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
+ Ops.push_back(N->getOperand(0));
+ Ops.push_back(N->getOperand(2));
+ SDValue MaskNode = N->getOperand(1);
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+ RawUndefs)) {
+ DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
+ break;
+ }
+ return false;
+ }
+ default: llvm_unreachable("unknown target shuffle node");
+ }
+
+ // Empty mask indicates the decode failed.
+ if (Mask.empty())
+ return false;
+
+ // Check if we're getting a shuffle mask with zero'd elements.
+ if (!AllowSentinelZero && isAnyZero(Mask))
+ return false;
+
+ // If we have a fake unary shuffle, the shuffle mask is spread across two
+ // inputs that are actually the same node. Re-map the mask to always point
+ // into the first input.
+ if (IsFakeUnary)
+ for (int &M : Mask)
+ if (M >= (int)Mask.size())
+ M -= Mask.size();
+
+ // If we didn't already add operands in the opcode-specific code, default to
+ // adding 1 or 2 operands starting at 0.
+ if (Ops.empty()) {
+ Ops.push_back(N->getOperand(0));
+ if (!IsUnary || IsFakeUnary)
+ Ops.push_back(N->getOperand(1));
+ }
+
+ return true;
+}
+
+/// Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static void computeZeroableShuffleElements(ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ APInt &KnownUndef, APInt &KnownZero) {
+ int Size = Mask.size();
+ KnownUndef = KnownZero = APInt::getNullValue(Size);
+
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
+
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ int VectorSizeInBits = V1.getValueSizeInBits();
+ int ScalarSizeInBits = VectorSizeInBits / Size;
+ assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
+ for (int i = 0; i < Size; ++i) {
+ int M = Mask[i];
+ // Handle the easy cases.
+ if (M < 0) {
+ KnownUndef.setBit(i);
+ continue;
+ }
+ if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+ KnownZero.setBit(i);
+ continue;
+ }
+
+ // Determine shuffle input and normalize the mask.
+ SDValue V = M < Size ? V1 : V2;
+ M %= Size;
+
+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
+ continue;
+
+ // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
+ // the (larger) source element must be UNDEF/ZERO.
+ if ((Size % V.getNumOperands()) == 0) {
+ int Scale = Size / V->getNumOperands();
+ SDValue Op = V.getOperand(M / Scale);
+ if (Op.isUndef())
+ KnownUndef.setBit(i);
+ if (X86::isZeroNode(Op))
+ KnownZero.setBit(i);
+ else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ APInt Val = Cst->getAPIntValue();
+ Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
+ if (Val == 0)
+ KnownZero.setBit(i);
+ } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+ APInt Val = Cst->getValueAPF().bitcastToAPInt();
+ Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
+ if (Val == 0)
+ KnownZero.setBit(i);
+ }
+ continue;
+ }
+
+ // If the BUILD_VECTOR has more elements then all the (smaller) source
+ // elements must be UNDEF or ZERO.
+ if ((V.getNumOperands() % Size) == 0) {
+ int Scale = V->getNumOperands() / Size;
+ bool AllUndef = true;
+ bool AllZero = true;
+ for (int j = 0; j < Scale; ++j) {
+ SDValue Op = V.getOperand((M * Scale) + j);
+ AllUndef &= Op.isUndef();
+ AllZero &= X86::isZeroNode(Op);
+ }
+ if (AllUndef)
+ KnownUndef.setBit(i);
+ if (AllZero)
+ KnownZero.setBit(i);
+ continue;
+ }
+ }
+}
+
+/// Decode a target shuffle mask and inputs and see if any values are
+/// known to be undef or zero from their inputs.
+/// Returns true if the target shuffle mask was decoded.
+/// FIXME: Merge this with computeZeroableShuffleElements?
+static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SDValue> &Ops,
+ APInt &KnownUndef, APInt &KnownZero) {
+ bool IsUnary;
+ if (!isTargetShuffle(N.getOpcode()))
+ return false;
+
+ MVT VT = N.getSimpleValueType();
+ if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
+ return false;
+
+ int Size = Mask.size();
+ SDValue V1 = Ops[0];
+ SDValue V2 = IsUnary ? V1 : Ops[1];
+ KnownUndef = KnownZero = APInt::getNullValue(Size);
+
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
+
+ assert((VT.getSizeInBits() % Size) == 0 &&
+ "Illegal split of shuffle value type");
+ unsigned EltSizeInBits = VT.getSizeInBits() / Size;
+
+ // Extract known constant input data.
+ APInt UndefSrcElts[2];
+ SmallVector<APInt, 32> SrcEltBits[2];
+ bool IsSrcConstant[2] = {
+ getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
+ SrcEltBits[0], true, false),
+ getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
+ SrcEltBits[1], true, false)};
+
+ for (int i = 0; i < Size; ++i) {
+ int M = Mask[i];
+
+ // Already decoded as SM_SentinelZero / SM_SentinelUndef.
+ if (M < 0) {
+ assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
+ if (SM_SentinelUndef == M)
+ KnownUndef.setBit(i);
+ if (SM_SentinelZero == M)
+ KnownZero.setBit(i);
+ continue;
+ }
+
+ // Determine shuffle input and normalize the mask.
+ unsigned SrcIdx = M / Size;
+ SDValue V = M < Size ? V1 : V2;
+ M %= Size;
+
+ // We are referencing an UNDEF input.
+ if (V.isUndef()) {
+ KnownUndef.setBit(i);
+ continue;
+ }
+
+ // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
+ // TODO: We currently only set UNDEF for integer types - floats use the same
+ // registers as vectors and many of the scalar folded loads rely on the
+ // SCALAR_TO_VECTOR pattern.
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ (Size % V.getValueType().getVectorNumElements()) == 0) {
+ int Scale = Size / V.getValueType().getVectorNumElements();
+ int Idx = M / Scale;
+ if (Idx != 0 && !VT.isFloatingPoint())
+ KnownUndef.setBit(i);
+ else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
+ KnownZero.setBit(i);
+ continue;
+ }
+
+ // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
+ // base vectors.
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue Vec = V.getOperand(0);
+ int NumVecElts = Vec.getValueType().getVectorNumElements();
+ if (Vec.isUndef() && Size == NumVecElts) {
+ int Idx = V.getConstantOperandVal(2);
+ int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
+ if (M < Idx || (Idx + NumSubElts) <= M)
+ KnownUndef.setBit(i);
+ }
+ continue;
+ }
+
+ // Attempt to extract from the source's constant bits.
+ if (IsSrcConstant[SrcIdx]) {
+ if (UndefSrcElts[SrcIdx][M])
+ KnownUndef.setBit(i);
+ else if (SrcEltBits[SrcIdx][M] == 0)
+ KnownZero.setBit(i);
+ }
+ }
+
+ assert(VT.getVectorNumElements() == (unsigned)Size &&
+ "Different mask size from vector size!");
+ return true;
+}
+
+// Replace target shuffle mask elements with known undef/zero sentinels.
+static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
+ const APInt &KnownUndef,
+ const APInt &KnownZero,
+ bool ResolveKnownZeros= true) {
+ unsigned NumElts = Mask.size();
+ assert(KnownUndef.getBitWidth() == NumElts &&
+ KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (KnownUndef[i])
+ Mask[i] = SM_SentinelUndef;
+ else if (ResolveKnownZeros && KnownZero[i])
+ Mask[i] = SM_SentinelZero;
+ }
+}
+
+// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
+static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
+ APInt &KnownUndef,
+ APInt &KnownZero) {
+ unsigned NumElts = Mask.size();
+ KnownUndef = KnownZero = APInt::getNullValue(NumElts);
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (SM_SentinelUndef == M)
+ KnownUndef.setBit(i);
+ if (SM_SentinelZero == M)
+ KnownZero.setBit(i);
+ }
+}
+
+// Forward declaration (for getFauxShuffleMask recursive check).
+// TODO: Use DemandedElts variant.
+static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ const SelectionDAG &DAG, unsigned Depth,
+ bool ResolveKnownElts);
+
+// Attempt to decode ops that could be represented as a shuffle mask.
+// The decoded shuffle mask may contain a different number of elements to the
+// destination value type.
+static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SDValue> &Ops,
+ const SelectionDAG &DAG, unsigned Depth,
+ bool ResolveKnownElts) {
+ Mask.clear();
+ Ops.clear();
+
+ MVT VT = N.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumSizeInBits = VT.getSizeInBits();
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
+ return false;
+ assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
+ unsigned NumSizeInBytes = NumSizeInBits / 8;
+ unsigned NumBytesPerElt = NumBitsPerElt / 8;
+
+ unsigned Opcode = N.getOpcode();
+ switch (Opcode) {
+ case ISD::VECTOR_SHUFFLE: {
+ // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
+ ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
+ if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
+ Mask.append(ShuffleMask.begin(), ShuffleMask.end());
+ Ops.push_back(N.getOperand(0));
+ Ops.push_back(N.getOperand(1));
+ return true;
+ }
+ return false;
+ }
+ case ISD::AND:
+ case X86ISD::ANDNP: {
+ // Attempt to decode as a per-byte mask.
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ bool IsAndN = (X86ISD::ANDNP == Opcode);
+ uint64_t ZeroMask = IsAndN ? 255 : 0;
+ if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
+ return false;
+ for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
+ if (UndefElts[i]) {
+ Mask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ const APInt &ByteBits = EltBits[i];
+ if (ByteBits != 0 && ByteBits != 255)
+ return false;
+ Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
+ }
+ Ops.push_back(IsAndN ? N1 : N0);
+ return true;
+ }
+ case ISD::OR: {
+ // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
+ // is a valid shuffle index.
+ SDValue N0 = peekThroughBitcasts(N.getOperand(0));
+ SDValue N1 = peekThroughBitcasts(N.getOperand(1));
+ if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
+ return false;
+ SmallVector<int, 64> SrcMask0, SrcMask1;
+ SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
+ if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
+ true) ||
+ !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
+ true))
+ return false;
+
+ size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
+ SmallVector<int, 64> Mask0, Mask1;
+ narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
+ narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
+ for (int i = 0; i != (int)MaskSize; ++i) {
+ if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
+ Mask.push_back(SM_SentinelUndef);
+ else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
+ Mask.push_back(SM_SentinelZero);
+ else if (Mask1[i] == SM_SentinelZero)
+ Mask.push_back(i);
+ else if (Mask0[i] == SM_SentinelZero)
+ Mask.push_back(i + MaskSize);
+ else
+ return false;
+ }
+ Ops.push_back(N0);
+ Ops.push_back(N1);
+ return true;
+ }
+ case ISD::INSERT_SUBVECTOR: {
+ SDValue Src = N.getOperand(0);
+ SDValue Sub = N.getOperand(1);
+ EVT SubVT = Sub.getValueType();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
+ if (!N->isOnlyUserOf(Sub.getNode()))
+ return false;
+ uint64_t InsertIdx = N.getConstantOperandVal(2);
+ // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Sub.getOperand(0).getValueType() == VT) {
+ uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
+ for (int i = 0; i != (int)NumElts; ++i)
+ Mask.push_back(i);
+ for (int i = 0; i != (int)NumSubElts; ++i)
+ Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
+ Ops.push_back(Src);
+ Ops.push_back(Sub.getOperand(0));
+ return true;
+ }
+ // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
+ SmallVector<int, 64> SubMask;
+ SmallVector<SDValue, 2> SubInputs;
+ if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
+ SubMask, DAG, Depth + 1, ResolveKnownElts))
+ return false;
+
+ // Subvector shuffle inputs must not be larger than the subvector.
+ if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
+ return SubVT.getFixedSizeInBits() <
+ SubInput.getValueSizeInBits().getFixedSize();
+ }))
+ return false;
+
+ if (SubMask.size() != NumSubElts) {
+ assert(((SubMask.size() % NumSubElts) == 0 ||
+ (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
+ if ((NumSubElts % SubMask.size()) == 0) {
+ int Scale = NumSubElts / SubMask.size();
+ SmallVector<int,64> ScaledSubMask;
+ narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
+ SubMask = ScaledSubMask;
+ } else {
+ int Scale = SubMask.size() / NumSubElts;
+ NumSubElts = SubMask.size();
+ NumElts *= Scale;
+ InsertIdx *= Scale;
+ }
+ }
+ Ops.push_back(Src);
+ Ops.append(SubInputs.begin(), SubInputs.end());
+ if (ISD::isBuildVectorAllZeros(Src.getNode()))
+ Mask.append(NumElts, SM_SentinelZero);
+ else
+ for (int i = 0; i != (int)NumElts; ++i)
+ Mask.push_back(i);
+ for (int i = 0; i != (int)NumSubElts; ++i) {
+ int M = SubMask[i];
+ if (0 <= M) {
+ int InputIdx = M / NumSubElts;
+ M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
+ }
+ Mask[i + InsertIdx] = M;
+ }
+ return true;
+ }
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW:
+ case ISD::SCALAR_TO_VECTOR:
+ case ISD::INSERT_VECTOR_ELT: {
+ // Match against a insert_vector_elt/scalar_to_vector of an extract from a
+ // vector, for matching src/dst vector types.
+ SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
+
+ unsigned DstIdx = 0;
+ if (Opcode != ISD::SCALAR_TO_VECTOR) {
+ // Check we have an in-range constant insertion index.
+ if (!isa<ConstantSDNode>(N.getOperand(2)) ||
+ N.getConstantOperandAPInt(2).uge(NumElts))
+ return false;
+ DstIdx = N.getConstantOperandVal(2);
+
+ // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
+ if (X86::isZeroNode(Scl)) {
+ Ops.push_back(N.getOperand(0));
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
+ return true;
+ }
+ }
+
+ // Peek through trunc/aext/zext.
+ // TODO: aext shouldn't require SM_SentinelZero padding.
+ // TODO: handle shift of scalars.
+ unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
+ while (Scl.getOpcode() == ISD::TRUNCATE ||
+ Scl.getOpcode() == ISD::ANY_EXTEND ||
+ Scl.getOpcode() == ISD::ZERO_EXTEND) {
+ Scl = Scl.getOperand(0);
+ MinBitsPerElt =
+ std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
+ }
+ if ((MinBitsPerElt % 8) != 0)
+ return false;
+
+ // Attempt to find the source vector the scalar was extracted from.
+ SDValue SrcExtract;
+ if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ Scl.getOpcode() == X86ISD::PEXTRW ||
+ Scl.getOpcode() == X86ISD::PEXTRB) &&
+ Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
+ SrcExtract = Scl;
+ }
+ if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
+ return false;
+
+ SDValue SrcVec = SrcExtract.getOperand(0);
+ EVT SrcVT = SrcVec.getValueType();
+ if (!SrcVT.getScalarType().isByteSized())
+ return false;
+ unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
+ unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
+ unsigned DstByte = DstIdx * NumBytesPerElt;
+ MinBitsPerElt =
+ std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
+
+ // Create 'identity' byte level shuffle mask and then add inserted bytes.
+ if (Opcode == ISD::SCALAR_TO_VECTOR) {
+ Ops.push_back(SrcVec);
+ Mask.append(NumSizeInBytes, SM_SentinelUndef);
+ } else {
+ Ops.push_back(SrcVec);
+ Ops.push_back(N.getOperand(0));
+ for (int i = 0; i != (int)NumSizeInBytes; ++i)
+ Mask.push_back(NumSizeInBytes + i);
+ }
+
+ unsigned MinBytesPerElts = MinBitsPerElt / 8;
+ MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
+ for (unsigned i = 0; i != MinBytesPerElts; ++i)
+ Mask[DstByte + i] = SrcByte + i;
+ for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
+ Mask[DstByte + i] = SM_SentinelZero;
+ return true;
+ }
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
+ N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
+ "Unexpected input value type");
+
+ APInt EltsLHS, EltsRHS;
+ getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
+
+ // If we know input saturation won't happen (or we don't care for particular
+ // lanes), we can treat this as a truncation shuffle.
+ bool Offset0 = false, Offset1 = false;
+ if (Opcode == X86ISD::PACKSS) {
+ if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
+ DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
+ (!(N1.isUndef() || EltsRHS.isNullValue()) &&
+ DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
+ return false;
+ // We can't easily fold ASHR into a shuffle, but if it was feeding a
+ // PACKSS then it was likely being used for sign-extension for a
+ // truncation, so just peek through and adjust the mask accordingly.
+ if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
+ N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
+ Offset0 = true;
+ N0 = N0.getOperand(0);
+ }
+ if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
+ N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
+ Offset1 = true;
+ N1 = N1.getOperand(0);
+ }
+ } else {
+ APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
+ if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
+ !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
+ (!(N1.isUndef() || EltsRHS.isNullValue()) &&
+ !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
+ return false;
+ }
+
+ bool IsUnary = (N0 == N1);
+
+ Ops.push_back(N0);
+ if (!IsUnary)
+ Ops.push_back(N1);
+
+ createPackShuffleMask(VT, Mask, IsUnary);
+
+ if (Offset0 || Offset1) {
+ for (int &M : Mask)
+ if ((Offset0 && isInRange(M, 0, NumElts)) ||
+ (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
+ ++M;
+ }
+ return true;
+ }
+ case X86ISD::VTRUNC: {
+ SDValue Src = N.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ // Truncated source must be a simple vector.
+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+ (SrcVT.getScalarSizeInBits() % 8) != 0)
+ return false;
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
+ unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
+ assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
+ for (unsigned i = 0; i != NumSrcElts; ++i)
+ Mask.push_back(i * Scale);
+ Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
+ Ops.push_back(Src);
+ return true;
+ }
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI: {
+ uint64_t ShiftVal = N.getConstantOperandVal(1);
+ // Out of range bit shifts are guaranteed to be zero.
+ if (NumBitsPerElt <= ShiftVal) {
+ Mask.append(NumElts, SM_SentinelZero);
+ return true;
+ }
+
+ // We can only decode 'whole byte' bit shifts as shuffles.
+ if ((ShiftVal % 8) != 0)
+ break;
+
+ uint64_t ByteShift = ShiftVal / 8;
+ Ops.push_back(N.getOperand(0));
+
+ // Clear mask to all zeros and insert the shifted byte indices.
+ Mask.append(NumSizeInBytes, SM_SentinelZero);
+
+ if (X86ISD::VSHLI == Opcode) {
+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
+ for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+ Mask[i + j] = i + j - ByteShift;
+ } else {
+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
+ for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+ Mask[i + j - ByteShift] = i + j;
+ }
+ return true;
+ }
+ case X86ISD::VROTLI:
+ case X86ISD::VROTRI: {
+ // We can only decode 'whole byte' bit rotates as shuffles.
+ uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
+ if ((RotateVal % 8) != 0)
+ return false;
+ Ops.push_back(N.getOperand(0));
+ int Offset = RotateVal / 8;
+ Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
+ for (int i = 0; i != (int)NumElts; ++i) {
+ int BaseIdx = i * NumBytesPerElt;
+ for (int j = 0; j != (int)NumBytesPerElt; ++j) {
+ Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
+ }
+ }
+ return true;
+ }
+ case X86ISD::VBROADCAST: {
+ SDValue Src = N.getOperand(0);
+ if (!Src.getSimpleValueType().isVector())
+ return false;
+ Ops.push_back(Src);
+ Mask.append(NumElts, 0);
+ return true;
+ }
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ case ISD::ANY_EXTEND_VECTOR_INREG: {
+ SDValue Src = N.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ // Extended source must be a simple vector.
+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+ (SrcVT.getScalarSizeInBits() % 8) != 0)
+ return false;
+
+ bool IsAnyExtend =
+ (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
+ DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+ IsAnyExtend, Mask);
+ Ops.push_back(Src);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
+static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask) {
+ int MaskWidth = Mask.size();
+ SmallVector<SDValue, 16> UsedInputs;
+ for (int i = 0, e = Inputs.size(); i < e; ++i) {
+ int lo = UsedInputs.size() * MaskWidth;
+ int hi = lo + MaskWidth;
+
+ // Strip UNDEF input usage.
+ if (Inputs[i].isUndef())
+ for (int &M : Mask)
+ if ((lo <= M) && (M < hi))
+ M = SM_SentinelUndef;
+
+ // Check for unused inputs.
+ if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+ for (int &M : Mask)
+ if (lo <= M)
+ M -= MaskWidth;
+ continue;
+ }
+
+ // Check for repeated inputs.
+ bool IsRepeat = false;
+ for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
+ if (UsedInputs[j] != Inputs[i])
+ continue;
+ for (int &M : Mask)
+ if (lo <= M)
+ M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
+ IsRepeat = true;
+ break;
+ }
+ if (IsRepeat)
+ continue;
+
+ UsedInputs.push_back(Inputs[i]);
+ }
+ Inputs = UsedInputs;
+}
+
+/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
+/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
+/// Returns true if the target shuffle mask was decoded.
+static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
+ SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ APInt &KnownUndef, APInt &KnownZero,
+ const SelectionDAG &DAG, unsigned Depth,
+ bool ResolveKnownElts) {
+ EVT VT = Op.getValueType();
+ if (!VT.isSimple() || !VT.isVector())
+ return false;
+
+ if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
+ if (ResolveKnownElts)
+ resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
+ return true;
+ }
+ if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
+ ResolveKnownElts)) {
+ resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
+ return true;
+ }
+ return false;
+}
+
+static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ const SelectionDAG &DAG, unsigned Depth = 0,
+ bool ResolveKnownElts = true) {
+ EVT VT = Op.getValueType();
+ if (!VT.isSimple() || !VT.isVector())
+ return false;
+
+ APInt KnownUndef, KnownZero;
+ unsigned NumElts = Op.getValueType().getVectorNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+ return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
+ KnownZero, DAG, Depth, ResolveKnownElts);
+}
+
+/// Returns the scalar element that will make up the i'th
+/// element of the result of the vector shuffle.
+static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
+ SelectionDAG &DAG, unsigned Depth) {
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue(); // Limit search depth.
+
+ EVT VT = Op.getValueType();
+ unsigned Opcode = Op.getOpcode();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
+ if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
+ int Elt = SV->getMaskElt(Index);
+
+ if (Elt < 0)
+ return DAG.getUNDEF(VT.getVectorElementType());
+
+ SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
+ return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
+ }
+
+ // Recurse into target specific vector shuffles to find scalars.
+ if (isTargetShuffle(Opcode)) {
+ MVT ShufVT = VT.getSimpleVT();
+ MVT ShufSVT = ShufVT.getVectorElementType();
+ int NumElems = (int)ShufVT.getVectorNumElements();
+ SmallVector<int, 16> ShuffleMask;
+ SmallVector<SDValue, 16> ShuffleOps;
+ bool IsUnary;
+
+ if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
+ ShuffleMask, IsUnary))
+ return SDValue();
+
+ int Elt = ShuffleMask[Index];
+ if (Elt == SM_SentinelZero)
+ return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
+ : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
+ if (Elt == SM_SentinelUndef)
+ return DAG.getUNDEF(ShufSVT);
+
+ assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
+ SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
+ return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
+ }
+
+ // Recurse into insert_subvector base/sub vector to find scalars.
+ if (Opcode == ISD::INSERT_SUBVECTOR) {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Sub = Op.getOperand(1);
+ uint64_t SubIdx = Op.getConstantOperandVal(2);
+ unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+
+ if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
+ return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
+ return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
+ }
+
+ // Recurse into concat_vectors sub vector to find scalars.
+ if (Opcode == ISD::CONCAT_VECTORS) {
+ EVT SubVT = Op.getOperand(0).getValueType();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
+ uint64_t SubIdx = Index / NumSubElts;
+ uint64_t SubElt = Index % NumSubElts;
+ return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
+ }
+
+ // Recurse into extract_subvector src vector to find scalars.
+ if (Opcode == ISD::EXTRACT_SUBVECTOR) {
+ SDValue Src = Op.getOperand(0);
+ uint64_t SrcIdx = Op.getConstantOperandVal(1);
+ return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
+ }
+
+ // We only peek through bitcasts of the same vector width.
+ if (Opcode == ISD::BITCAST) {
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
+ return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
+ return SDValue();
+ }
+
+ // Actual nodes that may contain scalar elements
+
+ // For insert_vector_elt - either return the index matching scalar or recurse
+ // into the base vector.
+ if (Opcode == ISD::INSERT_VECTOR_ELT &&
+ isa<ConstantSDNode>(Op.getOperand(2))) {
+ if (Op.getConstantOperandAPInt(2) == Index)
+ return Op.getOperand(1);
+ return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
+ }
+
+ if (Opcode == ISD::SCALAR_TO_VECTOR)
+ return (Index == 0) ? Op.getOperand(0)
+ : DAG.getUNDEF(VT.getVectorElementType());
+
+ if (Opcode == ISD::BUILD_VECTOR)
+ return Op.getOperand(Index);
+
+ return SDValue();
+}
+
+// Use PINSRB/PINSRW/PINSRD to create a build vector.
+static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
+ ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
+ "Illegal vector insertion");
+
+ SDLoc dl(Op);
+ SDValue V;
+ bool First = true;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ bool IsNonZero = NonZeroMask[i];
+ if (!IsNonZero)
+ continue;
+
+ // If the build vector contains zeros or our first insertion is not the
+ // first index then insert into zero vector to break any register
+ // dependency else use SCALAR_TO_VECTOR.
+ if (First) {
+ First = false;
+ if (NumZero || 0 != i)
+ V = getZeroVector(VT, Subtarget, DAG, dl);
+ else {
+ assert(0 == i && "Expected insertion into zero-index");
+ V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(VT, V);
+ continue;
+ }
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i, dl));
+ }
+
+ return V;
+}
+
+/// Custom lower build_vector of v16i8.
+static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (NumNonZero > 8 && !Subtarget.hasSSE41())
+ return SDValue();
+
+ // SSE4.1 - use PINSRB to insert each byte directly.
+ if (Subtarget.hasSSE41())
+ return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
+ Subtarget);
+
+ SDLoc dl(Op);
+ SDValue V;
+
+ // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
+ for (unsigned i = 0; i < 16; i += 2) {
+ bool ThisIsNonZero = NonZeroMask[i];
+ bool NextIsNonZero = NonZeroMask[i + 1];
+ if (!ThisIsNonZero && !NextIsNonZero)
+ continue;
+
+ // FIXME: Investigate combining the first 4 bytes as a i32 instead.
+ SDValue Elt;
+ if (ThisIsNonZero) {
+ if (NumZero || NextIsNonZero)
+ Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ else
+ Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ }
+
+ if (NextIsNonZero) {
+ SDValue NextElt = Op.getOperand(i + 1);
+ if (i == 0 && NumZero)
+ NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
+ else
+ NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
+ NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
+ DAG.getConstant(8, dl, MVT::i8));
+ if (ThisIsNonZero)
+ Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
+ else
+ Elt = NextElt;
+ }
+
+ // If our first insertion is not the first index or zeros are needed, then
+ // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
+ // elements undefined).
+ if (!V) {
+ if (i != 0 || NumZero)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else {
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ continue;
+ }
+ }
+ Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
+ DAG.getIntPtrConstant(i / 2, dl));
+ }
+
+ return DAG.getBitcast(MVT::v16i8, V);
+}
+
+/// Custom lower build_vector of v8i16.
+static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (NumNonZero > 4 && !Subtarget.hasSSE41())
+ return SDValue();
+
+ // Use PINSRW to insert each byte directly.
+ return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
+ Subtarget);
+}
+
+/// Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // If this is a splat of a pair of elements, use MOVDDUP (unless the target
+ // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
+ // Because we're creating a less complicated build vector here, we may enable
+ // further folding of the MOVDDUP via shuffle transforms.
+ if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
+ Op.getOperand(0) == Op.getOperand(2) &&
+ Op.getOperand(1) == Op.getOperand(3) &&
+ Op.getOperand(0) != Op.getOperand(1)) {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ // Create a new build vector with the first 2 elements followed by undef
+ // padding, bitcast to v2f64, duplicate, and bitcast back.
+ SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
+ DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
+ SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
+ SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
+ return DAG.getBitcast(VT, Dup);
+ }
+
+ // Find all zeroable elements.
+ std::bitset<4> Zeroable, Undefs;
+ for (int i = 0; i < 4; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ Undefs[i] = Elt.isUndef();
+ Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
+ }
+ assert(Zeroable.size() - Zeroable.count() > 1 &&
+ "We expect at least two non-zero elements!");
+
+ // We only know how to deal with build_vector nodes where elements are either
+ // zeroable or extract_vector_elt with constant index.
+ SDValue FirstNonZero;
+ unsigned FirstNonZeroIdx;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (Zeroable[i])
+ continue;
+ SDValue Elt = Op.getOperand(i);
+ if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Elt.getOperand(1)))
+ return SDValue();
+ // Make sure that this node is extracting from a 128-bit vector.
+ MVT VT = Elt.getOperand(0).getSimpleValueType();
+ if (!VT.is128BitVector())
+ return SDValue();
+ if (!FirstNonZero.getNode()) {
+ FirstNonZero = Elt;
+ FirstNonZeroIdx = i;
+ }
+ }
+
+ assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
+ SDValue V1 = FirstNonZero.getOperand(0);
+ MVT VT = V1.getSimpleValueType();
+
+ // See if this build_vector can be lowered as a blend with zero.
+ SDValue Elt;
+ unsigned EltMaskIdx, EltIdx;
+ int Mask[4];
+ for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
+ if (Zeroable[EltIdx]) {
+ // The zero vector will be on the right hand side.
+ Mask[EltIdx] = EltIdx+4;
+ continue;
+ }
+
+ Elt = Op->getOperand(EltIdx);
+ // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
+ EltMaskIdx = Elt.getConstantOperandVal(1);
+ if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
+ break;
+ Mask[EltIdx] = EltIdx;
+ }
+
+ if (EltIdx == 4) {
+ // Let the shuffle legalizer deal with blend operations.
+ SDValue VZeroOrUndef = (Zeroable == Undefs)
+ ? DAG.getUNDEF(VT)
+ : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+ if (V1.getSimpleValueType() != VT)
+ V1 = DAG.getBitcast(VT, V1);
+ return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
+ }
+
+ // See if we can lower this build_vector to a INSERTPS.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ SDValue V2 = Elt.getOperand(0);
+ if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
+ V1 = SDValue();
+
+ bool CanFold = true;
+ for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
+ if (Zeroable[i])
+ continue;
+
+ SDValue Current = Op->getOperand(i);
+ SDValue SrcVector = Current->getOperand(0);
+ if (!V1.getNode())
+ V1 = SrcVector;
+ CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
+ }
+
+ if (!CanFold)
+ return SDValue();
+
+ assert(V1.getNode() && "Expected at least two non-zero elements!");
+ if (V1.getSimpleValueType() != MVT::v4f32)
+ V1 = DAG.getBitcast(MVT::v4f32, V1);
+ if (V2.getSimpleValueType() != MVT::v4f32)
+ V2 = DAG.getBitcast(MVT::v4f32, V2);
+
+ // Ok, we can emit an INSERTPS instruction.
+ unsigned ZMask = Zeroable.to_ulong();
+
+ unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ SDLoc DL(Op);
+ SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getIntPtrConstant(InsertPSMask, DL, true));
+ return DAG.getBitcast(VT, Result);
+}
+
+/// Return a vector logical shift node.
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
+ SelectionDAG &DAG, const TargetLowering &TLI,
+ const SDLoc &dl) {
+ assert(VT.is128BitVector() && "Unknown type for VShift");
+ MVT ShVT = MVT::v16i8;
+ unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
+ SrcOp = DAG.getBitcast(ShVT, SrcOp);
+ assert(NumBits % 8 == 0 && "Only support byte sized shifts");
+ SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
+ return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
+}
+
+static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
+ SelectionDAG &DAG) {
+
+ // Check if the scalar load can be widened into a vector load. And if
+ // the address is "base + cst" see if the cst can be "absorbed" into
+ // the shuffle mask.
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+ SDValue Ptr = LD->getBasePtr();
+ if (!ISD::isNormalLoad(LD) || !LD->isSimple())
+ return SDValue();
+ EVT PVT = LD->getValueType(0);
+ if (PVT != MVT::i32 && PVT != MVT::f32)
+ return SDValue();
+
+ int FI = -1;
+ int64_t Offset = 0;
+ if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+ FI = FINode->getIndex();
+ Offset = 0;
+ } else if (DAG.isBaseWithConstantOffset(Ptr) &&
+ isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+ FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+ Offset = Ptr.getConstantOperandVal(1);
+ Ptr = Ptr.getOperand(0);
+ } else {
+ return SDValue();
+ }
+
+ // FIXME: 256-bit vector instructions don't require a strict alignment,
+ // improve this code to support it better.
+ Align RequiredAlign(VT.getSizeInBits() / 8);
+ SDValue Chain = LD->getChain();
+ // Make sure the stack object alignment is at least 16 or 32.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
+ if (!InferredAlign || *InferredAlign < RequiredAlign) {
+ if (MFI.isFixedObjectIndex(FI)) {
+ // Can't change the alignment. FIXME: It's possible to compute
+ // the exact stack offset and reference FI + adjust offset instead.
+ // If someone *really* cares about this. That's the way to implement it.
+ return SDValue();
+ } else {
+ MFI.setObjectAlignment(FI, RequiredAlign);
+ }
+ }
+
+ // (Offset % 16 or 32) must be multiple of 4. Then address is then
+ // Ptr + (Offset & ~15).
+ if (Offset < 0)
+ return SDValue();
+ if ((Offset % RequiredAlign.value()) & 3)
+ return SDValue();
+ int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
+ if (StartOffset) {
+ SDLoc DL(Ptr);
+ Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+ DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
+ }
+
+ int EltNo = (Offset - StartOffset) >> 2;
+ unsigned NumElems = VT.getVectorNumElements();
+
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+ SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
+ LD->getPointerInfo().getWithOffset(StartOffset));
+
+ SmallVector<int, 8> Mask(NumElems, EltNo);
+
+ return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
+ }
+
+ return SDValue();
+}
+
+// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
+static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
+ if (ISD::isNON_EXTLoad(Elt.getNode())) {
+ auto *BaseLd = cast<LoadSDNode>(Elt);
+ if (!BaseLd->isSimple())
+ return false;
+ Ld = BaseLd;
+ ByteOffset = 0;
+ return true;
+ }
+
+ switch (Elt.getOpcode()) {
+ case ISD::BITCAST:
+ case ISD::TRUNCATE:
+ case ISD::SCALAR_TO_VECTOR:
+ return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
+ case ISD::SRL:
+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
+ uint64_t Idx = IdxC->getZExtValue();
+ if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
+ ByteOffset += Idx / 8;
+ return true;
+ }
+ }
+ break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
+ SDValue Src = Elt.getOperand(0);
+ unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
+ unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
+ if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
+ findEltLoadSrc(Src, Ld, ByteOffset)) {
+ uint64_t Idx = IdxC->getZExtValue();
+ ByteOffset += Idx * (SrcSizeInBits / 8);
+ return true;
+ }
+ }
+ break;
+ }
+
+ return false;
+}
+
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
+ const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ bool isAfterLegalize) {
+ if ((VT.getScalarSizeInBits() % 8) != 0)
+ return SDValue();
+
+ unsigned NumElems = Elts.size();
+
+ int LastLoadedElt = -1;
+ APInt LoadMask = APInt::getNullValue(NumElems);
+ APInt ZeroMask = APInt::getNullValue(NumElems);
+ APInt UndefMask = APInt::getNullValue(NumElems);
+
+ SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
+ SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
+
+ // For each element in the initializer, see if we've found a load, zero or an
+ // undef.
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = peekThroughBitcasts(Elts[i]);
+ if (!Elt.getNode())
+ return SDValue();
+ if (Elt.isUndef()) {
+ UndefMask.setBit(i);
+ continue;
+ }
+ if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
+ ZeroMask.setBit(i);
+ continue;
+ }
+
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ unsigned EltSizeInBits = Elt.getValueSizeInBits();
+ if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
+ return SDValue();
+
+ if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
+ return SDValue();
+ unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
+ if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
+ return SDValue();
+
+ LoadMask.setBit(i);
+ LastLoadedElt = i;
+ }
+ assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
+ LoadMask.countPopulation()) == NumElems &&
+ "Incomplete element masks");
+
+ // Handle Special Cases - all undef or undef/zero.
+ if (UndefMask.countPopulation() == NumElems)
+ return DAG.getUNDEF(VT);
+ if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
+ return VT.isInteger() ? DAG.getConstant(0, DL, VT)
+ : DAG.getConstantFP(0.0, DL, VT);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ int FirstLoadedElt = LoadMask.countTrailingZeros();
+ SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
+ EVT EltBaseVT = EltBase.getValueType();
+ assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
+ "Register/Memory size mismatch");
+ LoadSDNode *LDBase = Loads[FirstLoadedElt];
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
+ unsigned BaseSizeInBytes = BaseSizeInBits / 8;
+ int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
+ int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
+ assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
+
+ // TODO: Support offsetting the base load.
+ if (ByteOffsets[FirstLoadedElt] != 0)
+ return SDValue();
+
+ // Check to see if the element's load is consecutive to the base load
+ // or offset from a previous (already checked) load.
+ auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
+ LoadSDNode *Ld = Loads[EltIdx];
+ int64_t ByteOffset = ByteOffsets[EltIdx];
+ if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
+ int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
+ return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
+ Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
+ }
+ return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
+ EltIdx - FirstLoadedElt);
+ };
+
+ // Consecutive loads can contain UNDEFS but not ZERO elements.
+ // Consecutive loads with UNDEFs and ZEROs elements require a
+ // an additional shuffle stage to clear the ZERO elements.
+ bool IsConsecutiveLoad = true;
+ bool IsConsecutiveLoadWithZeros = true;
+ for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
+ if (LoadMask[i]) {
+ if (!CheckConsecutiveLoad(LDBase, i)) {
+ IsConsecutiveLoad = false;
+ IsConsecutiveLoadWithZeros = false;
+ break;
+ }
+ } else if (ZeroMask[i]) {
+ IsConsecutiveLoad = false;
+ }
+ }
+
+ auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
+ auto MMOFlags = LDBase->getMemOperand()->getFlags();
+ assert(LDBase->isSimple() &&
+ "Cannot merge volatile or atomic loads.");
+ SDValue NewLd =
+ DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
+ MMOFlags);
+ for (auto *LD : Loads)
+ if (LD)
+ DAG.makeEquivalentMemoryOrdering(LD, NewLd);
+ return NewLd;
+ };
+
+ // Check if the base load is entirely dereferenceable.
+ bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
+ VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
+
+ // LOAD - all consecutive load/undefs (must start/end with a load or be
+ // entirely dereferenceable). If we have found an entire vector of loads and
+ // undefs, then return a large load of the entire vector width starting at the
+ // base pointer. If the vector contains zeros, then attempt to shuffle those
+ // elements.
+ if (FirstLoadedElt == 0 &&
+ (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
+ (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
+ if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
+ return SDValue();
+
+ // Don't create 256-bit non-temporal aligned loads without AVX2 as these
+ // will lower to regular temporal loads and use the cache.
+ if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
+ VT.is256BitVector() && !Subtarget.hasInt256())
+ return SDValue();
+
+ if (NumElems == 1)
+ return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
+
+ if (!ZeroMask)
+ return CreateLoad(VT, LDBase);
+
+ // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
+ // vector and a zero vector to clear out the zero elements.
+ if (!isAfterLegalize && VT.isVector()) {
+ unsigned NumMaskElts = VT.getVectorNumElements();
+ if ((NumMaskElts % NumElems) == 0) {
+ unsigned Scale = NumMaskElts / NumElems;
+ SmallVector<int, 4> ClearMask(NumMaskElts, -1);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (UndefMask[i])
+ continue;
+ int Offset = ZeroMask[i] ? NumMaskElts : 0;
+ for (unsigned j = 0; j != Scale; ++j)
+ ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
+ }
+ SDValue V = CreateLoad(VT, LDBase);
+ SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
+ : DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
+ }
+ }
+ }
+
+ // If the upper half of a ymm/zmm load is undef then just load the lower half.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ unsigned HalfNumElems = NumElems / 2;
+ if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
+ EVT HalfVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
+ SDValue HalfLD =
+ EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
+ DAG, Subtarget, isAfterLegalize);
+ if (HalfLD)
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
+ HalfLD, DAG.getIntPtrConstant(0, DL));
+ }
+ }
+
+ // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
+ if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
+ (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
+ ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
+ MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
+ : MVT::getIntegerVT(LoadSizeInBits);
+ MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
+ // Allow v4f32 on SSE1 only targets.
+ // FIXME: Add more isel patterns so we can just use VT directly.
+ if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
+ VecVT = MVT::v4f32;
+ if (TLI.isTypeLegal(VecVT)) {
+ SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
+ SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+ SDValue ResNode = DAG.getMemIntrinsicNode(
+ X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
+ LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
+ for (auto *LD : Loads)
+ if (LD)
+ DAG.makeEquivalentMemoryOrdering(LD, ResNode);
+ return DAG.getBitcast(VT, ResNode);
+ }
+ }
+
+ // BROADCAST - match the smallest possible repetition pattern, load that
+ // scalar/subvector element and then broadcast to the entire vector.
+ if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
+ (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
+ for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
+ unsigned RepeatSize = SubElems * BaseSizeInBits;
+ unsigned ScalarSize = std::min(RepeatSize, 64u);
+ if (!Subtarget.hasAVX2() && ScalarSize < 32)
+ continue;
+
+ // Don't attempt a 1:N subvector broadcast - it should be caught by
+ // combineConcatVectorOps, else will cause infinite loops.
+ if (RepeatSize > ScalarSize && SubElems == 1)
+ continue;
+
+ bool Match = true;
+ SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
+ for (unsigned i = 0; i != NumElems && Match; ++i) {
+ if (!LoadMask[i])
+ continue;
+ SDValue Elt = peekThroughBitcasts(Elts[i]);
+ if (RepeatedLoads[i % SubElems].isUndef())
+ RepeatedLoads[i % SubElems] = Elt;
+ else
+ Match &= (RepeatedLoads[i % SubElems] == Elt);
+ }
+
+ // We must have loads at both ends of the repetition.
+ Match &= !RepeatedLoads.front().isUndef();
+ Match &= !RepeatedLoads.back().isUndef();
+ if (!Match)
+ continue;
+
+ EVT RepeatVT =
+ VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
+ ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
+ : EVT::getFloatingPointVT(ScalarSize);
+ if (RepeatSize > ScalarSize)
+ RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
+ RepeatSize / ScalarSize);
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
+ VT.getSizeInBits() / ScalarSize);
+ if (TLI.isTypeLegal(BroadcastVT)) {
+ if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
+ RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
+ SDValue Broadcast = RepeatLoad;
+ if (RepeatSize > ScalarSize) {
+ while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
+ Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
+ } else {
+ Broadcast =
+ DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
+ }
+ return DAG.getBitcast(VT, Broadcast);
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
+// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
+// are consecutive, non-overlapping, and in the right order.
+static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ bool isAfterLegalize) {
+ SmallVector<SDValue, 64> Elts;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
+ Elts.push_back(Elt);
+ continue;
+ }
+ return SDValue();
+ }
+ assert(Elts.size() == VT.getVectorNumElements());
+ return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
+ isAfterLegalize);
+}
+
+static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
+ unsigned SplatBitSize, LLVMContext &C) {
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ unsigned NumElm = SplatBitSize / ScalarSize;
+
+ SmallVector<Constant *, 32> ConstantVec;
+ for (unsigned i = 0; i < NumElm; i++) {
+ APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
+ Constant *Const;
+ if (VT.isFloatingPoint()) {
+ if (ScalarSize == 32) {
+ Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
+ } else {
+ assert(ScalarSize == 64 && "Unsupported floating point scalar size");
+ Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
+ }
+ } else
+ Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
+ ConstantVec.push_back(Const);
+ }
+ return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
+}
+
+static bool isFoldableUseOfShuffle(SDNode *N) {
+ for (auto *U : N->uses()) {
+ unsigned Opc = U->getOpcode();
+ // VPERMV/VPERMV3 shuffles can never fold their index operands.
+ if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
+ return false;
+ if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
+ return false;
+ if (isTargetShuffle(Opc))
+ return true;
+ if (Opc == ISD::BITCAST) // Ignore bitcasts
+ return isFoldableUseOfShuffle(U);
+ if (N->hasOneUse())
+ return true;
+ }
+ return false;
+}
+
+/// Attempt to use the vbroadcast instruction to generate a splat value
+/// from a splat BUILD_VECTOR which uses:
+/// a. A single scalar load, or a constant.
+/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
+///
+/// The VBROADCAST node is returned when a pattern is found,
+/// or SDValue() otherwise.
+static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // VBROADCAST requires AVX.
+ // TODO: Splats could be generated for non-AVX CPUs using SSE
+ // instructions, but there's less potential gain for only 128-bit vectors.
+ if (!Subtarget.hasAVX())
+ return SDValue();
+
+ MVT VT = BVOp->getSimpleValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ SDLoc dl(BVOp);
+
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+ "Unsupported vector type for broadcast.");
+
+ // See if the build vector is a repeating sequence of scalars (inc. splat).
+ SDValue Ld;
+ BitVector UndefElements;
+ SmallVector<SDValue, 16> Sequence;
+ if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
+ assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
+ if (Sequence.size() == 1)
+ Ld = Sequence[0];
+ }
+
+ // Attempt to use VBROADCASTM
+ // From this pattern:
+ // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
+ // b. t1 = (build_vector t0 t0)
+ //
+ // Create (VBROADCASTM v2i1 X)
+ if (!Sequence.empty() && Subtarget.hasCDI()) {
+ // If not a splat, are the upper sequence values zeroable?
+ unsigned SeqLen = Sequence.size();
+ bool UpperZeroOrUndef =
+ SeqLen == 1 ||
+ llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
+ return !V || V.isUndef() || isNullConstant(V);
+ });
+ SDValue Op0 = Sequence[0];
+ if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
+ (Op0.getOpcode() == ISD::ZERO_EXTEND &&
+ Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
+ SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
+ ? Op0.getOperand(0)
+ : Op0.getOperand(0).getOperand(0);
+ MVT MaskVT = BOperand.getSimpleValueType();
+ MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
+ if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
+ (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
+ MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
+ if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+ unsigned Scale = 512 / VT.getSizeInBits();
+ BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
+ }
+ SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
+ if (BcstVT.getSizeInBits() != VT.getSizeInBits())
+ Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
+ return DAG.getBitcast(VT, Bcst);
+ }
+ }
+ }
+
+ unsigned NumUndefElts = UndefElements.count();
+ if (!Ld || (NumElts - NumUndefElts) <= 1) {
+ APInt SplatValue, Undef;
+ unsigned SplatBitSize;
+ bool HasUndef;
+ // Check if this is a repeated constant pattern suitable for broadcasting.
+ if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
+ SplatBitSize > VT.getScalarSizeInBits() &&
+ SplatBitSize < VT.getSizeInBits()) {
+ // Avoid replacing with broadcast when it's a use of a shuffle
+ // instruction to preserve the present custom lowering of shuffles.
+ if (isFoldableUseOfShuffle(BVOp))
+ return SDValue();
+ // replace BUILD_VECTOR with broadcast of the repeated constants.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ LLVMContext *Ctx = DAG.getContext();
+ MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (Subtarget.hasAVX()) {
+ if (SplatBitSize == 32 || SplatBitSize == 64 ||
+ (SplatBitSize < 32 && Subtarget.hasAVX2())) {
+ // Splatted value can fit in one INTEGER constant in constant pool.
+ // Load the constant and broadcast it.
+ MVT CVT = MVT::getIntegerVT(SplatBitSize);
+ Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
+ Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
+ SDValue CP = DAG.getConstantPool(C, PVT);
+ unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+ SDVTList Tys =
+ DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), CP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ SDValue Brdcst = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
+ MachineMemOperand::MOLoad);
+ return DAG.getBitcast(VT, Brdcst);
+ }
+ if (SplatBitSize > 64) {
+ // Load the vector of constants and broadcast it.
+ Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
+ *Ctx);
+ SDValue VCP = DAG.getConstantPool(VecC, PVT);
+ unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
+ MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
+ Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), VCP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ return DAG.getMemIntrinsicNode(
+ X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
+ MachineMemOperand::MOLoad);
+ }
+ }
+ }
+
+ // If we are moving a scalar into a vector (Ld must be set and all elements
+ // but 1 are undef) and that operation is not obviously supported by
+ // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
+ // That's better than general shuffling and may eliminate a load to GPR and
+ // move from scalar to vector register.
+ if (!Ld || NumElts - NumUndefElts != 1)
+ return SDValue();
+ unsigned ScalarSize = Ld.getValueSizeInBits();
+ if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
+ return SDValue();
+ }
+
+ bool ConstSplatVal =
+ (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
+ bool IsLoad = ISD::isNormalLoad(Ld.getNode());
+
+ // TODO: Handle broadcasts of non-constant sequences.
+
+ // Make sure that all of the users of a non-constant load are from the
+ // BUILD_VECTOR node.
+ // FIXME: Is the use count needed for non-constant, non-load case?
+ if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
+ return SDValue();
+
+ unsigned ScalarSize = Ld.getValueSizeInBits();
+ bool IsGE256 = (VT.getSizeInBits() >= 256);
+
+ // When optimizing for size, generate up to 5 extra bytes for a broadcast
+ // instruction to save 8 or more bytes of constant pool data.
+ // TODO: If multiple splats are generated to load the same constant,
+ // it may be detrimental to overall size. There needs to be a way to detect
+ // that condition to know if this is truly a size win.
+ bool OptForSize = DAG.shouldOptForSize();
+
+ // Handle broadcasting a single constant scalar from the constant pool
+ // into a vector.
+ // On Sandybridge (no AVX2), it is still better to load a constant vector
+ // from the constant pool and not to broadcast it from a scalar.
+ // But override that restriction when optimizing for size.
+ // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+ if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
+ EVT CVT = Ld.getValueType();
+ assert(!CVT.isVector() && "Must not broadcast a vector type");
+
+ // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+ // For size optimization, also splat v2f64 and v2i64, and for size opt
+ // with AVX2, also splat i8 and i16.
+ // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
+ const Constant *C = nullptr;
+ if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+ C = CI->getConstantIntValue();
+ else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+ C = CF->getConstantFPValue();
+
+ assert(C && "Invalid constant type");
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue CP =
+ DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), CP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
+ MPI, Alignment, MachineMemOperand::MOLoad);
+ }
+ }
+
+ // Handle AVX2 in-register broadcasts.
+ if (!IsLoad && Subtarget.hasInt256() &&
+ (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+ // The scalar source must be a normal load.
+ if (!IsLoad)
+ return SDValue();
+
+ // Make sure the non-chain result is only used by this build vector.
+ if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
+ return SDValue();
+
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (Subtarget.hasVLX() && ScalarSize == 64)) {
+ auto *LN = cast<LoadSDNode>(Ld);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue BCast =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
+ return BCast;
+ }
+
+ // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+ // double since there is no vbroadcastsd xmm
+ if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
+ (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
+ auto *LN = cast<LoadSDNode>(Ld);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue BCast =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
+ return BCast;
+ }
+
+ // Unsupported broadcast.
+ return SDValue();
+}
+
+/// For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+ SDValue ExtIdx) {
+ int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+ if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+ return Idx;
+
+ // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+ // lowered this:
+ // (extract_vector_elt (v8f32 %1), Constant<6>)
+ // to:
+ // (extract_vector_elt (vector_shuffle<2,u,u,u>
+ // (extract_subvector (v8f32 %0), Constant<4>),
+ // undef)
+ // Constant<0>)
+ // In this case the vector is the extract_subvector expression and the index
+ // is 2, as specified by the shuffle.
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+ SDValue ShuffleVec = SVOp->getOperand(0);
+ MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+ assert(ShuffleVecVT.getVectorElementType() ==
+ ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+ int ShuffleIdx = SVOp->getMaskElt(Idx);
+ if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+ ExtractedFromVec = ShuffleVec;
+ return ShuffleIdx;
+ }
+ return Idx;
+}
+
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ // Skip if insert_vec_elt is not supported.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+ return SDValue();
+
+ SDLoc DL(Op);
+ unsigned NumElems = Op.getNumOperands();
+
+ SDValue VecIn1;
+ SDValue VecIn2;
+ SmallVector<unsigned, 4> InsertIndices;
+ SmallVector<int, 8> Mask(NumElems, -1);
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Opc = Op.getOperand(i).getOpcode();
+
+ if (Opc == ISD::UNDEF)
+ continue;
+
+ if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+ // Quit if more than 1 elements need inserting.
+ if (InsertIndices.size() > 1)
+ return SDValue();
+
+ InsertIndices.push_back(i);
+ continue;
+ }
+
+ SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+ SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
+ // Quit if non-constant index.
+ if (!isa<ConstantSDNode>(ExtIdx))
+ return SDValue();
+ int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
+
+ // Quit if extracted from vector of different type.
+ if (ExtractedFromVec.getValueType() != VT)
+ return SDValue();
+
+ if (!VecIn1.getNode())
+ VecIn1 = ExtractedFromVec;
+ else if (VecIn1 != ExtractedFromVec) {
+ if (!VecIn2.getNode())
+ VecIn2 = ExtractedFromVec;
+ else if (VecIn2 != ExtractedFromVec)
+ // Quit if more than 2 vectors to shuffle
+ return SDValue();
+ }
+
+ if (ExtractedFromVec == VecIn1)
+ Mask[i] = Idx;
+ else if (ExtractedFromVec == VecIn2)
+ Mask[i] = Idx + NumElems;
+ }
+
+ if (!VecIn1.getNode())
+ return SDValue();
+
+ VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+ SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
+
+ for (unsigned Idx : InsertIndices)
+ NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+ DAG.getIntPtrConstant(Idx, DL));
+
+ return NV;
+}
+
+// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+
+ MVT VT = Op.getSimpleValueType();
+ assert((VT.getVectorElementType() == MVT::i1) &&
+ "Unexpected type in LowerBUILD_VECTORvXi1!");
+
+ SDLoc dl(Op);
+ if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
+ ISD::isBuildVectorAllOnes(Op.getNode()))
+ return Op;
+
+ uint64_t Immediate = 0;
+ SmallVector<unsigned, 16> NonConstIdx;
+ bool IsSplat = true;
+ bool HasConstElts = false;
+ int SplatIdx = -1;
+ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+ SDValue In = Op.getOperand(idx);
+ if (In.isUndef())
+ continue;
+ if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
+ Immediate |= (InC->getZExtValue() & 0x1) << idx;
+ HasConstElts = true;
+ } else {
+ NonConstIdx.push_back(idx);
+ }
+ if (SplatIdx < 0)
+ SplatIdx = idx;
+ else if (In != Op.getOperand(SplatIdx))
+ IsSplat = false;
+ }
+
+ // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
+ if (IsSplat) {
+ // The build_vector allows the scalar element to be larger than the vector
+ // element type. We need to mask it to use as a condition unless we know
+ // the upper bits are zero.
+ // FIXME: Use computeKnownBits instead of checking specific opcode?
+ SDValue Cond = Op.getOperand(SplatIdx);
+ assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
+ if (Cond.getOpcode() != ISD::SETCC)
+ Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
+ DAG.getConstant(1, dl, MVT::i8));
+
+ // Perform the select in the scalar domain so we can use cmov.
+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
+ DAG.getAllOnesConstant(dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ Select = DAG.getBitcast(MVT::v32i1, Select);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
+ } else {
+ MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
+ SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
+ DAG.getAllOnesConstant(dl, ImmVT),
+ DAG.getConstant(0, dl, ImmVT));
+ MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
+ Select = DAG.getBitcast(VecVT, Select);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ }
+
+ // insert elements one by one
+ SDValue DstVec;
+ if (HasConstElts) {
+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
+ SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
+ ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
+ ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
+ DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
+ } else {
+ MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
+ SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
+ MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
+ DstVec = DAG.getBitcast(VecVT, Imm);
+ DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ } else
+ DstVec = DAG.getUNDEF(VT);
+
+ for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
+ unsigned InsertIdx = NonConstIdx[i];
+ DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ Op.getOperand(InsertIdx),
+ DAG.getIntPtrConstant(InsertIdx, dl));
+ }
+ return DstVec;
+}
+
+/// This is a helper function of LowerToHorizontalOp().
+/// This function checks that the build_vector \p N in input implements a
+/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
+/// may not match the layout of an x86 256-bit horizontal instruction.
+/// In other words, if this returns true, then some extraction/insertion will
+/// be required to produce a valid horizontal instruction.
+///
+/// Parameter \p Opcode defines the kind of horizontal operation to match.
+/// For example, if \p Opcode is equal to ISD::ADD, then this function
+/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
+/// is equal to ISD::SUB, then this function checks if this is a horizontal
+/// arithmetic sub.
+///
+/// This function only analyzes elements of \p N whose indices are
+/// in range [BaseIdx, LastIdx).
+///
+/// TODO: This function was originally used to match both real and fake partial
+/// horizontal operations, but the index-matching logic is incorrect for that.
+/// See the corrected implementation in isHopBuildVector(). Can we reduce this
+/// code because it is only used for partial h-op matching now?
+static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
+ SelectionDAG &DAG,
+ unsigned BaseIdx, unsigned LastIdx,
+ SDValue &V0, SDValue &V1) {
+ EVT VT = N->getValueType(0);
+ assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
+ assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
+ assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
+ "Invalid Vector in input!");
+
+ bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
+ bool CanFold = true;
+ unsigned ExpectedVExtractIdx = BaseIdx;
+ unsigned NumElts = LastIdx - BaseIdx;
+ V0 = DAG.getUNDEF(VT);
+ V1 = DAG.getUNDEF(VT);
+
+ // Check if N implements a horizontal binop.
+ for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
+ SDValue Op = N->getOperand(i + BaseIdx);
+
+ // Skip UNDEFs.
+ if (Op->isUndef()) {
+ // Update the expected vector extract index.
+ if (i * 2 == NumElts)
+ ExpectedVExtractIdx = BaseIdx;
+ ExpectedVExtractIdx += 2;
+ continue;
+ }
+
+ CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
+
+ if (!CanFold)
+ break;
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Try to match the following pattern:
+ // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
+ CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op0.getOperand(0) == Op1.getOperand(0) &&
+ isa<ConstantSDNode>(Op0.getOperand(1)) &&
+ isa<ConstantSDNode>(Op1.getOperand(1)));
+ if (!CanFold)
+ break;
+
+ unsigned I0 = Op0.getConstantOperandVal(1);
+ unsigned I1 = Op1.getConstantOperandVal(1);
+
+ if (i * 2 < NumElts) {
+ if (V0.isUndef()) {
+ V0 = Op0.getOperand(0);
+ if (V0.getValueType() != VT)
+ return false;
+ }
+ } else {
+ if (V1.isUndef()) {
+ V1 = Op0.getOperand(0);
+ if (V1.getValueType() != VT)
+ return false;
+ }
+ if (i * 2 == NumElts)
+ ExpectedVExtractIdx = BaseIdx;
+ }
+
+ SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
+ if (I0 == ExpectedVExtractIdx)
+ CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
+ else if (IsCommutable && I1 == ExpectedVExtractIdx) {
+ // Try to match the following dag sequence:
+ // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
+ CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
+ } else
+ CanFold = false;
+
+ ExpectedVExtractIdx += 2;
+ }
+
+ return CanFold;
+}
+
+/// Emit a sequence of two 128-bit horizontal add/sub followed by
+/// a concat_vector.
+///
+/// This is a helper function of LowerToHorizontalOp().
+/// This function expects two 256-bit vectors called V0 and V1.
+/// At first, each vector is split into two separate 128-bit vectors.
+/// Then, the resulting 128-bit vectors are used to implement two
+/// horizontal binary operations.
+///
+/// The kind of horizontal binary operation is defined by \p X86Opcode.
+///
+/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
+/// the two new horizontal binop.
+/// When Mode is set, the first horizontal binop dag node would take as input
+/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
+/// horizontal binop dag node would take as input the lower 128-bit of V1
+/// and the upper 128-bit of V1.
+/// Example:
+/// HADD V0_LO, V0_HI
+/// HADD V1_LO, V1_HI
+///
+/// Otherwise, the first horizontal binop dag node takes as input the lower
+/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
+/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
+/// Example:
+/// HADD V0_LO, V1_LO
+/// HADD V0_HI, V1_HI
+///
+/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
+/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
+/// the upper 128-bits of the result.
+static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
+ const SDLoc &DL, SelectionDAG &DAG,
+ unsigned X86Opcode, bool Mode,
+ bool isUndefLO, bool isUndefHI) {
+ MVT VT = V0.getSimpleValueType();
+ assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
+ "Invalid nodes in input!");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
+ SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
+ SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
+ SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
+ MVT NewVT = V0_LO.getSimpleValueType();
+
+ SDValue LO = DAG.getUNDEF(NewVT);
+ SDValue HI = DAG.getUNDEF(NewVT);
+
+ if (Mode) {
+ // Don't emit a horizontal binop if the result is expected to be UNDEF.
+ if (!isUndefLO && !V0->isUndef())
+ LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
+ if (!isUndefHI && !V1->isUndef())
+ HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
+ } else {
+ // Don't emit a horizontal binop if the result is expected to be UNDEF.
+ if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
+ LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
+
+ if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
+ HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
+ }
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
+}
+
+/// Returns true iff \p BV builds a vector with the result equivalent to
+/// the result of ADDSUB/SUBADD operation.
+/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
+/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
+/// \p Opnd0 and \p Opnd1.
+static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ SDValue &Opnd0, SDValue &Opnd1,
+ unsigned &NumExtracts,
+ bool &IsSubAdd) {
+
+ MVT VT = BV->getSimpleValueType(0);
+ if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue InVec0 = DAG.getUNDEF(VT);
+ SDValue InVec1 = DAG.getUNDEF(VT);
+
+ NumExtracts = 0;
+
+ // Odd-numbered elements in the input build vector are obtained from
+ // adding/subtracting two integer/float elements.
+ // Even-numbered elements in the input build vector are obtained from
+ // subtracting/adding two integer/float elements.
+ unsigned Opc[2] = {0, 0};
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ SDValue Op = BV->getOperand(i);
+
+ // Skip 'undef' values.
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::UNDEF)
+ continue;
+
+ // Early exit if we found an unexpected opcode.
+ if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
+ return false;
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Try to match the following pattern:
+ // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
+ // Early exit if we cannot match that sequence.
+ if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+ Op0.getOperand(1) != Op1.getOperand(1))
+ return false;
+
+ unsigned I0 = Op0.getConstantOperandVal(1);
+ if (I0 != i)
+ return false;
+
+ // We found a valid add/sub node, make sure its the same opcode as previous
+ // elements for this parity.
+ if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
+ return false;
+ Opc[i % 2] = Opcode;
+
+ // Update InVec0 and InVec1.
+ if (InVec0.isUndef()) {
+ InVec0 = Op0.getOperand(0);
+ if (InVec0.getSimpleValueType() != VT)
+ return false;
+ }
+ if (InVec1.isUndef()) {
+ InVec1 = Op1.getOperand(0);
+ if (InVec1.getSimpleValueType() != VT)
+ return false;
+ }
+
+ // Make sure that operands in input to each add/sub node always
+ // come from a same pair of vectors.
+ if (InVec0 != Op0.getOperand(0)) {
+ if (Opcode == ISD::FSUB)
+ return false;
+
+ // FADD is commutable. Try to commute the operands
+ // and then test again.
+ std::swap(Op0, Op1);
+ if (InVec0 != Op0.getOperand(0))
+ return false;
+ }
+
+ if (InVec1 != Op1.getOperand(0))
+ return false;
+
+ // Increment the number of extractions done.
+ ++NumExtracts;
+ }
+
+ // Ensure we have found an opcode for both parities and that they are
+ // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
+ // inputs are undef.
+ if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
+ InVec0.isUndef() || InVec1.isUndef())
+ return false;
+
+ IsSubAdd = Opc[0] == ISD::FADD;
+
+ Opnd0 = InVec0;
+ Opnd1 = InVec1;
+ return true;
+}
+
+/// Returns true if is possible to fold MUL and an idiom that has already been
+/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
+/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
+/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
+///
+/// Prior to calling this function it should be known that there is some
+/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
+/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
+/// before replacement of such SDNode with ADDSUB operation. Thus the number
+/// of \p Opnd0 uses is expected to be equal to 2.
+/// For example, this function may be called for the following IR:
+/// %AB = fmul fast <2 x double> %A, %B
+/// %Sub = fsub fast <2 x double> %AB, %C
+/// %Add = fadd fast <2 x double> %AB, %C
+/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
+/// <2 x i32> <i32 0, i32 3>
+/// There is a def for %Addsub here, which potentially can be replaced by
+/// X86ISD::ADDSUB operation:
+/// %Addsub = X86ISD::ADDSUB %AB, %C
+/// and such ADDSUB can further be replaced with FMADDSUB:
+/// %Addsub = FMADDSUB %A, %B, %C.
+///
+/// The main reason why this method is called before the replacement of the
+/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
+/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
+/// FMADDSUB is.
+static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
+ SelectionDAG &DAG,
+ SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
+ unsigned ExpectedUses) {
+ if (Opnd0.getOpcode() != ISD::FMUL ||
+ !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
+ return false;
+
+ // FIXME: These checks must match the similar ones in
+ // DAGCombiner::visitFADDForFMACombine. It would be good to have one
+ // function that would answer if it is Ok to fuse MUL + ADD to FMADD
+ // or MUL + ADDSUB to FMADDSUB.
+ const TargetOptions &Options = DAG.getTarget().Options;
+ bool AllowFusion =
+ (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+ if (!AllowFusion)
+ return false;
+
+ Opnd2 = Opnd1;
+ Opnd1 = Opnd0.getOperand(1);
+ Opnd0 = Opnd0.getOperand(0);
+
+ return true;
+}
+
+/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
+/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
+/// X86ISD::FMSUBADD node.
+static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Opnd0, Opnd1;
+ unsigned NumExtracts;
+ bool IsSubAdd;
+ if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
+ IsSubAdd))
+ return SDValue();
+
+ MVT VT = BV->getSimpleValueType(0);
+ SDLoc DL(BV);
+
+ // Try to generate X86ISD::FMADDSUB node here.
+ SDValue Opnd2;
+ if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
+ unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
+ return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
+ }
+
+ // We only support ADDSUB.
+ if (IsSubAdd)
+ return SDValue();
+
+ // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+ // the ADDSUB idiom has been successfully recognized. There are no known
+ // X86 targets with 512-bit ADDSUB instructions!
+ // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
+ // recognition.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
+}
+
+static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
+ unsigned &HOpcode, SDValue &V0, SDValue &V1) {
+ // Initialize outputs to known values.
+ MVT VT = BV->getSimpleValueType(0);
+ HOpcode = ISD::DELETED_NODE;
+ V0 = DAG.getUNDEF(VT);
+ V1 = DAG.getUNDEF(VT);
+
+ // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
+ // half of the result is calculated independently from the 128-bit halves of
+ // the inputs, so that makes the index-checking logic below more complicated.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned GenericOpcode = ISD::DELETED_NODE;
+ unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
+ unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
+ unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
+ for (unsigned i = 0; i != Num128BitChunks; ++i) {
+ for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
+ // Ignore undef elements.
+ SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
+ if (Op.isUndef())
+ continue;
+
+ // If there's an opcode mismatch, we're done.
+ if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
+ return false;
+
+ // Initialize horizontal opcode.
+ if (HOpcode == ISD::DELETED_NODE) {
+ GenericOpcode = Op.getOpcode();
+ switch (GenericOpcode) {
+ case ISD::ADD: HOpcode = X86ISD::HADD; break;
+ case ISD::SUB: HOpcode = X86ISD::HSUB; break;
+ case ISD::FADD: HOpcode = X86ISD::FHADD; break;
+ case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
+ default: return false;
+ }
+ }
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op0.getOperand(0) != Op1.getOperand(0) ||
+ !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+ !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
+ return false;
+
+ // The source vector is chosen based on which 64-bit half of the
+ // destination vector is being calculated.
+ if (j < NumEltsIn64Bits) {
+ if (V0.isUndef())
+ V0 = Op0.getOperand(0);
+ } else {
+ if (V1.isUndef())
+ V1 = Op0.getOperand(0);
+ }
+
+ SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
+ if (SourceVec != Op0.getOperand(0))
+ return false;
+
+ // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
+ unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
+ unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
+ unsigned ExpectedIndex = i * NumEltsIn128Bits +
+ (j % NumEltsIn64Bits) * 2;
+ if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
+ continue;
+
+ // If this is not a commutative op, this does not match.
+ if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
+ return false;
+
+ // Addition is commutative, so try swapping the extract indexes.
+ // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
+ if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
+ continue;
+
+ // Extract indexes do not match horizontal requirement.
+ return false;
+ }
+ }
+ // We matched. Opcode and operands are returned by reference as arguments.
+ return true;
+}
+
+static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
+ SelectionDAG &DAG, unsigned HOpcode,
+ SDValue V0, SDValue V1) {
+ // If either input vector is not the same size as the build vector,
+ // extract/insert the low bits to the correct size.
+ // This is free (examples: zmm --> xmm, xmm --> ymm).
+ MVT VT = BV->getSimpleValueType(0);
+ unsigned Width = VT.getSizeInBits();
+ if (V0.getValueSizeInBits() > Width)
+ V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
+ else if (V0.getValueSizeInBits() < Width)
+ V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
+
+ if (V1.getValueSizeInBits() > Width)
+ V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
+ else if (V1.getValueSizeInBits() < Width)
+ V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
+
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+ for (unsigned i = 0; i != NumElts; ++i)
+ if (BV->getOperand(i).isUndef())
+ DemandedElts.clearBit(i);
+
+ // If we don't need the upper xmm, then perform as a xmm hop.
+ unsigned HalfNumElts = NumElts / 2;
+ if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
+ V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
+ SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
+ return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
+ }
+
+ return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
+}
+
+/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
+static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // We need at least 2 non-undef elements to make this worthwhile by default.
+ unsigned NumNonUndefs =
+ count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
+ if (NumNonUndefs < 2)
+ return SDValue();
+
+ // There are 4 sets of horizontal math operations distinguished by type:
+ // int/FP at 128-bit/256-bit. Each type was introduced with a different
+ // subtarget feature. Try to match those "native" patterns first.
+ MVT VT = BV->getSimpleValueType(0);
+ if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
+ ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
+ ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
+ ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
+ unsigned HOpcode;
+ SDValue V0, V1;
+ if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
+ return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
+ }
+
+ // Try harder to match 256-bit ops by using extract/concat.
+ if (!Subtarget.hasAVX() || !VT.is256BitVector())
+ return SDValue();
+
+ // Count the number of UNDEF operands in the build_vector in input.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned Half = NumElts / 2;
+ unsigned NumUndefsLO = 0;
+ unsigned NumUndefsHI = 0;
+ for (unsigned i = 0, e = Half; i != e; ++i)
+ if (BV->getOperand(i)->isUndef())
+ NumUndefsLO++;
+
+ for (unsigned i = Half, e = NumElts; i != e; ++i)
+ if (BV->getOperand(i)->isUndef())
+ NumUndefsHI++;
+
+ SDLoc DL(BV);
+ SDValue InVec0, InVec1;
+ if (VT == MVT::v8i32 || VT == MVT::v16i16) {
+ SDValue InVec2, InVec3;
+ unsigned X86Opcode;
+ bool CanFold = true;
+
+ if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
+ isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
+ InVec3) &&
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+ X86Opcode = X86ISD::HADD;
+ else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
+ InVec1) &&
+ isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
+ InVec3) &&
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+ X86Opcode = X86ISD::HSUB;
+ else
+ CanFold = false;
+
+ if (CanFold) {
+ // Do not try to expand this build_vector into a pair of horizontal
+ // add/sub if we can emit a pair of scalar add/sub.
+ if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+ return SDValue();
+
+ // Convert this build_vector into a pair of horizontal binops followed by
+ // a concat vector. We must adjust the outputs from the partial horizontal
+ // matching calls above to account for undefined vector halves.
+ SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
+ SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
+ assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
+ bool isUndefLO = NumUndefsLO == Half;
+ bool isUndefHI = NumUndefsHI == Half;
+ return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
+ isUndefHI);
+ }
+ }
+
+ if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
+ VT == MVT::v16i16) {
+ unsigned X86Opcode;
+ if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+ X86Opcode = X86ISD::HADD;
+ else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
+ InVec1))
+ X86Opcode = X86ISD::HSUB;
+ else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
+ InVec1))
+ X86Opcode = X86ISD::FHADD;
+ else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
+ InVec1))
+ X86Opcode = X86ISD::FHSUB;
+ else
+ return SDValue();
+
+ // Don't try to expand this build_vector into a pair of horizontal add/sub
+ // if we can simply emit a pair of scalar add/sub.
+ if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+ return SDValue();
+
+ // Convert this build_vector into two horizontal add/sub followed by
+ // a concat vector.
+ bool isUndefLO = NumUndefsLO == Half;
+ bool isUndefHI = NumUndefsHI == Half;
+ return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
+ isUndefLO, isUndefHI);
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG);
+
+/// If a BUILD_VECTOR's source elements all apply the same bit operation and
+/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
+/// just apply the bit to the vectors.
+/// NOTE: Its not in our interest to start make a general purpose vectorizer
+/// from this, but enough scalar bit operations are created from the later
+/// legalization + scalarization stages to need basic support.
+static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ MVT VT = Op->getSimpleValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Check that all elements have the same opcode.
+ // TODO: Should we allow UNDEFS and if so how many?
+ unsigned Opcode = Op->getOperand(0).getOpcode();
+ for (unsigned i = 1; i < NumElems; ++i)
+ if (Opcode != Op->getOperand(i).getOpcode())
+ return SDValue();
+
+ // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+ bool IsShift = false;
+ switch (Opcode) {
+ default:
+ return SDValue();
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ IsShift = true;
+ break;
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ // Don't do this if the buildvector is a splat - we'd replace one
+ // constant with an entire vector.
+ if (Op->getSplatValue())
+ return SDValue();
+ if (!TLI.isOperationLegalOrPromote(Opcode, VT))
+ return SDValue();
+ break;
+ }
+
+ SmallVector<SDValue, 4> LHSElts, RHSElts;
+ for (SDValue Elt : Op->ops()) {
+ SDValue LHS = Elt.getOperand(0);
+ SDValue RHS = Elt.getOperand(1);
+
+ // We expect the canonicalized RHS operand to be the constant.
+ if (!isa<ConstantSDNode>(RHS))
+ return SDValue();
+
+ // Extend shift amounts.
+ if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
+ if (!IsShift)
+ return SDValue();
+ RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
+ }
+
+ LHSElts.push_back(LHS);
+ RHSElts.push_back(RHS);
+ }
+
+ // Limit to shifts by uniform immediates.
+ // TODO: Only accept vXi8/vXi64 special cases?
+ // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
+ if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
+ return SDValue();
+
+ SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
+ SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
+ SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
+
+ if (!IsShift)
+ return Res;
+
+ // Immediately lower the shift to ensure the constant build vector doesn't
+ // get converted to a constant pool before the shift is lowered.
+ return LowerShift(Res, Subtarget, DAG);
+}
+
+/// Create a vector constant without a load. SSE/AVX provide the bare minimum
+/// functionality to do this, so it's all zeros, all ones, or some derivation
+/// that is cheap to calculate.
+static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // Vectors containing all zeros can be matched by pxor and xorps.
+ if (ISD::isBuildVectorAllZeros(Op.getNode()))
+ return Op;
+
+ // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+ // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+ // vpcmpeqd on 256-bit vectors.
+ if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+ if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
+ return Op;
+
+ return getOnesVector(VT, DAG, DL);
+ }
+
+ return SDValue();
+}
+
+/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
+/// from a vector of source values and a vector of extraction indices.
+/// The vectors might be manipulated to match the type of the permute op.
+static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
+ SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT ShuffleVT = VT;
+ EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned SizeInBits = VT.getSizeInBits();
+
+ // Adjust IndicesVec to match VT size.
+ assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
+ "Illegal variable permute mask size");
+ if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
+ IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
+ NumElts * VT.getScalarSizeInBits());
+ IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
+
+ // Handle SrcVec that don't match VT type.
+ if (SrcVec.getValueSizeInBits() != SizeInBits) {
+ if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
+ // Handle larger SrcVec by treating it as a larger permute.
+ unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
+ VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
+ IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
+ IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
+ Subtarget, DAG, SDLoc(IndicesVec));
+ SDValue NewSrcVec =
+ createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
+ if (NewSrcVec)
+ return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
+ return SDValue();
+ } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
+ // Widen smaller SrcVec to match VT.
+ SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
+ } else
+ return SDValue();
+ }
+
+ auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
+ assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
+ EVT SrcVT = Idx.getValueType();
+ unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
+ uint64_t IndexScale = 0;
+ uint64_t IndexOffset = 0;
+
+ // If we're scaling a smaller permute op, then we need to repeat the
+ // indices, scaling and offsetting them as well.
+ // e.g. v4i32 -> v16i8 (Scale = 4)
+ // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
+ // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
+ for (uint64_t i = 0; i != Scale; ++i) {
+ IndexScale |= Scale << (i * NumDstBits);
+ IndexOffset |= i << (i * NumDstBits);
+ }
+
+ Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
+ DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
+ Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
+ DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
+ return Idx;
+ };
+
+ unsigned Opcode = 0;
+ switch (VT.SimpleTy) {
+ default:
+ break;
+ case MVT::v16i8:
+ if (Subtarget.hasSSSE3())
+ Opcode = X86ISD::PSHUFB;
+ break;
+ case MVT::v8i16:
+ if (Subtarget.hasVLX() && Subtarget.hasBWI())
+ Opcode = X86ISD::VPERMV;
+ else if (Subtarget.hasSSSE3()) {
+ Opcode = X86ISD::PSHUFB;
+ ShuffleVT = MVT::v16i8;
+ }
+ break;
+ case MVT::v4f32:
+ case MVT::v4i32:
+ if (Subtarget.hasAVX()) {
+ Opcode = X86ISD::VPERMILPV;
+ ShuffleVT = MVT::v4f32;
+ } else if (Subtarget.hasSSSE3()) {
+ Opcode = X86ISD::PSHUFB;
+ ShuffleVT = MVT::v16i8;
+ }
+ break;
+ case MVT::v2f64:
+ case MVT::v2i64:
+ if (Subtarget.hasAVX()) {
+ // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
+ IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
+ Opcode = X86ISD::VPERMILPV;
+ ShuffleVT = MVT::v2f64;
+ } else if (Subtarget.hasSSE41()) {
+ // SSE41 can compare v2i64 - select between indices 0 and 1.
+ return DAG.getSelectCC(
+ DL, IndicesVec,
+ getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
+ DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
+ DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
+ ISD::CondCode::SETEQ);
+ }
+ break;
+ case MVT::v32i8:
+ if (Subtarget.hasVLX() && Subtarget.hasVBMI())
+ Opcode = X86ISD::VPERMV;
+ else if (Subtarget.hasXOP()) {
+ SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
+ SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
+ SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
+ SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, DL, VT,
+ DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
+ DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
+ } else if (Subtarget.hasAVX()) {
+ SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
+ SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
+ SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
+ SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
+ auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ // Permute Lo and Hi and then select based on index range.
+ // This works as SHUFB uses bits[3:0] to permute elements and we don't
+ // care about the bit[7] as its just an index vector.
+ SDValue Idx = Ops[2];
+ EVT VT = Idx.getValueType();
+ return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
+ DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
+ DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
+ ISD::CondCode::SETGT);
+ };
+ SDValue Ops[] = {LoLo, HiHi, IndicesVec};
+ return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
+ PSHUFBBuilder);
+ }
+ break;
+ case MVT::v16i16:
+ if (Subtarget.hasVLX() && Subtarget.hasBWI())
+ Opcode = X86ISD::VPERMV;
+ else if (Subtarget.hasAVX()) {
+ // Scale to v32i8 and perform as v32i8.
+ IndicesVec = ScaleIndices(IndicesVec, 2);
+ return DAG.getBitcast(
+ VT, createVariablePermute(
+ MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
+ DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
+ }
+ break;
+ case MVT::v8f32:
+ case MVT::v8i32:
+ if (Subtarget.hasAVX2())
+ Opcode = X86ISD::VPERMV;
+ else if (Subtarget.hasAVX()) {
+ SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
+ SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
+ {0, 1, 2, 3, 0, 1, 2, 3});
+ SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
+ {4, 5, 6, 7, 4, 5, 6, 7});
+ if (Subtarget.hasXOP())
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
+ IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
+ // Permute Lo and Hi and then select based on index range.
+ // This works as VPERMILPS only uses index bits[0:1] to permute elements.
+ SDValue Res = DAG.getSelectCC(
+ DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
+ DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
+ DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
+ ISD::CondCode::SETGT);
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ case MVT::v4i64:
+ case MVT::v4f64:
+ if (Subtarget.hasAVX512()) {
+ if (!Subtarget.hasVLX()) {
+ MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
+ SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
+ SDLoc(SrcVec));
+ IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
+ DAG, SDLoc(IndicesVec));
+ SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
+ DAG, Subtarget);
+ return extract256BitVector(Res, 0, DAG, DL);
+ }
+ Opcode = X86ISD::VPERMV;
+ } else if (Subtarget.hasAVX()) {
+ SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
+ SDValue LoLo =
+ DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
+ SDValue HiHi =
+ DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
+ // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
+ IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
+ if (Subtarget.hasXOP())
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
+ IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
+ // Permute Lo and Hi and then select based on index range.
+ // This works as VPERMILPD only uses index bit[1] to permute elements.
+ SDValue Res = DAG.getSelectCC(
+ DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
+ DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
+ DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
+ ISD::CondCode::SETGT);
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ case MVT::v64i8:
+ if (Subtarget.hasVBMI())
+ Opcode = X86ISD::VPERMV;
+ break;
+ case MVT::v32i16:
+ if (Subtarget.hasBWI())
+ Opcode = X86ISD::VPERMV;
+ break;
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8f64:
+ case MVT::v8i64:
+ if (Subtarget.hasAVX512())
+ Opcode = X86ISD::VPERMV;
+ break;
+ }
+ if (!Opcode)
+ return SDValue();
+
+ assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
+ (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
+ "Illegal variable permute shuffle type");
+
+ uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
+ if (Scale > 1)
+ IndicesVec = ScaleIndices(IndicesVec, Scale);
+
+ EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
+ IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
+
+ SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
+ SDValue Res = Opcode == X86ISD::VPERMV
+ ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
+ : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
+ return DAG.getBitcast(VT, Res);
+}
+
+// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
+// reasoned to be a permutation of a vector by indices in a non-constant vector.
+// (build_vector (extract_elt V, (extract_elt I, 0)),
+// (extract_elt V, (extract_elt I, 1)),
+// ...
+// ->
+// (vpermv I, V)
+//
+// TODO: Handle undefs
+// TODO: Utilize pshufb and zero mask blending to support more efficient
+// construction of vectors with constant-0 elements.
+static SDValue
+LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue SrcVec, IndicesVec;
+ // Check for a match of the permute source vector and permute index elements.
+ // This is done by checking that the i-th build_vector operand is of the form:
+ // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
+ for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
+ SDValue Op = V.getOperand(Idx);
+ if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // If this is the first extract encountered in V, set the source vector,
+ // otherwise verify the extract is from the previously defined source
+ // vector.
+ if (!SrcVec)
+ SrcVec = Op.getOperand(0);
+ else if (SrcVec != Op.getOperand(0))
+ return SDValue();
+ SDValue ExtractedIndex = Op->getOperand(1);
+ // Peek through extends.
+ if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
+ ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
+ ExtractedIndex = ExtractedIndex.getOperand(0);
+ if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // If this is the first extract from the index vector candidate, set the
+ // indices vector, otherwise verify the extract is from the previously
+ // defined indices vector.
+ if (!IndicesVec)
+ IndicesVec = ExtractedIndex.getOperand(0);
+ else if (IndicesVec != ExtractedIndex.getOperand(0))
+ return SDValue();
+
+ auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
+ if (!PermIdx || PermIdx->getAPIntValue() != Idx)
+ return SDValue();
+ }
+
+ SDLoc DL(V);
+ MVT VT = V.getSimpleValueType();
+ return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = Op.getNumOperands();
+
+ // Generate vectors for predicate vectors.
+ if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
+ return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
+
+ if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
+ return VectorConstant;
+
+ unsigned EVTBits = EltVT.getSizeInBits();
+ APInt UndefMask = APInt::getNullValue(NumElems);
+ APInt ZeroMask = APInt::getNullValue(NumElems);
+ APInt NonZeroMask = APInt::getNullValue(NumElems);
+ bool IsAllConstants = true;
+ SmallSet<SDValue, 8> Values;
+ unsigned NumConstants = NumElems;
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ if (Elt.isUndef()) {
+ UndefMask.setBit(i);
+ continue;
+ }
+ Values.insert(Elt);
+ if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
+ IsAllConstants = false;
+ NumConstants--;
+ }
+ if (X86::isZeroNode(Elt)) {
+ ZeroMask.setBit(i);
+ } else {
+ NonZeroMask.setBit(i);
+ }
+ }
+
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ if (NonZeroMask == 0) {
+ assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
+ return DAG.getUNDEF(VT);
+ }
+
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+
+ // If the upper elts of a ymm/zmm are undef/zero then we might be better off
+ // lowering to a smaller build vector and padding with undef/zero.
+ if ((VT.is256BitVector() || VT.is512BitVector()) &&
+ !isFoldableUseOfShuffle(BV)) {
+ unsigned UpperElems = NumElems / 2;
+ APInt UndefOrZeroMask = UndefMask | ZeroMask;
+ unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
+ if (NumUpperUndefsOrZeros >= UpperElems) {
+ if (VT.is512BitVector() &&
+ NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
+ UpperElems = NumElems - (NumElems / 4);
+ bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
+ MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
+ SDValue NewBV =
+ DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
+ return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
+ }
+ }
+
+ if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
+ return AddSub;
+ if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+ return HorizontalOp;
+ if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
+ return Broadcast;
+ if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
+ return BitOp;
+
+ unsigned NumZero = ZeroMask.countPopulation();
+ unsigned NumNonZero = NonZeroMask.countPopulation();
+
+ // If we are inserting one variable into a vector of non-zero constants, try
+ // to avoid loading each constant element as a scalar. Load the constants as a
+ // vector and then insert the variable scalar element. If insertion is not
+ // supported, fall back to a shuffle to get the scalar blended with the
+ // constants. Insertion into a zero vector is handled as a special-case
+ // somewhere below here.
+ if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
+ (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
+ isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
+ // Create an all-constant vector. The variable element in the old
+ // build vector is replaced by undef in the constant vector. Save the
+ // variable scalar element and its index for use in the insertelement.
+ LLVMContext &Context = *DAG.getContext();
+ Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
+ SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
+ SDValue VarElt;
+ SDValue InsIndex;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ if (auto *C = dyn_cast<ConstantSDNode>(Elt))
+ ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
+ else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
+ ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
+ else if (!Elt.isUndef()) {
+ assert(!VarElt.getNode() && !InsIndex.getNode() &&
+ "Expected one variable element in this vector");
+ VarElt = Elt;
+ InsIndex = DAG.getVectorIdxConstant(i, dl);
+ }
+ }
+ Constant *CV = ConstantVector::get(ConstVecOps);
+ SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
+
+ // The constants we just created may not be legal (eg, floating point). We
+ // must lower the vector right here because we can not guarantee that we'll
+ // legalize it before loading it. This is also why we could not just create
+ // a new build vector here. If the build vector contains illegal constants,
+ // it could get split back up into a series of insert elements.
+ // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
+ SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
+ SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
+ unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
+ unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
+ if (InsertC < NumEltsInLow128Bits)
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
+
+ // There's no good way to insert into the high elements of a >128-bit
+ // vector, so use shuffles to avoid an extract/insert sequence.
+ assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
+ assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
+ SmallVector<int, 8> ShuffleMask;
+ unsigned NumElts = VT.getVectorNumElements();
+ for (unsigned i = 0; i != NumElts; ++i)
+ ShuffleMask.push_back(i == InsertC ? NumElts : i);
+ SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
+ return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
+ }
+
+ // Special case for single non-zero, non-undef, element.
+ if (NumNonZero == 1) {
+ unsigned Idx = NonZeroMask.countTrailingZeros();
+ SDValue Item = Op.getOperand(Idx);
+
+ // If we have a constant or non-constant insertion into the low element of
+ // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+ // the rest of the elements. This will be matched as movd/movq/movss/movsd
+ // depending on what the source datatype is.
+ if (Idx == 0) {
+ if (NumZero == 0)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+ if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+ (EltVT == MVT::i64 && Subtarget.is64Bit())) {
+ assert((VT.is128BitVector() || VT.is256BitVector() ||
+ VT.is512BitVector()) &&
+ "Expected an SSE value type!");
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+ return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ }
+
+ // We can't directly insert an i8 or i16 into a vector, so zero extend
+ // it to i32 first.
+ if (EltVT == MVT::i16 || EltVT == MVT::i8) {
+ Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ return DAG.getBitcast(VT, Item);
+ }
+ }
+
+ // Is it a vector logical left shift?
+ if (NumElems == 2 && Idx == 1 &&
+ X86::isZeroNode(Op.getOperand(0)) &&
+ !X86::isZeroNode(Op.getOperand(1))) {
+ unsigned NumBits = VT.getSizeInBits();
+ return getVShift(true, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ VT, Op.getOperand(1)),
+ NumBits/2, DAG, *this, dl);
+ }
+
+ if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+ return SDValue();
+
+ // Otherwise, if this is a vector with i32 or f32 elements, and the element
+ // is a non-constant being inserted into an element other than the low one,
+ // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
+ // movd/movss) to move this into the low element, then shuffle it into
+ // place.
+ if (EVTBits == 32) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+ }
+ }
+
+ // Splat is obviously ok. Let legalizer expand it to a shuffle.
+ if (Values.size() == 1) {
+ if (EVTBits == 32) {
+ // Instead of a shuffle like this:
+ // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+ // Check if it's possible to issue this instead.
+ // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+ unsigned Idx = NonZeroMask.countTrailingZeros();
+ SDValue Item = Op.getOperand(Idx);
+ if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+ return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+ }
+ return SDValue();
+ }
+
+ // A vector full of immediates; various special cases are already
+ // handled, so this is best done with a single constant-pool load.
+ if (IsAllConstants)
+ return SDValue();
+
+ if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
+ return V;
+
+ // See if we can use a vector load to get all of the elements.
+ {
+ SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
+ if (SDValue LD =
+ EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
+ return LD;
+ }
+
+ // If this is a splat of pairs of 32-bit elements, we can use a narrower
+ // build_vector and broadcast it.
+ // TODO: We could probably generalize this more.
+ if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
+ SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
+ DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
+ auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
+ // Make sure all the even/odd operands match.
+ for (unsigned i = 2; i != NumElems; ++i)
+ if (Ops[i % 2] != Op.getOperand(i))
+ return false;
+ return true;
+ };
+ if (CanSplat(Op, NumElems, Ops)) {
+ MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
+ MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
+ // Create a new build vector and cast to v2i64/v2f64.
+ SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
+ DAG.getBuildVector(NarrowVT, dl, Ops));
+ // Broadcast from v2i64/v2f64 and cast to final VT.
+ MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
+ NewBV));
+ }
+ }
+
+ // For AVX-length vectors, build the individual 128-bit pieces and use
+ // shuffles to put them in place.
+ if (VT.getSizeInBits() > 128) {
+ MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ // Build both the lower and upper subvector.
+ SDValue Lower =
+ DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
+ SDValue Upper = DAG.getBuildVector(
+ HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
+
+ // Recreate the wider vector with the lower and upper part.
+ return concatSubVectors(Lower, Upper, DAG, dl);
+ }
+
+ // Let legalizer expand 2-wide build_vectors.
+ if (EVTBits == 64) {
+ if (NumNonZero == 1) {
+ // One half is zero or undef.
+ unsigned Idx = NonZeroMask.countTrailingZeros();
+ SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+ Op.getOperand(Idx));
+ return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
+ }
+ return SDValue();
+ }
+
+ // If element VT is < 32 bits, convert it to inserts into a zero vector.
+ if (EVTBits == 8 && NumElems == 16)
+ if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
+ DAG, Subtarget))
+ return V;
+
+ if (EVTBits == 16 && NumElems == 8)
+ if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
+ DAG, Subtarget))
+ return V;
+
+ // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+ if (EVTBits == 32 && NumElems == 4)
+ if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
+ return V;
+
+ // If element VT is == 32 bits, turn it into a number of shuffles.
+ if (NumElems == 4 && NumZero > 0) {
+ SmallVector<SDValue, 8> Ops(NumElems);
+ for (unsigned i = 0; i < 4; ++i) {
+ bool isZero = !NonZeroMask[i];
+ if (isZero)
+ Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
+ else
+ Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ }
+
+ for (unsigned i = 0; i < 2; ++i) {
+ switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
+ default: llvm_unreachable("Unexpected NonZero count");
+ case 0:
+ Ops[i] = Ops[i*2]; // Must be a zero vector.
+ break;
+ case 1:
+ Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
+ break;
+ case 2:
+ Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
+ break;
+ case 3:
+ Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
+ break;
+ }
+ }
+
+ bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
+ bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
+ int MaskVec[] = {
+ Reverse1 ? 1 : 0,
+ Reverse1 ? 0 : 1,
+ static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+ static_cast<int>(Reverse2 ? NumElems : NumElems+1)
+ };
+ return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
+ }
+
+ assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
+
+ // Check for a build vector from mostly shuffle plus few inserting.
+ if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
+ return Sh;
+
+ // For SSE 4.1, use insertps to put the high elements into the low element.
+ if (Subtarget.hasSSE41()) {
+ SDValue Result;
+ if (!Op.getOperand(0).isUndef())
+ Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+ else
+ Result = DAG.getUNDEF(VT);
+
+ for (unsigned i = 1; i < NumElems; ++i) {
+ if (Op.getOperand(i).isUndef()) continue;
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
+ Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
+ }
+ return Result;
+ }
+
+ // Otherwise, expand into a number of unpckl*, start by extending each of
+ // our (non-undef) elements to the full vector width with the element in the
+ // bottom slot of the vector (which generates no code for SSE).
+ SmallVector<SDValue, 8> Ops(NumElems);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (!Op.getOperand(i).isUndef())
+ Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ else
+ Ops[i] = DAG.getUNDEF(VT);
+ }
+
+ // Next, we iteratively mix elements, e.g. for v4f32:
+ // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
+ // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
+ // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
+ for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
+ // Generate scaled UNPCKL shuffle mask.
+ SmallVector<int, 16> Mask;
+ for(unsigned i = 0; i != Scale; ++i)
+ Mask.push_back(i);
+ for (unsigned i = 0; i != Scale; ++i)
+ Mask.push_back(NumElems+i);
+ Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
+
+ for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
+ Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
+ }
+ return Ops[0];
+}
+
+// 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+// TODO: Detect subvector broadcast here instead of DAG combine?
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ MVT ResVT = Op.getSimpleValueType();
+
+ assert((ResVT.is256BitVector() ||
+ ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+
+ unsigned NumOperands = Op.getNumOperands();
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ unsigned NonZeros = 0;
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ SDValue SubVec = Op.getOperand(i);
+ if (SubVec.isUndef())
+ continue;
+ if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+ ++NumZero;
+ else {
+ assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+ NonZeros |= 1 << i;
+ ++NumNonZero;
+ }
+ }
+
+ // If we have more than 2 non-zeros, build each half separately.
+ if (NumNonZero > 2) {
+ MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
+ ArrayRef<SDUse> Ops = Op->ops();
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+ Ops.slice(0, NumOperands/2));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+ Ops.slice(NumOperands/2));
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+ }
+
+ // Otherwise, build it up through insert_subvectors.
+ SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
+ : DAG.getUNDEF(ResVT);
+
+ MVT SubVT = Op.getOperand(0).getSimpleValueType();
+ unsigned NumSubElems = SubVT.getVectorNumElements();
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ if ((NonZeros & (1 << i)) == 0)
+ continue;
+
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
+ Op.getOperand(i),
+ DAG.getIntPtrConstant(i * NumSubElems, dl));
+ }
+
+ return Vec;
+}
+
+// Returns true if the given node is a type promotion (by concatenating i1
+// zeros) of the result of a node that already zeros all upper bits of
+// k-register.
+// TODO: Merge this with LowerAVXCONCAT_VECTORS?
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG & DAG) {
+ SDLoc dl(Op);
+ MVT ResVT = Op.getSimpleValueType();
+ unsigned NumOperands = Op.getNumOperands();
+
+ assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
+ "Unexpected number of operands in CONCAT_VECTORS");
+
+ uint64_t Zeros = 0;
+ uint64_t NonZeros = 0;
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ SDValue SubVec = Op.getOperand(i);
+ if (SubVec.isUndef())
+ continue;
+ assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+ if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+ Zeros |= (uint64_t)1 << i;
+ else
+ NonZeros |= (uint64_t)1 << i;
+ }
+
+ unsigned NumElems = ResVT.getVectorNumElements();
+
+ // If we are inserting non-zero vector and there are zeros in LSBs and undef
+ // in the MSBs we need to emit a KSHIFTL. The generic lowering to
+ // insert_subvector will give us two kshifts.
+ if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
+ Log2_64(NonZeros) != NumOperands - 1) {
+ MVT ShiftVT = ResVT;
+ if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
+ ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ unsigned Idx = Log2_64(NonZeros);
+ SDValue SubVec = Op.getOperand(Idx);
+ unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
+ DAG.getUNDEF(ShiftVT), SubVec,
+ DAG.getIntPtrConstant(0, dl));
+ Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
+ DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // If there are zero or one non-zeros we can handle this very simply.
+ if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
+ SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
+ if (!NonZeros)
+ return Vec;
+ unsigned Idx = Log2_64(NonZeros);
+ SDValue SubVec = Op.getOperand(Idx);
+ unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
+ DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
+ }
+
+ if (NumOperands > 2) {
+ MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
+ ArrayRef<SDUse> Ops = Op->ops();
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+ Ops.slice(0, NumOperands/2));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+ Ops.slice(NumOperands/2));
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+ }
+
+ assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
+
+ if (ResVT.getVectorNumElements() >= 16)
+ return Op; // The operation is legal with KUNPCK
+
+ SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
+ DAG.getUNDEF(ResVT), Op.getOperand(0),
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
+ DAG.getIntPtrConstant(NumElems/2, dl));
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
+
+ assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+ (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
+ Op.getNumOperands() == 4)));
+
+ // AVX can use the vinsertf128 instruction to create 256-bit vectors
+ // from two other 128-bit ones.
+
+ // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
+ return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle lowering
+//
+// This is an experimental code path for lowering vector shuffles on x86. It is
+// designed to handle arbitrary vector shuffles and blends, gracefully
+// degrading performance as necessary. It works hard to recognize idiomatic
+// shuffles and lower them to optimal instruction patterns without leaving
+// a framework that allows reasonably efficient handling of all vector shuffle
+// patterns.
+//===----------------------------------------------------------------------===//
+
+/// Tiny helper function to identify a no-op mask.
+///
+/// This is a somewhat boring predicate function. It checks whether the mask
+/// array input, which is assumed to be a single-input shuffle mask of the kind
+/// used by the X86 shuffle instructions (not a fully general
+/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
+/// in-place shuffle are 'no-op's.
+static bool isNoopShuffleMask(ArrayRef<int> Mask) {
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ assert(Mask[i] >= -1 && "Out of bound mask element!");
+ if (Mask[i] >= 0 && Mask[i] != i)
+ return false;
+ }
+ return true;
+}
+
+/// Test whether there are elements crossing LaneSizeInBits lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
+ unsigned ScalarSizeInBits,
+ ArrayRef<int> Mask) {
+ assert(LaneSizeInBits && ScalarSizeInBits &&
+ (LaneSizeInBits % ScalarSizeInBits) == 0 &&
+ "Illegal shuffle lane size");
+ int LaneSize = LaneSizeInBits / ScalarSizeInBits;
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ return true;
+ return false;
+}
+
+/// Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+ return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
+}
+
+/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
+/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
+/// better support 'repeated mask + lane permute' style shuffles.
+static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
+ unsigned ScalarSizeInBits,
+ ArrayRef<int> Mask) {
+ assert(LaneSizeInBits && ScalarSizeInBits &&
+ (LaneSizeInBits % ScalarSizeInBits) == 0 &&
+ "Illegal shuffle lane size");
+ int NumElts = Mask.size();
+ int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
+ int NumLanes = NumElts / NumEltsPerLane;
+ if (NumLanes > 1) {
+ for (int i = 0; i != NumLanes; ++i) {
+ int SrcLane = -1;
+ for (int j = 0; j != NumEltsPerLane; ++j) {
+ int M = Mask[(i * NumEltsPerLane) + j];
+ if (M < 0)
+ continue;
+ int Lane = (M % NumElts) / NumEltsPerLane;
+ if (SrcLane >= 0 && SrcLane != Lane)
+ return true;
+ SrcLane = Lane;
+ }
+ }
+ }
+ return false;
+}
+
+/// Test whether a shuffle mask is equivalent within each sub-lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// lane-relative shuffle in each sub-lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// suitable for use with existing 128-bit shuffles as entries from the second
+/// vector have been remapped to [LaneSize, 2*LaneSize).
+static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+ RepeatedMask.assign(LaneSize, -1);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
+ if (Mask[i] < 0)
+ continue;
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ // Adjust second vector indices to start at LaneSize instead of Size.
+ int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
+ : Mask[i] % LaneSize + LaneSize;
+ if (RepeatedMask[i % LaneSize] < 0)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] = LocalM;
+ else if (RepeatedMask[i % LaneSize] != LocalM)
+ // Found a mismatch with the repeated mask.
+ return false;
+ }
+ return true;
+}
+
+/// Test whether a shuffle mask is equivalent within each 128-bit lane.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
+}
+
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
+ SmallVector<int, 32> RepeatedMask;
+ return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
+}
+
+/// Test whether a shuffle mask is equivalent within each 256-bit lane.
+static bool
+is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
+}
+
+/// Test whether a target shuffle mask is equivalent within each sub-lane.
+/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
+ unsigned EltSizeInBits,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = LaneSizeInBits / EltSizeInBits;
+ RepeatedMask.assign(LaneSize, SM_SentinelUndef);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
+ if (Mask[i] == SM_SentinelUndef)
+ continue;
+ if (Mask[i] == SM_SentinelZero) {
+ if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
+ return false;
+ RepeatedMask[i % LaneSize] = SM_SentinelZero;
+ continue;
+ }
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ // Adjust second vector indices to start at LaneSize instead of Size.
+ int LocalM =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
+ if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] = LocalM;
+ else if (RepeatedMask[i % LaneSize] != LocalM)
+ // Found a mismatch with the repeated mask.
+ return false;
+ }
+ return true;
+}
+
+/// Test whether a target shuffle mask is equivalent within each sub-lane.
+/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
+ Mask, RepeatedMask);
+}
+
+/// Checks whether the vector elements referenced by two shuffle masks are
+/// equivalent.
+static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
+ int Idx, int ExpectedIdx) {
+ assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
+ ExpectedIdx < MaskSize && "Out of range element index");
+ if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
+ return false;
+
+ switch (Op.getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ // TODO: Handle MaskSize != Op.getNumOperands()?
+ if (MaskSize == (int)Op.getNumOperands() &&
+ MaskSize == (int)ExpectedOp.getNumOperands())
+ return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
+ break;
+ case X86ISD::VBROADCAST:
+ case X86ISD::VBROADCAST_LOAD:
+ // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
+ return (Op == ExpectedOp &&
+ (int)Op.getValueType().getVectorNumElements() == MaskSize);
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB:
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
+ // TODO: Handle MaskSize != NumElts?
+ // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
+ if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
+ MVT VT = Op.getSimpleValueType();
+ int NumElts = VT.getVectorNumElements();
+ if (MaskSize == NumElts) {
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+ int NumHalfEltsPerLane = NumEltsPerLane / 2;
+ bool SameLane =
+ (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
+ bool SameElt =
+ (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
+ return SameLane && SameElt;
+ }
+ }
+ break;
+ }
+
+ return false;
+}
+
+/// Checks whether a shuffle mask is equivalent to an explicit list of
+/// arguments.
+///
+/// This is a fast way to test a shuffle mask against a fixed pattern:
+///
+/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
+///
+/// It returns true if the mask is exactly as wide as the argument list, and
+/// each element of the mask is either -1 (signifying undef) or the value given
+/// in the argument.
+static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
+ SDValue V1 = SDValue(),
+ SDValue V2 = SDValue()) {
+ int Size = Mask.size();
+ if (Size != (int)ExpectedMask.size())
+ return false;
+
+ for (int i = 0; i < Size; ++i) {
+ assert(Mask[i] >= -1 && "Out of bound mask element!");
+ int MaskIdx = Mask[i];
+ int ExpectedIdx = ExpectedMask[i];
+ if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
+ SDValue MaskV = MaskIdx < Size ? V1 : V2;
+ SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
+ MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
+ ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
+ if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
+ return false;
+ }
+ }
+ return true;
+}
+
+/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
+///
+/// The masks must be exactly the same width.
+///
+/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
+/// value in ExpectedMask is always accepted. Otherwise the indices must match.
+///
+/// SM_SentinelZero is accepted as a valid negative index but must match in
+/// both.
+static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
+ ArrayRef<int> ExpectedMask,
+ SDValue V1 = SDValue(),
+ SDValue V2 = SDValue()) {
+ int Size = Mask.size();
+ if (Size != (int)ExpectedMask.size())
+ return false;
+ assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
+ "Illegal target shuffle mask");
+
+ // Check for out-of-range target shuffle mask indices.
+ if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
+ return false;
+
+ // Don't use V1/V2 if they're not the same size as the shuffle mask type.
+ if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
+ V1 = SDValue();
+ if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
+ V2 = SDValue();
+
+ for (int i = 0; i < Size; ++i) {
+ int MaskIdx = Mask[i];
+ int ExpectedIdx = ExpectedMask[i];
+ if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
+ continue;
+ if (0 <= MaskIdx && 0 <= ExpectedIdx) {
+ SDValue MaskV = MaskIdx < Size ? V1 : V2;
+ SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
+ MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
+ ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
+ if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
+ continue;
+ }
+ // TODO - handle SM_Sentinel equivalences.
+ return false;
+ }
+ return true;
+}
+
+// Attempt to create a shuffle mask from a VSELECT condition mask.
+static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
+ SDValue Cond) {
+ EVT CondVT = Cond.getValueType();
+ unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
+ unsigned NumElts = CondVT.getVectorNumElements();
+
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
+ true, false))
+ return false;
+
+ Mask.resize(NumElts, SM_SentinelUndef);
+
+ for (int i = 0; i != (int)NumElts; ++i) {
+ Mask[i] = i;
+ // Arbitrarily choose from the 2nd operand if the select condition element
+ // is undef.
+ // TODO: Can we do better by matching patterns such as even/odd?
+ if (UndefElts[i] || EltBits[i].isNullValue())
+ Mask[i] += NumElts;
+ }
+
+ return true;
+}
+
+// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
+// instructions.
+static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
+ if (VT != MVT::v8i32 && VT != MVT::v8f32)
+ return false;
+
+ SmallVector<int, 8> Unpcklwd;
+ createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
+ /* Unary = */ false);
+ SmallVector<int, 8> Unpckhwd;
+ createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
+ /* Unary = */ false);
+ bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
+ isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
+ return IsUnpackwdMask;
+}
+
+static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
+ // Create 128-bit vector type based on mask size.
+ MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
+ MVT VT = MVT::getVectorVT(EltVT, Mask.size());
+
+ // We can't assume a canonical shuffle mask, so try the commuted version too.
+ SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+
+ // Match any of unary/binary or low/high.
+ for (unsigned i = 0; i != 4; ++i) {
+ SmallVector<int, 16> UnpackMask;
+ createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
+ if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
+ isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
+ return true;
+ }
+ return false;
+}
+
+/// Return true if a shuffle mask chooses elements identically in its top and
+/// bottom halves. For example, any splat mask has the same top and bottom
+/// halves. If an element is undefined in only one half of the mask, the halves
+/// are not considered identical.
+static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
+ assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
+ unsigned HalfSize = Mask.size() / 2;
+ for (unsigned i = 0; i != HalfSize; ++i) {
+ if (Mask[i] != Mask[i + HalfSize])
+ return false;
+ }
+ return true;
+}
+
+/// Get a 4-lane 8-bit shuffle immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
+/// example.
+///
+/// NB: We rely heavily on "undef" masks preserving the input lane.
+static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
+ assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
+ assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
+
+ // If the mask only uses one non-undef element, then fully 'splat' it to
+ // improve later broadcast matching.
+ int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
+ assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
+
+ int FirstElt = Mask[FirstIndex];
+ if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
+ return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
+
+ unsigned Imm = 0;
+ Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
+ Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
+ Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
+ Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
+ return Imm;
+}
+
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
+ SelectionDAG &DAG) {
+ return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
+}
+
+// The Shuffle result is as follow:
+// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
+// Each Zeroable's element correspond to a particular Mask's element.
+// As described in computeZeroableShuffleElements function.
+//
+// The function looks for a sub-mask that the nonzero elements are in
+// increasing order. If such sub-mask exist. The function returns true.
+static bool isNonZeroElementsInOrder(const APInt &Zeroable,
+ ArrayRef<int> Mask, const EVT &VectorType,
+ bool &IsZeroSideLeft) {
+ int NextElement = -1;
+ // Check if the Mask's nonzero elements are in increasing order.
+ for (int i = 0, e = Mask.size(); i < e; i++) {
+ // Checks if the mask's zeros elements are built from only zeros.
+ assert(Mask[i] >= -1 && "Out of bound mask element!");
+ if (Mask[i] < 0)
+ return false;
+ if (Zeroable[i])
+ continue;
+ // Find the lowest non zero element
+ if (NextElement < 0) {
+ NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
+ IsZeroSideLeft = NextElement != 0;
+ }
+ // Exit if the mask's non zero elements are not in increasing order.
+ if (NextElement != Mask[i])
+ return false;
+ NextElement++;
+ }
+ return true;
+}
+
+/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
+static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ int Size = Mask.size();
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ const int NumBytes = VT.getSizeInBits() / 8;
+ const int NumEltBytes = VT.getScalarSizeInBits() / 8;
+
+ assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
+ (Subtarget.hasAVX2() && VT.is256BitVector()) ||
+ (Subtarget.hasBWI() && VT.is512BitVector()));
+
+ SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
+ // Sign bit set in i8 mask means zero element.
+ SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
+
+ SDValue V;
+ for (int i = 0; i < NumBytes; ++i) {
+ int M = Mask[i / NumEltBytes];
+ if (M < 0) {
+ PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
+ continue;
+ }
+ if (Zeroable[i / NumEltBytes]) {
+ PSHUFBMask[i] = ZeroMask;
+ continue;
+ }
+
+ // We can only use a single input of V1 or V2.
+ SDValue SrcV = (M >= Size ? V2 : V1);
+ if (V && V != SrcV)
+ return SDValue();
+ V = SrcV;
+ M %= Size;
+
+ // PSHUFB can't cross lanes, ensure this doesn't happen.
+ if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
+ return SDValue();
+
+ M = M % LaneSize;
+ M = M * NumEltBytes + (i % NumEltBytes);
+ PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
+ }
+ assert(V && "Failed to find a source input");
+
+ MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
+ DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
+}
+
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl);
+
+// X86 has dedicated shuffle that can be lowered to VEXPAND
+static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
+ const APInt &Zeroable,
+ ArrayRef<int> Mask, SDValue &V1,
+ SDValue &V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsLeftZeroSide = true;
+ if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
+ IsLeftZeroSide))
+ return SDValue();
+ unsigned VEXPANDMask = (~Zeroable).getZExtValue();
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
+ unsigned NumElts = VT.getVectorNumElements();
+ assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+ "Unexpected number of vector elements");
+ SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
+ Subtarget, DAG, DL);
+ SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
+ SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
+ return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
+}
+
+static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &UnpackOpcode, bool IsUnary,
+ ArrayRef<int> TargetMask, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ int NumElts = VT.getVectorNumElements();
+
+ bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
+ for (int i = 0; i != NumElts; i += 2) {
+ int M1 = TargetMask[i + 0];
+ int M2 = TargetMask[i + 1];
+ Undef1 &= (SM_SentinelUndef == M1);
+ Undef2 &= (SM_SentinelUndef == M2);
+ Zero1 &= isUndefOrZero(M1);
+ Zero2 &= isUndefOrZero(M2);
+ }
+ assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
+ "Zeroable shuffle detected");
+
+ // Attempt to match the target mask against the unpack lo/hi mask patterns.
+ SmallVector<int, 64> Unpckl, Unpckh;
+ createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
+ (IsUnary ? V1 : V2))) {
+ UnpackOpcode = X86ISD::UNPCKL;
+ V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+ V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+ return true;
+ }
+
+ createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
+ (IsUnary ? V1 : V2))) {
+ UnpackOpcode = X86ISD::UNPCKH;
+ V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+ V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+ return true;
+ }
+
+ // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
+ if (IsUnary && (Zero1 || Zero2)) {
+ // Don't bother if we can blend instead.
+ if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
+ isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
+ return false;
+
+ bool MatchLo = true, MatchHi = true;
+ for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
+ int M = TargetMask[i];
+
+ // Ignore if the input is known to be zero or the index is undef.
+ if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
+ (M == SM_SentinelUndef))
+ continue;
+
+ MatchLo &= (M == Unpckl[i]);
+ MatchHi &= (M == Unpckh[i]);
+ }
+
+ if (MatchLo || MatchHi) {
+ UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+ V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+ V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+ return true;
+ }
+ }
+
+ // If a binary shuffle, commute and try again.
+ if (!IsUnary) {
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
+ UnpackOpcode = X86ISD::UNPCKL;
+ std::swap(V1, V2);
+ return true;
+ }
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
+ UnpackOpcode = X86ISD::UNPCKH;
+ std::swap(V1, V2);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// X86 has dedicated unpack instructions that can handle specific blend
+// operations: UNPCKH and UNPCKL.
+static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ SmallVector<int, 8> Unpckl;
+ createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
+ if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+
+ SmallVector<int, 8> Unpckh;
+ createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
+ if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+
+ // Commute and try again.
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
+
+ return SDValue();
+}
+
+/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
+/// followed by unpack 256-bit.
+static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ SmallVector<int, 32> Unpckl, Unpckh;
+ createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
+ createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
+
+ unsigned UnpackOpcode;
+ if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
+ UnpackOpcode = X86ISD::UNPCKL;
+ else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
+ UnpackOpcode = X86ISD::UNPCKH;
+ else
+ return SDValue();
+
+ // This is a "natural" unpack operation (rather than the 128-bit sectored
+ // operation implemented by AVX). We need to rearrange 64-bit chunks of the
+ // input in order to use the x86 instruction.
+ V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
+ DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
+ V1 = DAG.getBitcast(VT, V1);
+ return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
+}
+
+// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
+// source into the lower elements and zeroing the upper elements.
+static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
+ ArrayRef<int> Mask, const APInt &Zeroable,
+ const X86Subtarget &Subtarget) {
+ if (!VT.is512BitVector() && !Subtarget.hasVLX())
+ return false;
+
+ unsigned NumElts = Mask.size();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ unsigned SrcEltBits = EltSizeInBits * Scale;
+ if (SrcEltBits < 32 && !Subtarget.hasBWI())
+ continue;
+ unsigned NumSrcElts = NumElts / Scale;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
+ continue;
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+ SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
+ SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
+ DstVT = MVT::getIntegerVT(EltSizeInBits);
+ if ((NumSrcElts * EltSizeInBits) >= 128) {
+ // ISD::TRUNCATE
+ DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
+ } else {
+ // X86ISD::VTRUNC
+ DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
+// element padding to the final DstVT.
+static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, bool ZeroUppers) {
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT DstSVT = DstVT.getScalarType();
+ unsigned NumDstElts = DstVT.getVectorNumElements();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
+
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
+ return SDValue();
+
+ // Perform a direct ISD::TRUNCATE if possible.
+ if (NumSrcElts == NumDstElts)
+ return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
+
+ if (NumSrcElts > NumDstElts) {
+ MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+ return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
+ }
+
+ if ((NumSrcElts * DstEltSizeInBits) >= 128) {
+ MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+ return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+ DstVT.getSizeInBits());
+ }
+
+ // Non-VLX targets must truncate from a 512-bit type, so we need to
+ // widen, truncate and then possibly extract the original subvector.
+ if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
+ SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
+ return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
+ }
+
+ // Fallback to a X86ISD::VTRUNC, padding if necessary.
+ MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
+ SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
+ if (DstVT != TruncVT)
+ Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+ DstVT.getSizeInBits());
+ return Trunc;
+}
+
+// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
+//
+// An example is the following:
+//
+// t0: ch = EntryToken
+// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
+// t25: v4i32 = truncate t2
+// t41: v8i16 = bitcast t25
+// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
+// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
+// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
+// t18: v2i64 = bitcast t51
+//
+// One can just use a single vpmovdw instruction, without avx512vl we need to
+// use the zmm variant and extract the lower subvector, padding with zeroes.
+// TODO: Merge with lowerShuffleAsVTRUNC.
+static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ unsigned NumSrcElts = NumElts / Scale;
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+ !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+
+ SDValue Src = V1;
+ if (!Src.hasOneUse())
+ return SDValue();
+
+ Src = peekThroughOneUseBitcasts(Src);
+ if (Src.getOpcode() != ISD::TRUNCATE ||
+ Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
+ return SDValue();
+ Src = Src.getOperand(0);
+
+ // VPMOVWB is only available with avx512bw.
+ MVT SrcVT = Src.getSimpleValueType();
+ if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
+ !Subtarget.hasBWI())
+ return SDValue();
+
+ bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
+ return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
+ }
+
+ return SDValue();
+}
+
+// Attempt to match binary shuffle patterns as a truncate.
+static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unexpected VTRUNC type");
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ // TODO: Support non-BWI VPMOVWB truncations?
+ unsigned SrcEltBits = EltSizeInBits * Scale;
+ if (SrcEltBits < 32 && !Subtarget.hasBWI())
+ continue;
+
+ // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
+ // Bail if the V2 elements are undef.
+ unsigned NumHalfSrcElts = NumElts / Scale;
+ unsigned NumSrcElts = 2 * NumHalfSrcElts;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+ isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
+ continue;
+
+ // The elements beyond the truncation must be undef/zero.
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (UpperElts > 0 &&
+ !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+ bool UndefUppers =
+ UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
+
+ // As we're using both sources then we need to concat them together
+ // and truncate from the double-sized src.
+ MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
+ SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
+
+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+ MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+ Src = DAG.getBitcast(SrcVT, Src);
+ return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
+ }
+
+ return SDValue();
+}
+
+/// Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
+/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
+/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
+/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
+ bool IsSingleInput) {
+ // The modulus for the shuffle vector entries is based on whether this is
+ // a single input or not.
+ int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+ assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+ "We should only be called with masks with a power-of-2 size!");
+
+ uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+ // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+ // and 2^3 simultaneously. This is because we may have ambiguity with
+ // partially undef inputs.
+ bool ViableForN[3] = {true, true, true};
+
+ for (int i = 0, e = Mask.size(); i < e; ++i) {
+ // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+ // want.
+ if (Mask[i] < 0)
+ continue;
+
+ bool IsAnyViable = false;
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j]) {
+ uint64_t N = j + 1;
+
+ // The shuffle mask must be equal to (i * 2^N) % M.
+ if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+ IsAnyViable = true;
+ else
+ ViableForN[j] = false;
+ }
+ // Early exit if we exhaust the possible powers of two.
+ if (!IsAnyViable)
+ break;
+ }
+
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j])
+ return j + 1;
+
+ // Return 0 as there is no viable power of two.
+ return 0;
+}
+
+// X86 has dedicated pack instructions that can handle specific truncation
+// operations: PACKSS and PACKUS.
+// Checks for compaction shuffle masks if MaxStages > 1.
+// TODO: Add support for matching multiple PACKSS/PACKUS stages.
+static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
+ unsigned &PackOpcode, ArrayRef<int> TargetMask,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ unsigned MaxStages = 1) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned BitSize = VT.getScalarSizeInBits();
+ assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
+ "Illegal maximum compaction");
+
+ auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
+ unsigned NumSrcBits = PackVT.getScalarSizeInBits();
+ unsigned NumPackedBits = NumSrcBits - BitSize;
+ SDValue VV1 = DAG.getBitcast(PackVT, N1);
+ SDValue VV2 = DAG.getBitcast(PackVT, N2);
+ if (Subtarget.hasSSE41() || BitSize == 8) {
+ APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
+ if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
+ (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
+ V1 = VV1;
+ V2 = VV2;
+ SrcVT = PackVT;
+ PackOpcode = X86ISD::PACKUS;
+ return true;
+ }
+ }
+ if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
+ (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
+ V1 = VV1;
+ V2 = VV2;
+ SrcVT = PackVT;
+ PackOpcode = X86ISD::PACKSS;
+ return true;
+ }
+ return false;
+ };
+
+ // Attempt to match against wider and wider compaction patterns.
+ for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
+ MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
+ MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
+
+ // Try binary shuffle.
+ SmallVector<int, 32> BinaryMask;
+ createPackShuffleMask(VT, BinaryMask, false, NumStages);
+ if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
+ if (MatchPACK(V1, V2, PackVT))
+ return true;
+
+ // Try unary shuffle.
+ SmallVector<int, 32> UnaryMask;
+ createPackShuffleMask(VT, UnaryMask, true, NumStages);
+ if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
+ if (MatchPACK(V1, V1, PackVT))
+ return true;
+ }
+
+ return false;
+}
+
+static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT PackVT;
+ unsigned PackOpcode;
+ unsigned SizeBits = VT.getSizeInBits();
+ unsigned EltBits = VT.getScalarSizeInBits();
+ unsigned MaxStages = Log2_32(64 / EltBits);
+ if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
+ Subtarget, MaxStages))
+ return SDValue();
+
+ unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
+ unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
+
+ // Don't lower multi-stage packs on AVX512, truncation is better.
+ if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
+ return SDValue();
+
+ // Pack to the largest type possible:
+ // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
+ unsigned MaxPackBits = 16;
+ if (CurrentEltBits > 16 &&
+ (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
+ MaxPackBits = 32;
+
+ // Repeatedly pack down to the target size.
+ SDValue Res;
+ for (unsigned i = 0; i != NumStages; ++i) {
+ unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
+ unsigned NumSrcElts = SizeBits / SrcEltBits;
+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+ MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
+ MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+ MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
+ Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
+ DAG.getBitcast(SrcVT, V2));
+ V1 = V2 = Res;
+ CurrentEltBits /= 2;
+ }
+ assert(Res && Res.getValueType() == VT &&
+ "Failed to lower compaction shuffle");
+ return Res;
+}
+
+/// Try to emit a bitmask instruction for a shuffle.
+///
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT MaskVT = VT;
+ MVT EltVT = VT.getVectorElementType();
+ SDValue Zero, AllOnes;
+ // Use f64 if i64 isn't legal.
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ MaskVT = MVT::getVectorVT(EltVT, Mask.size());
+ }
+
+ MVT LogicVT = VT;
+ if (EltVT == MVT::f32 || EltVT == MVT::f64) {
+ Zero = DAG.getConstantFP(0.0, DL, EltVT);
+ APFloat AllOnesValue = APFloat::getAllOnesValue(
+ SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
+ AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
+ LogicVT =
+ MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
+ } else {
+ Zero = DAG.getConstant(0, DL, EltVT);
+ AllOnes = DAG.getAllOnesConstant(DL, EltVT);
+ }
+
+ SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+ SDValue V;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Zeroable[i])
+ continue;
+ if (Mask[i] % Size != i)
+ return SDValue(); // Not a blend.
+ if (!V)
+ V = Mask[i] < Size ? V1 : V2;
+ else if (V != (Mask[i] < Size ? V1 : V2))
+ return SDValue(); // Can only let one input through the mask.
+
+ VMaskOps[i] = AllOnes;
+ }
+ if (!V)
+ return SDValue(); // No non-zeroable elements!
+
+ SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
+ VMask = DAG.getBitcast(LogicVT, VMask);
+ V = DAG.getBitcast(LogicVT, V);
+ SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
+ return DAG.getBitcast(VT, And);
+}
+
+/// Try to emit a blend instruction for a shuffle using bit math.
+///
+/// This is used as a fallback approach when first class blend instructions are
+/// unavailable. Currently it is only suitable for integer vectors, but could
+/// be generalized for floating point vectors if desirable.
+static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT.isInteger() && "Only supports integer vector types!");
+ MVT EltVT = VT.getVectorElementType();
+ SDValue Zero = DAG.getConstant(0, DL, EltVT);
+ SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
+ SmallVector<SDValue, 16> MaskOps;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
+ return SDValue(); // Shuffled input!
+ MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
+ }
+
+ SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
+ V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
+ V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
+ return DAG.getNode(ISD::OR, DL, VT, V1, V2);
+}
+
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG);
+
+static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
+ MutableArrayRef<int> Mask,
+ const APInt &Zeroable, bool &ForceV1Zero,
+ bool &ForceV2Zero, uint64_t &BlendMask) {
+ bool V1IsZeroOrUndef =
+ V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZeroOrUndef =
+ V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
+
+ BlendMask = 0;
+ ForceV1Zero = false, ForceV2Zero = false;
+ assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
+
+ // Attempt to generate the binary blend mask. If an input is zero then
+ // we can use any lane.
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ if (M == i)
+ continue;
+ if (M == i + Size) {
+ BlendMask |= 1ull << i;
+ continue;
+ }
+ if (Zeroable[i]) {
+ if (V1IsZeroOrUndef) {
+ ForceV1Zero = true;
+ Mask[i] = i;
+ continue;
+ }
+ if (V2IsZeroOrUndef) {
+ ForceV2Zero = true;
+ BlendMask |= 1ull << i;
+ Mask[i] = i + Size;
+ continue;
+ }
+ }
+ return false;
+ }
+ return true;
+}
+
+static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
+ int Scale) {
+ uint64_t ScaledMask = 0;
+ for (int i = 0; i != Size; ++i)
+ if (BlendMask & (1ull << i))
+ ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
+ return ScaledMask;
+}
+
+/// Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Original,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ uint64_t BlendMask = 0;
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ SmallVector<int, 64> Mask(Original.begin(), Original.end());
+ if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
+ BlendMask))
+ return SDValue();
+
+ // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+ if (ForceV1Zero)
+ V1 = getZeroVector(VT, Subtarget, DAG, DL);
+ if (ForceV2Zero)
+ V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+ switch (VT.SimpleTy) {
+ case MVT::v4i64:
+ case MVT::v8i32:
+ assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
+ LLVM_FALLTHROUGH;
+ case MVT::v4f64:
+ case MVT::v8f32:
+ assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
+ LLVM_FALLTHROUGH;
+ case MVT::v2f64:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8));
+ case MVT::v16i16: {
+ assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+ assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+ BlendMask = 0;
+ for (int i = 0; i < 8; ++i)
+ if (RepeatedMask[i] >= 8)
+ BlendMask |= 1ull << i;
+ return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8));
+ }
+ // Use PBLENDW for lower/upper lanes and then blend lanes.
+ // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
+ // merge to VSELECT where useful.
+ uint64_t LoMask = BlendMask & 0xFF;
+ uint64_t HiMask = (BlendMask >> 8) & 0xFF;
+ if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
+ SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+ DAG.getTargetConstant(LoMask, DL, MVT::i8));
+ SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+ DAG.getTargetConstant(HiMask, DL, MVT::i8));
+ return DAG.getVectorShuffle(
+ MVT::v16i16, DL, Lo, Hi,
+ {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
+ }
+ LLVM_FALLTHROUGH;
+ }
+ case MVT::v32i8:
+ assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
+ LLVM_FALLTHROUGH;
+ case MVT::v16i8: {
+ assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
+
+ // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return Masked;
+
+ if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+ return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
+ }
+
+ // If we have VPTERNLOG, we can use that as a bit blend.
+ if (Subtarget.hasVLX())
+ if (SDValue BitBlend =
+ lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return BitBlend;
+
+ // Scale the blend by the number of bytes per element.
+ int Scale = VT.getScalarSizeInBits() / 8;
+
+ // This form of blend is always done on bytes. Compute the byte vector
+ // type.
+ MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+
+ // x86 allows load folding with blendvb from the 2nd source operand. But
+ // we are still using LLVM select here (see comment below), so that's V1.
+ // If V2 can be load-folded and V1 cannot be load-folded, then commute to
+ // allow that load-folding possibility.
+ if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(V1, V2);
+ }
+
+ // Compute the VSELECT mask. Note that VSELECT is really confusing in the
+ // mix of LLVM's code generator and the x86 backend. We tell the code
+ // generator that boolean values in the elements of an x86 vector register
+ // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
+ // mapping a select to operand #1, and 'false' mapping to operand #2. The
+ // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
+ // of the element (the remaining are ignored) and 0 in that high bit would
+ // mean operand #1 while 1 in the high bit would mean operand #2. So while
+ // the LLVM model for boolean values in vector elements gets the relevant
+ // bit set, it is set backwards and over constrained relative to x86's
+ // actual model.
+ SmallVector<SDValue, 32> VSELECTMask;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ for (int j = 0; j < Scale; ++j)
+ VSELECTMask.push_back(
+ Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+ : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
+ MVT::i8));
+
+ V1 = DAG.getBitcast(BlendVT, V1);
+ V2 = DAG.getBitcast(BlendVT, V2);
+ return DAG.getBitcast(
+ VT,
+ DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
+ V1, V2));
+ }
+ case MVT::v16f32:
+ case MVT::v8f64:
+ case MVT::v8i64:
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8: {
+ // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
+ bool OptForSize = DAG.shouldOptForSize();
+ if (!OptForSize) {
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return Masked;
+ }
+
+ // Otherwise load an immediate into a GPR, cast to k-register, and use a
+ // masked move.
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+ return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
+ }
+ default:
+ llvm_unreachable("Not a supported integer vector type!");
+ }
+}
+
+/// Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ bool ImmBlends = false) {
+ // We build up the blend mask while checking whether a blend is a viable way
+ // to reduce the shuffle.
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+ if (BlendMask[Mask[i] % Size] < 0)
+ BlendMask[Mask[i] % Size] = Mask[i];
+ else if (BlendMask[Mask[i] % Size] != Mask[i])
+ return SDValue(); // Can't blend in the needed input!
+
+ PermuteMask[i] = Mask[i] % Size;
+ }
+
+ // If only immediate blends, then bail if the blend mask can't be widened to
+ // i16.
+ unsigned EltSize = VT.getScalarSizeInBits();
+ if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
+ return SDValue();
+
+ SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// Try to lower as an unpack of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can unpack elements from two inputs and
+/// then reduce the shuffle to a single-input (wider) permutation.
+static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ int NumElts = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = NumElts / NumLanes;
+ int NumHalfLaneElts = NumLaneElts / 2;
+
+ bool MatchLo = true, MatchHi = true;
+ SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
+
+ // Determine UNPCKL/UNPCKH type and operand order.
+ for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
+ for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
+ int M = Mask[Lane + Elt];
+ if (M < 0)
+ continue;
+
+ SDValue &Op = Ops[Elt & 1];
+ if (M < NumElts && (Op.isUndef() || Op == V1))
+ Op = V1;
+ else if (NumElts <= M && (Op.isUndef() || Op == V2))
+ Op = V2;
+ else
+ return SDValue();
+
+ int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
+ MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
+ isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
+ MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
+ isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
+ if (!MatchLo && !MatchHi)
+ return SDValue();
+ }
+ }
+ assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
+
+ // Now check that each pair of elts come from the same unpack pair
+ // and set the permute mask based on each pair.
+ // TODO - Investigate cases where we permute individual elements.
+ SmallVector<int, 32> PermuteMask(NumElts, -1);
+ for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
+ for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
+ int M0 = Mask[Lane + Elt + 0];
+ int M1 = Mask[Lane + Elt + 1];
+ if (0 <= M0 && 0 <= M1 &&
+ (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
+ return SDValue();
+ if (0 <= M0)
+ PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
+ if (0 <= M1)
+ PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
+ }
+ }
+
+ unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+ SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
+ return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
+/// permuting the elements of the result in place.
+static SDValue lowerShuffleAsByteRotateAndPermute(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
+ (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
+ (VT.is512BitVector() && !Subtarget.hasBWI()))
+ return SDValue();
+
+ // We don't currently support lane crossing permutes.
+ if (is128BitLaneCrossingShuffleMask(VT, Mask))
+ return SDValue();
+
+ int Scale = VT.getScalarSizeInBits() / 8;
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsPerLane = NumElts / NumLanes;
+
+ // Determine range of mask elts.
+ bool Blend1 = true;
+ bool Blend2 = true;
+ std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
+ std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
+ for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
+ for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
+ int M = Mask[Lane + Elt];
+ if (M < 0)
+ continue;
+ if (M < NumElts) {
+ Blend1 &= (M == (Lane + Elt));
+ assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
+ M = M % NumEltsPerLane;
+ Range1.first = std::min(Range1.first, M);
+ Range1.second = std::max(Range1.second, M);
+ } else {
+ M -= NumElts;
+ Blend2 &= (M == (Lane + Elt));
+ assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
+ M = M % NumEltsPerLane;
+ Range2.first = std::min(Range2.first, M);
+ Range2.second = std::max(Range2.second, M);
+ }
+ }
+ }
+
+ // Bail if we don't need both elements.
+ // TODO - it might be worth doing this for unary shuffles if the permute
+ // can be widened.
+ if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
+ !(0 <= Range2.first && Range2.second < NumEltsPerLane))
+ return SDValue();
+
+ if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
+ return SDValue();
+
+ // Rotate the 2 ops so we can access both ranges, then permute the result.
+ auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+ SDValue Rotate = DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
+ DAG.getBitcast(ByteVT, Lo),
+ DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
+ SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
+ for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
+ for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
+ int M = Mask[Lane + Elt];
+ if (M < 0)
+ continue;
+ if (M < NumElts)
+ PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
+ else
+ PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
+ }
+ }
+ return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
+ };
+
+ // Check if the ranges are small enough to rotate from either direction.
+ if (Range2.second < Range1.first)
+ return RotateAndPermute(V1, V2, Range1.first, 0);
+ if (Range1.second < Range2.first)
+ return RotateAndPermute(V2, V1, Range2.first, NumElts);
+ return SDValue();
+}
+
+/// Generic routine to decompose a shuffle and blend into independent
+/// blends and permutes.
+///
+/// This matches the extremely common pattern for handling combined
+/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
+/// operations. It will try to pick the best arrangement of shuffles and
+/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
+static SDValue lowerShuffleAsDecomposedShuffleMerge(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ int NumElts = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+
+ // Shuffle the input elements into the desired positions in V1 and V2 and
+ // unpack/blend them together.
+ bool IsAlternating = true;
+ SmallVector<int, 32> V1Mask(NumElts, -1);
+ SmallVector<int, 32> V2Mask(NumElts, -1);
+ SmallVector<int, 32> FinalMask(NumElts, -1);
+ for (int i = 0; i < NumElts; ++i) {
+ int M = Mask[i];
+ if (M >= 0 && M < NumElts) {
+ V1Mask[i] = M;
+ FinalMask[i] = i;
+ IsAlternating &= (i & 1) == 0;
+ } else if (M >= NumElts) {
+ V2Mask[i] = M - NumElts;
+ FinalMask[i] = i + NumElts;
+ IsAlternating &= (i & 1) == 1;
+ }
+ }
+
+ // Try to lower with the simpler initial blend/unpack/rotate strategies unless
+ // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
+ // the shuffle may be able to fold with a load or other benefit. However, when
+ // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
+ // pre-shuffle first is a better strategy.
+ if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
+ // Only prefer immediate blends to unpack/rotate.
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
+ DAG, true))
+ return BlendPerm;
+ if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
+ DAG))
+ return UnpackPerm;
+ if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return RotatePerm;
+ // Unpack/rotate failed - try again with variable blends.
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
+ DAG))
+ return BlendPerm;
+ }
+
+ // If the final mask is an alternating blend of vXi8/vXi16, convert to an
+ // UNPCKL(SHUFFLE, SHUFFLE) pattern.
+ // TODO: It doesn't have to be alternating - but each lane mustn't have more
+ // than half the elements coming from each source.
+ if (IsAlternating && VT.getScalarSizeInBits() < 32) {
+ V1Mask.assign(NumElts, -1);
+ V2Mask.assign(NumElts, -1);
+ FinalMask.assign(NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumEltsPerLane)
+ for (int j = 0; j != NumEltsPerLane; ++j) {
+ int M = Mask[i + j];
+ if (M >= 0 && M < NumElts) {
+ V1Mask[i + (j / 2)] = M;
+ FinalMask[i + j] = i + (j / 2);
+ } else if (M >= NumElts) {
+ V2Mask[i + (j / 2)] = M - NumElts;
+ FinalMask[i + j] = i + (j / 2) + NumElts;
+ }
+ }
+ }
+
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+ return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
+}
+
+/// Try to lower a vector shuffle as a bit rotation.
+///
+/// Look for a repeated rotation pattern in each sub group.
+/// Returns a ISD::ROTL element rotation amount or -1 if failed.
+static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
+ int NumElts = Mask.size();
+ assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
+
+ int RotateAmt = -1;
+ for (int i = 0; i != NumElts; i += NumSubElts) {
+ for (int j = 0; j != NumSubElts; ++j) {
+ int M = Mask[i + j];
+ if (M < 0)
+ continue;
+ if (!isInRange(M, i, i + NumSubElts))
+ return -1;
+ int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
+ if (0 <= RotateAmt && Offset != RotateAmt)
+ return -1;
+ RotateAmt = Offset;
+ }
+ }
+ return RotateAmt;
+}
+
+static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
+ const X86Subtarget &Subtarget,
+ ArrayRef<int> Mask) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
+
+ // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
+ int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
+ int MaxSubElts = 64 / EltSizeInBits;
+ for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
+ int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
+ if (RotateAmt < 0)
+ continue;
+
+ int NumElts = Mask.size();
+ MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
+ RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
+ return RotateAmt * EltSizeInBits;
+ }
+
+ return -1;
+}
+
+/// Lower shuffle using X86ISD::VROTLI rotations.
+static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // Only XOP + AVX512 targets have bit rotation instructions.
+ // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
+ bool IsLegal =
+ (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
+ if (!IsLegal && Subtarget.hasSSE3())
+ return SDValue();
+
+ MVT RotateVT;
+ int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
+ Subtarget, Mask);
+ if (RotateAmt < 0)
+ return SDValue();
+
+ // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
+ // expanded to OR(SRL,SHL), will be more efficient, but if they can
+ // widen to vXi16 or more then existing lowering should will be better.
+ if (!IsLegal) {
+ if ((RotateAmt % 16) == 0)
+ return SDValue();
+ // TODO: Use getTargetVShiftByConstNode.
+ unsigned ShlAmt = RotateAmt;
+ unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
+ V1 = DAG.getBitcast(RotateVT, V1);
+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
+ DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
+ DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
+ SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
+ return DAG.getBitcast(VT, Rot);
+ }
+
+ SDValue Rot =
+ DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, Rot);
+}
+
+/// Try to match a vector shuffle as an element rotation.
+///
+/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
+static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
+ int NumElts = Mask.size();
+
+ // We need to detect various ways of spelling a rotation:
+ // [11, 12, 13, 14, 15, 0, 1, 2]
+ // [-1, 12, 13, 14, -1, -1, 1, -1]
+ // [-1, -1, -1, -1, -1, -1, 1, 2]
+ // [ 3, 4, 5, 6, 7, 8, 9, 10]
+ // [-1, 4, 5, 6, -1, -1, 9, -1]
+ // [-1, 4, 5, 6, -1, -1, -1, -1]
+ int Rotation = 0;
+ SDValue Lo, Hi;
+ for (int i = 0; i < NumElts; ++i) {
+ int M = Mask[i];
+ assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
+ "Unexpected mask index.");
+ if (M < 0)
+ continue;
+
+ // Determine where a rotated vector would have started.
+ int StartIdx = i - (M % NumElts);
+ if (StartIdx == 0)
+ // The identity rotation isn't interesting, stop.
+ return -1;
+
+ // If we found the tail of a vector the rotation must be the missing
+ // front. If we found the head of a vector, it must be how much of the
+ // head.
+ int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
+
+ if (Rotation == 0)
+ Rotation = CandidateRotation;
+ else if (Rotation != CandidateRotation)
+ // The rotations don't match, so we can't match this mask.
+ return -1;
+
+ // Compute which value this mask is pointing at.
+ SDValue MaskV = M < NumElts ? V1 : V2;
+
+ // Compute which of the two target values this index should be assigned
+ // to. This reflects whether the high elements are remaining or the low
+ // elements are remaining.
+ SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+ // Either set up this value if we've not encountered it before, or check
+ // that it remains consistent.
+ if (!TargetV)
+ TargetV = MaskV;
+ else if (TargetV != MaskV)
+ // This may be a rotation, but it pulls from the inputs in some
+ // unsupported interleaving.
+ return -1;
+ }
+
+ // Check that we successfully analyzed the mask, and normalize the results.
+ assert(Rotation != 0 && "Failed to locate a viable rotation!");
+ assert((Lo || Hi) && "Failed to find a rotated input vector!");
+ if (!Lo)
+ Lo = Hi;
+ else if (!Hi)
+ Hi = Lo;
+
+ V1 = Lo;
+ V2 = Hi;
+
+ return Rotation;
+}
+
+/// Try to lower a vector shuffle as a byte rotation.
+///
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
+///
+/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
+ // Don't accept any shuffles with zero elements.
+ if (isAnyZero(Mask))
+ return -1;
+
+ // PALIGNR works on 128-bit lanes.
+ SmallVector<int, 16> RepeatedMask;
+ if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
+ return -1;
+
+ int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
+ if (Rotation <= 0)
+ return -1;
+
+ // PALIGNR rotates bytes, so we need to scale the
+ // rotation based on how many bytes are in the vector lane.
+ int NumElts = RepeatedMask.size();
+ int Scale = 16 / NumElts;
+ return Rotation * Scale;
+}
+
+static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+ SDValue Lo = V1, Hi = V2;
+ int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
+ if (ByteRotation <= 0)
+ return SDValue();
+
+ // Cast the inputs to i8 vector of correct length to match PALIGNR or
+ // PSLLDQ/PSRLDQ.
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+ Lo = DAG.getBitcast(ByteVT, Lo);
+ Hi = DAG.getBitcast(ByteVT, Hi);
+
+ // SSSE3 targets can use the palignr instruction.
+ if (Subtarget.hasSSSE3()) {
+ assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
+ "512-bit PALIGNR requires BWI instructions");
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
+ DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
+ }
+
+ assert(VT.is128BitVector() &&
+ "Rotate-based lowering only supports 128-bit lowering!");
+ assert(Mask.size() <= 16 &&
+ "Can shuffle at most 16 bytes in a 128-bit vector!");
+ assert(ByteVT == MVT::v16i8 &&
+ "SSE2 rotate lowering only needed for v16i8!");
+
+ // Default SSE2 implementation
+ int LoByteShift = 16 - ByteRotation;
+ int HiByteShift = ByteRotation;
+
+ SDValue LoShift =
+ DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
+ DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
+ SDValue HiShift =
+ DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
+ DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
+ return DAG.getBitcast(VT,
+ DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
+}
+
+/// Try to lower a vector shuffle as a dword/qword rotation.
+///
+/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
+/// rotation of the concatenation of two vectors; This routine will
+/// try to generically lower a vector shuffle through such an pattern.
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
+ "Only 32-bit and 64-bit elements are supported!");
+
+ // 128/256-bit vectors are only supported with VLX.
+ assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
+ && "VLX required for 128/256-bit vectors");
+
+ SDValue Lo = V1, Hi = V2;
+ int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
+ if (Rotation <= 0)
+ return SDValue();
+
+ return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
+ DAG.getTargetConstant(Rotation, DL, MVT::i8));
+}
+
+/// Try to lower a vector shuffle as a byte shift sequence.
+static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ assert(VT.is128BitVector() && "Only 128-bit vectors supported");
+
+ // We need a shuffle that has zeros at one/both ends and a sequential
+ // shuffle from one source within.
+ unsigned ZeroLo = Zeroable.countTrailingOnes();
+ unsigned ZeroHi = Zeroable.countLeadingOnes();
+ if (!ZeroLo && !ZeroHi)
+ return SDValue();
+
+ unsigned NumElts = Mask.size();
+ unsigned Len = NumElts - (ZeroLo + ZeroHi);
+ if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
+ return SDValue();
+
+ unsigned Scale = VT.getScalarSizeInBits() / 8;
+ ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
+ if (!isUndefOrInRange(StubMask, 0, NumElts) &&
+ !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
+ return SDValue();
+
+ SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
+ Res = DAG.getBitcast(MVT::v16i8, Res);
+
+ // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
+ // inner sequential set of elements, possibly offset:
+ // 01234567 --> zzzzzz01 --> 1zzzzzzz
+ // 01234567 --> 4567zzzz --> zzzzz456
+ // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
+ if (ZeroLo == 0) {
+ unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
+ } else if (ZeroHi == 0) {
+ unsigned Shift = Mask[ZeroLo] % NumElts;
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
+ } else if (!Subtarget.hasSSSE3()) {
+ // If we don't have PSHUFB then its worth avoiding an AND constant mask
+ // by performing 3 byte shifts. Shuffle combining can kick in above that.
+ // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
+ unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
+ Shift += Mask[ZeroLo] % NumElts;
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
+ } else
+ return SDValue();
+
+ return DAG.getBitcast(VT, Res);
+}
+
+/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
+/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
+/// matches elements from one of the input vectors shuffled to the left or
+/// right with zeroable elements 'shifted in'. It handles both the strictly
+/// bit-wise element shifts and the byte shift across an entire 128-bit double
+/// quad word lane.
+///
+/// PSHL : (little-endian) left bit shift.
+/// [ zz, 0, zz, 2 ]
+/// [ -1, 4, zz, -1 ]
+/// PSRL : (little-endian) right bit shift.
+/// [ 1, zz, 3, zz]
+/// [ -1, -1, 7, zz]
+/// PSLLDQ : (little-endian) left byte shift
+/// [ zz, 0, 1, 2, 3, 4, 5, 6]
+/// [ zz, zz, -1, -1, 2, 3, 4, -1]
+/// [ zz, zz, zz, zz, zz, zz, -1, 1]
+/// PSRLDQ : (little-endian) right byte shift
+/// [ 5, 6, 7, zz, zz, zz, zz, zz]
+/// [ -1, 5, 6, 7, zz, zz, zz, zz]
+/// [ 1, 2, -1, -1, -1, -1, zz, zz]
+static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+ unsigned ScalarSizeInBits, ArrayRef<int> Mask,
+ int MaskOffset, const APInt &Zeroable,
+ const X86Subtarget &Subtarget) {
+ int Size = Mask.size();
+ unsigned SizeInBits = Size * ScalarSizeInBits;
+
+ auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i < Size; i += Scale)
+ for (int j = 0; j < Shift; ++j)
+ if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+ return false;
+
+ return true;
+ };
+
+ auto MatchShift = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i != Size; i += Scale) {
+ unsigned Pos = Left ? i + Shift : i;
+ unsigned Low = Left ? i : i + Shift;
+ unsigned Len = Scale - Shift;
+ if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
+ return -1;
+ }
+
+ int ShiftEltBits = ScalarSizeInBits * Scale;
+ bool ByteShift = ShiftEltBits > 64;
+ Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+ : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+ int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
+
+ // Normalize the scale for byte shifts to still produce an i64 element
+ // type.
+ Scale = ByteShift ? Scale / 2 : Scale;
+
+ // We need to round trip through the appropriate type for the shift.
+ MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
+ ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
+ : MVT::getVectorVT(ShiftSVT, Size / Scale);
+ return (int)ShiftAmt;
+ };
+
+ // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+ // keep doubling the size of the integer elements up to that. We can
+ // then shift the elements of the integer vector by whole multiples of
+ // their width within the elements of the larger integer vector. Test each
+ // multiple to see if we can find a match with the moved element indices
+ // and that the shifted in elements are all zeroable.
+ unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
+ for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; ++Shift)
+ for (bool Left : {true, false})
+ if (CheckZeros(Shift, Scale, Left)) {
+ int ShiftAmt = MatchShift(Shift, Scale, Left);
+ if (0 < ShiftAmt)
+ return ShiftAmt;
+ }
+
+ // no match
+ return -1;
+}
+
+static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ int Size = Mask.size();
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ MVT ShiftVT;
+ SDValue V = V1;
+ unsigned Opcode;
+
+ // Try to match shuffle against V1 shift.
+ int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, 0, Zeroable, Subtarget);
+
+ // If V1 failed, try to match shuffle against V2 shift.
+ if (ShiftAmt < 0) {
+ ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, Size, Zeroable, Subtarget);
+ V = V2;
+ }
+
+ if (ShiftAmt < 0)
+ return SDValue();
+
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+ V = DAG.getBitcast(ShiftVT, V);
+ V = DAG.getNode(Opcode, DL, ShiftVT, V,
+ DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, V);
+}
+
+// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+// Remainder of lower half result is zero and upper half is all undef.
+static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx, const APInt &Zeroable) {
+ int Size = Mask.size();
+ int HalfSize = Size / 2;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+ assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
+
+ // Upper half must be undefined.
+ if (!isUndefUpperHalf(Mask))
+ return false;
+
+ // Determine the extraction length from the part of the
+ // lower half that isn't zeroable.
+ int Len = HalfSize;
+ for (; Len > 0; --Len)
+ if (!Zeroable[Len - 1])
+ break;
+ assert(Len > 0 && "Zeroable shuffle mask");
+
+ // Attempt to match first Len sequential elements from the lower half.
+ SDValue Src;
+ int Idx = -1;
+ for (int i = 0; i != Len; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ SDValue &V = (M < Size ? V1 : V2);
+ M = M % Size;
+
+ // The extracted elements must start at a valid index and all mask
+ // elements must be in the lower half.
+ if (i > M || M >= HalfSize)
+ return false;
+
+ if (Idx < 0 || (Src == V && Idx == (M - i))) {
+ Src = V;
+ Idx = M - i;
+ continue;
+ }
+ return false;
+ }
+
+ if (!Src || Idx < 0)
+ return false;
+
+ assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+ BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ V1 = Src;
+ return true;
+}
+
+// INSERTQ: Extract lowest Len elements from lower half of second source and
+// insert over first source, starting at Idx.
+// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx) {
+ int Size = Mask.size();
+ int HalfSize = Size / 2;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ // Upper half must be undefined.
+ if (!isUndefUpperHalf(Mask))
+ return false;
+
+ for (int Idx = 0; Idx != HalfSize; ++Idx) {
+ SDValue Base;
+
+ // Attempt to match first source from mask before insertion point.
+ if (isUndefInRange(Mask, 0, Idx)) {
+ /* EMPTY */
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+ Base = V1;
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+ Base = V2;
+ } else {
+ continue;
+ }
+
+ // Extend the extraction length looking to match both the insertion of
+ // the second source and the remaining elements of the first.
+ for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+ SDValue Insert;
+ int Len = Hi - Idx;
+
+ // Match insertion.
+ if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+ Insert = V1;
+ } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+ Insert = V2;
+ } else {
+ continue;
+ }
+
+ // Match the remaining elements of the lower half.
+ if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+ /* EMPTY */
+ } else if ((!Base || (Base == V1)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+ Base = V1;
+ } else if ((!Base || (Base == V2)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+ Size + Hi)) {
+ Base = V2;
+ } else {
+ continue;
+ }
+
+ BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ V1 = Base;
+ V2 = Insert;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, SelectionDAG &DAG) {
+ uint64_t BitLen, BitIdx;
+ if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
+ return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
+
+ if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
+ return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
+ V2 ? V2 : DAG.getUNDEF(VT),
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
+
+ return SDValue();
+}
+
+/// Lower a vector shuffle as a zero or any extension.
+///
+/// Given a specific number of elements, element bit width, and extension
+/// stride, produce either a zero or any extension based on the available
+/// features of the subtarget. The extended elements are consecutive and
+/// begin and can start from an offsetted element index in the input; to
+/// avoid excess shuffling the offset must either being in the bottom lane
+/// or at the start of a higher lane. All extended elements must be from
+/// the same lane.
+static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
+ const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
+ ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(Scale > 1 && "Need a scale to extend.");
+ int EltBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = 128 / EltBits;
+ int OffsetLane = Offset / NumEltsPerLane;
+ assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+ "Only 8, 16, and 32 bit elements can be extended.");
+ assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+ assert(0 <= Offset && "Extension offset must be positive.");
+ assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
+ "Extension offset must be in the first lane or start an upper lane.");
+
+ // Check that an index is in same lane as the base offset.
+ auto SafeOffset = [&](int Idx) {
+ return OffsetLane == (Idx / NumEltsPerLane);
+ };
+
+ // Shift along an input so that the offset base moves to the first element.
+ auto ShuffleOffset = [&](SDValue V) {
+ if (!Offset)
+ return V;
+
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = 0; i * Scale < NumElements; ++i) {
+ int SrcIdx = i + Offset;
+ ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
+ }
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
+ };
+
+ // Found a valid a/zext mask! Try various lowering strategies based on the
+ // input type and available ISA extensions.
+ if (Subtarget.hasSSE41()) {
+ // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
+ // PUNPCK will catch this in a later shuffle match.
+ if (Offset && Scale == 2 && VT.is128BitVector())
+ return SDValue();
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
+ NumElements / Scale);
+ InputV = ShuffleOffset(InputV);
+ InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
+ DL, ExtVT, InputV, DAG);
+ return DAG.getBitcast(VT, InputV);
+ }
+
+ assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+
+ // For any extends we can cheat for larger element sizes and use shuffle
+ // instructions that can fold with a load and/or copy.
+ if (AnyExt && EltBits == 32) {
+ int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
+ -1};
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+ }
+ if (AnyExt && EltBits == 16 && Scale > 2) {
+ int PSHUFDMask[4] = {Offset / 2, -1,
+ SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
+ InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+ int PSHUFWMask[4] = {1, -1, -1, -1};
+ unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+ return DAG.getBitcast(
+ VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
+ DAG.getBitcast(MVT::v8i16, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
+ }
+
+ // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+ // to 64-bits.
+ if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
+ assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+ assert(VT.is128BitVector() && "Unexpected vector width!");
+
+ int LoIdx = Offset * EltBits;
+ SDValue Lo = DAG.getBitcast(
+ MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getTargetConstant(EltBits, DL, MVT::i8),
+ DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
+
+ if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
+ return DAG.getBitcast(VT, Lo);
+
+ int HiIdx = (Offset + 1) * EltBits;
+ SDValue Hi = DAG.getBitcast(
+ MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getTargetConstant(EltBits, DL, MVT::i8),
+ DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+ }
+
+ // If this would require more than 2 unpack instructions to expand, use
+ // pshufb when available. We can only use more than 2 unpack instructions
+ // when zero extending i8 elements which also makes it easier to use pshufb.
+ if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
+ assert(NumElements == 16 && "Unexpected byte vector width!");
+ SDValue PSHUFBMask[16];
+ for (int i = 0; i < 16; ++i) {
+ int Idx = Offset + (i / Scale);
+ if ((i % Scale == 0 && SafeOffset(Idx))) {
+ PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
+ continue;
+ }
+ PSHUFBMask[i] =
+ AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
+ }
+ InputV = DAG.getBitcast(MVT::v16i8, InputV);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+ DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
+ }
+
+ // If we are extending from an offset, ensure we start on a boundary that
+ // we can unpack from.
+ int AlignToUnpack = Offset % (NumElements / Scale);
+ if (AlignToUnpack) {
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = AlignToUnpack; i < NumElements; ++i)
+ ShMask[i - AlignToUnpack] = i;
+ InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
+ Offset -= AlignToUnpack;
+ }
+
+ // Otherwise emit a sequence of unpacks.
+ do {
+ unsigned UnpackLoHi = X86ISD::UNPCKL;
+ if (Offset >= (NumElements / 2)) {
+ UnpackLoHi = X86ISD::UNPCKH;
+ Offset -= (NumElements / 2);
+ }
+
+ MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+ SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
+ : getZeroVector(InputVT, Subtarget, DAG, DL);
+ InputV = DAG.getBitcast(InputVT, InputV);
+ InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
+ Scale /= 2;
+ EltBits *= 2;
+ NumElements /= 2;
+ } while (Scale > 1);
+ return DAG.getBitcast(VT, InputV);
+}
+
+/// Try to lower a vector shuffle as a zero extension on any microarch.
+///
+/// This routine will try to do everything in its power to cleverly lower
+/// a shuffle which happens to match the pattern of a zero extend. It doesn't
+/// check for the profitability of this lowering, it tries to aggressively
+/// match this pattern. It will use all of the micro-architectural details it
+/// can to emit an efficient lowering. It handles both blends with all-zero
+/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
+/// masking out later).
+///
+/// The reason we have dedicated lowering for zext-style shuffles is that they
+/// are both incredibly common and often quite performance sensitive.
+static SDValue lowerShuffleAsZeroOrAnyExtend(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ int Bits = VT.getSizeInBits();
+ int NumLanes = Bits / 128;
+ int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = NumElements / NumLanes;
+ assert(VT.getScalarSizeInBits() <= 32 &&
+ "Exceeds 32-bit integer zero extension limit");
+ assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
+
+ // Define a helper function to check a particular ext-scale and lower to it if
+ // valid.
+ auto Lower = [&](int Scale) -> SDValue {
+ SDValue InputV;
+ bool AnyExt = true;
+ int Offset = 0;
+ int Matches = 0;
+ for (int i = 0; i < NumElements; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue; // Valid anywhere but doesn't tell us anything.
+ if (i % Scale != 0) {
+ // Each of the extended elements need to be zeroable.
+ if (!Zeroable[i])
+ return SDValue();
+
+ // We no longer are in the anyext case.
+ AnyExt = false;
+ continue;
+ }
+
+ // Each of the base elements needs to be consecutive indices into the
+ // same input vector.
+ SDValue V = M < NumElements ? V1 : V2;
+ M = M % NumElements;
+ if (!InputV) {
+ InputV = V;
+ Offset = M - (i / Scale);
+ } else if (InputV != V)
+ return SDValue(); // Flip-flopping inputs.
+
+ // Offset must start in the lowest 128-bit lane or at the start of an
+ // upper lane.
+ // FIXME: Is it ever worth allowing a negative base offset?
+ if (!((0 <= Offset && Offset < NumEltsPerLane) ||
+ (Offset % NumEltsPerLane) == 0))
+ return SDValue();
+
+ // If we are offsetting, all referenced entries must come from the same
+ // lane.
+ if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
+ return SDValue();
+
+ if ((M % NumElements) != (Offset + (i / Scale)))
+ return SDValue(); // Non-consecutive strided elements.
+ Matches++;
+ }
+
+ // If we fail to find an input, we have a zero-shuffle which should always
+ // have already been handled.
+ // FIXME: Maybe handle this here in case during blending we end up with one?
+ if (!InputV)
+ return SDValue();
+
+ // If we are offsetting, don't extend if we only match a single input, we
+ // can always do better by using a basic PSHUF or PUNPCK.
+ if (Offset != 0 && Matches < 2)
+ return SDValue();
+
+ return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
+ InputV, Mask, Subtarget, DAG);
+ };
+
+ // The widest scale possible for extending is to a 64-bit integer.
+ assert(Bits % 64 == 0 &&
+ "The number of bits in a vector must be divisible by 64 on x86!");
+ int NumExtElements = Bits / 64;
+
+ // Each iteration, try extending the elements half as much, but into twice as
+ // many elements.
+ for (; NumExtElements < NumElements; NumExtElements *= 2) {
+ assert(NumElements % NumExtElements == 0 &&
+ "The input vector size must be divisible by the extended size.");
+ if (SDValue V = Lower(NumElements / NumExtElements))
+ return V;
+ }
+
+ // General extends failed, but 128-bit vectors may be able to use MOVQ.
+ if (Bits != 128)
+ return SDValue();
+
+ // Returns one of the source operands if the shuffle can be reduced to a
+ // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+ auto CanZExtLowHalf = [&]() {
+ for (int i = NumElements / 2; i != NumElements; ++i)
+ if (!Zeroable[i])
+ return SDValue();
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+ return V1;
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+ return V2;
+ return SDValue();
+ };
+
+ if (SDValue V = CanZExtLowHalf()) {
+ V = DAG.getBitcast(MVT::v2i64, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+ return DAG.getBitcast(VT, V);
+ }
+
+ // No viable ext lowering found.
+ return SDValue();
+}
+
+/// Try to get a scalar value for a specific element of a vector.
+///
+/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
+static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
+ SelectionDAG &DAG) {
+ MVT VT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ V = peekThroughBitcasts(V);
+
+ // If the bitcasts shift the element size, we can't extract an equivalent
+ // element from it.
+ MVT NewVT = V.getSimpleValueType();
+ if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ if (V.getOpcode() == ISD::BUILD_VECTOR ||
+ (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
+ // Ensure the scalar operand is the same size as the destination.
+ // FIXME: Add support for scalar truncation where possible.
+ SDValue S = V.getOperand(Idx);
+ if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
+ return DAG.getBitcast(EltVT, S);
+ }
+
+ return SDValue();
+}
+
+/// Helper to test for a load that can be folded with x86 shuffles.
+///
+/// This is particularly important because the set of instructions varies
+/// significantly based on whether the operand is a load or not.
+static bool isShuffleFoldableLoad(SDValue V) {
+ V = peekThroughBitcasts(V);
+ return ISD::isNON_EXTLoad(V.getNode());
+}
+
+/// Try to lower insertion of a single element into a zero vector.
+///
+/// This is a common pattern that we have especially efficient patterns to lower
+/// across all subtarget feature sets.
+static SDValue lowerShuffleAsElementInsertion(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT ExtVT = VT;
+ MVT EltVT = VT.getVectorElementType();
+
+ int V2Index =
+ find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
+ Mask.begin();
+ bool IsV1Zeroable = true;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (i != V2Index && !Zeroable[i]) {
+ IsV1Zeroable = false;
+ break;
+ }
+
+ // Check for a single input from a SCALAR_TO_VECTOR node.
+ // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+ // all the smarts here sunk into that routine. However, the current
+ // lowering of BUILD_VECTOR makes that nearly impossible until the old
+ // vector shuffle lowering is dead.
+ SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
+ DAG);
+ if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
+ // We need to zext the scalar if it is smaller than an i32.
+ V2S = DAG.getBitcast(EltVT, V2S);
+ if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+ // Using zext to expand a narrow element won't work for non-zero
+ // insertions.
+ if (!IsV1Zeroable)
+ return SDValue();
+
+ // Zero-extend directly to i32.
+ ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
+ V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+ }
+ V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
+ } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
+ EltVT == MVT::i16) {
+ // Either not inserting from the low element of the input or the input
+ // element size is too small to use VZEXT_MOVL to clear the high bits.
+ return SDValue();
+ }
+
+ if (!IsV1Zeroable) {
+ // If V1 can't be treated as a zero vector we have fewer options to lower
+ // this. We can't support integer vectors or non-zero targets cheaply, and
+ // the V1 elements can't be permuted in any way.
+ assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
+ if (!VT.isFloatingPoint() || V2Index != 0)
+ return SDValue();
+ SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
+ V1Mask[V2Index] = -1;
+ if (!isNoopShuffleMask(V1Mask))
+ return SDValue();
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ // Otherwise, use MOVSD or MOVSS.
+ assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
+ "Only two types of floating point element types to handle!");
+ return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
+ ExtVT, V1, V2);
+ }
+
+ // This lowering only works for the low element with floating point vectors.
+ if (VT.isFloatingPoint() && V2Index != 0)
+ return SDValue();
+
+ V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
+ if (ExtVT != VT)
+ V2 = DAG.getBitcast(VT, V2);
+
+ if (V2Index != 0) {
+ // If we have 4 or fewer lanes we can cheaply shuffle the element into
+ // the desired position. Otherwise it is more efficient to do a vector
+ // shift left. We know that we can do a vector shift left because all
+ // the inputs are zero.
+ if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
+ SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+ V2Shuffle[V2Index] = 0;
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+ } else {
+ V2 = DAG.getBitcast(MVT::v16i8, V2);
+ V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
+ DAG.getTargetConstant(
+ V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
+ V2 = DAG.getBitcast(VT, V2);
+ }
+ }
+ return V2;
+}
+
+/// Try to lower broadcast of a single - truncated - integer element,
+/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
+///
+/// This assumes we have AVX2.
+static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
+ int BroadcastIdx,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX2() &&
+ "We can only lower integer broadcasts with AVX2!");
+
+ MVT EltVT = VT.getVectorElementType();
+ MVT V0VT = V0.getSimpleValueType();
+
+ assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
+ assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
+
+ MVT V0EltVT = V0VT.getVectorElementType();
+ if (!V0EltVT.isInteger())
+ return SDValue();
+
+ const unsigned EltSize = EltVT.getSizeInBits();
+ const unsigned V0EltSize = V0EltVT.getSizeInBits();
+
+ // This is only a truncation if the original element type is larger.
+ if (V0EltSize <= EltSize)
+ return SDValue();
+
+ assert(((V0EltSize % EltSize) == 0) &&
+ "Scalar type sizes must all be powers of 2 on x86!");
+
+ const unsigned V0Opc = V0.getOpcode();
+ const unsigned Scale = V0EltSize / EltSize;
+ const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
+
+ if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
+ V0Opc != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ SDValue Scalar = V0.getOperand(V0BroadcastIdx);
+
+ // If we're extracting non-least-significant bits, shift so we can truncate.
+ // Hopefully, we can fold away the trunc/srl/load into the broadcast.
+ // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
+ // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
+ if (const int OffsetIdx = BroadcastIdx % Scale)
+ Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
+ DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
+
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
+}
+
+/// Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+ assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
+ return false;
+ if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
+ return false;
+
+ return true;
+}
+
+/// If we are extracting two 128-bit halves of a vector and shuffling the
+/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
+/// multi-shuffle lowering.
+static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
+ SDValue N1, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ MVT VT = N0.getSimpleValueType();
+ assert((VT.is128BitVector() &&
+ (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
+ "VPERM* family of shuffles requires 32-bit or 64-bit elements");
+
+ // Check that both sources are extracts of the same source vector.
+ if (!N0.hasOneUse() || !N1.hasOneUse() ||
+ N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ N0.getOperand(0) != N1.getOperand(0))
+ return SDValue();
+
+ SDValue WideVec = N0.getOperand(0);
+ MVT WideVT = WideVec.getSimpleValueType();
+ if (!WideVT.is256BitVector())
+ return SDValue();
+
+ // Match extracts of each half of the wide source vector. Commute the shuffle
+ // if the extract of the low half is N1.
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
+ const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
+ const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
+ if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
+ ShuffleVectorSDNode::commuteMask(NewMask);
+ else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
+ return SDValue();
+
+ // Final bailout: if the mask is simple, we are better off using an extract
+ // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
+ // because that avoids a constant load from memory.
+ if (NumElts == 4 &&
+ (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
+ return SDValue();
+
+ // Extend the shuffle mask with undef elements.
+ NewMask.append(NumElts, -1);
+
+ // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
+ SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
+ NewMask);
+ // This is free: ymm -> xmm.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
+ DAG.getIntPtrConstant(0, DL));
+}
+
+/// Try to lower broadcast of a single element.
+///
+/// For convenience, this code also bundles all of the subtarget feature set
+/// filtering. While a little annoying to re-dispatch on type here, there isn't
+/// a convenient way to factor it out.
+static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
+ (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
+ (Subtarget.hasAVX2() && VT.isInteger())))
+ return SDValue();
+
+ // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
+ // we can only broadcast from a register with AVX2.
+ unsigned NumEltBits = VT.getScalarSizeInBits();
+ unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
+ ? X86ISD::MOVDDUP
+ : X86ISD::VBROADCAST;
+ bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
+
+ // Check that the mask is a broadcast.
+ int BroadcastIdx = getSplatIndex(Mask);
+ if (BroadcastIdx < 0)
+ return SDValue();
+ assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
+ "a sorted mask where the broadcast "
+ "comes from V1.");
+
+ // Go up the chain of (vector) values to find a scalar load that we can
+ // combine with the broadcast.
+ // TODO: Combine this logic with findEltLoadSrc() used by
+ // EltsFromConsecutiveLoads().
+ int BitOffset = BroadcastIdx * NumEltBits;
+ SDValue V = V1;
+ for (;;) {
+ switch (V.getOpcode()) {
+ case ISD::BITCAST: {
+ V = V.getOperand(0);
+ continue;
+ }
+ case ISD::CONCAT_VECTORS: {
+ int OpBitWidth = V.getOperand(0).getValueSizeInBits();
+ int OpIdx = BitOffset / OpBitWidth;
+ V = V.getOperand(OpIdx);
+ BitOffset %= OpBitWidth;
+ continue;
+ }
+ case ISD::EXTRACT_SUBVECTOR: {
+ // The extraction index adds to the existing offset.
+ unsigned EltBitWidth = V.getScalarValueSizeInBits();
+ unsigned Idx = V.getConstantOperandVal(1);
+ unsigned BeginOffset = Idx * EltBitWidth;
+ BitOffset += BeginOffset;
+ V = V.getOperand(0);
+ continue;
+ }
+ case ISD::INSERT_SUBVECTOR: {
+ SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+ int EltBitWidth = VOuter.getScalarValueSizeInBits();
+ int Idx = (int)V.getConstantOperandVal(2);
+ int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
+ int BeginOffset = Idx * EltBitWidth;
+ int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
+ if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
+ BitOffset -= BeginOffset;
+ V = VInner;
+ } else {
+ V = VOuter;
+ }
+ continue;
+ }
+ }
+ break;
+ }
+ assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
+ BroadcastIdx = BitOffset / NumEltBits;
+
+ // Do we need to bitcast the source to retrieve the original broadcast index?
+ bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
+
+ // Check if this is a broadcast of a scalar. We special case lowering
+ // for scalars so that we can more effectively fold with loads.
+ // If the original value has a larger element type than the shuffle, the
+ // broadcast element is in essence truncated. Make that explicit to ease
+ // folding.
+ if (BitCastSrc && VT.isInteger())
+ if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
+ DL, VT, V, BroadcastIdx, Subtarget, DAG))
+ return TruncBroadcast;
+
+ // Also check the simpler case, where we can directly reuse the scalar.
+ if (!BitCastSrc &&
+ ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
+ V = V.getOperand(BroadcastIdx);
+
+ // If we can't broadcast from a register, check that the input is a load.
+ if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
+ return SDValue();
+ } else if (ISD::isNormalLoad(V.getNode()) &&
+ cast<LoadSDNode>(V)->isSimple()) {
+ // We do not check for one-use of the vector load because a broadcast load
+ // is expected to be a win for code size, register pressure, and possibly
+ // uops even if the original vector load is not eliminated.
+
+ // Reduce the vector load and shuffle to a broadcasted scalar load.
+ LoadSDNode *Ld = cast<LoadSDNode>(V);
+ SDValue BaseAddr = Ld->getOperand(1);
+ MVT SVT = VT.getScalarType();
+ unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+ assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
+ SDValue NewAddr =
+ DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
+
+ // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
+ // than MOVDDUP.
+ // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
+ if (Opcode == X86ISD::VBROADCAST) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), NewAddr};
+ V = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+ DAG.makeEquivalentMemoryOrdering(Ld, V);
+ return DAG.getBitcast(VT, V);
+ }
+ assert(SVT == MVT::f64 && "Unexpected VT!");
+ V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+ DAG.makeEquivalentMemoryOrdering(Ld, V);
+ } else if (!BroadcastFromReg) {
+ // We can't broadcast from a vector register.
+ return SDValue();
+ } else if (BitOffset != 0) {
+ // We can only broadcast from the zero-element of a vector register,
+ // but it can be advantageous to broadcast from the zero-element of a
+ // subvector.
+ if (!VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
+ if (VT == MVT::v4f64 || VT == MVT::v4i64)
+ return SDValue();
+
+ // Only broadcast the zero-element of a 128-bit subvector.
+ if ((BitOffset % 128) != 0)
+ return SDValue();
+
+ assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
+ "Unexpected bit-offset");
+ assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
+ "Unexpected vector size");
+ unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
+ V = extract128BitVector(V, ExtractIdx, DAG, DL);
+ }
+
+ if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+ DAG.getBitcast(MVT::f64, V));
+
+ // If this is a scalar, do the broadcast on this type and bitcast.
+ if (!V.getValueType().isVector()) {
+ assert(V.getScalarValueSizeInBits() == NumEltBits &&
+ "Unexpected scalar size");
+ MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
+ VT.getVectorNumElements());
+ return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
+ }
+
+ // We only support broadcasting from 128-bit vectors to minimize the
+ // number of patterns we need to deal with in isel. So extract down to
+ // 128-bits, removing as many bitcasts as possible.
+ if (V.getValueSizeInBits() > 128)
+ V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
+
+ // Otherwise cast V to a vector with the same element type as VT, but
+ // possibly narrower than VT. Then perform the broadcast.
+ unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
+ MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
+ return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
+}
+
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+ unsigned &InsertPSMask,
+ const APInt &Zeroable,
+ ArrayRef<int> Mask, SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
+ assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ // Attempt to match INSERTPS with one element from VA or VB being
+ // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
+ // are updated.
+ auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
+ ArrayRef<int> CandidateMask) {
+ unsigned ZMask = 0;
+ int VADstIndex = -1;
+ int VBDstIndex = -1;
+ bool VAUsedInPlace = false;
+
+ for (int i = 0; i < 4; ++i) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
+
+ // Flag if we use any VA inputs in place.
+ if (i == CandidateMask[i]) {
+ VAUsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (VADstIndex >= 0 || VBDstIndex >= 0)
+ return false;
+
+ if (CandidateMask[i] < 4) {
+ // VA input out of place for insertion.
+ VADstIndex = i;
+ } else {
+ // VB input for insertion.
+ VBDstIndex = i;
+ }
+ }
+
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (VADstIndex < 0 && VBDstIndex < 0)
+ return false;
+
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned VBSrcIndex = 0;
+ if (VADstIndex >= 0) {
+ // If we have a VA input out of place, we use VA as the V2 element
+ // insertion and don't use the original V2 at all.
+ VBSrcIndex = CandidateMask[VADstIndex];
+ VBDstIndex = VADstIndex;
+ VB = VA;
+ } else {
+ VBSrcIndex = CandidateMask[VBDstIndex] - 4;
+ }
+
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!VAUsedInPlace)
+ VA = DAG.getUNDEF(MVT::v4f32);
+
+ // Update V1, V2 and InsertPSMask accordingly.
+ V1 = VA;
+ V2 = VB;
+
+ // Insert the V2 element into the desired position.
+ InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ return true;
+ };
+
+ if (matchAsInsertPS(V1, V2, Mask))
+ return true;
+
+ // Commute and try again.
+ SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+ if (matchAsInsertPS(V2, V1, CommutedMask))
+ return true;
+
+ return false;
+}
+
+static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask, const APInt &Zeroable,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+
+ // Attempt to match the insertps pattern.
+ unsigned InsertPSMask = 0;
+ if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+ return SDValue();
+
+ // Insert the V2 element into the desired position.
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
+}
+
+/// Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerShuffleAsPermuteAndUnpack(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() &&
+ "This routine only supports integer vectors.");
+ assert(VT.is128BitVector() &&
+ "This routine only works on 128-bit vectors.");
+ assert(!V2.isUndef() &&
+ "This routine should only be used when blending two inputs.");
+ assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+ int Size = Mask.size();
+
+ int NumLoInputs =
+ count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
+ int NumHiInputs =
+ count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
+
+ bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+ auto TryUnpack = [&](int ScalarSize, int Scale) {
+ SmallVector<int, 16> V1Mask((unsigned)Size, -1);
+ SmallVector<int, 16> V2Mask((unsigned)Size, -1);
+
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // Each element of the unpack contains Scale elements from this mask.
+ int UnpackIdx = i / Scale;
+
+ // We only handle the case where V1 feeds the first slots of the unpack.
+ // We rely on canonicalization to ensure this is the case.
+ if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ // Setup the mask for this input. The indexing is tricky as we have to
+ // handle the unpack stride.
+ SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+ VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+ Mask[i] % Size;
+ }
+
+ // If we will have to shuffle both inputs to use the unpack, check whether
+ // we can just unpack first and shuffle the result. If so, skip this unpack.
+ if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
+ !isNoopShuffleMask(V2Mask))
+ return SDValue();
+
+ // Shuffle the inputs into place.
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+ // Cast the inputs to the type we will use to unpack them.
+ MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
+ V1 = DAG.getBitcast(UnpackVT, V1);
+ V2 = DAG.getBitcast(UnpackVT, V2);
+
+ // Unpack the inputs and cast the result back to the desired type.
+ return DAG.getBitcast(
+ VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ UnpackVT, V1, V2));
+ };
+
+ // We try each unpack from the largest to the smallest to try and find one
+ // that fits this mask.
+ int OrigScalarSize = VT.getScalarSizeInBits();
+ for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
+ if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
+ return Unpack;
+
+ // If we're shuffling with a zero vector then we're better off not doing
+ // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
+ if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
+ ISD::isBuildVectorAllZeros(V2.getNode()))
+ return SDValue();
+
+ // If none of the unpack-rooted lowerings worked (or were profitable) try an
+ // initial unpack.
+ if (NumLoInputs == 0 || NumHiInputs == 0) {
+ assert((NumLoInputs > 0 || NumHiInputs > 0) &&
+ "We have to have *some* inputs!");
+ int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
+
+ // FIXME: We could consider the total complexity of the permute of each
+ // possible unpacking. Or at the least we should consider how many
+ // half-crossings are created.
+ // FIXME: We could consider commuting the unpacks.
+
+ SmallVector<int, 32> PermMask((unsigned)Size, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
+
+ PermMask[i] =
+ 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
+ }
+ return DAG.getVectorShuffle(
+ VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
+ DL, VT, V1, V2),
+ DAG.getUNDEF(VT), PermMask);
+ }
+
+ return SDValue();
+}
+
+/// Handle lowering of 2-lane 64-bit floating point shuffles.
+///
+/// This is the basis function for the 2-lane 64-bit shuffles as we have full
+/// support for floating point shuffles but not integer shuffles. These
+/// instructions will incur a domain crossing penalty on some chips though so
+/// it is better to avoid lowering through this for integer vectors where
+/// possible.
+static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+ if (V2.isUndef()) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Straight shuffle of a single input vector. Simulate this by using the
+ // single input as both of the "inputs" to this instruction..
+ unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+
+ if (Subtarget.hasAVX()) {
+ // If we have AVX, we can use VPERMILPS which will allow folding a load
+ // into the shuffle.
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
+ }
+
+ return DAG.getNode(
+ X86ISD::SHUFP, DL, MVT::v2f64,
+ Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+ Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
+ }
+ assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+ assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
+ // When loading a scalar and then shuffling it into a vector we can often do
+ // the insertion cheaply.
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
+ DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+ Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
+ DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
+ return Insertion;
+
+ // Try to use one of the special instruction patterns to handle two common
+ // blend patterns if a zero-blend above didn't work.
+ if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
+ isShuffleEquivalent(Mask, {1, 3}, V1, V2))
+ if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
+ // We can either use a special instruction to load over the low double or
+ // to move just the low double.
+ return DAG.getNode(
+ X86ISD::MOVSD, DL, MVT::v2f64, V2,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
+
+ if (Subtarget.hasSSE41())
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+ return V;
+
+ unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
+}
+
+/// Handle lowering of 2-lane 64-bit integer shuffles.
+///
+/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
+/// the integer unit to minimize domain crossing penalties. However, for blends
+/// it falls back to the floating point shuffle operation with appropriate bit
+/// casting.
+static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+ if (V2.isUndef()) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Straight shuffle of a single input vector. For everything from SSE2
+ // onward this has a single fast instruction with no scary immediates.
+ // We have to map the mask as it is actually a v4i32 shuffle instruction.
+ V1 = DAG.getBitcast(MVT::v4i32, V1);
+ int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
+ Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
+ Mask[1] < 0 ? -1 : (Mask[1] * 2),
+ Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
+ return DAG.getBitcast(
+ MVT::v2i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+ getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
+ }
+ assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+ assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // When loading a scalar and then shuffling it into a vector we can often do
+ // the insertion cheaply.
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
+ DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
+ DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
+ return Insertion;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget.hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+ if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+ }
+
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG);
+
+ // We implement this with SHUFPD which is pretty lame because it will likely
+ // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
+ // However, all the alternatives are still more cycles and newer chips don't
+ // have this problem. It would be really nice if x86 had better shuffles here.
+ V1 = DAG.getBitcast(MVT::v2f64, V1);
+ V2 = DAG.getBitcast(MVT::v2f64, V2);
+ return DAG.getBitcast(MVT::v2i64,
+ DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// Lower a vector shuffle using the SHUFPS instruction.
+///
+/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
+/// It makes no assumptions about whether this is the *best* lowering, it simply
+/// uses it.
+static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ SDValue LowV = V1, HighV = V2;
+ SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 1) {
+ int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
+
+ // Compute the index adjacent to V2Index and in the same half by toggling
+ // the low bit.
+ int V2AdjIndex = V2Index ^ 1;
+
+ if (Mask[V2AdjIndex] < 0) {
+ // Handles all the cases where we have a single V2 element and an undef.
+ // This will only ever happen in the high lanes because we commute the
+ // vector otherwise.
+ if (V2Index < 2)
+ std::swap(LowV, HighV);
+ NewMask[V2Index] -= 4;
+ } else {
+ // Handle the case where the V2 element ends up adjacent to a V1 element.
+ // To make this work, blend them together as the first step.
+ int V1Index = V2AdjIndex;
+ int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+ V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+ getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+ // Now proceed to reconstruct the final blend as we have the necessary
+ // high or low half formed.
+ if (V2Index < 2) {
+ LowV = V2;
+ HighV = V1;
+ } else {
+ HighV = V2;
+ }
+ NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+ NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+ }
+ } else if (NumV2Elements == 2) {
+ if (Mask[0] < 4 && Mask[1] < 4) {
+ // Handle the easy case where we have V1 in the low lanes and V2 in the
+ // high lanes.
+ NewMask[2] -= 4;
+ NewMask[3] -= 4;
+ } else if (Mask[2] < 4 && Mask[3] < 4) {
+ // We also handle the reversed case because this utility may get called
+ // when we detect a SHUFPS pattern but can't easily commute the shuffle to
+ // arrange things in the right direction.
+ NewMask[0] -= 4;
+ NewMask[1] -= 4;
+ HighV = V1;
+ LowV = V2;
+ } else {
+ // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+ // trying to place elements directly, just blend them and set up the final
+ // shuffle to place them.
+
+ // The first two blend mask elements are for V1, the second two are for
+ // V2.
+ int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+ Mask[2] < 4 ? Mask[2] : Mask[3],
+ (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+ (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+ V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+ getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+ // Now we do a normal shuffle of V1 by giving V1 as both operands to
+ // a blend.
+ LowV = HighV = V1;
+ NewMask[0] = Mask[0] < 4 ? 0 : 2;
+ NewMask[1] = Mask[0] < 4 ? 2 : 0;
+ NewMask[2] = Mask[2] < 4 ? 1 : 3;
+ NewMask[3] = Mask[2] < 4 ? 3 : 1;
+ }
+ } else if (NumV2Elements == 3) {
+ // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
+ // we can get here due to other paths (e.g repeated mask matching) that we
+ // don't want to do another round of lowerVECTOR_SHUFFLE.
+ ShuffleVectorSDNode::commuteMask(NewMask);
+ return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
+ }
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
+ getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
+}
+
+/// Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (Subtarget.hasSSE3()) {
+ if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+ if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+ }
+
+ if (Subtarget.hasAVX()) {
+ // If we have AVX, we can use VPERMILPS which will allow folding a load
+ // into the shuffle.
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
+ // in SSE1 because otherwise they are widened to v2f64 and never get here.
+ if (!Subtarget.hasSSE2()) {
+ if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
+ return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
+ if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
+ return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
+ }
+
+ // Otherwise, use a straight shuffle of a single input vector. We pass the
+ // input vector to both operands to simulate this with a SHUFPS.
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
+ // There are special ways we can lower some single-element blends. However, we
+ // have custom ways we can lower more complex single-element blends below that
+ // we defer to if both this and BLENDPS fail to match, so restrict this to
+ // when the V2 input is targeting element 0 of the mask -- that is the fast
+ // case here.
+ if (NumV2Elements == 1 && Mask[0] >= 4)
+ if (SDValue V = lowerShuffleAsElementInsertion(
+ DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
+ if (Subtarget.hasSSE41()) {
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use INSERTPS if we can complete the shuffle efficiently.
+ if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
+ return V;
+
+ if (!isSingleSHUFPSMask(Mask))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
+ V2, Mask, DAG))
+ return BlendPerm;
+ }
+
+ // Use low/high mov instructions. These are only valid in SSE1 because
+ // otherwise they are widened to v2f64 and never get here.
+ if (!Subtarget.hasSSE2()) {
+ if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
+ return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
+ return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+ }
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+ return V;
+
+ // Otherwise fall back to a SHUFPS lowering strategy.
+ return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
+}
+
+/// Lower 4-lane i32 vector shuffles.
+///
+/// We try to handle these with integer-domain shuffles where we can, but for
+/// blends we use the floating point domain blend instructions.
+static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 0) {
+ // Try to use broadcast unless the mask only has one non-undef element.
+ if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+ }
+
+ // Straight shuffle of a single input vector. For everything from SSE2
+ // onward this has a single fast instruction with no scary immediates.
+ // We coerce the shuffle pattern to be compatible with UNPCK instructions
+ // but we aren't actually going to use the UNPCK instruction because doing
+ // so prevents folding a load into this instruction or making a copy.
+ const int UnpackLoMask[] = {0, 0, 1, 1};
+ const int UnpackHiMask[] = {2, 2, 3, 3};
+ if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
+ Mask = UnpackLoMask;
+ else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
+ Mask = UnpackHiMask;
+
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Elements == 1)
+ if (SDValue V = lowerShuffleAsElementInsertion(
+ DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget.hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+ if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+ }
+
+ // Assume that a single SHUFPS is faster than an alternative sequence of
+ // multiple instructions (even if the CPU has a domain penalty).
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (!isSingleSHUFPSMask(Mask)) {
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG);
+
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Unpack;
+ }
+
+ // We implement this with SHUFPS because it can blend from two vectors.
+ // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
+ // up the inputs, bypassing domain shift penalties that we would incur if we
+ // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
+ // relevant.
+ SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
+ SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
+ return DAG.getBitcast(MVT::v4i32, ShufPS);
+}
+
+/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// shuffle lowering, and the most complex part.
+///
+/// The lowering strategy is to try to form pairs of input lanes which are
+/// targeted at the same half of the final vector, and then use a dword shuffle
+/// to place them onto the right half, and finally unpack the paired lanes into
+/// their final position.
+///
+/// The exact breakdown of how to form these dword pairs and align them on the
+/// correct sides is really tricky. See the comments within the function for
+/// more of the details.
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
+static SDValue lowerV8I16GeneralSingleInputShuffle(
+ const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
+ MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+ assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
+ MutableArrayRef<int> LoMask = Mask.slice(0, 4);
+ MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+
+ // Attempt to directly match PSHUFLW or PSHUFHW.
+ if (isUndefOrInRange(LoMask, 0, 4) &&
+ isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
+ return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
+ }
+ if (isUndefOrInRange(HiMask, 4, 8) &&
+ isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
+ for (int i = 0; i != 4; ++i)
+ HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
+ return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
+ }
+
+ SmallVector<int, 4> LoInputs;
+ copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
+ array_pod_sort(LoInputs.begin(), LoInputs.end());
+ LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+ SmallVector<int, 4> HiInputs;
+ copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
+ array_pod_sort(HiInputs.begin(), HiInputs.end());
+ HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+ int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
+ int NumHToL = LoInputs.size() - NumLToL;
+ int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
+ int NumHToH = HiInputs.size() - NumLToH;
+ MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
+ MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
+ MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
+ MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+
+ // If we are shuffling values from one half - check how many different DWORD
+ // pairs we need to create. If only 1 or 2 then we can perform this as a
+ // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
+ auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
+ ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
+ V = DAG.getNode(ShufWOp, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+ V = DAG.getBitcast(PSHUFDVT, V);
+ V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+ return DAG.getBitcast(VT, V);
+ };
+
+ if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
+ int PSHUFDMask[4] = { -1, -1, -1, -1 };
+ SmallVector<std::pair<int, int>, 4> DWordPairs;
+ int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
+
+ // Collect the different DWORD pairs.
+ for (int DWord = 0; DWord != 4; ++DWord) {
+ int M0 = Mask[2 * DWord + 0];
+ int M1 = Mask[2 * DWord + 1];
+ M0 = (M0 >= 0 ? M0 % 4 : M0);
+ M1 = (M1 >= 0 ? M1 % 4 : M1);
+ if (M0 < 0 && M1 < 0)
+ continue;
+
+ bool Match = false;
+ for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
+ auto &DWordPair = DWordPairs[j];
+ if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
+ (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
+ DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
+ DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
+ PSHUFDMask[DWord] = DOffset + j;
+ Match = true;
+ break;
+ }
+ }
+ if (!Match) {
+ PSHUFDMask[DWord] = DOffset + DWordPairs.size();
+ DWordPairs.push_back(std::make_pair(M0, M1));
+ }
+ }
+
+ if (DWordPairs.size() <= 2) {
+ DWordPairs.resize(2, std::make_pair(-1, -1));
+ int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
+ DWordPairs[1].first, DWordPairs[1].second};
+ if ((NumHToL + NumHToH) == 0)
+ return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
+ if ((NumLToL + NumLToH) == 0)
+ return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
+ }
+ }
+
+ // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
+ // such inputs we can swap two of the dwords across the half mark and end up
+ // with <=2 inputs to each half in each half. Once there, we can fall through
+ // to the generic code below. For example:
+ //
+ // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+ // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
+ //
+ // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
+ // and an existing 2-into-2 on the other half. In this case we may have to
+ // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
+ // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
+ // Fortunately, we don't have to handle anything but a 2-into-2 pattern
+ // because any other situation (including a 3-into-1 or 1-into-3 in the other
+ // half than the one we target for fixing) will be fixed when we re-enter this
+ // path. We will also combine away any sequence of PSHUFD instructions that
+ // result into a single instruction. Here is an example of the tricky case:
+ //
+ // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+ // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
+ //
+ // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
+ //
+ // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
+ // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
+ //
+ // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
+ // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
+ //
+ // The result is fine to be handled by the generic logic.
+ auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
+ ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
+ int AOffset, int BOffset) {
+ assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
+ "Must call this with A having 3 or 1 inputs from the A half.");
+ assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
+ "Must call this with B having 1 or 3 inputs from the B half.");
+ assert(AToAInputs.size() + BToAInputs.size() == 4 &&
+ "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
+
+ bool ThreeAInputs = AToAInputs.size() == 3;
+
+ // Compute the index of dword with only one word among the three inputs in
+ // a half by taking the sum of the half with three inputs and subtracting
+ // the sum of the actual three inputs. The difference is the remaining
+ // slot.
+ int ADWord = 0, BDWord = 0;
+ int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
+ int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
+ int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
+ ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
+ int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
+ int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
+ int TripleNonInputIdx =
+ TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
+ TripleDWord = TripleNonInputIdx / 2;
+
+ // We use xor with one to compute the adjacent DWord to whichever one the
+ // OneInput is in.
+ OneInputDWord = (OneInput / 2) ^ 1;
+
+ // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
+ // and BToA inputs. If there is also such a problem with the BToB and AToB
+ // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
+ // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
+ // is essential that we don't *create* a 3<-1 as then we might oscillate.
+ if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
+ // Compute how many inputs will be flipped by swapping these DWords. We
+ // need
+ // to balance this to ensure we don't form a 3-1 shuffle in the other
+ // half.
+ int NumFlippedAToBInputs =
+ std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
+ std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
+ int NumFlippedBToBInputs =
+ std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
+ std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
+ if ((NumFlippedAToBInputs == 1 &&
+ (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
+ (NumFlippedBToBInputs == 1 &&
+ (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
+ // We choose whether to fix the A half or B half based on whether that
+ // half has zero flipped inputs. At zero, we may not be able to fix it
+ // with that half. We also bias towards fixing the B half because that
+ // will more commonly be the high half, and we have to bias one way.
+ auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
+ ArrayRef<int> Inputs) {
+ int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
+ bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
+ // Determine whether the free index is in the flipped dword or the
+ // unflipped dword based on where the pinned index is. We use this bit
+ // in an xor to conditionally select the adjacent dword.
+ int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
+ bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
+ if (IsFixIdxInput == IsFixFreeIdxInput)
+ FixFreeIdx += 1;
+ IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
+ assert(IsFixIdxInput != IsFixFreeIdxInput &&
+ "We need to be changing the number of flipped inputs!");
+ int PSHUFHalfMask[] = {0, 1, 2, 3};
+ std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
+ V = DAG.getNode(
+ FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
+ MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
+ getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+
+ for (int &M : Mask)
+ if (M >= 0 && M == FixIdx)
+ M = FixFreeIdx;
+ else if (M >= 0 && M == FixFreeIdx)
+ M = FixIdx;
+ };
+ if (NumFlippedBToBInputs != 0) {
+ int BPinnedIdx =
+ BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
+ FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
+ } else {
+ assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
+ int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
+ FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
+ }
+ }
+ }
+
+ int PSHUFDMask[] = {0, 1, 2, 3};
+ PSHUFDMask[ADWord] = BDWord;
+ PSHUFDMask[BDWord] = ADWord;
+ V = DAG.getBitcast(
+ VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+ // Adjust the mask to match the new locations of A and B.
+ for (int &M : Mask)
+ if (M >= 0 && M/2 == ADWord)
+ M = 2 * BDWord + M % 2;
+ else if (M >= 0 && M/2 == BDWord)
+ M = 2 * ADWord + M % 2;
+
+ // Recurse back into this routine to re-compute state now that this isn't
+ // a 3 and 1 problem.
+ return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
+ };
+ if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
+ return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
+ if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+ return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
+
+ // At this point there are at most two inputs to the low and high halves from
+ // each half. That means the inputs can always be grouped into dwords and
+ // those dwords can then be moved to the correct half with a dword shuffle.
+ // We use at most one low and one high word shuffle to collect these paired
+ // inputs into dwords, and finally a dword shuffle to place them.
+ int PSHUFLMask[4] = {-1, -1, -1, -1};
+ int PSHUFHMask[4] = {-1, -1, -1, -1};
+ int PSHUFDMask[4] = {-1, -1, -1, -1};
+
+ // First fix the masks for all the inputs that are staying in their
+ // original halves. This will then dictate the targets of the cross-half
+ // shuffles.
+ auto fixInPlaceInputs =
+ [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+ MutableArrayRef<int> SourceHalfMask,
+ MutableArrayRef<int> HalfMask, int HalfOffset) {
+ if (InPlaceInputs.empty())
+ return;
+ if (InPlaceInputs.size() == 1) {
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+ return;
+ }
+ if (IncomingInputs.empty()) {
+ // Just fix all of the in place inputs.
+ for (int Input : InPlaceInputs) {
+ SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+ PSHUFDMask[Input / 2] = Input / 2;
+ }
+ return;
+ }
+
+ assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ // Put the second input next to the first so that they are packed into
+ // a dword. We find the adjacent index by toggling the low bit.
+ int AdjIndex = InPlaceInputs[0] ^ 1;
+ SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+ std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
+ PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+ };
+ fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+ fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
+
+ // Now gather the cross-half inputs and place them into a free dword of
+ // their target half.
+ // FIXME: This operation could almost certainly be simplified dramatically to
+ // look more like the 3-1 fixing operation.
+ auto moveInputsToRightHalf = [&PSHUFDMask](
+ MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
+ MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
+ MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+ int DestOffset) {
+ auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
+ return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
+ };
+ auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
+ int Word) {
+ int LowWord = Word & ~1;
+ int HighWord = Word | 1;
+ return isWordClobbered(SourceHalfMask, LowWord) ||
+ isWordClobbered(SourceHalfMask, HighWord);
+ };
+
+ if (IncomingInputs.empty())
+ return;
+
+ if (ExistingInputs.empty()) {
+ // Map any dwords with inputs from them into the right half.
+ for (int Input : IncomingInputs) {
+ // If the source half mask maps over the inputs, turn those into
+ // swaps and use the swapped lane.
+ if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
+ if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
+ SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
+ Input - SourceOffset;
+ // We have to swap the uses in our half mask in one sweep.
+ for (int &M : HalfMask)
+ if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
+ M = Input;
+ else if (M == Input)
+ M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+ } else {
+ assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
+ Input - SourceOffset &&
+ "Previous placement doesn't match!");
+ }
+ // Note that this correctly re-maps both when we do a swap and when
+ // we observe the other side of the swap above. We rely on that to
+ // avoid swapping the members of the input list directly.
+ Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+ }
+
+ // Map the input's dword into the correct half.
+ if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
+ PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
+ else
+ assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
+ Input / 2 &&
+ "Previous placement doesn't match!");
+ }
+
+ // And just directly shift any other-half mask elements to be same-half
+ // as we will have mirrored the dword containing the element into the
+ // same position within that half.
+ for (int &M : HalfMask)
+ if (M >= SourceOffset && M < SourceOffset + 4) {
+ M = M - SourceOffset + DestOffset;
+ assert(M >= 0 && "This should never wrap below zero!");
+ }
+ return;
+ }
+
+ // Ensure we have the input in a viable dword of its current half. This
+ // is particularly tricky because the original position may be clobbered
+ // by inputs being moved and *staying* in that half.
+ if (IncomingInputs.size() == 1) {
+ if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+ int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
+ SourceOffset;
+ SourceHalfMask[InputFixed - SourceOffset] =
+ IncomingInputs[0] - SourceOffset;
+ std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
+ InputFixed);
+ IncomingInputs[0] = InputFixed;
+ }
+ } else if (IncomingInputs.size() == 2) {
+ if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
+ isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+ // We have two non-adjacent or clobbered inputs we need to extract from
+ // the source half. To do this, we need to map them into some adjacent
+ // dword slot in the source mask.
+ int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+ IncomingInputs[1] - SourceOffset};
+
+ // If there is a free slot in the source half mask adjacent to one of
+ // the inputs, place the other input in it. We use (Index XOR 1) to
+ // compute an adjacent index.
+ if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+ SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
+ SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+ SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+ InputsFixed[1] = InputsFixed[0] ^ 1;
+ } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+ SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
+ SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+ SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+ InputsFixed[0] = InputsFixed[1] ^ 1;
+ } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
+ // The two inputs are in the same DWord but it is clobbered and the
+ // adjacent DWord isn't used at all. Move both inputs to the free
+ // slot.
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+ InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+ InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+ } else {
+ // The only way we hit this point is if there is no clobbering
+ // (because there are no off-half inputs to this half) and there is no
+ // free slot adjacent to one of the inputs. In this case, we have to
+ // swap an input with a non-input.
+ for (int i = 0; i < 4; ++i)
+ assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
+ "We can't handle any clobbers here!");
+ assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+ "Cannot have adjacent inputs here!");
+
+ SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+ SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+ // We also have to update the final source mask in this case because
+ // it may need to undo the above swap.
+ for (int &M : FinalSourceHalfMask)
+ if (M == (InputsFixed[0] ^ 1) + SourceOffset)
+ M = InputsFixed[1] + SourceOffset;
+ else if (M == InputsFixed[1] + SourceOffset)
+ M = (InputsFixed[0] ^ 1) + SourceOffset;
+
+ InputsFixed[1] = InputsFixed[0] ^ 1;
+ }
+
+ // Point everything at the fixed inputs.
+ for (int &M : HalfMask)
+ if (M == IncomingInputs[0])
+ M = InputsFixed[0] + SourceOffset;
+ else if (M == IncomingInputs[1])
+ M = InputsFixed[1] + SourceOffset;
+
+ IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+ IncomingInputs[1] = InputsFixed[1] + SourceOffset;
+ }
+ } else {
+ llvm_unreachable("Unhandled input size!");
+ }
+
+ // Now hoist the DWord down to the right half.
+ int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
+ assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
+ PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
+ for (int &M : HalfMask)
+ for (int Input : IncomingInputs)
+ if (M == Input)
+ M = FreeDWord * 2 + Input % 2;
+ };
+ moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
+ /*SourceOffset*/ 4, /*DestOffset*/ 0);
+ moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
+ /*SourceOffset*/ 0, /*DestOffset*/ 4);
+
+ // Now enact all the shuffles we've computed to move the inputs into their
+ // target half.
+ if (!isNoopShuffleMask(PSHUFLMask))
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
+ if (!isNoopShuffleMask(PSHUFHMask))
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
+ if (!isNoopShuffleMask(PSHUFDMask))
+ V = DAG.getBitcast(
+ VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+ // At this point, each half should contain all its inputs, and we can then
+ // just shuffle them into their final position.
+ assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
+ "Failed to lift all the high half inputs to the low mask!");
+ assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
+ "Failed to lift all the low half inputs to the high mask!");
+
+ // Do a half shuffle for the low mask.
+ if (!isNoopShuffleMask(LoMask))
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
+
+ // Do a half shuffle with the high mask after shifting its values down.
+ for (int &M : HiMask)
+ if (M >= 0)
+ M -= 4;
+ if (!isNoopShuffleMask(HiMask))
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
+
+ return V;
+}
+
+/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
+/// blend if only one input is used.
+static SDValue lowerShuffleAsBlendOfPSHUFBs(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
+ assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
+ "Lane crossing shuffle masks not supported");
+
+ int NumBytes = VT.getSizeInBits() / 8;
+ int Size = Mask.size();
+ int Scale = NumBytes / Size;
+
+ SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
+ SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
+ V1InUse = false;
+ V2InUse = false;
+
+ for (int i = 0; i < NumBytes; ++i) {
+ int M = Mask[i / Scale];
+ if (M < 0)
+ continue;
+
+ const int ZeroMask = 0x80;
+ int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
+ int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
+ if (Zeroable[i / Scale])
+ V1Idx = V2Idx = ZeroMask;
+
+ V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
+ V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
+ V1InUse |= (ZeroMask != V1Idx);
+ V2InUse |= (ZeroMask != V2Idx);
+ }
+
+ MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
+ if (V1InUse)
+ V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
+ DAG.getBuildVector(ShufVT, DL, V1Mask));
+ if (V2InUse)
+ V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
+ DAG.getBuildVector(ShufVT, DL, V2Mask));
+
+ // If we need shuffled inputs from both, blend the two.
+ SDValue V;
+ if (V1InUse && V2InUse)
+ V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
+ else
+ V = V1InUse ? V1 : V2;
+
+ // Cast the result back to the correct type.
+ return DAG.getBitcast(VT, V);
+}
+
+/// Generic lowering of 8-lane i16 shuffles.
+///
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative.
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
+
+ if (NumV2Inputs == 0) {
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Make a copy of the mask so it can be modified.
+ SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
+ return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
+ Subtarget, DAG);
+ }
+
+ assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
+ "All single-input shuffles should be canonicalized to be V1-input "
+ "shuffles.");
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // See if we can use SSE4A Extraction / Insertion.
+ if (Subtarget.hasSSE4A())
+ if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG))
+ return V;
+
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Inputs == 1)
+ if (SDValue V = lowerShuffleAsElementInsertion(
+ DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget.hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue BitBlend =
+ lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return BitBlend;
+
+ // Try to use byte shift instructions to mask.
+ if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return V;
+
+ // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
+ // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
+ // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
+ int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
+ if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
+ !Subtarget.hasVLX()) {
+ SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
+ for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
+ DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
+ SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
+ DWordClearMask);
+ V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
+ DWordClearMask);
+ // Now pack things back together.
+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
+ if (NumEvenDrops == 2) {
+ Result = DAG.getBitcast(MVT::v4i32, Result);
+ Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
+ }
+ return Result;
+ }
+
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return Unpack;
+
+ // If we can't directly blend but can use PSHUFB, that will be better as it
+ // can both shuffle and set up the inefficient blend.
+ if (!IsBlendSupported && Subtarget.hasSSSE3()) {
+ bool V1InUse, V2InUse;
+ return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG, V1InUse, V2InUse);
+ }
+
+ // We can always bit-blend if we have to so the fallback strategy is to
+ // decompose into single-input permutes and blends/unpacks.
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG);
+}
+
+// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
+// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
+// the active subvector is extracted.
+static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT MaskVT = VT.changeTypeToInteger();
+ SDValue MaskNode;
+ MVT ShuffleVT = VT;
+ if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+ V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
+ V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
+ ShuffleVT = V1.getSimpleValueType();
+
+ // Adjust mask to correct indices for the second input.
+ int NumElts = VT.getVectorNumElements();
+ unsigned Scale = 512 / VT.getSizeInBits();
+ SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
+ for (int &M : AdjustedMask)
+ if (NumElts <= M)
+ M += (Scale - 1) * NumElts;
+ MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
+ MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
+ } else {
+ MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
+ }
+
+ SDValue Result;
+ if (V2.isUndef())
+ Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
+ else
+ Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
+
+ if (VT != ShuffleVT)
+ Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
+
+ return Result;
+}
+
+/// Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
+ // Try to use a zext lowering.
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ // See if we can use SSE4A Extraction / Insertion.
+ if (Subtarget.hasSSE4A())
+ if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, DAG))
+ return V;
+
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
+
+ // For single-input shuffles, there are some nicer lowering tricks we can use.
+ if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+ // Notably, this handles splat and partial-splat shuffles more efficiently.
+ // However, it only makes sense if the pre-duplication shuffle simplifies
+ // things significantly. Currently, this means we need to be able to
+ // express the pre-duplication shuffle as an i16 shuffle.
+ //
+ // FIXME: We should check for other patterns which can be widened into an
+ // i16 shuffle as well.
+ auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+ for (int i = 0; i < 16; i += 2)
+ if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
+ return false;
+
+ return true;
+ };
+ auto tryToWidenViaDuplication = [&]() -> SDValue {
+ if (!canWidenViaDuplication(Mask))
+ return SDValue();
+ SmallVector<int, 4> LoInputs;
+ copy_if(Mask, std::back_inserter(LoInputs),
+ [](int M) { return M >= 0 && M < 8; });
+ array_pod_sort(LoInputs.begin(), LoInputs.end());
+ LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+ LoInputs.end());
+ SmallVector<int, 4> HiInputs;
+ copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
+ array_pod_sort(HiInputs.begin(), HiInputs.end());
+ HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+ HiInputs.end());
+
+ bool TargetLo = LoInputs.size() >= HiInputs.size();
+ ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+ ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+ int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ SmallDenseMap<int, int, 8> LaneMap;
+ for (int I : InPlaceInputs) {
+ PreDupI16Shuffle[I/2] = I/2;
+ LaneMap[I] = I;
+ }
+ int j = TargetLo ? 0 : 4, je = j + 4;
+ for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+ // Check if j is already a shuffle of this input. This happens when
+ // there are two adjacent bytes after we move the low one.
+ if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+ // If we haven't yet mapped the input, search for a slot into which
+ // we can map it.
+ while (j < je && PreDupI16Shuffle[j] >= 0)
+ ++j;
+
+ if (j == je)
+ // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+ return SDValue();
+
+ // Map this input with the i16 shuffle.
+ PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+ }
+
+ // Update the lane map based on the mapping we ended up with.
+ LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+ }
+ V1 = DAG.getBitcast(
+ MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+ // Unpack the bytes to form the i16s that will be shuffled into place.
+ bool EvenInUse = false, OddInUse = false;
+ for (int i = 0; i < 16; i += 2) {
+ EvenInUse |= (Mask[i + 0] >= 0);
+ OddInUse |= (Mask[i + 1] >= 0);
+ if (EvenInUse && OddInUse)
+ break;
+ }
+ V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
+ OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
+
+ int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] >= 0) {
+ int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+ assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
+ if (PostDupI16Shuffle[i / 2] < 0)
+ PostDupI16Shuffle[i / 2] = MappedMask;
+ else
+ assert(PostDupI16Shuffle[i / 2] == MappedMask &&
+ "Conflicting entries in the original shuffle!");
+ }
+ return DAG.getBitcast(
+ MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+ };
+ if (SDValue V = tryToWidenViaDuplication())
+ return V;
+ }
+
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte shift instructions to mask.
+ if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return V;
+
+ // Check for compaction patterns.
+ bool IsSingleInput = V2.isUndef();
+ int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
+
+ // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+ // with PSHUFB. It is important to do this before we attempt to generate any
+ // blends but after all of the single-input lowerings. If the single input
+ // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+ // want to preserve that and we can DAG combine any longer sequences into
+ // a PSHUFB in the end. But once we start blending from multiple inputs,
+ // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+ // and there are *very* few patterns that would actually be faster than the
+ // PSHUFB approach because of its ability to zero lanes.
+ //
+ // If the mask is a binary compaction, we can more efficiently perform this
+ // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
+ //
+ // FIXME: The only exceptions to the above are blends which are exact
+ // interleavings with direct instructions supporting them. We currently don't
+ // handle those well here.
+ if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
+ bool V1InUse = false;
+ bool V2InUse = false;
+
+ SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
+
+ // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+ // do so. This avoids using them to handle blends-with-zero which is
+ // important as a single pshufb is significantly faster for that.
+ if (V1InUse && V2InUse) {
+ if (Subtarget.hasSSE41())
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // We can use an unpack to do the blending rather than an or in some
+ // cases. Even though the or may be (very minorly) more efficient, we
+ // preference this lowering because there are common cases where part of
+ // the complexity of the shuffles goes away when we do the final blend as
+ // an unpack.
+ // FIXME: It might be worth trying to detect if the unpack-feeding
+ // shuffles will both be pshufb, in which case we shouldn't bother with
+ // this.
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return Unpack;
+
+ // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
+ DAG);
+
+ // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
+ if (Subtarget.hasXOP()) {
+ SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
+ }
+
+ // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
+ // PALIGNR will be cheaper than the second PSHUFB+OR.
+ if (SDValue V = lowerShuffleAsByteRotateAndPermute(
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
+ }
+
+ return PSHUFB;
+ }
+
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Elements == 1)
+ if (SDValue V = lowerShuffleAsElementInsertion(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
+ if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Blend;
+
+ // Check whether a compaction lowering can be done. This handles shuffles
+ // which take every Nth element for some even N. See the helper function for
+ // details.
+ //
+ // We special case these as they can be particularly efficiently handled with
+ // the PACKUSB instruction on x86 and they show up in common patterns of
+ // rearranging bytes to truncate wide elements.
+ if (NumEvenDrops) {
+ // NumEvenDrops is the power of two stride of the elements. Another way of
+ // thinking about it is that we need to drop the even elements this many
+ // times to get the original input.
+
+ // First we need to zero all the dropped bytes.
+ assert(NumEvenDrops <= 3 &&
+ "No support for dropping even elements more than 3 times.");
+ SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
+ for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
+ WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
+ SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
+ WordClearMask);
+ if (!IsSingleInput)
+ V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
+ WordClearMask);
+
+ // Now pack things back together.
+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
+ IsSingleInput ? V1 : V2);
+ for (int i = 1; i < NumEvenDrops; ++i) {
+ Result = DAG.getBitcast(MVT::v8i16, Result);
+ Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
+ }
+ return Result;
+ }
+
+ // Handle multi-input cases by blending/unpacking single-input shuffles.
+ if (NumV2Elements > 0)
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG);
+
+ // The fallback path for single-input shuffles widens this into two v8i16
+ // vectors with unpacks, shuffles those, and then pulls them back together
+ // with a pack.
+ SDValue V = V1;
+
+ std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
+ std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] >= 0)
+ (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
+
+ SDValue VLoHalf, VHiHalf;
+ // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+ // them out and avoid using UNPCK{L,H} to extract the elements of V as
+ // i16s.
+ if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
+ // Use a mask to drop the high bytes.
+ VLoHalf = DAG.getBitcast(MVT::v8i16, V);
+ VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+ DAG.getConstant(0x00FF, DL, MVT::v8i16));
+
+ // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+ VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+ // Squash the masks to point directly into VLoHalf.
+ for (int &M : LoBlendMask)
+ if (M >= 0)
+ M /= 2;
+ for (int &M : HiBlendMask)
+ if (M >= 0)
+ M /= 2;
+ } else {
+ // Otherwise just unpack the low half of V into VLoHalf and the high half into
+ // VHiHalf so that we can blend them as i16s.
+ SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
+
+ VLoHalf = DAG.getBitcast(
+ MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+ VHiHalf = DAG.getBitcast(
+ MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+ }
+
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
+
+ return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
+}
+
+/// Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ switch (VT.SimpleTy) {
+ case MVT::v2i64:
+ return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v2f64:
+ return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v4i32:
+ return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v4f32:
+ return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v8i16:
+ return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v16i8:
+ return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Unimplemented!");
+ }
+}
+
+/// Generic routine to split vector shuffle into half-sized shuffles.
+///
+/// This routine just extracts two subvectors, shuffles them independently, and
+/// then concatenates them back together. This should work effectively with all
+/// AVX vector shuffle types.
+static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT.getSizeInBits() >= 256 &&
+ "Only for 256-bit or wider vector shuffles!");
+ assert(V1.getSimpleValueType() == VT && "Bad operand type!");
+ assert(V2.getSimpleValueType() == VT && "Bad operand type!");
+
+ ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
+ ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
+
+ int NumElements = VT.getVectorNumElements();
+ int SplitNumElements = NumElements / 2;
+ MVT ScalarVT = VT.getVectorElementType();
+ MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
+
+ // Use splitVector/extractSubVector so that split build-vectors just build two
+ // narrower build vectors. This helps shuffling with splats and zeros.
+ auto SplitVector = [&](SDValue V) {
+ SDValue LoV, HiV;
+ std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
+ return std::make_pair(DAG.getBitcast(SplitVT, LoV),
+ DAG.getBitcast(SplitVT, HiV));
+ };
+
+ SDValue LoV1, HiV1, LoV2, HiV2;
+ std::tie(LoV1, HiV1) = SplitVector(V1);
+ std::tie(LoV2, HiV2) = SplitVector(V2);
+
+ // Now create two 4-way blends of these half-width vectors.
+ auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+ bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
+ SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
+ for (int i = 0; i < SplitNumElements; ++i) {
+ int M = HalfMask[i];
+ if (M >= NumElements) {
+ if (M >= NumElements + SplitNumElements)
+ UseHiV2 = true;
+ else
+ UseLoV2 = true;
+ V2BlendMask[i] = M - NumElements;
+ BlendMask[i] = SplitNumElements + i;
+ } else if (M >= 0) {
+ if (M >= SplitNumElements)
+ UseHiV1 = true;
+ else
+ UseLoV1 = true;
+ V1BlendMask[i] = M;
+ BlendMask[i] = i;
+ }
+ }
+
+ // Because the lowering happens after all combining takes place, we need to
+ // manually combine these blend masks as much as possible so that we create
+ // a minimal number of high-level vector shuffle nodes.
+
+ // First try just blending the halves of V1 or V2.
+ if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
+ return DAG.getUNDEF(SplitVT);
+ if (!UseLoV2 && !UseHiV2)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ if (!UseLoV1 && !UseHiV1)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+
+ SDValue V1Blend, V2Blend;
+ if (UseLoV1 && UseHiV1) {
+ V1Blend =
+ DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ } else {
+ // We only use half of V1 so map the usage down into the final blend mask.
+ V1Blend = UseLoV1 ? LoV1 : HiV1;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
+ BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
+ }
+ if (UseLoV2 && UseHiV2) {
+ V2Blend =
+ DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+ } else {
+ // We only use half of V2 so map the usage down into the final blend mask.
+ V2Blend = UseLoV2 ? LoV2 : HiV2;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= SplitNumElements)
+ BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
+ }
+ return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
+ };
+ SDValue Lo = HalfBlend(LoMask);
+ SDValue Hi = HalfBlend(HiMask);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+}
+
+/// Either split a vector in halves or decompose the shuffles and the
+/// blend/unpack.
+///
+/// This is provided as a good fallback for many lowerings of non-single-input
+/// shuffles with more than one 128-bit lane. In those cases, we want to select
+/// between splitting the shuffle into 128-bit components and stitching those
+/// back together vs. extracting the single-input shuffles and blending those
+/// results.
+static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(!V2.isUndef() && "This routine must not be used to lower single-input "
+ "shuffles as it could then recurse on itself.");
+ int Size = Mask.size();
+
+ // If this can be modeled as a broadcast of two elements followed by a blend,
+ // prefer that lowering. This is especially important because broadcasts can
+ // often fold with memory operands.
+ auto DoBothBroadcast = [&] {
+ int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
+ for (int M : Mask)
+ if (M >= Size) {
+ if (V2BroadcastIdx < 0)
+ V2BroadcastIdx = M - Size;
+ else if (M - Size != V2BroadcastIdx)
+ return false;
+ } else if (M >= 0) {
+ if (V1BroadcastIdx < 0)
+ V1BroadcastIdx = M;
+ else if (M != V1BroadcastIdx)
+ return false;
+ }
+ return true;
+ };
+ if (DoBothBroadcast())
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
+ DAG);
+
+ // If the inputs all stem from a single 128-bit lane of each input, then we
+ // split them rather than blending because the split will decompose to
+ // unusually few instructions.
+ int LaneCount = VT.getSizeInBits() / 128;
+ int LaneSize = Size / LaneCount;
+ SmallBitVector LaneInputs[2];
+ LaneInputs[0].resize(LaneCount, false);
+ LaneInputs[1].resize(LaneCount, false);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
+ if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
+ // requires that the decomposed single-input shuffles don't end up here.
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
+ DAG);
+}
+
+// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
+// TODO: Extend to support v8f32 (+ 512-bit shuffles).
+static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
+
+ int LHSMask[4] = {-1, -1, -1, -1};
+ int RHSMask[4] = {-1, -1, -1, -1};
+ unsigned SHUFPMask = 0;
+
+ // As SHUFPD uses a single LHS/RHS element per lane, we can always
+ // perform the shuffle once the lanes have been shuffled in place.
+ for (int i = 0; i != 4; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ int LaneBase = i & ~1;
+ auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
+ LaneMask[LaneBase + (M & 1)] = M;
+ SHUFPMask |= (M & 1) << i;
+ }
+
+ SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
+ SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
+ DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
+}
+
+/// Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a lane permutation followed by a per-lane permutation.
+///
+/// This is mainly for cases where we can have non-repeating permutes
+/// in each lane.
+///
+/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
+/// we should investigate merging them.
+static SDValue lowerShuffleAsLanePermuteAndPermute(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+ bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
+
+ /// Attempts to find a sublane permute with the given size
+ /// that gets all elements into their target lanes.
+ ///
+ /// If successful, fills CrossLaneMask and InLaneMask and returns true.
+ /// If unsuccessful, returns false and may overwrite InLaneMask.
+ auto getSublanePermute = [&](int NumSublanes) -> SDValue {
+ int NumSublanesPerLane = NumSublanes / NumLanes;
+ int NumEltsPerSublane = NumElts / NumSublanes;
+
+ SmallVector<int, 16> CrossLaneMask;
+ SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
+ // CrossLaneMask but one entry == one sublane.
+ SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
+
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+
+ int SrcSublane = M / NumEltsPerSublane;
+ int DstLane = i / NumEltsPerLane;
+
+ // We only need to get the elements into the right lane, not sublane.
+ // So search all sublanes that make up the destination lane.
+ bool Found = false;
+ int DstSubStart = DstLane * NumSublanesPerLane;
+ int DstSubEnd = DstSubStart + NumSublanesPerLane;
+ for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
+ if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
+ continue;
+
+ Found = true;
+ CrossLaneMaskLarge[DstSublane] = SrcSublane;
+ int DstSublaneOffset = DstSublane * NumEltsPerSublane;
+ InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
+ break;
+ }
+ if (!Found)
+ return SDValue();
+ }
+
+ // Fill CrossLaneMask using CrossLaneMaskLarge.
+ narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
+
+ if (!CanUseSublanes) {
+ // If we're only shuffling a single lowest lane and the rest are identity
+ // then don't bother.
+ // TODO - isShuffleMaskInputInPlace could be extended to something like
+ // this.
+ int NumIdentityLanes = 0;
+ bool OnlyShuffleLowestLane = true;
+ for (int i = 0; i != NumLanes; ++i) {
+ int LaneOffset = i * NumEltsPerLane;
+ if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
+ i * NumEltsPerLane))
+ NumIdentityLanes++;
+ else if (CrossLaneMask[LaneOffset] != 0)
+ OnlyShuffleLowestLane = false;
+ }
+ if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+ return SDValue();
+ }
+
+ SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
+ return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
+ InLaneMask);
+ };
+
+ // First attempt a solution with full lanes.
+ if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
+ return V;
+
+ // The rest of the solutions use sublanes.
+ if (!CanUseSublanes)
+ return SDValue();
+
+ // Then attempt a solution with 64-bit sublanes (vpermq).
+ if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
+ return V;
+
+ // If that doesn't work and we have fast variable shuffle,
+ // attempt 32-bit sublanes (vpermd).
+ if (!Subtarget.hasFastVariableShuffle())
+ return SDValue();
+
+ return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
+}
+
+/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
+/// source with a lane permutation.
+///
+/// This lowering strategy results in four instructions in the worst case for a
+/// single-input cross lane shuffle which is lower than any other fully general
+/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
+/// shuffle pattern should be handled prior to trying this lowering.
+static SDValue lowerShuffleAsLanePermuteAndShuffle(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ // FIXME: This should probably be generalized for 512-bit vectors as well.
+ assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
+ int Size = Mask.size();
+ int LaneSize = Size / 2;
+
+ // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
+ // Only do this if the elements aren't all from the lower lane,
+ // otherwise we're (probably) better off doing a split.
+ if (VT == MVT::v4f64 &&
+ !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
+ if (SDValue V =
+ lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
+ return V;
+
+ // If there are only inputs from one 128-bit lane, splitting will in fact be
+ // less expensive. The flags track whether the given lane contains an element
+ // that crosses to another lane.
+ if (!Subtarget.hasAVX2()) {
+ bool LaneCrossing[2] = {false, false};
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
+ LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+ if (!LaneCrossing[0] || !LaneCrossing[1])
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ } else {
+ bool LaneUsed[2] = {false, false};
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ LaneUsed[(Mask[i] % Size) / LaneSize] = true;
+ if (!LaneUsed[0] || !LaneUsed[1])
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
+
+ // TODO - we could support shuffling V2 in the Flipped input.
+ assert(V2.isUndef() &&
+ "This last part of this routine only works on single input shuffles");
+
+ SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
+ for (int i = 0; i < Size; ++i) {
+ int &M = InLaneMask[i];
+ if (M < 0)
+ continue;
+ if (((M % Size) / LaneSize) != (i / LaneSize))
+ M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
+ }
+ assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
+ "In-lane shuffle mask expected");
+
+ // Flip the lanes, and shuffle the results which should now be in-lane.
+ MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
+ SDValue Flipped = DAG.getBitcast(PVT, V1);
+ Flipped =
+ DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
+ Flipped = DAG.getBitcast(VT, Flipped);
+ return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
+}
+
+/// Handle lowering 2-lane 128-bit shuffles.
+static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
+ if (Subtarget.hasAVX2() && V2.isUndef())
+ return SDValue();
+
+ bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
+
+ SmallVector<int, 4> WidenedMask;
+ if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
+ return SDValue();
+
+ bool IsLowZero = (Zeroable & 0x3) == 0x3;
+ bool IsHighZero = (Zeroable & 0xc) == 0xc;
+
+ // Try to use an insert into a zero vector.
+ if (WidenedMask[0] == 0 && IsHighZero) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), LoV,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // TODO: If minimizing size and one of the inputs is a zero vector and the
+ // the zero vector has only one use, we could use a VPERM2X128 to save the
+ // instruction bytes needed to explicitly generate the zero vector.
+
+ // Blends are faster and handle all the non-lane-crossing cases.
+ if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return Blend;
+
+ // If either input operand is a zero vector, use VPERM2X128 because its mask
+ // allows us to replace the zero input with an implicit zero.
+ if (!IsLowZero && !IsHighZero) {
+ // Check for patterns which can be matched with a single insert of a 128-bit
+ // subvector.
+ bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
+ if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
+
+ // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
+ // this will likely become vinsertf128 which can't fold a 256-bit memop.
+ if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
+ DAG.getIntPtrConstant(2, DL));
+ }
+ }
+
+ // Try to use SHUF128 if possible.
+ if (Subtarget.hasVLX()) {
+ if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
+ unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
+ ((WidenedMask[1] % 2) << 1);
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ }
+ }
+ }
+
+ // Otherwise form a 128-bit permutation. After accounting for undefs,
+ // convert the 64-bit shuffle mask selection values into 128-bit
+ // selection bits by dividing the indexes by 2 and shifting into positions
+ // defined by a vperm2*128 instruction's immediate control byte.
+
+ // The immediate permute control byte looks like this:
+ // [1:0] - select 128 bits from sources for low half of destination
+ // [2] - ignore
+ // [3] - zero low half of destination
+ // [5:4] - select 128 bits from sources for high half of destination
+ // [6] - ignore
+ // [7] - zero high half of destination
+
+ assert((WidenedMask[0] >= 0 || IsLowZero) &&
+ (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
+
+ unsigned PermMask = 0;
+ PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
+ PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
+
+ // Check the immediate mask and replace unused sources with undef.
+ if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
+ V1 = DAG.getUNDEF(VT);
+ if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
+ V2 = DAG.getUNDEF(VT);
+
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+}
+
+/// Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This attempts to create a repeated lane shuffle where each lane uses one
+/// or two of the lanes of the inputs. The lanes of the input vectors are
+/// shuffled in one or two independent shuffles to get the lanes into the
+/// position needed by the final shuffle.
+static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(!V2.isUndef() && "This is only useful with multiple inputs.");
+
+ if (is128BitLaneRepeatedShuffleMask(VT, Mask))
+ return SDValue();
+
+ int NumElts = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = 128 / VT.getScalarSizeInBits();
+ SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
+ SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
+
+ // First pass will try to fill in the RepeatMask from lanes that need two
+ // sources.
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ int Srcs[2] = {-1, -1};
+ SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
+ for (int i = 0; i != NumLaneElts; ++i) {
+ int M = Mask[(Lane * NumLaneElts) + i];
+ if (M < 0)
+ continue;
+ // Determine which of the possible input lanes (NumLanes from each source)
+ // this element comes from. Assign that as one of the sources for this
+ // lane. We can assign up to 2 sources for this lane. If we run out
+ // sources we can't do anything.
+ int LaneSrc = M / NumLaneElts;
+ int Src;
+ if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
+ Src = 0;
+ else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
+ Src = 1;
+ else
+ return SDValue();
+
+ Srcs[Src] = LaneSrc;
+ InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
+ }
+
+ // If this lane has two sources, see if it fits with the repeat mask so far.
+ if (Srcs[1] < 0)
+ continue;
+
+ LaneSrcs[Lane][0] = Srcs[0];
+ LaneSrcs[Lane][1] = Srcs[1];
+
+ auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
+ assert(M1.size() == M2.size() && "Unexpected mask size");
+ for (int i = 0, e = M1.size(); i != e; ++i)
+ if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
+ return false;
+ return true;
+ };
+
+ auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
+ assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
+ for (int i = 0, e = MergedMask.size(); i != e; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
+ "Unexpected mask element");
+ MergedMask[i] = M;
+ }
+ };
+
+ if (MatchMasks(InLaneMask, RepeatMask)) {
+ // Merge this lane mask into the final repeat mask.
+ MergeMasks(InLaneMask, RepeatMask);
+ continue;
+ }
+
+ // Didn't find a match. Swap the operands and try again.
+ std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
+ ShuffleVectorSDNode::commuteMask(InLaneMask);
+
+ if (MatchMasks(InLaneMask, RepeatMask)) {
+ // Merge this lane mask into the final repeat mask.
+ MergeMasks(InLaneMask, RepeatMask);
+ continue;
+ }
+
+ // Couldn't find a match with the operands in either order.
+ return SDValue();
+ }
+
+ // Now handle any lanes with only one source.
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ // If this lane has already been processed, skip it.
+ if (LaneSrcs[Lane][0] >= 0)
+ continue;
+
+ for (int i = 0; i != NumLaneElts; ++i) {
+ int M = Mask[(Lane * NumLaneElts) + i];
+ if (M < 0)
+ continue;
+
+ // If RepeatMask isn't defined yet we can define it ourself.
+ if (RepeatMask[i] < 0)
+ RepeatMask[i] = M % NumLaneElts;
+
+ if (RepeatMask[i] < NumElts) {
+ if (RepeatMask[i] != M % NumLaneElts)
+ return SDValue();
+ LaneSrcs[Lane][0] = M / NumLaneElts;
+ } else {
+ if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
+ return SDValue();
+ LaneSrcs[Lane][1] = M / NumLaneElts;
+ }
+ }
+
+ if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
+ return SDValue();
+ }
+
+ SmallVector<int, 16> NewMask(NumElts, -1);
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ int Src = LaneSrcs[Lane][0];
+ for (int i = 0; i != NumLaneElts; ++i) {
+ int M = -1;
+ if (Src >= 0)
+ M = Src * NumLaneElts + i;
+ NewMask[Lane * NumLaneElts + i] = M;
+ }
+ }
+ SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+ // Ensure we didn't get back the shuffle we started with.
+ // FIXME: This is a hack to make up for some splat handling code in
+ // getVectorShuffle.
+ if (isa<ShuffleVectorSDNode>(NewV1) &&
+ cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
+ return SDValue();
+
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ int Src = LaneSrcs[Lane][1];
+ for (int i = 0; i != NumLaneElts; ++i) {
+ int M = -1;
+ if (Src >= 0)
+ M = Src * NumLaneElts + i;
+ NewMask[Lane * NumLaneElts + i] = M;
+ }
+ }
+ SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+ // Ensure we didn't get back the shuffle we started with.
+ // FIXME: This is a hack to make up for some splat handling code in
+ // getVectorShuffle.
+ if (isa<ShuffleVectorSDNode>(NewV2) &&
+ cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
+ return SDValue();
+
+ for (int i = 0; i != NumElts; ++i) {
+ NewMask[i] = RepeatMask[i % NumLaneElts];
+ if (NewMask[i] < 0)
+ continue;
+
+ NewMask[i] += (i / NumLaneElts) * NumLaneElts;
+ }
+ return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
+}
+
+/// If the input shuffle mask results in a vector that is undefined in all upper
+/// or lower half elements and that mask accesses only 2 halves of the
+/// shuffle's operands, return true. A mask of half the width with mask indexes
+/// adjusted to access the extracted halves of the original shuffle operands is
+/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
+/// lower half of each input operand is accessed.
+static bool
+getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
+ int &HalfIdx1, int &HalfIdx2) {
+ assert((Mask.size() == HalfMask.size() * 2) &&
+ "Expected input mask to be twice as long as output");
+
+ // Exactly one half of the result must be undef to allow narrowing.
+ bool UndefLower = isUndefLowerHalf(Mask);
+ bool UndefUpper = isUndefUpperHalf(Mask);
+ if (UndefLower == UndefUpper)
+ return false;
+
+ unsigned HalfNumElts = HalfMask.size();
+ unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
+ HalfIdx1 = -1;
+ HalfIdx2 = -1;
+ for (unsigned i = 0; i != HalfNumElts; ++i) {
+ int M = Mask[i + MaskIndexOffset];
+ if (M < 0) {
+ HalfMask[i] = M;
+ continue;
+ }
+
+ // Determine which of the 4 half vectors this element is from.
+ // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
+ int HalfIdx = M / HalfNumElts;
+
+ // Determine the element index into its half vector source.
+ int HalfElt = M % HalfNumElts;
+
+ // We can shuffle with up to 2 half vectors, set the new 'half'
+ // shuffle mask accordingly.
+ if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
+ HalfMask[i] = HalfElt;
+ HalfIdx1 = HalfIdx;
+ continue;
+ }
+ if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
+ HalfMask[i] = HalfElt + HalfNumElts;
+ HalfIdx2 = HalfIdx;
+ continue;
+ }
+
+ // Too many half vectors referenced.
+ return false;
+ }
+
+ return true;
+}
+
+/// Given the output values from getHalfShuffleMask(), create a half width
+/// shuffle of extracted vectors followed by an insert back to full width.
+static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
+ ArrayRef<int> HalfMask, int HalfIdx1,
+ int HalfIdx2, bool UndefLower,
+ SelectionDAG &DAG, bool UseConcat = false) {
+ assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
+ assert(V1.getValueType().isSimple() && "Expecting only simple types");
+
+ MVT VT = V1.getSimpleValueType();
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ unsigned HalfNumElts = HalfVT.getVectorNumElements();
+
+ auto getHalfVector = [&](int HalfIdx) {
+ if (HalfIdx < 0)
+ return DAG.getUNDEF(HalfVT);
+ SDValue V = (HalfIdx < 2 ? V1 : V2);
+ HalfIdx = (HalfIdx % 2) * HalfNumElts;
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
+ DAG.getIntPtrConstant(HalfIdx, DL));
+ };
+
+ // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
+ SDValue Half1 = getHalfVector(HalfIdx1);
+ SDValue Half2 = getHalfVector(HalfIdx2);
+ SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ if (UseConcat) {
+ SDValue Op0 = V;
+ SDValue Op1 = DAG.getUNDEF(HalfVT);
+ if (UndefLower)
+ std::swap(Op0, Op1);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
+ }
+
+ unsigned Offset = UndefLower ? HalfNumElts : 0;
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
+ DAG.getIntPtrConstant(Offset, DL));
+}
+
+/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.is256BitVector() || VT.is512BitVector()) &&
+ "Expected 256-bit or 512-bit vector");
+
+ bool UndefLower = isUndefLowerHalf(Mask);
+ if (!UndefLower && !isUndefUpperHalf(Mask))
+ return SDValue();
+
+ assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
+ "Completely undef shuffle mask should have been simplified already");
+
+ // Upper half is undef and lower half is whole upper subvector.
+ // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ unsigned HalfNumElts = HalfVT.getVectorNumElements();
+ if (!UndefLower &&
+ isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Lower half is undef and upper half is whole lower subvector.
+ // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (UndefLower &&
+ isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ }
+
+ int HalfIdx1, HalfIdx2;
+ SmallVector<int, 8> HalfMask(HalfNumElts);
+ if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
+ return SDValue();
+
+ assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+ // Only shuffle the halves of the inputs when useful.
+ unsigned NumLowerHalves =
+ (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
+ unsigned NumUpperHalves =
+ (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
+ assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
+
+ // Determine the larger pattern of undef/halves, then decide if it's worth
+ // splitting the shuffle based on subtarget capabilities and types.
+ unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
+ if (!UndefLower) {
+ // XXXXuuuu: no insert is needed.
+ // Always extract lowers when setting lower - these are all free subreg ops.
+ if (NumUpperHalves == 0)
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+
+ if (NumUpperHalves == 1) {
+ // AVX2 has efficient 32/64-bit element cross-lane shuffles.
+ if (Subtarget.hasAVX2()) {
+ // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
+ if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
+ !is128BitUnpackShuffleMask(HalfMask) &&
+ (!isSingleSHUFPSMask(HalfMask) ||
+ Subtarget.hasFastVariableShuffle()))
+ return SDValue();
+ // If this is a unary shuffle (assume that the 2nd operand is
+ // canonicalized to undef), then we can use vpermpd. Otherwise, we
+ // are better off extracting the upper half of 1 operand and using a
+ // narrow shuffle.
+ if (EltWidth == 64 && V2.isUndef())
+ return SDValue();
+ }
+ // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
+ if (Subtarget.hasAVX512() && VT.is512BitVector())
+ return SDValue();
+ // Extract + narrow shuffle is better than the wide alternative.
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+ }
+
+ // Don't extract both uppers, instead shuffle and then extract.
+ assert(NumUpperHalves == 2 && "Half vector count went wrong");
+ return SDValue();
+ }
+
+ // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
+ if (NumUpperHalves == 0) {
+ // AVX2 has efficient 64-bit element cross-lane shuffles.
+ // TODO: Refine to account for unary shuffle, splat, and other masks?
+ if (Subtarget.hasAVX2() && EltWidth == 64)
+ return SDValue();
+ // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
+ if (Subtarget.hasAVX512() && VT.is512BitVector())
+ return SDValue();
+ // Narrow shuffle + insert is better than the wide alternative.
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+ }
+
+ // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
+ return SDValue();
+}
+
+/// Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+ assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+ return false;
+
+ return true;
+}
+
+/// Handle case where shuffle sources are coming from the same 128-bit lane and
+/// every lane can be represented as the same repeating mask - allowing us to
+/// shuffle the sources with the repeating shuffle and then permute the result
+/// to the destination lanes.
+static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = NumElts / NumLanes;
+
+ // On AVX2 we may be able to just shuffle the lowest elements and then
+ // broadcast the result.
+ if (Subtarget.hasAVX2()) {
+ for (unsigned BroadcastSize : {16, 32, 64}) {
+ if (BroadcastSize <= VT.getScalarSizeInBits())
+ continue;
+ int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
+
+ // Attempt to match a repeating pattern every NumBroadcastElts,
+ // accounting for UNDEFs but only references the lowest 128-bit
+ // lane of the inputs.
+ auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
+ for (int i = 0; i != NumElts; i += NumBroadcastElts)
+ for (int j = 0; j != NumBroadcastElts; ++j) {
+ int M = Mask[i + j];
+ if (M < 0)
+ continue;
+ int &R = RepeatMask[j];
+ if (0 != ((M % NumElts) / NumLaneElts))
+ return false;
+ if (0 <= R && R != M)
+ return false;
+ R = M;
+ }
+ return true;
+ };
+
+ SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
+ if (!FindRepeatingBroadcastMask(RepeatMask))
+ continue;
+
+ // Shuffle the (lowest) repeated elements in place for broadcast.
+ SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
+
+ // Shuffle the actual broadcast.
+ SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumBroadcastElts)
+ for (int j = 0; j != NumBroadcastElts; ++j)
+ BroadcastMask[i + j] = j;
+ return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
+ BroadcastMask);
+ }
+ }
+
+ // Bail if the shuffle mask doesn't cross 128-bit lanes.
+ if (!is128BitLaneCrossingShuffleMask(VT, Mask))
+ return SDValue();
+
+ // Bail if we already have a repeated lane shuffle mask.
+ SmallVector<int, 8> RepeatedShuffleMask;
+ if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
+ return SDValue();
+
+ // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
+ // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
+ int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
+ int NumSubLanes = NumLanes * SubLaneScale;
+ int NumSubLaneElts = NumLaneElts / SubLaneScale;
+
+ // Check that all the sources are coming from the same lane and see if we can
+ // form a repeating shuffle mask (local to each sub-lane). At the same time,
+ // determine the source sub-lane for each destination sub-lane.
+ int TopSrcSubLane = -1;
+ SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
+ SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
+ SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
+ SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
+
+ for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
+ // Extract the sub-lane mask, check that it all comes from the same lane
+ // and normalize the mask entries to come from the first lane.
+ int SrcLane = -1;
+ SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
+ for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+ int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
+ if (M < 0)
+ continue;
+ int Lane = (M % NumElts) / NumLaneElts;
+ if ((0 <= SrcLane) && (SrcLane != Lane))
+ return SDValue();
+ SrcLane = Lane;
+ int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
+ SubLaneMask[Elt] = LocalM;
+ }
+
+ // Whole sub-lane is UNDEF.
+ if (SrcLane < 0)
+ continue;
+
+ // Attempt to match against the candidate repeated sub-lane masks.
+ for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
+ auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
+ for (int i = 0; i != NumSubLaneElts; ++i) {
+ if (M1[i] < 0 || M2[i] < 0)
+ continue;
+ if (M1[i] != M2[i])
+ return false;
+ }
+ return true;
+ };
+
+ auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
+ if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
+ continue;
+
+ // Merge the sub-lane mask into the matching repeated sub-lane mask.
+ for (int i = 0; i != NumSubLaneElts; ++i) {
+ int M = SubLaneMask[i];
+ if (M < 0)
+ continue;
+ assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
+ "Unexpected mask element");
+ RepeatedSubLaneMask[i] = M;
+ }
+
+ // Track the top most source sub-lane - by setting the remaining to UNDEF
+ // we can greatly simplify shuffle matching.
+ int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
+ TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
+ Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
+ break;
+ }
+
+ // Bail if we failed to find a matching repeated sub-lane mask.
+ if (Dst2SrcSubLanes[DstSubLane] < 0)
+ return SDValue();
+ }
+ assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
+ "Unexpected source lane");
+
+ // Create a repeating shuffle mask for the entire vector.
+ SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
+ for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
+ int Lane = SubLane / SubLaneScale;
+ auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
+ for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+ int M = RepeatedSubLaneMask[Elt];
+ if (M < 0)
+ continue;
+ int Idx = (SubLane * NumSubLaneElts) + Elt;
+ RepeatedMask[Idx] = M + (Lane * NumLaneElts);
+ }
+ }
+ SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
+
+ // Shuffle each source sub-lane to its destination.
+ SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumSubLaneElts) {
+ int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
+ if (SrcSubLane < 0)
+ continue;
+ for (int j = 0; j != NumSubLaneElts; ++j)
+ SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
+ }
+
+ return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
+ SubLaneMask);
+}
+
+static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
+ bool &ForceV1Zero, bool &ForceV2Zero,
+ unsigned &ShuffleImm, ArrayRef<int> Mask,
+ const APInt &Zeroable) {
+ int NumElts = VT.getVectorNumElements();
+ assert(VT.getScalarSizeInBits() == 64 &&
+ (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
+ "Unexpected data type for VSHUFPD");
+ assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
+ "Illegal shuffle mask");
+
+ bool ZeroLane[2] = { true, true };
+ for (int i = 0; i < NumElts; ++i)
+ ZeroLane[i & 1] &= Zeroable[i];
+
+ // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
+ // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
+ ShuffleImm = 0;
+ bool ShufpdMask = true;
+ bool CommutableMask = true;
+ for (int i = 0; i < NumElts; ++i) {
+ if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
+ continue;
+ if (Mask[i] < 0)
+ return false;
+ int Val = (i & 6) + NumElts * (i & 1);
+ int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
+ if (Mask[i] < Val || Mask[i] > Val + 1)
+ ShufpdMask = false;
+ if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
+ CommutableMask = false;
+ ShuffleImm |= (Mask[i] % 2) << i;
+ }
+
+ if (!ShufpdMask && !CommutableMask)
+ return false;
+
+ if (!ShufpdMask && CommutableMask)
+ std::swap(V1, V2);
+
+ ForceV1Zero = ZeroLane[0];
+ ForceV2Zero = ZeroLane[1];
+ return true;
+}
+
+static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
+ "Unexpected data type for VSHUFPD");
+
+ unsigned Immediate = 0;
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
+ Mask, Zeroable))
+ return SDValue();
+
+ // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+ if (ForceV1Zero)
+ V1 = getZeroVector(VT, Subtarget, DAG, DL);
+ if (ForceV2Zero)
+ V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+ DAG.getTargetConstant(Immediate, DL, MVT::i8));
+}
+
+// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
+// by zeroable elements in the remaining 24 elements. Turn this into two
+// vmovqb instructions shuffled together.
+static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ SelectionDAG &DAG) {
+ assert(VT == MVT::v32i8 && "Unexpected type!");
+
+ // The first 8 indices should be every 8th element.
+ if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
+ return SDValue();
+
+ // Remaining elements need to be zeroable.
+ if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
+ return SDValue();
+
+ V1 = DAG.getBitcast(MVT::v4i64, V1);
+ V2 = DAG.getBitcast(MVT::v4i64, V2);
+
+ V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
+ V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
+
+ // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
+ // the upper bits of the result using an unpckldq.
+ SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
+ { 0, 1, 2, 3, 16, 17, 18, 19,
+ 4, 5, 6, 7, 20, 21, 22, 23 });
+ // Insert the unpckldq into a zero vector to widen to v32i8.
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
+ DAG.getConstant(0, DL, MVT::v32i8), Unpack,
+ DAG.getIntPtrConstant(0, DL));
+}
+
+
+/// Handle lowering of 4-lane 64-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ if (V2.isUndef()) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
+ if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
+ // Non-half-crossing single input shuffles can be lowered with an
+ // interleaved permutation.
+ unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
+ DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
+ }
+
+ // With AVX2 we have direct support for this permutation.
+ if (Subtarget.hasAVX2())
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // Try to permute the lanes and then use a per-lane permute.
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
+ Mask, DAG, Subtarget))
+ return V;
+
+ // Otherwise, fall back.
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
+ DAG, Subtarget);
+ }
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ return V;
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Check if the blend happens to exactly fit that of SHUFPD.
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Op;
+
+ // If we have lane crossing shuffles AND they don't all come from the lower
+ // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
+ // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
+ // canonicalize to a blend of splat which isn't necessary for this combine.
+ if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
+ !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
+ (V1.getOpcode() != ISD::BUILD_VECTOR) &&
+ (V2.getOpcode() != ISD::BUILD_VECTOR))
+ if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
+ Mask, DAG))
+ return Op;
+
+ // If we have one input in place, then we can permute the other input and
+ // blend the result.
+ if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // If we have VLX support, we can use VEXPAND.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
+ return V;
+
+ // If we have AVX2 then we always want to lower with a blend because an v4 we
+ // can fully permute the elements.
+ if (Subtarget.hasAVX2())
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
+}
+
+/// Handle lowering of 4-lane 64-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v4i64 shuffling..
+static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
+
+ if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Broadcast;
+
+ if (V2.isUndef()) {
+ // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+ // can use lower latency instructions that will operate on both lanes.
+ SmallVector<int, 2> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+ SmallVector<int, 4> PSHUFDMask;
+ narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
+ return DAG.getBitcast(
+ MVT::v4i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
+ DAG.getBitcast(MVT::v8i32, V1),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+ }
+
+ // AVX2 provides a direct instruction for permuting a single input across
+ // lanes.
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // If we have VLX support, we can use VALIGN or VEXPAND.
+ if (Subtarget.hasVLX()) {
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
+ return V;
+ }
+
+ // Try to use PALIGNR.
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+ return V;
+
+ // If we have one input in place, then we can permute the other input and
+ // blend the result.
+ if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG);
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!isShuffleMaskInputInPlace(0, Mask) &&
+ !isShuffleMaskInputInPlace(1, Mask))
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG);
+}
+
+/// Handle lowering of 8-lane 32-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Broadcast;
+
+ // If the shuffle mask is repeated in each 128-bit lane, we have many more
+ // options to efficiently lower the shuffle.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 &&
+ "Repeated masks must be half the mask width!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+ if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+ return V;
+
+ // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+ // have already handled any direct blends.
+ return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+ }
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // two 128-bit lanes use the variable mask to VPERMILPS.
+ if (V2.isUndef()) {
+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
+ }
+ if (Subtarget.hasAVX2()) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
+ }
+ // Otherwise, fall back.
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
+ DAG, Subtarget);
+ }
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // If we have VLX support, we can use VEXPAND.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
+ return V;
+
+ // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+ // since after split we get a more efficient code using vpunpcklwd and
+ // vpunpckhwd instrs than vblend.
+ if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
+ DAG);
+
+ // If we have AVX2 then we always want to lower with a blend because at v8 we
+ // can fully permute the elements.
+ if (Subtarget.hasAVX2())
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG);
+}
+
+/// Handle lowering of 8-lane 32-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v8i32 shuffling..
+static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+ // since after split we get a more efficient code than vblend by using
+ // vpunpcklwd and vpunpckhwd instrs.
+ if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
+ !Subtarget.hasAVX512())
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
+ DAG);
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Broadcast;
+
+ // If the shuffle mask is repeated in each 128-bit lane we can use more
+ // efficient instructions that mirror the shuffles across the two 128-bit
+ // lanes.
+ SmallVector<int, 4> RepeatedMask;
+ bool Is128BitLaneRepeatedShuffle =
+ is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
+ if (Is128BitLaneRepeatedShuffle) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ return V;
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // If we have VLX support, we can use VALIGN or EXPAND.
+ if (Subtarget.hasVLX()) {
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
+ return V;
+ }
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ if (V2.isUndef()) {
+ // Try to produce a fixed cross-128-bit lane permute followed by unpack
+ // because that should be faster than the variable permute alternatives.
+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ return V;
+
+ // If the shuffle patterns aren't repeated but it's a single input, directly
+ // generate a cross-lane VPERMD instruction.
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
+ }
+
+ // Assume that a single SHUFPS is faster than an alternative sequence of
+ // multiple instructions (even if the CPU has a domain penalty).
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
+ SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
+ SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
+ CastV1, CastV2, DAG);
+ return DAG.getBitcast(MVT::v8i32, ShufPS);
+ }
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG);
+}
+
+/// Handle lowering of 16-lane 16-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v16i16 shuffling..
+static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Broadcast;
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ if (V2.isUndef()) {
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to produce a fixed cross-128-bit lane permute followed by unpack
+ // because that should be faster than the variable permute alternatives.
+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ return V;
+
+ // There are no generalized cross-lane shuffle operations available on i16
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
+ DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
+ return V;
+
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
+ DAG, Subtarget);
+ }
+
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v16 case.
+ return lowerV8I16GeneralSingleInputShuffle(
+ DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+ }
+ }
+
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
+ return PSHUFB;
+
+ // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
+ if (Subtarget.hasBWI())
+ return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Try to permute the lanes and then use a per-lane permute.
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
+ DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
+ return V;
+
+ // Otherwise fall back on generic lowering.
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG);
+}
+
+/// Handle lowering of 32-lane 8-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v32i8 shuffling..
+static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+ assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
+ return Broadcast;
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Try to use bit rotation instructions.
+ if (V2.isUndef())
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // There are no generalized cross-lane shuffle operations available on i8
+ // element types.
+ if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
+ // Try to produce a fixed cross-128-bit lane permute followed by unpack
+ // because that should be faster than the variable permute alternatives.
+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ return V;
+
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
+ DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+ return V;
+
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
+ DAG, Subtarget);
+ }
+
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
+ return PSHUFB;
+
+ // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Try to permute the lanes and then use a per-lane permute.
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
+ DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+ return V;
+
+ // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
+ // by zeroable elements in the remaining 24 elements. Turn this into two
+ // vmovqb instructions shuffled together.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
+ Mask, Zeroable, DAG))
+ return V;
+
+ // Otherwise fall back on generic lowering.
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG);
+}
+
+/// High-level routine to lower various 256-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 256-bit x86 vector
+/// shuffle or splits it into two 128-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = VT.getVectorNumElements();
+ int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+
+ // Handle special cases where the lower or upper half is UNDEF.
+ if (SDValue V =
+ lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // There is a really nice hard cut-over between AVX1 and AVX2 that means we
+ // can check for those subtargets here and avoid much of the subtarget
+ // querying in the per-vector-type lowering routines. With AVX1 we have
+ // essentially *zero* ability to manipulate a 256-bit vector with integer
+ // types. Since we'll use floating point types there eventually, just
+ // immediately cast everything to a float and operate entirely in that domain.
+ if (VT.isInteger() && !Subtarget.hasAVX2()) {
+ int ElementBits = VT.getScalarSizeInBits();
+ if (ElementBits < 32) {
+ // No floating point type available, if we can't use the bit operations
+ // for masking/blending then decompose into 128-bit vectors.
+ if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return V;
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
+
+ MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
+ VT.getVectorNumElements());
+ V1 = DAG.getBitcast(FpVT, V1);
+ V2 = DAG.getBitcast(FpVT, V2);
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+ }
+
+ switch (VT.SimpleTy) {
+ case MVT::v4f64:
+ return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v4i64:
+ return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v8f32:
+ return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v8i32:
+ return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v16i16:
+ return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v32i8:
+ return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Not a valid 256-bit x86 vector type!");
+ }
+}
+
+/// Try to lower a vector shuffle as a 128-bit shuffles.
+static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(VT.getScalarSizeInBits() == 64 &&
+ "Unexpected element type size for 128bit shuffle.");
+
+ // To handle 256 bit vector requires VLX and most probably
+ // function lowerV2X128VectorShuffle() is better solution.
+ assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
+
+ // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
+ SmallVector<int, 4> Widened128Mask;
+ if (!canWidenShuffleElements(Mask, Widened128Mask))
+ return SDValue();
+ assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
+
+ // Try to use an insert into a zero vector.
+ if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
+ (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
+ unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), LoV,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Check for patterns which can be matched with a single insert of a 256-bit
+ // subvector.
+ bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
+ if (OnlyUsesV1 ||
+ isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
+ SDValue SubVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
+ DAG.getIntPtrConstant(4, DL));
+ }
+
+ // See if this is an insertion of the lower 128-bits of V2 into V1.
+ bool IsInsert = true;
+ int V2Index = -1;
+ for (int i = 0; i < 4; ++i) {
+ assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+ if (Widened128Mask[i] < 0)
+ continue;
+
+ // Make sure all V1 subvectors are in place.
+ if (Widened128Mask[i] < 4) {
+ if (Widened128Mask[i] != i) {
+ IsInsert = false;
+ break;
+ }
+ } else {
+ // Make sure we only have a single V2 index and its the lowest 128-bits.
+ if (V2Index >= 0 || Widened128Mask[i] != 4) {
+ IsInsert = false;
+ break;
+ }
+ V2Index = i;
+ }
+ }
+ if (IsInsert && V2Index >= 0) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+ SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
+ DAG.getIntPtrConstant(0, DL));
+ return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
+ }
+
+ // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
+ // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
+ // possible we at least ensure the lanes stay sequential to help later
+ // combines.
+ SmallVector<int, 2> Widened256Mask;
+ if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
+ Widened128Mask.clear();
+ narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
+ }
+
+ // Try to lower to vshuf64x2/vshuf32x4.
+ SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
+ unsigned PermMask = 0;
+ // Insure elements came from the same Op.
+ for (int i = 0; i < 4; ++i) {
+ assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+ if (Widened128Mask[i] < 0)
+ continue;
+
+ SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
+ unsigned OpIndex = i / 2;
+ if (Ops[OpIndex].isUndef())
+ Ops[OpIndex] = Op;
+ else if (Ops[OpIndex] != Op)
+ return SDValue();
+
+ // Convert the 128-bit shuffle mask selection values into 128-bit selection
+ // bits defined by a vshuf64x2 instruction's immediate control byte.
+ PermMask |= (Widened128Mask[i] % 4) << (i * 2);
+ }
+
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+}
+
+/// Handle lowering of 8-lane 64-bit floating point shuffles.
+static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (V2.isUndef()) {
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
+
+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
+ // Non-half-crossing single input shuffles can be lowered with an
+ // interleaved permutation.
+ unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
+ ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
+ ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
+ DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
+ }
+
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+ }
+
+ if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
+ V2, Subtarget, DAG))
+ return Shuf128;
+
+ if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Unpck;
+
+ // Check if the blend happens to exactly fit that of SHUFPD.
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Op;
+
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
+ return V;
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
+}
+
+/// Handle lowering of 16-lane 32-bit floating point shuffles.
+static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ // If the shuffle mask is repeated in each 128-bit lane, we have many more
+ // options to efficiently lower the shuffle.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
+ if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
+
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+ return V;
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Otherwise, fall back to a SHUFPS sequence.
+ return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
+ }
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
+ if (V2.isUndef() &&
+ !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
+ }
+
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
+
+ return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
+}
+
+/// Handle lowering of 8-lane 64-bit integer shuffles.
+static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (V2.isUndef()) {
+ // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+ // can use lower latency instructions that will operate on all four
+ // 128-bit lanes.
+ SmallVector<int, 2> Repeated128Mask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
+ SmallVector<int, 4> PSHUFDMask;
+ narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
+ return DAG.getBitcast(
+ MVT::v8i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
+ DAG.getBitcast(MVT::v16i32, V1),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+ }
+
+ SmallVector<int, 4> Repeated256Mask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
+ getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
+ }
+
+ if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
+ V2, Subtarget, DAG))
+ return Shuf128;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use VALIGN.
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Try to use PALIGNR.
+ if (Subtarget.hasBWI())
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Unpck;
+
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
+ return V;
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
+}
+
+/// Handle lowering of 16-lane 32-bit integer shuffles.
+static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // If the shuffle mask is repeated in each 128-bit lane we can use more
+ // efficient instructions that mirror the shuffles across the four 128-bit
+ // lanes.
+ SmallVector<int, 4> RepeatedMask;
+ bool Is128BitLaneRepeatedShuffle =
+ is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
+ if (Is128BitLaneRepeatedShuffle) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+ return V;
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use VALIGN.
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Try to use byte rotation instructions.
+ if (Subtarget.hasBWI())
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Assume that a single SHUFPS is faster than using a permv shuffle.
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
+ SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
+ SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
+ CastV1, CastV2, DAG);
+ return DAG.getBitcast(MVT::v16i32, ShufPS);
+ }
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
+ return V;
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
+}
+
+/// Handle lowering of 32-lane 16-bit integer shuffles.
+static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+ assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+ assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
+ DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ if (V2.isUndef()) {
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v32 case.
+ return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
+ RepeatedMask, Subtarget, DAG);
+ }
+ }
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
+ return PSHUFB;
+
+ return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
+}
+
+/// Handle lowering of 64-lane 8-bit integer shuffles.
+static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+ assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
+ assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
+ DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
+ // Try to use bit rotation instructions.
+ if (V2.isUndef())
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Lower as AND if possible.
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Masked;
+
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
+ return PSHUFB;
+
+ // VBMI can use VPERMV/VPERMV3 byte shuffles.
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (!V2.isUndef())
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // FIXME: Implement direct support for this type!
+ return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+}
+
+/// High-level routine to lower various 512-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 512-bit x86 vector
+/// shuffle or splits it into two 256-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX512() &&
+ "Cannot lower 512-bit vectors w/ basic ISA!");
+
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = Mask.size();
+ int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+
+ // Handle special cases where the lower or upper half is UNDEF.
+ if (SDValue V =
+ lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+ Subtarget, DAG))
+ return Broadcast;
+
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
+ // Try using bit ops for masking and blending before falling back to
+ // splitting.
+ if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return V;
+
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
+
+ // Dispatch to each element type for lowering. If we don't have support for
+ // specific element type shuffles at 512 bits, immediately split them and
+ // lower them. Each lowering routine of a given type is allowed to assume that
+ // the requisite ISA extensions for that element type are available.
+ switch (VT.SimpleTy) {
+ case MVT::v8f64:
+ return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v16f32:
+ return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v8i64:
+ return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v16i32:
+ return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v32i16:
+ return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ case MVT::v64i8:
+ return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Not a valid 512-bit x86 vector type!");
+ }
+}
+
+static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // Shuffle should be unary.
+ if (!V2.isUndef())
+ return SDValue();
+
+ int ShiftAmt = -1;
+ int NumElts = Mask.size();
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
+ "Unexpected mask index.");
+ if (M < 0)
+ continue;
+
+ // The first non-undef element determines our shift amount.
+ if (ShiftAmt < 0) {
+ ShiftAmt = M - i;
+ // Need to be shifting right.
+ if (ShiftAmt <= 0)
+ return SDValue();
+ }
+ // All non-undef elements must shift by the same amount.
+ if (ShiftAmt != M - i)
+ return SDValue();
+ }
+ assert(ShiftAmt >= 0 && "All undef?");
+
+ // Great we found a shift right.
+ MVT WideVT = VT;
+ if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
+ WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
+ DAG.getUNDEF(WideVT), V1,
+ DAG.getIntPtrConstant(0, DL));
+ Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
+ DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+}
+
+// Determine if this shuffle can be implemented with a KSHIFT instruction.
+// Returns the shift amount if possible or -1 if not. This is a simplified
+// version of matchShuffleAsShift.
+static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
+ int MaskOffset, const APInt &Zeroable) {
+ int Size = Mask.size();
+
+ auto CheckZeros = [&](int Shift, bool Left) {
+ for (int j = 0; j < Shift; ++j)
+ if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
+ return false;
+
+ return true;
+ };
+
+ auto MatchShift = [&](int Shift, bool Left) {
+ unsigned Pos = Left ? Shift : 0;
+ unsigned Low = Left ? 0 : Shift;
+ unsigned Len = Size - Shift;
+ return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
+ };
+
+ for (int Shift = 1; Shift != Size; ++Shift)
+ for (bool Left : {true, false})
+ if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
+ Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
+ return Shift;
+ }
+
+ return -1;
+}
+
+
+// Lower vXi1 vector shuffles.
+// There is no a dedicated instruction on AVX-512 that shuffles the masks.
+// The only way to shuffle bits is to sign-extend the mask vector to SIMD
+// vector, shuffle and then truncate it back.
+static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX512() &&
+ "Cannot lower 512-bit vectors w/o basic ISA!");
+
+ int NumElts = Mask.size();
+
+ // Try to recognize shuffles that are just padding a subvector with zeros.
+ int SubvecElts = 0;
+ int Src = -1;
+ for (int i = 0; i != NumElts; ++i) {
+ if (Mask[i] >= 0) {
+ // Grab the source from the first valid mask. All subsequent elements need
+ // to use this same source.
+ if (Src < 0)
+ Src = Mask[i] / NumElts;
+ if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
+ break;
+ }
+
+ ++SubvecElts;
+ }
+ assert(SubvecElts != NumElts && "Identity shuffle?");
+
+ // Clip to a power 2.
+ SubvecElts = PowerOf2Floor(SubvecElts);
+
+ // Make sure the number of zeroable bits in the top at least covers the bits
+ // not covered by the subvector.
+ if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
+ assert(Src >= 0 && "Expected a source!");
+ MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
+ Src == 0 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ DAG.getConstant(0, DL, VT),
+ Extract, DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Try a simple shift right with undef elements. Later we'll try with zeros.
+ if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
+ DAG))
+ return Shift;
+
+ // Try to match KSHIFTs.
+ unsigned Offset = 0;
+ for (SDValue V : { V1, V2 }) {
+ unsigned Opcode;
+ int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
+ if (ShiftAmt >= 0) {
+ MVT WideVT = VT;
+ if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
+ WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
+ DAG.getUNDEF(WideVT), V,
+ DAG.getIntPtrConstant(0, DL));
+ // Widened right shifts need two shifts to ensure we shift in zeroes.
+ if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
+ int WideElts = WideVT.getVectorNumElements();
+ // Shift left to put the original vector in the MSBs of the new size.
+ Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
+ DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
+ // Increase the shift amount to account for the left shift.
+ ShiftAmt += WideElts - NumElts;
+ }
+
+ Res = DAG.getNode(Opcode, DL, WideVT, Res,
+ DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ Offset += NumElts; // Increment for next iteration.
+ }
+
+
+
+ MVT ExtVT;
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("Expected a vector of i1 elements");
+ case MVT::v2i1:
+ ExtVT = MVT::v2i64;
+ break;
+ case MVT::v4i1:
+ ExtVT = MVT::v4i32;
+ break;
+ case MVT::v8i1:
+ // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
+ // shuffle.
+ ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
+ break;
+ case MVT::v16i1:
+ // Take 512-bit type, unless we are avoiding 512-bit types and have the
+ // 256-bit operation available.
+ ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
+ break;
+ case MVT::v32i1:
+ // Take 512-bit type, unless we are avoiding 512-bit types and have the
+ // 256-bit operation available.
+ assert(Subtarget.hasBWI() && "Expected AVX512BW support");
+ ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
+ break;
+ case MVT::v64i1:
+ // Fall back to scalarization. FIXME: We can do better if the shuffle
+ // can be partitioned cleanly.
+ if (!Subtarget.useBWIRegs())
+ return SDValue();
+ ExtVT = MVT::v64i8;
+ break;
+ }
+
+ V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+ V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+
+ SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
+ // i1 was sign extended we can use X86ISD::CVT2MASK.
+ int NumElems = VT.getVectorNumElements();
+ if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
+ (Subtarget.hasDQI() && (NumElems < 32)))
+ return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
+ Shuffle, ISD::SETGT);
+
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
+}
+
+/// Helper function that returns true if the shuffle mask should be
+/// commuted to improve canonicalization.
+static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
+ int NumElements = Mask.size();
+
+ int NumV1Elements = 0, NumV2Elements = 0;
+ for (int M : Mask)
+ if (M < 0)
+ continue;
+ else if (M < NumElements)
+ ++NumV1Elements;
+ else
+ ++NumV2Elements;
+
+ // Commute the shuffle as needed such that more elements come from V1 than
+ // V2. This allows us to match the shuffle pattern strictly on how many
+ // elements come from V1 without handling the symmetric cases.
+ if (NumV2Elements > NumV1Elements)
+ return true;
+
+ assert(NumV1Elements > 0 && "No V1 indices");
+
+ if (NumV2Elements == 0)
+ return false;
+
+ // When the number of V1 and V2 elements are the same, try to minimize the
+ // number of uses of V2 in the low half of the vector. When that is tied,
+ // ensure that the sum of indices for V1 is equal to or lower than the sum
+ // indices for V2. When those are equal, try to ensure that the number of odd
+ // indices for V1 is lower than the number of odd indices for V2.
+ if (NumV1Elements == NumV2Elements) {
+ int LowV1Elements = 0, LowV2Elements = 0;
+ for (int M : Mask.slice(0, NumElements / 2))
+ if (M >= NumElements)
+ ++LowV2Elements;
+ else if (M >= 0)
+ ++LowV1Elements;
+ if (LowV2Elements > LowV1Elements)
+ return true;
+ if (LowV2Elements == LowV1Elements) {
+ int SumV1Indices = 0, SumV2Indices = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= NumElements)
+ SumV2Indices += i;
+ else if (Mask[i] >= 0)
+ SumV1Indices += i;
+ if (SumV2Indices < SumV1Indices)
+ return true;
+ if (SumV2Indices == SumV1Indices) {
+ int NumV1OddIndices = 0, NumV2OddIndices = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= NumElements)
+ NumV2OddIndices += i % 2;
+ else if (Mask[i] >= 0)
+ NumV1OddIndices += i % 2;
+ if (NumV2OddIndices < NumV1OddIndices)
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/// Top-level lowering for x86 vector shuffles.
+///
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> OrigMask = SVOp->getMask();
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ MVT VT = Op.getSimpleValueType();
+ int NumElements = VT.getVectorNumElements();
+ SDLoc DL(Op);
+ bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
+
+ assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
+ "Can't lower MMX shuffles");
+
+ bool V1IsUndef = V1.isUndef();
+ bool V2IsUndef = V2.isUndef();
+ if (V1IsUndef && V2IsUndef)
+ return DAG.getUNDEF(VT);
+
+ // When we create a shuffle node we put the UNDEF node to second operand,
+ // but in some cases the first operand may be transformed to UNDEF.
+ // In this case we should just commute the node.
+ if (V1IsUndef)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+
+ // Check for non-undef masks pointing at an undef vector and make the masks
+ // undef as well. This makes it easier to match the shuffle based solely on
+ // the mask.
+ if (V2IsUndef &&
+ any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
+ SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
+ for (int &M : NewMask)
+ if (M >= NumElements)
+ M = -1;
+ return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+ }
+
+ // Check for illegal shuffle mask element index values.
+ int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
+ (void)MaskUpperLimit;
+ assert(llvm::all_of(OrigMask,
+ [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
+ "Out of bounds shuffle index");
+
+ // We actually see shuffles that are entirely re-arrangements of a set of
+ // zero inputs. This mostly happens while decomposing complex shuffles into
+ // simple ones. Directly lower these as a buildvector of zeros.
+ APInt KnownUndef, KnownZero;
+ computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
+
+ APInt Zeroable = KnownUndef | KnownZero;
+ if (Zeroable.isAllOnesValue())
+ return getZeroVector(VT, Subtarget, DAG, DL);
+
+ bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
+
+ // Try to collapse shuffles into using a vector type with fewer elements but
+ // wider element types. We cap this to not form integers or floating point
+ // elements wider than 64 bits, but it might be interesting to form i128
+ // integers to handle flipping the low and high halves of AVX 256-bit vectors.
+ SmallVector<int, 16> WidenedMask;
+ if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
+ canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
+ // Shuffle mask widening should not interfere with a broadcast opportunity
+ // by obfuscating the operands with bitcasts.
+ // TODO: Avoid lowering directly from this top-level function: make this
+ // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
+ Subtarget, DAG))
+ return Broadcast;
+
+ MVT NewEltVT = VT.isFloatingPoint()
+ ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
+ : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
+ int NewNumElts = NumElements / 2;
+ MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
+ // Make sure that the new vector type is legal. For example, v2f64 isn't
+ // legal on SSE1.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+ if (V2IsZero) {
+ // Modify the new Mask to take all zeros from the all-zero vector.
+ // Choose indices that are blend-friendly.
+ bool UsedZeroVector = false;
+ assert(is_contained(WidenedMask, SM_SentinelZero) &&
+ "V2's non-undef elements are used?!");
+ for (int i = 0; i != NewNumElts; ++i)
+ if (WidenedMask[i] == SM_SentinelZero) {
+ WidenedMask[i] = i + NewNumElts;
+ UsedZeroVector = true;
+ }
+ // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
+ // some elements to be undef.
+ if (UsedZeroVector)
+ V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
+ }
+ V1 = DAG.getBitcast(NewVT, V1);
+ V2 = DAG.getBitcast(NewVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
+ }
+ }
+
+ // Commute the shuffle if it will improve canonicalization.
+ SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
+ if (canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(V1, V2);
+ }
+
+ // For each vector width, delegate to a specialized lowering routine.
+ if (VT.is128BitVector())
+ return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
+
+ if (VT.is256BitVector())
+ return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
+
+ if (VT.is512BitVector())
+ return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
+
+ if (Is1BitVector)
+ return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
+
+ llvm_unreachable("Unimplemented!");
+}
+
+/// Try to lower a VSELECT instruction to a vector shuffle.
+static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Cond = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ MVT VT = Op.getSimpleValueType();
+
+ // Only non-legal VSELECTs reach this lowering, convert those into generic
+ // shuffles and re-use the shuffle lowering path for blends.
+ if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
+ SmallVector<int, 32> Mask;
+ if (createShuffleMaskFromVSELECT(Mask, Cond))
+ return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
+ }
+
+ return SDValue();
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Cond = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+
+ // A vselect where all conditions and data are constants can be optimized into
+ // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+ if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
+ return SDValue();
+
+ // Try to lower this to a blend-style vector shuffle. This can handle all
+ // constant condition cases.
+ if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
+ return BlendOp;
+
+ // If this VSELECT has a vector if i1 as a mask, it will be directly matched
+ // with patterns on the mask registers on AVX-512.
+ MVT CondVT = Cond.getSimpleValueType();
+ unsigned CondEltSize = Cond.getScalarValueSizeInBits();
+ if (CondEltSize == 1)
+ return Op;
+
+ // Variable blends are only legal from SSE4.1 onward.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ unsigned EltSize = VT.getScalarSizeInBits();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Expand v32i16/v64i8 without BWI.
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return SDValue();
+
+ // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
+ // into an i1 condition so that we can use the mask-based 512-bit blend
+ // instructions.
+ if (VT.getSizeInBits() == 512) {
+ // Build a mask by testing the condition against zero.
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
+ DAG.getConstant(0, dl, CondVT),
+ ISD::SETNE);
+ // Now return a new VSELECT using the mask.
+ return DAG.getSelect(dl, VT, Mask, LHS, RHS);
+ }
+
+ // SEXT/TRUNC cases where the mask doesn't match the destination size.
+ if (CondEltSize != EltSize) {
+ // If we don't have a sign splat, rely on the expansion.
+ if (CondEltSize != DAG.ComputeNumSignBits(Cond))
+ return SDValue();
+
+ MVT NewCondSVT = MVT::getIntegerVT(EltSize);
+ MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
+ Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
+ return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
+ }
+
+ // Only some types will be legal on some subtargets. If we can emit a legal
+ // VSELECT-matching blend, return Op, and but if we need to expand, return
+ // a null value.
+ switch (VT.SimpleTy) {
+ default:
+ // Most of the vector types have blends past SSE4.1.
+ return Op;
+
+ case MVT::v32i8:
+ // The byte blends for AVX vectors were introduced only in AVX2.
+ if (Subtarget.hasAVX2())
+ return Op;
+
+ return SDValue();
+
+ case MVT::v8i16:
+ case MVT::v16i16: {
+ // Bitcast everything to the vXi8 type and use a vXi8 vselect.
+ MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
+ Cond = DAG.getBitcast(CastVT, Cond);
+ LHS = DAG.getBitcast(CastVT, LHS);
+ RHS = DAG.getBitcast(CastVT, RHS);
+ SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
+ return DAG.getBitcast(VT, Select);
+ }
+ }
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDValue Vec = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+ assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
+ SDLoc dl(Op);
+
+ if (!Vec.getSimpleValueType().is128BitVector())
+ return SDValue();
+
+ if (VT.getSizeInBits() == 8) {
+ // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
+ // we're going to zero extend the register or fold the store.
+ if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
+ !MayFoldIntoStore(Op))
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec), Idx));
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
+ }
+
+ if (VT == MVT::f32) {
+ // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+ // the result back to FR32 register. It's only worth matching if the
+ // result has a single use which is a store or a bitcast to i32. And in
+ // the case of a store, it's not worth it if the index is a constant 0,
+ // because a MOVSSmr can be used instead, which is smaller and faster.
+ if (!Op.hasOneUse())
+ return SDValue();
+ SDNode *User = *Op.getNode()->use_begin();
+ if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
+ (User->getOpcode() != ISD::BITCAST ||
+ User->getValueType(0) != MVT::i32))
+ return SDValue();
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec), Idx);
+ return DAG.getBitcast(MVT::f32, Extract);
+ }
+
+ if (VT == MVT::i32 || VT == MVT::i64)
+ return Op;
+
+ return SDValue();
+}
+
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Vec = Op.getOperand(0);
+ SDLoc dl(Vec);
+ MVT VecVT = Vec.getSimpleValueType();
+ SDValue Idx = Op.getOperand(1);
+ auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
+ MVT EltVT = Op.getSimpleValueType();
+
+ assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
+ "Unexpected vector type in ExtractBitFromMaskVector");
+
+ // variable index can't be handled in mask registers,
+ // extend vector to VR512/128
+ if (!IdxC) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ // Extending v8i1/v16i1 to 512-bit get better performance on KNL
+ // than extending to 128/256bit.
+ MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
+ MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+ }
+
+ unsigned IdxVal = IdxC->getZExtValue();
+ if (IdxVal == 0) // the operation is legal
+ return Op;
+
+ // Extend to natively supported kshift.
+ unsigned NumElems = VecVT.getVectorNumElements();
+ MVT WideVecVT = VecVT;
+ if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
+ WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
+ DAG.getUNDEF(WideVecVT), Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // Use kshiftr instruction to move to the lower element.
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ MVT VecVT = Vec.getSimpleValueType();
+ SDValue Idx = Op.getOperand(1);
+ auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
+
+ if (VecVT.getVectorElementType() == MVT::i1)
+ return ExtractBitFromMaskVector(Op, DAG, Subtarget);
+
+ if (!IdxC) {
+ // Its more profitable to go through memory (1 cycles throughput)
+ // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
+ // IACA tool was used to get performance estimation
+ // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
+ //
+ // example : extractelement <16 x i8> %a, i32 %i
+ //
+ // Block Throughput: 3.00 Cycles
+ // Throughput Bottleneck: Port5
+ //
+ // | Num Of | Ports pressure in cycles | |
+ // | Uops | 0 - DV | 5 | 6 | 7 | |
+ // ---------------------------------------------
+ // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
+ // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
+ // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
+ // Total Num Of Uops: 4
+ //
+ //
+ // Block Throughput: 1.00 Cycles
+ // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
+ //
+ // | | Ports pressure in cycles | |
+ // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
+ // ---------------------------------------------------------
+ // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
+ // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
+ // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
+ // Total Num Of Uops: 4
+
+ return SDValue();
+ }
+
+ unsigned IdxVal = IdxC->getZExtValue();
+
+ // If this is a 256-bit vector result, first extract the 128-bit vector and
+ // then extract the element from the 128-bit vector.
+ if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
+ // Get the 128-bit vector.
+ Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
+ MVT EltVT = VecVT.getVectorElementType();
+
+ unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+ // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
+ // this can be done with a mask.
+ IdxVal &= ElemsPerChunk - 1;
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+ DAG.getIntPtrConstant(IdxVal, dl));
+ }
+
+ assert(VecVT.is128BitVector() && "Unexpected vector length");
+
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT.getSizeInBits() == 16) {
+ // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
+ // we're going to zero extend the register or fold the store (SSE41 only).
+ if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
+ !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec), Idx));
+
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
+ }
+
+ if (Subtarget.hasSSE41())
+ if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
+ return Res;
+
+ // TODO: We only extract a single element from v16i8, we can probably afford
+ // to be more aggressive here before using the default approach of spilling to
+ // stack.
+ if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
+ // Extract either the lowest i32 or any i16, and extract the sub-byte.
+ int DWordIdx = IdxVal / 4;
+ if (DWordIdx == 0) {
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec),
+ DAG.getIntPtrConstant(DWordIdx, dl));
+ int ShiftVal = (IdxVal % 4) * 8;
+ if (ShiftVal != 0)
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
+ DAG.getConstant(ShiftVal, dl, MVT::i8));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ int WordIdx = IdxVal / 2;
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+ DAG.getBitcast(MVT::v8i16, Vec),
+ DAG.getIntPtrConstant(WordIdx, dl));
+ int ShiftVal = (IdxVal % 2) * 8;
+ if (ShiftVal != 0)
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+ DAG.getConstant(ShiftVal, dl, MVT::i8));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ if (VT.getSizeInBits() == 32) {
+ if (IdxVal == 0)
+ return Op;
+
+ // SHUFPS the element to the lowest double word, then movss.
+ int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
+ Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ if (VT.getSizeInBits() == 64) {
+ // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+ // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+ // to match extract_elt for f64.
+ if (IdxVal == 0)
+ return Op;
+
+ // UNPCKHPD the element to the lowest double word, then movsd.
+ // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+ // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+ int Mask[2] = { 1, -1 };
+ Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ return SDValue();
+}
+
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Elt = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (!isa<ConstantSDNode>(Idx)) {
+ // Non constant index. Extend source and destination,
+ // insert element and then truncate the result.
+ unsigned NumElts = VecVT.getVectorNumElements();
+ MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
+ MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
+ SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+ }
+
+ // Copy into a k-register, extract to v1i1 and insert_subvector.
+ SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
+}
+
+SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if (EltVT == MVT::i1)
+ return InsertBitToMaskVector(Op, DAG, Subtarget);
+
+ SDLoc dl(Op);
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+
+ auto *N2C = dyn_cast<ConstantSDNode>(N2);
+ if (!N2C || N2C->getAPIntValue().uge(NumElts))
+ return SDValue();
+ uint64_t IdxVal = N2C->getZExtValue();
+
+ bool IsZeroElt = X86::isZeroNode(N1);
+ bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
+
+ // If we are inserting a element, see if we can do this more efficiently with
+ // a blend shuffle with a rematerializable vector than a costly integer
+ // insertion.
+ if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
+ 16 <= EltVT.getSizeInBits()) {
+ SmallVector<int, 8> BlendMask;
+ for (unsigned i = 0; i != NumElts; ++i)
+ BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+ SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
+ : getOnesVector(VT, DAG, dl);
+ return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
+ }
+
+ // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
+ // into that, and then insert the subvector back into the result.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ // With a 256-bit vector, we can insert into the zero element efficiently
+ // using a blend if we have AVX or AVX2 and the right data type.
+ if (VT.is256BitVector() && IdxVal == 0) {
+ // TODO: It is worthwhile to cast integer to floating point and back
+ // and incur a domain crossing penalty if that's what we'll end up
+ // doing anyway after extracting to a 128-bit vector.
+ if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+ (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
+ SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
+ DAG.getTargetConstant(1, dl, MVT::i8));
+ }
+ }
+
+ // Get the desired 128-bit vector chunk.
+ SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
+
+ // Insert the element into the desired chunk.
+ unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
+ assert(isPowerOf2_32(NumEltsIn128));
+ // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
+ unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
+
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+ DAG.getIntPtrConstant(IdxIn128, dl));
+
+ // Insert the changed part back into the bigger vector
+ return insert128BitVector(N0, V, IdxVal, DAG, dl);
+ }
+ assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
+
+ // This will be just movd/movq/movss/movsd.
+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
+ if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+ EltVT == MVT::i64) {
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ }
+
+ // We can't directly insert an i8 or i16 into a vector, so zero extend
+ // it to i32 first.
+ if (EltVT == MVT::i16 || EltVT == MVT::i8) {
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
+ N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ return DAG.getBitcast(VT, N1);
+ }
+ }
+
+ // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+ // argument. SSE41 required for pinsrb.
+ if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
+ unsigned Opc;
+ if (VT == MVT::v8i16) {
+ assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
+ Opc = X86ISD::PINSRW;
+ } else {
+ assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
+ assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
+ Opc = X86ISD::PINSRB;
+ }
+
+ assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
+ return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ }
+
+ if (Subtarget.hasSSE41()) {
+ if (EltVT == MVT::f32) {
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into
+ // these bits. For example (insert (extract, 3), 2) could be matched by
+ // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // combine either bitwise AND or insert of float 0.0 to set these bits.
+
+ bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
+ if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+ // If this is an insertion of 32-bits into the low 32-bits of
+ // a vector, we prefer to generate a blend with immediate rather
+ // than an insertps. Blends are simpler operations in hardware and so
+ // will always have equal or better performance than insertps.
+ // But if optimizing for size and there's a load folding opportunity,
+ // generate insertps because blendps does not have a 32-bit memory
+ // operand form.
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
+ DAG.getTargetConstant(1, dl, MVT::i8));
+ }
+ // Create this as a scalar to vector..
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
+ DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
+ }
+
+ // PINSR* works with constant index.
+ if (EltVT == MVT::i32 || EltVT == MVT::i64)
+ return Op;
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT OpVT = Op.getSimpleValueType();
+
+ // It's always cheaper to replace a xor+movd with xorps and simplifies further
+ // combines.
+ if (X86::isZeroNode(Op.getOperand(0)))
+ return getZeroVector(OpVT, Subtarget, DAG, dl);
+
+ // If this is a 256-bit vector result, first insert into a 128-bit
+ // vector and then insert into the 256-bit vector.
+ if (!OpVT.is128BitVector()) {
+ // Insert into a 128-bit vector.
+ unsigned SizeFactor = OpVT.getSizeInBits() / 128;
+ MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
+ OpVT.getVectorNumElements() / SizeFactor);
+
+ Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
+
+ // Insert the 128-bit vector.
+ return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+ }
+ assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
+ "Expected an SSE type!");
+
+ // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
+ if (OpVT == MVT::v4i32)
+ return Op;
+
+ SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+ return DAG.getBitcast(
+ OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
+}
+
+// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
+// simple superregister reference or explicit instructions to insert
+// the upper bits of a vector.
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
+
+ return insert1BitVector(Op, DAG, Subtarget);
+}
+
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Only vXi1 extract_subvectors need custom lowering");
+
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ uint64_t IdxVal = Op.getConstantOperandVal(1);
+
+ if (IdxVal == 0) // the operation is legal
+ return Op;
+
+ MVT VecVT = Vec.getSimpleValueType();
+ unsigned NumElems = VecVT.getVectorNumElements();
+
+ // Extend to natively supported kshift.
+ MVT WideVecVT = VecVT;
+ if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
+ WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
+ DAG.getUNDEF(WideVecVT), Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // Shift to the LSB.
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+// Returns the appropriate wrapper opcode for a global reference.
+unsigned X86TargetLowering::getGlobalWrapperKind(
+ const GlobalValue *GV, const unsigned char OpFlags) const {
+ // References to absolute symbols are never PC-relative.
+ if (GV && GV->isAbsoluteSymbolRef())
+ return X86ISD::Wrapper;
+
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+ if (Subtarget.isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ return X86ISD::WrapperRIP;
+
+ // GOTPCREL references must always use RIP.
+ if (OpFlags == X86II::MO_GOTPCREL)
+ return X86ISD::WrapperRIP;
+
+ return X86ISD::Wrapper;
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetConstantPool(
+ CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
+ SDLoc DL(CP);
+ Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag) {
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+ }
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
+
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
+ SDLoc DL(JT);
+ Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag)
+ Result =
+ DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
+ SelectionDAG &DAG) const {
+ return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+ // Create the TargetBlockAddressAddress node.
+ unsigned char OpFlags =
+ Subtarget.classifyBlockAddressReference();
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
+ SDLoc dl(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
+ Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isGlobalRelativeToPICBase(OpFlags)) {
+ Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+ }
+
+ return Result;
+}
+
+/// Creates target global address or external symbol nodes for calls or
+/// other uses.
+SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
+ bool ForCall) const {
+ // Unpack the global address or external symbol.
+ const SDLoc &dl = SDLoc(Op);
+ const GlobalValue *GV = nullptr;
+ int64_t Offset = 0;
+ const char *ExternalSym = nullptr;
+ if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
+ GV = G->getGlobal();
+ Offset = G->getOffset();
+ } else {
+ const auto *ES = cast<ExternalSymbolSDNode>(Op);
+ ExternalSym = ES->getSymbol();
+ }
+
+ // Calculate some flags for address lowering.
+ const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
+ unsigned char OpFlags;
+ if (ForCall)
+ OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
+ else
+ OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
+ bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
+ bool NeedsLoad = isGlobalStubReference(OpFlags);
+
+ CodeModel::Model M = DAG.getTarget().getCodeModel();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Result;
+
+ if (GV) {
+ // Create a target global address if this is a global. If possible, fold the
+ // offset into the global address reference. Otherwise, ADD it on later.
+ // Suppress the folding if Offset is negative: movl foo-1, %eax is not
+ // allowed because if the address of foo is 0, the ELF R_X86_64_32
+ // relocation will compute to a negative value, which is invalid.
+ int64_t GlobalOffset = 0;
+ if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
+ X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
+ std::swap(GlobalOffset, Offset);
+ }
+ Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
+ } else {
+ // If this is not a global address, this must be an external symbol.
+ Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
+ }
+
+ // If this is a direct call, avoid the wrapper if we don't need to do any
+ // loads or adds. This allows SDAG ISel to match direct calls.
+ if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
+ return Result;
+
+ Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (HasPICReg) {
+ Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+ }
+
+ // For globals that require a load from a stub to get the address, emit the
+ // load.
+ if (NeedsLoad)
+ Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+
+ // If there was a non-zero offset that we didn't fold, create an explicit
+ // addition for it.
+ if (Offset != 0)
+ Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
+ DAG.getConstant(Offset, dl, PtrVT));
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+ return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+ SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+ unsigned char OperandFlags, bool LocalDynamic = false) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDLoc dl(GA);
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(),
+ OperandFlags);
+
+ X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+ : X86ISD::TLSADDR;
+
+ if (InFlag) {
+ SDValue Ops[] = { Chain, TGA, *InFlag };
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+ } else {
+ SDValue Ops[] = { Chain, TGA };
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+ }
+
+ // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+ MFI.setAdjustsStack(true);
+ MFI.setHasCalls(true);
+
+ SDValue Flag = Chain.getValue(1);
+ return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ SDValue InFlag;
+ SDLoc dl(GA); // ? function entry point might be better
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ SDLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+
+ return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+ X86::RAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
+static SDValue
+LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+ X86::EAX, X86II::MO_TLSGD);
+}
+
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG, const EVT PtrVT,
+ bool Is64Bit, bool Is64BitLP64) {
+ SDLoc dl(GA);
+
+ // Get the start address of the TLS block for this module.
+ X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
+ .getInfo<X86MachineFunctionInfo>();
+ MFI->incNumLocalDynamicTLSAccesses();
+
+ SDValue Base;
+ if (Is64Bit) {
+ unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
+ X86II::MO_TLSLD, /*LocalDynamic=*/true);
+ } else {
+ SDValue InFlag;
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+ Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+ X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+ }
+
+ // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+ // of Base.
+
+ // Build x@dtpoff.
+ unsigned char OperandFlags = X86II::MO_DTPOFF;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ // Add x@dtpoff with the base.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT, TLSModel::Model model,
+ bool is64Bit, bool isPIC) {
+ SDLoc dl(GA);
+
+ // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+ Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
+ is64Bit ? 257 : 256));
+
+ SDValue ThreadPointer =
+ DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
+ MachinePointerInfo(Ptr));
+
+ unsigned char OperandFlags = 0;
+ // Most TLS accesses are not RIP relative, even on x86-64. One exception is
+ // initialexec.
+ unsigned WrapperKind = X86ISD::Wrapper;
+ if (model == TLSModel::LocalExec) {
+ OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+ } else if (model == TLSModel::InitialExec) {
+ if (is64Bit) {
+ OperandFlags = X86II::MO_GOTTPOFF;
+ WrapperKind = X86ISD::WrapperRIP;
+ } else {
+ OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+ }
+ } else {
+ llvm_unreachable("Unexpected model");
+ }
+
+ // emit "addl x@ntpoff,%eax" (local exec)
+ // or "addl x@indntpoff,%eax" (initial exec)
+ // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
+ SDValue TGA =
+ DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ if (model == TLSModel::InitialExec) {
+ if (isPIC && !is64Bit) {
+ Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+ Offset);
+ }
+
+ Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+ if (DAG.getTarget().useEmulatedTLS())
+ return LowerToTLSEmulatedModel(GA, DAG);
+
+ const GlobalValue *GV = GA->getGlobal();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ bool PositionIndependent = isPositionIndependent();
+
+ if (Subtarget.isTargetELF()) {
+ TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+ switch (model) {
+ case TLSModel::GeneralDynamic:
+ if (Subtarget.is64Bit()) {
+ if (Subtarget.isTarget64BitLP64())
+ return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+ return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
+ }
+ return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
+ case TLSModel::LocalDynamic:
+ return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
+ Subtarget.isTarget64BitLP64());
+ case TLSModel::InitialExec:
+ case TLSModel::LocalExec:
+ return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
+ PositionIndependent);
+ }
+ llvm_unreachable("Unknown TLS model.");
+ }
+
+ if (Subtarget.isTargetDarwin()) {
+ // Darwin only has one model of TLS. Lower to that.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
+ X86ISD::WrapperRIP : X86ISD::Wrapper;
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
+ if (PIC32)
+ OpFlag = X86II::MO_TLVP_PIC_BASE;
+ else
+ OpFlag = X86II::MO_TLVP;
+ SDLoc DL(Op);
+ SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+ GA->getValueType(0),
+ GA->getOffset(), OpFlag);
+ SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+ // With PIC32, the address is actually $g + Offset.
+ if (PIC32)
+ Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+ Offset);
+
+ // Lowering the machine isd will make sure everything is in the right
+ // location.
+ SDValue Chain = DAG.getEntryNode();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+ SDValue Args[] = { Chain, Offset };
+ Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+ DAG.getIntPtrConstant(0, DL, true),
+ Chain.getValue(1), DL);
+
+ // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setAdjustsStack(true);
+
+ // And our return value (tls address) is in the standard call return value
+ // location.
+ unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+ return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
+ }
+
+ if (Subtarget.isOSWindows()) {
+ // Just use the implicit TLS architecture
+ // Need to generate something similar to:
+ // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+ // ; from TEB
+ // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
+ // mov rcx, qword [rdx+rcx*8]
+ // mov eax, .tls$:tlsvar
+ // [rax+rcx] contains the address
+ // Windows 64bit: gs:0x58
+ // Windows 32bit: fs:__tls_array
+
+ SDLoc dl(GA);
+ SDValue Chain = DAG.getEntryNode();
+
+ // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+ // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+ // use its literal value of 0x2C.
+ Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
+ ? Type::getInt8PtrTy(*DAG.getContext(),
+ 256)
+ : Type::getInt32PtrTy(*DAG.getContext(),
+ 257));
+
+ SDValue TlsArray = Subtarget.is64Bit()
+ ? DAG.getIntPtrConstant(0x58, dl)
+ : (Subtarget.isTargetWindowsGNU()
+ ? DAG.getIntPtrConstant(0x2C, dl)
+ : DAG.getExternalSymbol("_tls_array", PtrVT));
+
+ SDValue ThreadPointer =
+ DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
+
+ SDValue res;
+ if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
+ res = ThreadPointer;
+ } else {
+ // Load the _tls_index variable
+ SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
+ if (Subtarget.is64Bit())
+ IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
+ MachinePointerInfo(), MVT::i32);
+ else
+ IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
+
+ const DataLayout &DL = DAG.getDataLayout();
+ SDValue Scale =
+ DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
+ IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
+
+ res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
+ }
+
+ res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
+
+ // Get the offset of start of .tls section
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), X86II::MO_SECREL);
+ SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
+ }
+
+ llvm_unreachable("TLS not implemented for this target.");
+}
+
+/// Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+/// TODO: Can this be moved to general expansion code?
+static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ MVT VT = Op.getSimpleValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
+ // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
+ // during isel.
+ SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits - 1, dl, MVT::i8));
+ SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, dl, MVT::i8))
+ : DAG.getConstant(0, dl, VT);
+
+ SDValue Tmp2, Tmp3;
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
+ Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
+ } else {
+ Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
+ Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
+ }
+
+ // If the shift amount is larger or equal than the width of a part we can't
+ // rely on the results of shld/shrd. Insert a test and select the appropriate
+ // values for large shift amounts.
+ SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits, dl, MVT::i8));
+ SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
+ DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
+
+ SDValue Hi, Lo;
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
+ Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
+ } else {
+ Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
+ Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
+ }
+
+ return DAG.getMergeValues({ Lo, Hi }, dl);
+}
+
+static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
+ "Unexpected funnel shift opcode!");
+
+ SDLoc DL(Op);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Amt = Op.getOperand(2);
+
+ bool IsFSHR = Op.getOpcode() == ISD::FSHR;
+
+ if (VT.isVector()) {
+ assert(Subtarget.hasVBMI2() && "Expected VBMI2");
+
+ if (IsFSHR)
+ std::swap(Op0, Op1);
+
+ // With AVX512, but not VLX we need to widen to get a 512-bit result type.
+ if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
+ Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
+ Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
+ }
+
+ SDValue Funnel;
+ APInt APIntShiftAmt;
+ MVT ResultVT = Op0.getSimpleValueType();
+ if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
+ uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
+ Funnel =
+ DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
+ Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ } else {
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
+ Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
+ Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
+ ResultVT, Op0, Op1, Amt);
+ }
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
+ Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
+ return Funnel;
+ }
+ assert(
+ (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+ "Unexpected funnel shift type!");
+
+ // Expand slow SHLD/SHRD cases if we are not optimizing for size.
+ bool OptForSize = DAG.shouldOptForSize();
+ bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
+
+ // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
+ // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
+ if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
+ !isa<ConstantSDNode>(Amt)) {
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
+ SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
+ Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
+ Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
+ Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
+ SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
+ if (IsFSHR) {
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
+ } else {
+ Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
+ }
+ return DAG.getZExtOrTrunc(Res, DL, VT);
+ }
+
+ if (VT == MVT::i8 || ExpandFunnel)
+ return SDValue();
+
+ // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
+ if (VT == MVT::i16) {
+ Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
+ DAG.getConstant(15, DL, Amt.getValueType()));
+ unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
+ return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
+ }
+
+ return Op;
+}
+
+// Try to use a packed vector operation to handle i64 on 32-bit targets when
+// AVX512DQ is enabled.
+static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert((Op.getOpcode() == ISD::SINT_TO_FP ||
+ Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
+ Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
+ Op.getOpcode() == ISD::UINT_TO_FP) &&
+ "Unexpected opcode!");
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Src = Op.getOperand(OpNo);
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT VT = Op.getSimpleValueType();
+
+ if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
+ (VT != MVT::f32 && VT != MVT::f64))
+ return SDValue();
+
+ // Pack the i64 into a vector, do the operation and extract.
+
+ // Using 256-bit to ensure result is 128-bits for f32 case.
+ unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
+ MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
+ MVT VecVT = MVT::getVectorVT(VT, NumElts);
+
+ SDLoc dl(Op);
+ SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
+ if (IsStrict) {
+ SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
+ {Op.getOperand(0), InVec});
+ SDValue Chain = CvtVec.getValue(1);
+ SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getMergeValues({Value, Chain}, dl);
+ }
+
+ SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
+ const X86Subtarget &Subtarget) {
+ switch (Opcode) {
+ case ISD::SINT_TO_FP:
+ // TODO: Handle wider types with AVX/AVX512.
+ if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
+ return false;
+ // CVTDQ2PS or (V)CVTDQ2PD
+ return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
+
+ case ISD::UINT_TO_FP:
+ // TODO: Handle wider types and i64 elements.
+ if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
+ return false;
+ // VCVTUDQ2PS or VCVTUDQ2PD
+ return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+
+ default:
+ return false;
+ }
+}
+
+/// Given a scalar cast operation that is extracted from a vector, try to
+/// vectorize the cast op followed by extraction. This will avoid an expensive
+/// round-trip between XMM and GPR.
+static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: This could be enhanced to handle smaller integer types by peeking
+ // through an extend.
+ SDValue Extract = Cast.getOperand(0);
+ MVT DestVT = Cast.getSimpleValueType();
+ if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Extract.getOperand(1)))
+ return SDValue();
+
+ // See if we have a 128-bit vector cast op for this type of cast.
+ SDValue VecOp = Extract.getOperand(0);
+ MVT FromVT = VecOp.getSimpleValueType();
+ unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
+ MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
+ MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
+ if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
+ return SDValue();
+
+ // If we are extracting from a non-zero element, first shuffle the source
+ // vector to allow extracting from element zero.
+ SDLoc DL(Cast);
+ if (!isNullConstant(Extract.getOperand(1))) {
+ SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
+ Mask[0] = Extract.getConstantOperandVal(1);
+ VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
+ }
+ // If the source vector is wider than 128-bits, extract the low part. Do not
+ // create an unnecessarily wide vector cast op.
+ if (FromVT != Vec128VT)
+ VecOp = extract128BitVector(VecOp, 0, DAG, DL);
+
+ // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
+ // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
+ SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
+ DAG.getIntPtrConstant(0, DL));
+}
+
+/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
+/// try to vectorize the cast ops. This will avoid an expensive round-trip
+/// between XMM and GPR.
+static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: Allow FP_TO_UINT.
+ SDValue CastToInt = CastToFP.getOperand(0);
+ MVT VT = CastToFP.getSimpleValueType();
+ if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
+ return SDValue();
+
+ MVT IntVT = CastToInt.getSimpleValueType();
+ SDValue X = CastToInt.getOperand(0);
+ MVT SrcVT = X.getSimpleValueType();
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return SDValue();
+
+ // See if we have 128-bit vector cast instructions for this type of cast.
+ // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
+ if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
+ IntVT != MVT::i32)
+ return SDValue();
+
+ unsigned SrcSize = SrcVT.getSizeInBits();
+ unsigned IntSize = IntVT.getSizeInBits();
+ unsigned VTSize = VT.getSizeInBits();
+ MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
+ MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
+ MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
+
+ // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
+ unsigned ToIntOpcode =
+ SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
+ unsigned ToFPOpcode =
+ IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
+
+ // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
+ //
+ // We are not defining the high elements (for example, zero them) because
+ // that could nullify any performance advantage that we hoped to gain from
+ // this vector op hack. We do not expect any adverse effects (like denorm
+ // penalties) with cast ops.
+ SDLoc DL(CastToFP);
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+ SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
+ SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
+ SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
+}
+
+static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(Op);
+ bool IsStrict = Op->isStrictFPOpcode();
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
+
+ if (Subtarget.hasDQI()) {
+ assert(!Subtarget.hasVLX() && "Unexpected features");
+
+ assert((Src.getSimpleValueType() == MVT::v2i64 ||
+ Src.getSimpleValueType() == MVT::v4i64) &&
+ "Unsupported custom type");
+
+ // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
+ assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
+ "Unexpected VT!");
+ MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
+
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
+ : DAG.getUNDEF(MVT::v8i64);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
+ {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, DL);
+ return Res;
+ }
+
+ bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
+ Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
+ if (VT != MVT::v4f32 || IsSigned)
+ return SDValue();
+
+ SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
+ SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
+ SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
+ DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
+ DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
+ SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
+ SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
+ SmallVector<SDValue, 4> SignCvts(4);
+ SmallVector<SDValue, 4> Chains(4);
+ for (int i = 0; i != 4; ++i) {
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
+ DAG.getIntPtrConstant(i, DL));
+ if (IsStrict) {
+ SignCvts[i] =
+ DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
+ {Op.getOperand(0), Elt});
+ Chains[i] = SignCvts[i].getValue(1);
+ } else {
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
+ }
+ }
+ SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
+
+ SDValue Slow, Chain;
+ if (IsStrict) {
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
+ {Chain, SignCvt, SignCvt});
+ Chain = Slow.getValue(1);
+ } else {
+ Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
+ }
+
+ IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
+ SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Cvt, Chain}, DL);
+
+ return Cvt;
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Src = Op.getOperand(OpNo);
+ SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
+ if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
+ return R;
+
+ if (SrcVT.isVector()) {
+ if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
+ // Note: Since v2f64 is a legal type. We don't need to zero extend the
+ // source for strict FP.
+ if (IsStrict)
+ return DAG.getNode(
+ X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
+ {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getUNDEF(SrcVT))});
+ return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getUNDEF(SrcVT)));
+ }
+ if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
+ return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
+
+ return SDValue();
+ }
+
+ assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
+ "Unknown SINT_TO_FP to lower!");
+
+ bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
+
+ // These are really Legal; return the operand so the caller accepts it as
+ // Legal.
+ if (SrcVT == MVT::i32 && UseSSEReg)
+ return Op;
+ if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
+ return Op;
+
+ if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
+ return V;
+
+ // SSE doesn't have an i16 conversion so we need to promote.
+ if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {Chain, Ext});
+
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
+ }
+
+ if (VT == MVT::f128)
+ return SDValue();
+
+ SDValue ValueToStore = Src;
+ if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
+ // Bitcasting to f64 here allows us to do a single 64-bit store from
+ // an SSE register, avoiding the store forwarding penalty that would come
+ // with two 32-bit stores.
+ ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+
+ unsigned Size = SrcVT.getStoreSize();
+ Align Alignment(Size);
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
+ std::pair<SDValue, SDValue> Tmp =
+ BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
+
+ return Tmp.first;
+}
+
+std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
+ EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
+ MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
+ // Build the FILD
+ SDVTList Tys;
+ bool useSSE = isScalarFPTypeInSSEReg(DstVT);
+ if (useSSE)
+ Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ else
+ Tys = DAG.getVTList(DstVT, MVT::Other);
+
+ SDValue FILDOps[] = {Chain, Pointer};
+ SDValue Result =
+ DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
+ Alignment, MachineMemOperand::MOLoad);
+ Chain = Result.getValue(1);
+
+ if (useSSE) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned SSFISize = DstVT.getStoreSize();
+ int SSFI =
+ MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ Tys = DAG.getVTList(MVT::Other);
+ SDValue FSTOps[] = {Chain, Result, StackSlot};
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
+
+ Chain =
+ DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
+ Result = DAG.getLoad(
+ DstVT, DL, Chain, StackSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
+ Chain = Result.getValue(1);
+ }
+
+ return { Result, Chain };
+}
+
+/// Horizontal vector math instructions may be slower than normal math with
+/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
+/// implementation, and likely shuffle complexity of the alternate sequence.
+static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsOptimizingSize = DAG.shouldOptForSize();
+ bool HasFastHOps = Subtarget.hasFastHorizontalOps();
+ return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+}
+
+/// 64-bit unsigned integer to double expansion.
+static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
+ // when converting 0 when rounding toward negative infinity. Caller will
+ // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
+ assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
+ // This algorithm is not obvious. Here it is what we're trying to output:
+ /*
+ movq %rax, %xmm0
+ punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+ subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+ #ifdef __SSE3__
+ haddpd %xmm0, %xmm0
+ #else
+ pshufd $0x4e, %xmm0, %xmm1
+ addpd %xmm1, %xmm0
+ #endif
+ */
+
+ SDLoc dl(Op);
+ LLVMContext *Context = DAG.getContext();
+
+ // Build some magic constants.
+ static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+ Constant *C0 = ConstantDataVector::get(*Context, CV0);
+ auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
+
+ SmallVector<Constant*,2> CV1;
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+ APInt(64, 0x4330000000000000ULL))));
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+ APInt(64, 0x4530000000000000ULL))));
+ Constant *C1 = ConstantVector::get(CV1);
+ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
+
+ // Load the 64-bit value into an XMM register.
+ SDValue XR1 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
+ SDValue CLod0 = DAG.getLoad(
+ MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
+ SDValue Unpck1 =
+ getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
+
+ SDValue CLod1 = DAG.getLoad(
+ MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
+ SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ SDValue Result;
+
+ if (Subtarget.hasSSE3() &&
+ shouldUseHorizontalOp(true, DAG, Subtarget)) {
+ Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+ } else {
+ SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
+ }
+ Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+ DAG.getIntPtrConstant(0, dl));
+ return Result;
+}
+
+/// 32-bit unsigned integer to float expansion.
+static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
+ SDLoc dl(Op);
+ // FP constant to bias correct the final result.
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
+ MVT::f64);
+
+ // Load the 32-bit value into an XMM register.
+ SDValue Load =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
+
+ // Zero out the upper parts of the register.
+ Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+
+ // Or the load with the bias.
+ SDValue Or = DAG.getNode(
+ ISD::OR, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, Load),
+ DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
+ Or =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
+
+ if (Op.getNode()->isStrictFPOpcode()) {
+ // Subtract the bias.
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Chain = Op.getOperand(0);
+ SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
+ {Chain, Or, Bias});
+
+ if (Op.getValueType() == Sub.getValueType())
+ return Sub;
+
+ // Handle final rounding.
+ std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
+ Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
+
+ return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
+ }
+
+ // Subtract the bias.
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+ // Handle final rounding.
+ return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
+}
+
+static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ const SDLoc &DL) {
+ if (Op.getSimpleValueType() != MVT::v2f64)
+ return SDValue();
+
+ bool IsStrict = Op->isStrictFPOpcode();
+
+ SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
+ assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
+
+ if (Subtarget.hasAVX512()) {
+ if (!Subtarget.hasVLX()) {
+ // Let generic type legalization widen this.
+ if (!IsStrict)
+ return SDValue();
+ // Otherwise pad the integer input with 0s and widen the operation.
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getConstant(0, DL, MVT::v2i32));
+ SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
+ {Op.getOperand(0), N0});
+ SDValue Chain = Res.getValue(1);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getMergeValues({Res, Chain}, DL);
+ }
+
+ // Legalize to v4i32 type.
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+ if (IsStrict)
+ return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
+ {Op.getOperand(0), N0});
+ return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
+ }
+
+ // Zero extend to 2i64, OR with the floating point representation of 2^52.
+ // This gives us the floating point equivalent of 2^52 + the i32 integer
+ // since double has 52-bits of mantissa. Then subtract 2^52 in floating
+ // point leaving just our i32 integers in double format.
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
+ SDValue VBias =
+ DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
+ DAG.getBitcast(MVT::v2i64, VBias));
+ Or = DAG.getBitcast(MVT::v2f64, Or);
+
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
+ {Op.getOperand(0), Or, VBias});
+ return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
+}
+
+static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(Op);
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue V = Op->getOperand(IsStrict ? 1 : 0);
+ MVT VecIntVT = V.getSimpleValueType();
+ assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
+ "Unsupported custom type");
+
+ if (Subtarget.hasAVX512()) {
+ // With AVX512, but not VLX we need to widen to get a 512-bit result type.
+ assert(!Subtarget.hasVLX() && "Unexpected features");
+ MVT VT = Op->getSimpleValueType(0);
+
+ // v8i32->v8f64 is legal with AVX512 so just return it.
+ if (VT == MVT::v8f64)
+ return Op;
+
+ assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
+ "Unexpected VT!");
+ MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
+ MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ SDValue Tmp =
+ IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
+ V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
+ {Op->getOperand(0), V});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, DL);
+ return Res;
+ }
+
+ if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
+ Op->getSimpleValueType(0) == MVT::v4f64) {
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
+ Constant *Bias = ConstantFP::get(
+ *DAG.getContext(),
+ APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
+ auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
+ SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
+ SDValue VBias = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
+ MachineMemOperand::MOLoad);
+
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
+ DAG.getBitcast(MVT::v4i64, VBias));
+ Or = DAG.getBitcast(MVT::v4f64, Or);
+
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
+ {Op.getOperand(0), Or, VBias});
+ return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
+ }
+
+ // The algorithm is the following:
+ // #ifdef __SSE4_1__
+ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+ // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+ // (uint4) 0x53000000, 0xaa);
+ // #else
+ // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+ // uint4 hi = (v >> 16) | (uint4) 0x53000000;
+ // #endif
+ // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ // return (float4) lo + fhi;
+
+ bool Is128 = VecIntVT == MVT::v4i32;
+ MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+ // If we convert to something else than the supported type, e.g., to v4f64,
+ // abort early.
+ if (VecFloatVT != Op->getSimpleValueType(0))
+ return SDValue();
+
+ // In the #idef/#else code, we have in common:
+ // - The vector of constants:
+ // -- 0x4b000000
+ // -- 0x53000000
+ // - A shift:
+ // -- v >> 16
+
+ // Create the splat vector for 0x4b000000.
+ SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
+ // Create the splat vector for 0x53000000.
+ SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
+
+ // Create the right shift.
+ SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
+ SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
+
+ SDValue Low, High;
+ if (Subtarget.hasSSE41()) {
+ MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
+ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+ SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
+ SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
+ // Low will be bitcasted right away, so do not bother bitcasting back to its
+ // original type.
+ Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
+ VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
+ // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+ // (uint4) 0x53000000, 0xaa);
+ SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
+ SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
+ // High will be bitcasted right away, so do not bother bitcasting back to
+ // its original type.
+ High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
+ VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
+ } else {
+ SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
+ // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+ SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
+ Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
+
+ // uint4 hi = (v >> 16) | (uint4) 0x53000000;
+ High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
+ }
+
+ // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
+ SDValue VecCstFSub = DAG.getConstantFP(
+ APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
+
+ // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ // NOTE: By using fsub of a positive constant instead of fadd of a negative
+ // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
+ // enabled. See PR24512.
+ SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
+ // TODO: Are there any fast-math-flags to propagate here?
+ // (float4) lo;
+ SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
+ // return (float4) lo + fhi;
+ if (IsStrict) {
+ SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
+ {Op.getOperand(0), HighBitcast, VecCstFSub});
+ return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
+ {FHigh.getValue(1), LowBitcast, FHigh});
+ }
+
+ SDValue FHigh =
+ DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
+ return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
+}
+
+static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
+ SDValue N0 = Op.getOperand(OpNo);
+ MVT SrcVT = N0.getSimpleValueType();
+ SDLoc dl(Op);
+
+ switch (SrcVT.SimpleTy) {
+ default:
+ llvm_unreachable("Custom UINT_TO_FP is not supported!");
+ case MVT::v2i32:
+ return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
+ case MVT::v4i32:
+ case MVT::v8i32:
+ return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
+ case MVT::v2i64:
+ case MVT::v4i64:
+ return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
+ }
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Src = Op.getOperand(OpNo);
+ SDLoc dl(Op);
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT DstVT = Op->getSimpleValueType(0);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+
+ if (DstVT == MVT::f128)
+ return SDValue();
+
+ if (DstVT.isVector())
+ return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
+
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
+ if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+ (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
+ // Conversions from unsigned i32 to f32/f64 are legal,
+ // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
+ return Op;
+ }
+
+ // Promote i32 to i64 and use a signed conversion on 64-bit targets.
+ if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
+ Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
+ {Chain, Src});
+ return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
+ }
+
+ if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
+ return V;
+
+ // The transform for i64->f64 isn't correct for 0 when rounding to negative
+ // infinity. It produces -0.0, so disable under strictfp.
+ if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
+ return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
+ if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
+ return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
+ if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
+ (DstVT == MVT::f32 || DstVT == MVT::f64))
+ return SDValue();
+
+ // Make a 64-bit buffer, and use it to build an FILD.
+ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
+ int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+ Align SlotAlign(8);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
+ if (SrcVT == MVT::i32) {
+ SDValue OffsetSlot =
+ DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
+ SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
+ SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
+ OffsetSlot, MPI.getWithOffset(4), SlotAlign);
+ std::pair<SDValue, SDValue> Tmp =
+ BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
+
+ return Tmp.first;
+ }
+
+ assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+ SDValue ValueToStore = Src;
+ if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
+ // Bitcasting to f64 here allows us to do a single 64-bit store from
+ // an SSE register, avoiding the store forwarding penalty that would come
+ // with two 32-bit stores.
+ ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+ }
+ SDValue Store =
+ DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
+ // For i64 source, we need to add the appropriate power of 2 if the input
+ // was negative. We must be careful to do the computation in x87 extended
+ // precision, not in SSE.
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Store, StackSlot };
+ SDValue Fild =
+ DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
+ SlotAlign, MachineMemOperand::MOLoad);
+ Chain = Fild.getValue(1);
+
+
+ // Check whether the sign bit is set.
+ SDValue SignSet = DAG.getSetCC(
+ dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+ Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+
+ // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
+ APInt FF(64, 0x5F80000000000000ULL);
+ SDValue FudgePtr = DAG.getConstantPool(
+ ConstantInt::get(*DAG.getContext(), FF), PtrVT);
+ Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
+
+ // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+ SDValue Zero = DAG.getIntPtrConstant(0, dl);
+ SDValue Four = DAG.getIntPtrConstant(4, dl);
+ SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
+ FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
+
+ // Load the value out, extending it from f32 to f80.
+ SDValue Fudge = DAG.getExtLoad(
+ ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+ CPAlignment);
+ Chain = Fudge.getValue(1);
+ // Extend everything to 80 bits to force it to be done on x87.
+ // TODO: Are there any fast-math-flags to propagate here?
+ if (IsStrict) {
+ SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
+ {Chain, Fild, Fudge});
+ // STRICT_FP_ROUND can't handle equal types.
+ if (DstVT == MVT::f80)
+ return Add;
+ return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
+ {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
+ }
+ SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+ return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
+// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
+// just return an SDValue().
+// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
+// to i16, i32 or i64, and we lower it to a legal sequence and return the
+// result.
+SDValue
+X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool IsSigned, SDValue &Chain) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDLoc DL(Op);
+
+ EVT DstTy = Op.getValueType();
+ SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
+ EVT TheVT = Value.getValueType();
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
+ // f16 must be promoted before using the lowering in this routine.
+ // fp128 does not use this lowering.
+ return SDValue();
+ }
+
+ // If using FIST to compute an unsigned i64, we'll need some fixup
+ // to handle values above the maximum signed i64. A FIST is always
+ // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
+ bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
+
+ // FIXME: This does not generate an invalid exception if the input does not
+ // fit in i32. PR44019
+ if (!IsSigned && DstTy != MVT::i64) {
+ // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
+ // The low 32 bits of the fist result will have the correct uint32 result.
+ assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+ DstTy = MVT::i64;
+ }
+
+ assert(DstTy.getSimpleVT() <= MVT::i64 &&
+ DstTy.getSimpleVT() >= MVT::i16 &&
+ "Unknown FP_TO_INT to lower!");
+
+ // We lower FP->int64 into FISTP64 followed by a load from a temporary
+ // stack slot.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned MemSize = DstTy.getStoreSize();
+ int SSFI =
+ MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+
+ Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+
+ SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
+
+ if (UnsignedFixup) {
+ //
+ // Conversion to unsigned i64 is implemented with a select,
+ // depending on whether the source value fits in the range
+ // of a signed i64. Let Thresh be the FP equivalent of
+ // 0x8000000000000000ULL.
+ //
+ // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
+ // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
+ // FistSrc = (Value - FltOfs);
+ // Fist-to-mem64 FistSrc
+ // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
+ // to XOR'ing the high 32 bits with Adjust.
+ //
+ // Being a power of 2, Thresh is exactly representable in all FP formats.
+ // For X87 we'd like to use the smallest FP type for this constant, but
+ // for DAG type consistency we have to match the FP operand type.
+
+ APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
+ LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+ bool LosesInfo = false;
+ if (TheVT == MVT::f64)
+ // The rounding mode is irrelevant as the conversion should be exact.
+ Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+ &LosesInfo);
+ else if (TheVT == MVT::f80)
+ Status = Thresh.convert(APFloat::x87DoubleExtended(),
+ APFloat::rmNearestTiesToEven, &LosesInfo);
+
+ assert(Status == APFloat::opOK && !LosesInfo &&
+ "FP conversion should have been exact");
+
+ SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
+
+ EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), TheVT);
+ SDValue Cmp;
+ if (IsStrict) {
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
+ /*IsSignaling*/ true);
+ Chain = Cmp.getValue(1);
+ } else {
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
+ }
+
+ // Our preferred lowering of
+ //
+ // (Value >= Thresh) ? 0x8000000000000000ULL : 0
+ //
+ // is
+ //
+ // (Value >= Thresh) << 63
+ //
+ // but since we can get here after LegalOperations, DAGCombine might do the
+ // wrong thing if we create a select. So, directly create the preferred
+ // version.
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
+ SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
+ Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
+
+ SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
+ DAG.getConstantFP(0.0, DL, TheVT));
+
+ if (IsStrict) {
+ Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
+ { Chain, Value, FltOfs });
+ Chain = Value.getValue(1);
+ } else
+ Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
+ }
+
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
+
+ // FIXME This causes a redundant load/store if the SSE-class value is already
+ // in memory, such as if it is on the callstack.
+ if (isScalarFPTypeInSSEReg(TheVT)) {
+ assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+ Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Chain, StackSlot };
+
+ unsigned FLDSize = TheVT.getStoreSize();
+ assert(FLDSize <= MemSize && "Stack slot not big enough");
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
+ Chain = Value.getValue(1);
+ }
+
+ // Build the FP_TO_INT*_IN_MEM
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
+ DAG.getVTList(MVT::Other),
+ Ops, DstTy, MMO);
+
+ SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
+ Chain = Res.getValue(1);
+
+ // If we need an unsigned fixup, XOR the result with adjust.
+ if (UnsignedFixup)
+ Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
+
+ return Res;
+}
+
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ SDLoc dl(Op);
+ unsigned Opc = Op.getOpcode();
+
+ assert(VT.isVector() && InVT.isVector() && "Expected vector type");
+ assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
+ "Unexpected extension opcode");
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+ "Expected same number of elements");
+ assert((VT.getVectorElementType() == MVT::i16 ||
+ VT.getVectorElementType() == MVT::i32 ||
+ VT.getVectorElementType() == MVT::i64) &&
+ "Unexpected element type");
+ assert((InVT.getVectorElementType() == MVT::i8 ||
+ InVT.getVectorElementType() == MVT::i16 ||
+ InVT.getVectorElementType() == MVT::i32) &&
+ "Unexpected element type");
+
+ unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
+
+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(InVT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
+ }
+
+ if (Subtarget.hasInt256())
+ return Op;
+
+ // Optimize vectors in AVX mode:
+ //
+ // v8i16 -> v8i32
+ // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
+ // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
+ // Concat upper and lower parts.
+ //
+ // v4i32 -> v4i64
+ // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
+ // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
+ // Concat upper and lower parts.
+ //
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
+
+ // Short-circuit if we can determine that each 128-bit half is the same value.
+ // Otherwise, this is difficult to match and optimize.
+ if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
+ if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
+
+ SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
+ SDValue Undef = DAG.getUNDEF(InVT);
+ bool NeedZero = Opc == ISD::ZERO_EXTEND;
+ SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+ OpHi = DAG.getBitcast(HalfVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
+static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
+ const SDLoc &dl, SelectionDAG &DAG) {
+ assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
+ DAG.getIntPtrConstant(8, dl));
+ Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
+ Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+}
+
+static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
+ SDLoc DL(Op);
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
+ // avoids a constant pool load.
+ if (VT.getVectorElementType() != MVT::i8) {
+ SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
+ return DAG.getNode(ISD::SRL, DL, VT, Extend,
+ DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
+ }
+
+ // Extend VT if BWI is not supported.
+ MVT ExtVT = VT;
+ if (!Subtarget.hasBWI()) {
+ // If v16i32 is to be avoided, we'll need to split and concatenate.
+ if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
+ return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
+
+ ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
+ }
+
+ // Widen to 512-bits if VLX is not supported.
+ MVT WideVT = ExtVT;
+ if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
+ NumElts *= 512 / ExtVT.getSizeInBits();
+ InVT = MVT::getVectorVT(MVT::i1, NumElts);
+ In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
+ In, DAG.getIntPtrConstant(0, DL));
+ WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
+ NumElts);
+ }
+
+ SDValue One = DAG.getConstant(1, DL, WideVT);
+ SDValue Zero = DAG.getConstant(0, DL, WideVT);
+
+ SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
+
+ // Truncate if we had to extend above.
+ if (VT != ExtVT) {
+ WideVT = MVT::getVectorVT(MVT::i8, NumElts);
+ SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
+ }
+
+ // Extract back to 128/256-bit if we widened.
+ if (WideVT != VT)
+ SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
+ DAG.getIntPtrConstant(0, DL));
+
+ return SelectedVal;
+}
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getSimpleValueType();
+
+ if (SVT.getVectorElementType() == MVT::i1)
+ return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
+
+ assert(Subtarget.hasAVX() && "Expected AVX support");
+ return LowerAVXExtend(Op, DAG, Subtarget);
+}
+
+/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
+/// It makes use of the fact that vectors with enough leading sign/zero bits
+/// prevent the PACKSS/PACKUS from saturating the results.
+/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
+/// within each 128-bit lane.
+static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
+ const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
+ "Unexpected PACK opcode");
+ assert(DstVT.isVector() && "VT not a vector?");
+
+ // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ EVT SrcVT = In.getValueType();
+
+ // No truncation required, we might get here due to recursive calls.
+ if (SrcVT == DstVT)
+ return In;
+
+ // We only support vector truncation to 64bits or greater from a
+ // 128bits or greater source.
+ unsigned DstSizeInBits = DstVT.getSizeInBits();
+ unsigned SrcSizeInBits = SrcVT.getSizeInBits();
+ if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
+ return SDValue();
+
+ unsigned NumElems = SrcVT.getVectorNumElements();
+ if (!isPowerOf2_32(NumElems))
+ return SDValue();
+
+ LLVMContext &Ctx = *DAG.getContext();
+ assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
+ assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
+
+ EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
+
+ // Pack to the largest type possible:
+ // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
+ EVT InVT = MVT::i16, OutVT = MVT::i8;
+ if (SrcVT.getScalarSizeInBits() > 16 &&
+ (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
+ InVT = MVT::i32;
+ OutVT = MVT::i16;
+ }
+
+ // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
+ if (SrcVT.is128BitVector()) {
+ InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
+ OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
+ In = DAG.getBitcast(InVT, In);
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
+ Res = extractSubVector(Res, 0, DAG, DL, 64);
+ return DAG.getBitcast(DstVT, Res);
+ }
+
+ // Split lower/upper subvectors.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(In, DAG, DL);
+
+ unsigned SubSizeInBits = SrcSizeInBits / 2;
+ InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
+ OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
+
+ // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
+ if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
+ Lo = DAG.getBitcast(InVT, Lo);
+ Hi = DAG.getBitcast(InVT, Hi);
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
+ return DAG.getBitcast(DstVT, Res);
+ }
+
+ // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
+ // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
+ if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
+ Lo = DAG.getBitcast(InVT, Lo);
+ Hi = DAG.getBitcast(InVT, Hi);
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
+
+ // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
+ // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
+ // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
+ SmallVector<int, 64> Mask;
+ int Scale = 64 / OutVT.getScalarSizeInBits();
+ narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
+ Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
+
+ if (DstVT.is256BitVector())
+ return DAG.getBitcast(DstVT, Res);
+
+ // If 512bit -> 128bit truncate another stage.
+ EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
+ Res = DAG.getBitcast(PackedVT, Res);
+ return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
+ }
+
+ // Recursively pack lower/upper subvectors, concat result and pack again.
+ assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
+ EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
+ Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
+ Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
+
+ PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
+ return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
+}
+
+static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+
+ assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
+
+ // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
+ unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
+ if (InVT.getScalarSizeInBits() <= 16) {
+ if (Subtarget.hasBWI()) {
+ // legal, will go to VPMOVB2M, VPMOVW2M
+ if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
+ // We need to shift to get the lsb into sign position.
+ // Shift packed bytes not supported natively, bitcast to word
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+ In = DAG.getNode(ISD::SHL, DL, ExtVT,
+ DAG.getBitcast(ExtVT, In),
+ DAG.getConstant(ShiftInx, DL, ExtVT));
+ In = DAG.getBitcast(InVT, In);
+ }
+ return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
+ In, ISD::SETGT);
+ }
+ // Use TESTD/Q, extended vector to packed dword/qword.
+ assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
+ "Unexpected vector type.");
+ unsigned NumElts = InVT.getVectorNumElements();
+ assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
+ // We need to change to a wider element type that we have support for.
+ // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
+ // For 16 element vectors we extend to v16i32 unless we are explicitly
+ // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
+ // we need to split into two 8 element vectors which we can extend to v8i32,
+ // truncate and concat the results. There's an additional complication if
+ // the original type is v16i8. In that case we can't split the v16i8
+ // directly, so we need to shuffle high elements to low and use
+ // sign_extend_vector_inreg.
+ if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
+ SDValue Lo, Hi;
+ if (InVT == MVT::v16i8) {
+ Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
+ Hi = DAG.getVectorShuffle(
+ InVT, DL, In, In,
+ {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
+ Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
+ } else {
+ assert(InVT == MVT::v16i16 && "Unexpected VT!");
+ Lo = extract128BitVector(In, 0, DAG, DL);
+ Hi = extract128BitVector(In, 8, DAG, DL);
+ }
+ // We're split now, just emit two truncates and a concat. The two
+ // truncates will trigger legalization to come back to this function.
+ Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
+ Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+ // We either have 8 elements or we're allowed to use 512-bit vectors.
+ // If we have VLX, we want to use the narrowest vector that can get the
+ // job done so we use vXi32.
+ MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
+ MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
+ In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+ InVT = ExtVT;
+ ShiftInx = InVT.getScalarSizeInBits() - 1;
+ }
+
+ if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
+ // We need to shift to get the lsb into sign position.
+ In = DAG.getNode(ISD::SHL, DL, InVT, In,
+ DAG.getConstant(ShiftInx, DL, InVT));
+ }
+ // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
+ if (Subtarget.hasDQI())
+ return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
+ return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
+}
+
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ unsigned InNumEltBits = InVT.getScalarSizeInBits();
+
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+ "Invalid TRUNCATE operation");
+
+ // If we're called by the type legalizer, handle a few cases.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(InVT)) {
+ if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
+ VT.is128BitVector()) {
+ assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
+ "Unexpected subtarget!");
+ // The default behavior is to truncate one step, concatenate, and then
+ // truncate the remainder. We'd rather produce two 64-bit results and
+ // concatenate those.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
+ Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+
+ // Otherwise let default legalization handle it.
+ return SDValue();
+ }
+
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerTruncateVecI1(Op, DAG, Subtarget);
+
+ // vpmovqb/w/d, vpmovdb/w, vpmovwb
+ if (Subtarget.hasAVX512()) {
+ if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(VT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
+ }
+
+ // word to byte only under BWI. Otherwise we have to promoted to v16i32
+ // and then truncate that. But we should only do that if we haven't been
+ // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
+ // handled by isel patterns.
+ if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
+ Subtarget.canExtendTo512DQ())
+ return Op;
+ }
+
+ unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
+ unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
+
+ // Truncate with PACKUS if we are truncating a vector with leading zero bits
+ // that extend all the way to the packed/truncated value.
+ // Pre-SSE41 we can only use PACKUSWB.
+ KnownBits Known = DAG.computeKnownBits(In);
+ if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
+ if (SDValue V =
+ truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
+ return V;
+
+ // Truncate with PACKSS if we are truncating a vector with sign-bits that
+ // extend all the way to the packed/truncated value.
+ if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
+ if (SDValue V =
+ truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
+ return V;
+
+ // Handle truncation of V256 to V128 using shuffles.
+ assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
+
+ if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
+ In = DAG.getBitcast(MVT::v8i32, In);
+
+ // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+ if (Subtarget.hasInt256()) {
+ static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(4, DL));
+ static const int ShufMask[] = {0, 2, 4, 6};
+ return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+ }
+
+ if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
+ In = DAG.getBitcast(MVT::v32i8, In);
+
+ // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
+ if (Subtarget.hasInt256()) {
+ // The PSHUFB mask:
+ static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ 16, 17, 20, 21, 24, 25, 28, 29,
+ -1, -1, -1, -1, -1, -1, -1, -1 };
+ In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
+ In = DAG.getBitcast(MVT::v4i64, In);
+
+ static const int ShufMask2[] = {0, 2, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
+ DAG.getBitcast(MVT::v16i16, In),
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
+ DAG.getIntPtrConstant(16, DL));
+
+ // The PSHUFB mask:
+ static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
+
+ OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+ OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+
+ // The MOVLHPS Mask:
+ static const int ShufMask2[] = {0, 1, 4, 5};
+ SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+ return DAG.getBitcast(MVT::v8i16, res);
+ }
+
+ if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
+ // Use an AND to zero uppper bits for PACKUS.
+ In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
+
+ SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
+ DAG.getIntPtrConstant(8, DL));
+ return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
+ }
+
+ llvm_unreachable("All 256->128 cases should have been handled above!");
+}
+
+SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ MVT SrcVT = Src.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (VT.isVector()) {
+ if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
+ MVT ResVT = MVT::v4i32;
+ MVT TruncVT = MVT::v4i1;
+ unsigned Opc;
+ if (IsStrict)
+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+ else
+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+
+ if (!IsSigned && !Subtarget.hasVLX()) {
+ assert(Subtarget.useAVX512Regs() && "Unexpected features!");
+ // Widen to 512-bits.
+ ResVT = MVT::v8i32;
+ TruncVT = MVT::v8i1;
+ Opc = Op.getOpcode();
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
+ : DAG.getUNDEF(MVT::v8f64);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res =
+ DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Opc, dl, ResVT, Src);
+ }
+
+ Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
+ DAG.getIntPtrConstant(0, dl));
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+ return Res;
+ }
+
+ // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
+ if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
+ assert(!IsSigned && "Expected unsigned conversion!");
+ assert(Subtarget.useAVX512Regs() && "Requires avx512f");
+ return Op;
+ }
+
+ // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
+ if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
+ (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
+ assert(!IsSigned && "Expected unsigned conversion!");
+ assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
+ "Unexpected features!");
+ MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
+ MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp =
+ IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
+ DAG.getIntPtrConstant(0, dl));
+
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
+ {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+ return Res;
+ }
+
+ // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
+ if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+ (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
+ assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
+ !Subtarget.hasVLX() && "Unexpected features!");
+ MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp =
+ IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
+ DAG.getIntPtrConstant(0, dl));
+
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
+ {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+ return Res;
+ }
+
+ if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
+ if (!Subtarget.hasVLX()) {
+ // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
+ // legalizer and then widened again by vector op legalization.
+ if (!IsStrict)
+ return SDValue();
+
+ SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
+ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
+ {Src, Zero, Zero, Zero});
+ Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
+ {Op->getOperand(0), Tmp});
+ SDValue Chain = Tmp.getValue(1);
+ Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
+ DAG.getIntPtrConstant(0, dl));
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp, Chain}, dl);
+ return Tmp;
+ }
+
+ assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
+ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32));
+ if (IsStrict) {
+ unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
+ : X86ISD::STRICT_CVTTP2UI;
+ return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
+ }
+ unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ return DAG.getNode(Opc, dl, VT, Tmp);
+ }
+
+ return SDValue();
+ }
+
+ assert(!VT.isVector());
+
+ bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
+
+ if (!IsSigned && UseSSEReg) {
+ // Conversions from f32/f64 with AVX512 should be legal.
+ if (Subtarget.hasAVX512())
+ return Op;
+
+ // Use default expansion for i64.
+ if (VT == MVT::i64)
+ return SDValue();
+
+ assert(VT == MVT::i32 && "Unexpected VT!");
+
+ // Promote i32 to i64 and use a signed operation on 64-bit targets.
+ // FIXME: This does not generate an invalid exception if the input does not
+ // fit in i32. PR44019
+ if (Subtarget.is64Bit()) {
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
+ { Op.getOperand(0), Src });
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
+
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ if (IsStrict)
+ return DAG.getMergeValues({ Res, Chain }, dl);
+ return Res;
+ }
+
+ // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
+ // use fisttp which will be handled later.
+ if (!Subtarget.hasSSE3())
+ return SDValue();
+ }
+
+ // Promote i16 to i32 if we can use a SSE operation or the type is f128.
+ // FIXME: This does not generate an invalid exception if the input does not
+ // fit in i16. PR44019
+ if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
+ assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
+ { Op.getOperand(0), Src });
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
+
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ if (IsStrict)
+ return DAG.getMergeValues({ Res, Chain }, dl);
+ return Res;
+ }
+
+ // If this is a FP_TO_SINT using SSEReg we're done.
+ if (UseSSEReg && IsSigned)
+ return Op;
+
+ // fp128 needs to use a libcall.
+ if (SrcVT == MVT::f128) {
+ RTLIB::Libcall LC;
+ if (IsSigned)
+ LC = RTLIB::getFPTOSINT(SrcVT, VT);
+ else
+ LC = RTLIB::getFPTOUINT(SrcVT, VT);
+
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ MakeLibCallOptions CallOptions;
+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
+ SDLoc(Op), Chain);
+
+ if (IsStrict)
+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+ return Tmp.first;
+ }
+
+ // Fall back to X87.
+ SDValue Chain;
+ if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
+ if (IsStrict)
+ return DAG.getMergeValues({V, Chain}, dl);
+ return V;
+ }
+
+ llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
+}
+
+SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // If the source is in an SSE register, the node is Legal.
+ if (isScalarFPTypeInSSEReg(SrcVT))
+ return Op;
+
+ return LRINT_LLRINTHelper(Op.getNode(), DAG);
+}
+
+SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
+ SelectionDAG &DAG) const {
+ EVT DstVT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
+ // f16 must be promoted before using the lowering in this routine.
+ // fp128 does not use this lowering.
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ SDValue Chain = DAG.getEntryNode();
+
+ bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
+
+ // If we're converting from SSE, the stack slot needs to hold both types.
+ // Otherwise it only needs to hold the DstVT.
+ EVT OtherVT = UseSSE ? SrcVT : DstVT;
+ SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+
+ if (UseSSE) {
+ assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
+ Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Chain, StackPtr };
+
+ Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
+ /*Align*/ None, MachineMemOperand::MOLoad);
+ Chain = Src.getValue(1);
+ }
+
+ SDValue StoreOps[] = { Chain, Src, StackPtr };
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
+ StoreOps, DstVT, MPI, /*Align*/ None,
+ MachineMemOperand::MOStore);
+
+ return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
+}
+
+SDValue
+X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
+ // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
+ // but making use of X86 specifics to produce better instruction sequences.
+ SDNode *Node = Op.getNode();
+ bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
+ unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+ SDLoc dl(SDValue(Node, 0));
+ SDValue Src = Node->getOperand(0);
+
+ // There are three types involved here: SrcVT is the source floating point
+ // type, DstVT is the type of the result, and TmpVT is the result of the
+ // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
+ // DstVT).
+ EVT SrcVT = Src.getValueType();
+ EVT DstVT = Node->getValueType(0);
+ EVT TmpVT = DstVT;
+
+ // This code is only for floats and doubles. Fall back to generic code for
+ // anything else.
+ if (!isScalarFPTypeInSSEReg(SrcVT))
+ return SDValue();
+
+ unsigned SatWidth = Node->getConstantOperandVal(1);
+ unsigned DstWidth = DstVT.getScalarSizeInBits();
+ unsigned TmpWidth = TmpVT.getScalarSizeInBits();
+ assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
+ "Expected saturation width smaller than result width");
+
+ // Promote result of FP_TO_*INT to at least 32 bits.
+ if (TmpWidth < 32) {
+ TmpVT = MVT::i32;
+ TmpWidth = 32;
+ }
+
+ // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
+ // us to use a native signed conversion instead.
+ if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
+ TmpVT = MVT::i64;
+ TmpWidth = 64;
+ }
+
+ // If the saturation width is smaller than the size of the temporary result,
+ // we can always use signed conversion, which is native.
+ if (SatWidth < TmpWidth)
+ FpToIntOpcode = ISD::FP_TO_SINT;
+
+ // Determine minimum and maximum integer values and their corresponding
+ // floating-point values.
+ APInt MinInt, MaxInt;
+ if (IsSigned) {
+ MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
+ MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
+ } else {
+ MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
+ MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
+ }
+
+ APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+ APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+
+ APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
+ MinInt, IsSigned, APFloat::rmTowardZero);
+ APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
+ MaxInt, IsSigned, APFloat::rmTowardZero);
+ bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
+ && !(MaxStatus & APFloat::opStatus::opInexact);
+
+ SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
+ SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
+
+ // If the integer bounds are exactly representable as floats, emit a
+ // min+max+fptoi sequence. Otherwise use comparisons and selects.
+ if (AreExactFloatBounds) {
+ if (DstVT != TmpVT) {
+ // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
+ SDValue MinClamped = DAG.getNode(
+ X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
+ // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
+ SDValue BothClamped = DAG.getNode(
+ X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
+ // Convert clamped value to integer.
+ SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
+
+ // NaN will become INDVAL, with the top bit set and the rest zero.
+ // Truncation will discard the top bit, resulting in zero.
+ return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
+ }
+
+ // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
+ SDValue MinClamped = DAG.getNode(
+ X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
+ // Clamp by MaxFloat from above. NaN cannot occur.
+ SDValue BothClamped = DAG.getNode(
+ X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
+ // Convert clamped value to integer.
+ SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
+
+ if (!IsSigned) {
+ // In the unsigned case we're done, because we mapped NaN to MinFloat,
+ // which is zero.
+ return FpToInt;
+ }
+
+ // Otherwise, select zero if Src is NaN.
+ SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+ return DAG.getSelectCC(
+ dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
+ }
+
+ SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
+ SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
+
+ // Result of direct conversion, which may be selected away.
+ SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
+
+ if (DstVT != TmpVT) {
+ // NaN will become INDVAL, with the top bit set and the rest zero.
+ // Truncation will discard the top bit, resulting in zero.
+ FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
+ }
+
+ SDValue Select = FpToInt;
+ // For signed conversions where we saturate to the same size as the
+ // result type of the fptoi instructions, INDVAL coincides with integer
+ // minimum, so we don't need to explicitly check it.
+ if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
+ // If Src ULT MinFloat, select MinInt. In particular, this also selects
+ // MinInt if Src is NaN.
+ Select = DAG.getSelectCC(
+ dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
+ }
+
+ // If Src OGT MaxFloat, select MaxInt.
+ Select = DAG.getSelectCC(
+ dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
+
+ // In the unsigned case we are done, because we mapped NaN to MinInt, which
+ // is already zero. The promoted case was already handled above.
+ if (!IsSigned || DstVT != TmpVT) {
+ return Select;
+ }
+
+ // Otherwise, select 0 if Src is NaN.
+ SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+ return DAG.getSelectCC(
+ dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
+}
+
+SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
+ MVT SVT = In.getSimpleValueType();
+
+ if (VT == MVT::f128)
+ return SDValue();
+
+ assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+ SDValue Res =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
+ if (IsStrict)
+ return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
+ {Op->getOperand(0), Res});
+ return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
+}
+
+SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
+ // It's legal except when f128 is involved
+ if (In.getSimpleValueType() != MVT::f128)
+ return Op;
+
+ return SDValue();
+}
+
+static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
+ "Unexpected VT!");
+
+ SDLoc dl(Op);
+ SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
+ DAG.getConstant(0, dl, MVT::v8i16), Src,
+ DAG.getIntPtrConstant(0, dl));
+
+ SDValue Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
+ {Op.getOperand(0), Res});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+
+ return Res;
+}
+
+static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
+ "Unexpected VT!");
+
+ SDLoc dl(Op);
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
+ DAG.getConstantFP(0, dl, MVT::v4f32), Src,
+ DAG.getIntPtrConstant(0, dl));
+ Res = DAG.getNode(
+ X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
+ {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
+ Chain = Res.getValue(1);
+ } else {
+ // FIXME: Should we use zeros for upper elements for non-strict?
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
+ Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
+ DAG.getTargetConstant(4, dl, MVT::i32));
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+
+ return Res;
+}
+
+/// Depending on uarch and/or optimizing for size, we might prefer to use a
+/// vector operation in place of the typical scalar operation.
+static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // If both operands have other uses, this is probably not profitable.
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ if (!LHS.hasOneUse() && !RHS.hasOneUse())
+ return Op;
+
+ // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
+ bool IsFP = Op.getSimpleValueType().isFloatingPoint();
+ if (IsFP && !Subtarget.hasSSE3())
+ return Op;
+ if (!IsFP && !Subtarget.hasSSSE3())
+ return Op;
+
+ // Extract from a common vector.
+ if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ LHS.getOperand(0) != RHS.getOperand(0) ||
+ !isa<ConstantSDNode>(LHS.getOperand(1)) ||
+ !isa<ConstantSDNode>(RHS.getOperand(1)) ||
+ !shouldUseHorizontalOp(true, DAG, Subtarget))
+ return Op;
+
+ // Allow commuted 'hadd' ops.
+ // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
+ unsigned HOpcode;
+ switch (Op.getOpcode()) {
+ case ISD::ADD: HOpcode = X86ISD::HADD; break;
+ case ISD::SUB: HOpcode = X86ISD::HSUB; break;
+ case ISD::FADD: HOpcode = X86ISD::FHADD; break;
+ case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
+ default:
+ llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
+ }
+ unsigned LExtIndex = LHS.getConstantOperandVal(1);
+ unsigned RExtIndex = RHS.getConstantOperandVal(1);
+ if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
+ (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
+ std::swap(LExtIndex, RExtIndex);
+
+ if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
+ return Op;
+
+ SDValue X = LHS.getOperand(0);
+ EVT VecVT = X.getValueType();
+ unsigned BitWidth = VecVT.getSizeInBits();
+ unsigned NumLanes = BitWidth / 128;
+ unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
+ assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
+ "Not expecting illegal vector widths here");
+
+ // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
+ // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
+ SDLoc DL(Op);
+ if (BitWidth == 256 || BitWidth == 512) {
+ unsigned LaneIdx = LExtIndex / NumEltsPerLane;
+ X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
+ LExtIndex %= NumEltsPerLane;
+ }
+
+ // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
+ // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
+ // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
+ // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
+ SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
+ DAG.getIntPtrConstant(LExtIndex / 2, DL));
+}
+
+/// Depending on uarch and/or optimizing for size, we might prefer to use a
+/// vector operation in place of the typical scalar operation.
+SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
+ assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
+ "Only expecting float/double");
+ return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
+}
+
+/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
+/// This mode isn't supported in hardware on X86. But as long as we aren't
+/// compiling with trapping math, we can emulate this with
+/// floor(X + copysign(nextafter(0.5, 0.0), X)).
+static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
+ SDValue N0 = Op.getOperand(0);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // N0 += copysign(nextafter(0.5, 0.0), N0)
+ const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
+ bool Ignored;
+ APFloat Point5Pred = APFloat(0.5f);
+ Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
+ Point5Pred.next(/*nextDown*/true);
+
+ SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
+ DAG.getConstantFP(Point5Pred, dl, VT), N0);
+ N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
+
+ // Truncate the result to remove fraction.
+ return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
+}
+
+/// The only differences between FABS and FNEG are the mask and the logic op.
+/// FNEG also has a folding opportunity for FNEG(FABS(x)).
+static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
+ assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
+ "Wrong opcode for lowering FABS or FNEG.");
+
+ bool IsFABS = (Op.getOpcode() == ISD::FABS);
+
+ // If this is a FABS and it has an FNEG user, bail out to fold the combination
+ // into an FNABS. We'll lower the FABS after that if it is still in use.
+ if (IsFABS)
+ for (SDNode *User : Op->uses())
+ if (User->getOpcode() == ISD::FNEG)
+ return Op;
+
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ bool IsF128 = (VT == MVT::f128);
+ assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
+ VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+ "Unexpected type in LowerFABSorFNEG");
+
+ // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
+ // decide if we should generate a 16-byte constant mask when we only need 4 or
+ // 8 bytes for the scalar case.
+
+ // There are no scalar bitwise logical SSE/AVX instructions, so we
+ // generate a 16-byte vector constant and logic op even for the scalar case.
+ // Using a 16-byte mask allows folding the load of the mask with
+ // the logic op, so it can save (~4 bytes) on code size.
+ bool IsFakeVector = !VT.isVector() && !IsF128;
+ MVT LogicVT = VT;
+ if (IsFakeVector)
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+
+ unsigned EltBits = VT.getScalarSizeInBits();
+ // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
+ APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
+ APInt::getSignMask(EltBits);
+ const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
+ SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
+
+ SDValue Op0 = Op.getOperand(0);
+ bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+ unsigned LogicOp = IsFABS ? X86ISD::FAND :
+ IsFNABS ? X86ISD::FOR :
+ X86ISD::FXOR;
+ SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
+
+ if (VT.isVector() || IsF128)
+ return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+ // For the scalar case extend to a 128-bit vector, perform the logic op,
+ // and extract the scalar result back out.
+ Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+ SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+ SDValue Mag = Op.getOperand(0);
+ SDValue Sign = Op.getOperand(1);
+ SDLoc dl(Op);
+
+ // If the sign operand is smaller, extend it first.
+ MVT VT = Op.getSimpleValueType();
+ if (Sign.getSimpleValueType().bitsLT(VT))
+ Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
+
+ // And if it is bigger, shrink it first.
+ if (Sign.getSimpleValueType().bitsGT(VT))
+ Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
+
+ // At this point the operands and the result should have the same
+ // type, and that won't be f80 since that is not custom lowered.
+ bool IsF128 = (VT == MVT::f128);
+ assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
+ VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+ "Unexpected type in LowerFCOPYSIGN");
+
+ const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
+
+ // Perform all scalar logic operations as 16-byte vectors because there are no
+ // scalar FP logic instructions in SSE.
+ // TODO: This isn't necessary. If we used scalar types, we might avoid some
+ // unnecessary splats, but we might miss load folding opportunities. Should
+ // this decision be based on OptimizeForSize?
+ bool IsFakeVector = !VT.isVector() && !IsF128;
+ MVT LogicVT = VT;
+ if (IsFakeVector)
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+
+ // The mask constants are automatically splatted for vector types.
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ SDValue SignMask = DAG.getConstantFP(
+ APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
+ SDValue MagMask = DAG.getConstantFP(
+ APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
+
+ // First, clear all bits but the sign bit from the second operand (sign).
+ if (IsFakeVector)
+ Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
+ SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
+
+ // Next, clear the sign bit from the first operand (magnitude).
+ // TODO: If we had general constant folding for FP logic ops, this check
+ // wouldn't be necessary.
+ SDValue MagBits;
+ if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
+ APFloat APF = Op0CN->getValueAPF();
+ APF.clearSign();
+ MagBits = DAG.getConstantFP(APF, dl, LogicVT);
+ } else {
+ // If the magnitude operand wasn't a constant, we need to AND out the sign.
+ if (IsFakeVector)
+ Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
+ MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
+ }
+
+ // OR the magnitude value with the sign bit.
+ SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
+ return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
+ SDValue N0 = Op.getOperand(0);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ MVT OpVT = N0.getSimpleValueType();
+ assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
+ "Unexpected type for FGETSIGN");
+
+ // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
+ MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
+ Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
+ Res = DAG.getZExtOrTrunc(Res, dl, VT);
+ Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
+ return Res;
+}
+
+/// Helper for creating a X86ISD::SETCC node.
+static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
+ SelectionDAG &DAG) {
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
+}
+
+/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
+/// style scalarized (associative) reduction patterns. Partial reductions
+/// are supported when the pointer SrcMask is non-null.
+/// TODO - move this to SelectionDAG?
+static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
+ SmallVectorImpl<SDValue> &SrcOps,
+ SmallVectorImpl<APInt> *SrcMask = nullptr) {
+ SmallVector<SDValue, 8> Opnds;
+ DenseMap<SDValue, APInt> SrcOpMap;
+ EVT VT = MVT::Other;
+
+ // Recognize a special case where a vector is casted into wide integer to
+ // test all 0s.
+ assert(Op.getOpcode() == unsigned(BinOp) &&
+ "Unexpected bit reduction opcode");
+ Opnds.push_back(Op.getOperand(0));
+ Opnds.push_back(Op.getOperand(1));
+
+ for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+ SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
+ // BFS traverse all BinOp operands.
+ if (I->getOpcode() == unsigned(BinOp)) {
+ Opnds.push_back(I->getOperand(0));
+ Opnds.push_back(I->getOperand(1));
+ // Re-evaluate the number of nodes to be traversed.
+ e += 2; // 2 more nodes (LHS and RHS) are pushed.
+ continue;
+ }
+
+ // Quit if a non-EXTRACT_VECTOR_ELT
+ if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return false;
+
+ // Quit if without a constant index.
+ auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
+ if (!Idx)
+ return false;
+
+ SDValue Src = I->getOperand(0);
+ DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
+ if (M == SrcOpMap.end()) {
+ VT = Src.getValueType();
+ // Quit if not the same type.
+ if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
+ return false;
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt EltCount = APInt::getNullValue(NumElts);
+ M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
+ SrcOps.push_back(Src);
+ }
+
+ // Quit if element already used.
+ unsigned CIdx = Idx->getZExtValue();
+ if (M->second[CIdx])
+ return false;
+ M->second.setBit(CIdx);
+ }
+
+ if (SrcMask) {
+ // Collect the source partial masks.
+ for (SDValue &SrcOp : SrcOps)
+ SrcMask->push_back(SrcOpMap[SrcOp]);
+ } else {
+ // Quit if not all elements are used.
+ for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
+ E = SrcOpMap.end();
+ I != E; ++I) {
+ if (!I->second.isAllOnesValue())
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Helper function for comparing all bits of a vector against zero.
+static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
+ const APInt &Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, X86::CondCode &X86CC) {
+ EVT VT = V.getValueType();
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ if (Mask.getBitWidth() != ScalarSize) {
+ assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
+ return SDValue();
+ }
+
+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
+ X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
+
+ auto MaskBits = [&](SDValue Src) {
+ if (Mask.isAllOnesValue())
+ return Src;
+ EVT SrcVT = Src.getValueType();
+ SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
+ return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
+ };
+
+ // For sub-128-bit vector, cast to (legal) integer and compare with zero.
+ if (VT.getSizeInBits() < 128) {
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
+ return SDValue();
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ DAG.getBitcast(IntVT, MaskBits(V)),
+ DAG.getConstant(0, DL, IntVT));
+ }
+
+ // Quit if not splittable to 128/256-bit vector.
+ if (!isPowerOf2_32(VT.getSizeInBits()))
+ return SDValue();
+
+ // Split down to 128/256-bit vector.
+ unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
+ while (VT.getSizeInBits() > TestSize) {
+ auto Split = DAG.SplitVector(V, DL);
+ VT = Split.first.getValueType();
+ V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
+ }
+
+ bool UsePTEST = Subtarget.hasSSE41();
+ if (UsePTEST) {
+ MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ V = DAG.getBitcast(TestVT, MaskBits(V));
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
+ }
+
+ // Without PTEST, a masked v2i64 or-reduction is not faster than
+ // scalarization.
+ if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
+ return SDValue();
+
+ V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
+ V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
+ getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
+ DAG.getConstant(0xFFFF, DL, MVT::i32));
+}
+
+// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
+// CMP(MOVMSK(PCMPEQB(X,0))).
+static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+ const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, SDValue &X86CC) {
+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
+
+ if (!Subtarget.hasSSE2() || !Op->hasOneUse())
+ return SDValue();
+
+ // Check whether we're masking/truncating an OR-reduction result, in which
+ // case track the masked bits.
+ APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
+ switch (Op.getOpcode()) {
+ case ISD::TRUNCATE: {
+ SDValue Src = Op.getOperand(0);
+ Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
+ Op.getScalarValueSizeInBits());
+ Op = Src;
+ break;
+ }
+ case ISD::AND: {
+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Mask = Cst->getAPIntValue();
+ Op = Op.getOperand(0);
+ }
+ break;
+ }
+ }
+
+ SmallVector<SDValue, 8> VecIns;
+ if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
+ EVT VT = VecIns[0].getValueType();
+ assert(llvm::all_of(VecIns,
+ [VT](SDValue V) { return VT == V.getValueType(); }) &&
+ "Reduction source vector mismatch");
+
+ // Quit if less than 128-bits or not splittable to 128/256-bit vector.
+ if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
+ return SDValue();
+
+ // If more than one full vector is evaluated, OR them first before PTEST.
+ for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
+ Slot += 2, e += 1) {
+ // Each iteration will OR 2 nodes and append the result until there is
+ // only 1 node left, i.e. the final OR'd value of all vectors.
+ SDValue LHS = VecIns[Slot];
+ SDValue RHS = VecIns[Slot + 1];
+ VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
+ }
+
+ X86::CondCode CCode;
+ if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
+ DAG, CCode)) {
+ X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
+ return V;
+ }
+ }
+
+ if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ ISD::NodeType BinOp;
+ if (SDValue Match =
+ DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
+ X86::CondCode CCode;
+ if (SDValue V =
+ LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
+ X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
+ return V;
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+/// return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+ for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+ ++UI) {
+ SDNode *User = *UI;
+ unsigned UOpNo = UI.getOperandNo();
+ if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+ // Look pass truncate.
+ UOpNo = User->use_begin().getOperandNo();
+ User = *User->use_begin();
+ }
+
+ if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+ !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+ return true;
+ }
+ return false;
+}
+
+// Transform to an x86-specific ALU node with flags if there is a chance of
+// using an RMW op or only the flags are used. Otherwise, leave
+// the node alone and emit a 'cmp' or 'test' instruction.
+static bool isProfitableToUseFlagOp(SDValue Op) {
+ for (SDNode *U : Op->uses())
+ if (U->getOpcode() != ISD::CopyToReg &&
+ U->getOpcode() != ISD::SETCC &&
+ U->getOpcode() != ISD::STORE)
+ return false;
+
+ return true;
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ // CF and OF aren't always set the way we want. Determine which
+ // of these we need.
+ bool NeedCF = false;
+ bool NeedOF = false;
+ switch (X86CC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ NeedCF = true;
+ break;
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO: {
+ // Check if we really need to set the
+ // Overflow flag. If NoSignedWrap is present
+ // that is not actually needed.
+ switch (Op->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SHL:
+ if (Op.getNode()->getFlags().hasNoSignedWrap())
+ break;
+ LLVM_FALLTHROUGH;
+ default:
+ NeedOF = true;
+ break;
+ }
+ break;
+ }
+ }
+ // See if we can use the EFLAGS value from the operand instead of
+ // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+ // we prove that the arithmetic won't overflow, we can't use OF or CF.
+ if (Op.getResNo() != 0 || NeedOF || NeedCF) {
+ // Emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, Op.getValueType()));
+ }
+ unsigned Opcode = 0;
+ unsigned NumOperands = 0;
+
+ SDValue ArithOp = Op;
+
+ // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+ // which may be the result of a CAST. We use the variable 'Op', which is the
+ // non-casted variable when we check for possible users.
+ switch (ArithOp.getOpcode()) {
+ case ISD::AND:
+ // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
+ // because a TEST instruction will be better.
+ if (!hasNonFlagsUse(Op))
+ break;
+
+ LLVM_FALLTHROUGH;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::OR:
+ case ISD::XOR:
+ if (!isProfitableToUseFlagOp(Op))
+ break;
+
+ // Otherwise use a regular EFLAGS-setting instruction.
+ switch (ArithOp.getOpcode()) {
+ default: llvm_unreachable("unexpected operator!");
+ case ISD::ADD: Opcode = X86ISD::ADD; break;
+ case ISD::SUB: Opcode = X86ISD::SUB; break;
+ case ISD::XOR: Opcode = X86ISD::XOR; break;
+ case ISD::AND: Opcode = X86ISD::AND; break;
+ case ISD::OR: Opcode = X86ISD::OR; break;
+ }
+
+ NumOperands = 2;
+ break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ return SDValue(Op.getNode(), 1);
+ case ISD::SSUBO:
+ case ISD::USUBO: {
+ // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
+ Op->getOperand(1)).getValue(1);
+ }
+ default:
+ break;
+ }
+
+ if (Opcode == 0) {
+ // Emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, Op.getValueType()));
+ }
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
+
+ SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
+ return SDValue(New.getNode(), 1);
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (isNullConstant(Op1))
+ return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
+
+ EVT CmpVT = Op0.getValueType();
+
+ assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
+ CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
+
+ // Only promote the compare up to I32 if it is a 16 bit operation
+ // with an immediate. 16 bit immediates are to be avoided.
+ if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
+ !DAG.getMachineFunction().getFunction().hasMinSize()) {
+ ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
+ ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+ // Don't do this if the immediate can fit in 8-bits.
+ if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
+ (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
+ unsigned ExtendOp =
+ isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
+ // For equality comparisons try to use SIGN_EXTEND if the input was
+ // truncate from something with enough sign bits.
+ if (Op0.getOpcode() == ISD::TRUNCATE) {
+ SDValue In = Op0.getOperand(0);
+ unsigned EffBits =
+ In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
+ if (EffBits <= 16)
+ ExtendOp = ISD::SIGN_EXTEND;
+ } else if (Op1.getOpcode() == ISD::TRUNCATE) {
+ SDValue In = Op1.getOperand(0);
+ unsigned EffBits =
+ In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
+ if (EffBits <= 16)
+ ExtendOp = ISD::SIGN_EXTEND;
+ }
+ }
+
+ CmpVT = MVT::i32;
+ Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
+ Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
+ }
+ }
+
+ // Try to shrink i64 compares if the input has enough zero bits.
+ // FIXME: Do this for non-constant compares for constant on LHS?
+ if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
+ Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
+ cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
+ DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
+ CmpVT = MVT::i32;
+ Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
+ Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
+ }
+
+ // 0-x == y --> x+y == 0
+ // 0-x != y --> x+y != 0
+ if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
+ Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
+ return Add.getValue(1);
+ }
+
+ // x == 0-y --> x+y == 0
+ // x != 0-y --> x+y != 0
+ if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
+ Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
+ return Add.getValue(1);
+ }
+
+ // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
+ return Sub.getValue(1);
+}
+
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // We never want to use both SQRT and RSQRT instructions for the same input.
+ if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+ return false;
+
+ if (VT.isVector())
+ return Subtarget.hasFastVectorFSQRT();
+ return Subtarget.hasFastScalarFSQRT();
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
+ SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps,
+ bool &UseOneConstNR,
+ bool Reciprocal) const {
+ EVT VT = Op.getValueType();
+
+ // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
+ // It is likely not profitable to do this for f64 because a double-precision
+ // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+ // instructions: convert to single, rsqrtss, convert back to double, refine
+ // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+ // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
+ // after legalize types.
+ if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
+ (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
+ if (RefinementSteps == ReciprocalEstimate::Unspecified)
+ RefinementSteps = 1;
+
+ UseOneConstNR = false;
+ // There is no FSQRT for 512-bits, but there is RSQRT14.
+ unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
+ return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
+ int Enabled,
+ int &RefinementSteps) const {
+ EVT VT = Op.getValueType();
+
+ // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
+ // It is likely not profitable to do this for f64 because a double-precision
+ // reciprocal estimate with refinement on x86 prior to FMA requires
+ // 15 instructions: convert to single, rcpss, convert back to double, refine
+ // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+
+ if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
+ // Enable estimate codegen with 1 refinement step for vector division.
+ // Scalar division estimates are disabled because they break too much
+ // real-world code. These defaults are intended to match GCC behavior.
+ if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
+ return SDValue();
+
+ if (RefinementSteps == ReciprocalEstimate::Unspecified)
+ RefinementSteps = 1;
+
+ // There is no FSQRT for 512-bits, but there is RCP14.
+ unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
+ return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
+}
+
+/// If we have at least two divisions that use the same divisor, convert to
+/// multiplication by a reciprocal. This may need to be adjusted for a given
+/// CPU if a division's cost is not at least twice the cost of a multiplication.
+/// This is because we still need one division to calculate the reciprocal and
+/// then we need two multiplies by that reciprocal as replacements for the
+/// original divisions.
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
+ return 2;
+}
+
+SDValue
+X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const {
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+ if (isIntDivCheap(N->getValueType(0), Attr))
+ return SDValue(N,0); // Lower SDIV as SDIV
+
+ assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
+ "Unexpected divisor!");
+
+ // Only perform this transform if CMOV is supported otherwise the select
+ // below will become a branch.
+ if (!Subtarget.hasCMov())
+ return SDValue();
+
+ // fold (sdiv X, pow2)
+ EVT VT = N->getValueType(0);
+ // FIXME: Support i8.
+ if (VT != MVT::i16 && VT != MVT::i32 &&
+ !(Subtarget.is64Bit() && VT == MVT::i64))
+ return SDValue();
+
+ unsigned Lg2 = Divisor.countTrailingZeros();
+
+ // If the divisor is 2 or -2, the default expansion is better.
+ if (Lg2 == 1)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
+ SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
+
+ // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
+ SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+ SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+
+ Created.push_back(Cmp.getNode());
+ Created.push_back(Add.getNode());
+ Created.push_back(CMov.getNode());
+
+ // Divide by pow2.
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
+
+ // If we're dividing by a positive value, we're done. Otherwise, we must
+ // negate the result.
+ if (Divisor.isNonNegative())
+ return SRA;
+
+ Created.push_back(SRA.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
+}
+
+/// Result of 'and' is compared against zero. Change to a BT node if possible.
+/// Returns the BT node and the condition code needed to use it.
+static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SDValue &X86CC) {
+ assert(And.getOpcode() == ISD::AND && "Expected AND node!");
+ SDValue Op0 = And.getOperand(0);
+ SDValue Op1 = And.getOperand(1);
+ if (Op0.getOpcode() == ISD::TRUNCATE)
+ Op0 = Op0.getOperand(0);
+ if (Op1.getOpcode() == ISD::TRUNCATE)
+ Op1 = Op1.getOperand(0);
+
+ SDValue Src, BitNo;
+ if (Op1.getOpcode() == ISD::SHL)
+ std::swap(Op0, Op1);
+ if (Op0.getOpcode() == ISD::SHL) {
+ if (isOneConstant(Op0.getOperand(0))) {
+ // If we looked past a truncate, check that it's only truncating away
+ // known zeros.
+ unsigned BitWidth = Op0.getValueSizeInBits();
+ unsigned AndBitWidth = And.getValueSizeInBits();
+ if (BitWidth > AndBitWidth) {
+ KnownBits Known = DAG.computeKnownBits(Op0);
+ if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
+ return SDValue();
+ }
+ Src = Op1;
+ BitNo = Op0.getOperand(1);
+ }
+ } else if (Op1.getOpcode() == ISD::Constant) {
+ ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+ uint64_t AndRHSVal = AndRHS->getZExtValue();
+ SDValue AndLHS = Op0;
+
+ if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
+ Src = AndLHS.getOperand(0);
+ BitNo = AndLHS.getOperand(1);
+ } else {
+ // Use BT if the immediate can't be encoded in a TEST instruction or we
+ // are optimizing for size and the immedaite won't fit in a byte.
+ bool OptForSize = DAG.shouldOptForSize();
+ if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
+ isPowerOf2_64(AndRHSVal)) {
+ Src = AndLHS;
+ BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
+ Src.getValueType());
+ }
+ }
+ }
+
+ // No patterns found, give up.
+ if (!Src.getNode())
+ return SDValue();
+
+ // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
+ // instruction. Since the shift amount is in-range-or-undefined, we know
+ // that doing a bittest on the i32 value is ok. We extend to i32 because
+ // the encoding for the i16 version is larger than the i32 version.
+ // Also promote i16 to i32 for performance / code size reason.
+ if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
+ Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
+
+ // See if we can use the 32-bit instruction instead of the 64-bit one for a
+ // shorter encoding. Since the former takes the modulo 32 of BitNo and the
+ // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
+ // known to be zero.
+ if (Src.getValueType() == MVT::i64 &&
+ DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
+ Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+
+ // If the operand types disagree, extend the shift amount to match. Since
+ // BT ignores high bits (like shifts) we can use anyextend.
+ if (Src.getValueType() != BitNo.getValueType())
+ BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
+
+ X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
+ dl, MVT::i8);
+ return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
+}
+
+/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
+/// CMPs.
+static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+ SDValue &Op1, bool &IsAlwaysSignaling) {
+ unsigned SSECC;
+ bool Swap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETOEQ:
+ case ISD::SETEQ: SSECC = 0; break;
+ case ISD::SETOGT:
+ case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETLT:
+ case ISD::SETOLT: SSECC = 1; break;
+ case ISD::SETOGE:
+ case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETLE:
+ case ISD::SETOLE: SSECC = 2; break;
+ case ISD::SETUO: SSECC = 3; break;
+ case ISD::SETUNE:
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETUGE: SSECC = 5; break;
+ case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETUGT: SSECC = 6; break;
+ case ISD::SETO: SSECC = 7; break;
+ case ISD::SETUEQ: SSECC = 8; break;
+ case ISD::SETONE: SSECC = 12; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ switch (SetCCOpcode) {
+ default:
+ IsAlwaysSignaling = true;
+ break;
+ case ISD::SETEQ:
+ case ISD::SETOEQ:
+ case ISD::SETUEQ:
+ case ISD::SETNE:
+ case ISD::SETONE:
+ case ISD::SETUNE:
+ case ISD::SETO:
+ case ISD::SETUO:
+ IsAlwaysSignaling = false;
+ break;
+ }
+
+ return SSECC;
+}
+
+/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
+/// concatenate the result back.
+static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation");
+ assert(Op.getOperand(0).getValueType().isInteger() &&
+ VT == Op.getOperand(0).getValueType() && "Unsupported VTs!");
+
+ SDLoc dl(Op);
+ SDValue CC = Op.getOperand(2);
+
+ // Extract the LHS Lo/Hi vectors
+ SDValue LHS1, LHS2;
+ std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
+
+ // Extract the RHS Lo/Hi vectors
+ SDValue RHS1, RHS2;
+ std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
+
+ // Issue the operation on the smaller types and concatenate the result back
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
+ DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
+}
+
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ assert(VT.getVectorElementType() == MVT::i1 &&
+ "Cannot set masked compare for this operation");
+
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+
+ // Prefer SETGT over SETLT.
+ if (SetCCOpcode == ISD::SETLT) {
+ SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
+ std::swap(Op0, Op1);
+ }
+
+ return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
+}
+
+/// Given a buildvector constant, return a new vector constant with each element
+/// incremented or decremented. If incrementing or decrementing would result in
+/// unsigned overflow or underflow or this is not a simple vector constant,
+/// return an empty value.
+static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
+ auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
+ if (!BV)
+ return SDValue();
+
+ MVT VT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> NewVecC;
+ SDLoc DL(V);
+ for (unsigned i = 0; i < NumElts; ++i) {
+ auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+ if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
+ return SDValue();
+
+ // Avoid overflow/underflow.
+ const APInt &EltC = Elt->getAPIntValue();
+ if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
+ return SDValue();
+
+ NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
+ }
+
+ return DAG.getBuildVector(VT, DL, NewVecC);
+}
+
+/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+/// Op0 u<= Op1:
+/// t = psubus Op0, Op1
+/// pcmpeq t, <0..0>
+static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
+ ISD::CondCode Cond, const SDLoc &dl,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ MVT VET = VT.getVectorElementType();
+ if (VET != MVT::i8 && VET != MVT::i16)
+ return SDValue();
+
+ switch (Cond) {
+ default:
+ return SDValue();
+ case ISD::SETULT: {
+ // If the comparison is against a constant we can turn this into a
+ // setule. With psubus, setule does not require a swap. This is
+ // beneficial because the constant in the register is no longer
+ // destructed as the destination so it can be hoisted out of a loop.
+ // Only do this pre-AVX since vpcmp* is no longer destructive.
+ if (Subtarget.hasAVX())
+ return SDValue();
+ SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
+ if (!ULEOp1)
+ return SDValue();
+ Op1 = ULEOp1;
+ break;
+ }
+ case ISD::SETUGT: {
+ // If the comparison is against a constant, we can turn this into a setuge.
+ // This is beneficial because materializing a constant 0 for the PCMPEQ is
+ // probably cheaper than XOR+PCMPGT using 2 different vector constants:
+ // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
+ SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
+ if (!UGEOp1)
+ return SDValue();
+ Op1 = Op0;
+ Op0 = UGEOp1;
+ break;
+ }
+ // Psubus is better than flip-sign because it requires no inversion.
+ case ISD::SETUGE:
+ std::swap(Op0, Op1);
+ break;
+ case ISD::SETULE:
+ break;
+ }
+
+ SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
+ return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+ DAG.getConstant(0, dl, VT));
+}
+
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
+ Op.getOpcode() == ISD::STRICT_FSETCCS;
+ SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
+ SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
+ MVT VT = Op->getSimpleValueType(0);
+ ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
+ bool isFP = Op1.getSimpleValueType().isFloatingPoint();
+ SDLoc dl(Op);
+
+ if (isFP) {
+#ifndef NDEBUG
+ MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
+ assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
+
+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+
+ // If we have a strict compare with a vXi1 result and the input is 128/256
+ // bits we can't use a masked compare unless we have VLX. If we use a wider
+ // compare like we do for non-strict, we might trigger spurious exceptions
+ // from the upper elements. Instead emit a AVX compare and convert to mask.
+ unsigned Opc;
+ if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
+ (!IsStrict || Subtarget.hasVLX() ||
+ Op0.getSimpleValueType().is512BitVector())) {
+ assert(VT.getVectorNumElements() <= 16);
+ Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
+ } else {
+ Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
+ // The SSE/AVX packed FP comparison nodes are defined with a
+ // floating-point vector result that matches the operand type. This allows
+ // them to work with an SSE1 target (integer vector types are not legal).
+ VT = Op0.getSimpleValueType();
+ }
+
+ SDValue Cmp;
+ bool IsAlwaysSignaling;
+ unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
+ if (!Subtarget.hasAVX()) {
+ // TODO: We could use following steps to handle a quiet compare with
+ // signaling encodings.
+ // 1. Get ordered masks from a quiet ISD::SETO
+ // 2. Use the masks to mask potential unordered elements in operand A, B
+ // 3. Get the compare results of masked A, B
+ // 4. Calculating final result using the mask and result from 3
+ // But currently, we just fall back to scalar operations.
+ if (IsStrict && IsAlwaysSignaling && !IsSignaling)
+ return SDValue();
+
+ // Insert an extra signaling instruction to raise exception.
+ if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
+ SDValue SignalCmp = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
+ // FIXME: It seems we need to update the flags of all new strict nodes.
+ // Otherwise, mayRaiseFPException in MI will return false due to
+ // NoFPExcept = false by default. However, I didn't find it in other
+ // patches.
+ SignalCmp->setFlags(Op->getFlags());
+ Chain = SignalCmp.getValue(1);
+ }
+
+ // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
+ // emit two comparisons and a logic op to tie them together.
+ if (SSECC >= 8) {
+ // LLVM predicate is SETUEQ or SETONE.
+ unsigned CC0, CC1;
+ unsigned CombineOpc;
+ if (Cond == ISD::SETUEQ) {
+ CC0 = 3; // UNORD
+ CC1 = 0; // EQ
+ CombineOpc = X86ISD::FOR;
+ } else {
+ assert(Cond == ISD::SETONE);
+ CC0 = 7; // ORD
+ CC1 = 4; // NEQ
+ CombineOpc = X86ISD::FAND;
+ }
+
+ SDValue Cmp0, Cmp1;
+ if (IsStrict) {
+ Cmp0 = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
+ Cmp1 = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
+ Cmp1.getValue(1));
+ } else {
+ Cmp0 = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
+ Cmp1 = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
+ }
+ Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+ } else {
+ if (IsStrict) {
+ Cmp = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
+ Chain = Cmp.getValue(1);
+ } else
+ Cmp = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
+ }
+ } else {
+ // Handle all other FP comparisons here.
+ if (IsStrict) {
+ // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
+ SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
+ Cmp = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
+ Chain = Cmp.getValue(1);
+ } else
+ Cmp = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
+ }
+
+ if (VT.getFixedSizeInBits() >
+ Op.getSimpleValueType().getFixedSizeInBits()) {
+ // We emitted a compare with an XMM/YMM result. Finish converting to a
+ // mask register using a vptestm.
+ EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
+ Cmp = DAG.getBitcast(CastVT, Cmp);
+ Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
+ DAG.getConstant(0, dl, CastVT), ISD::SETNE);
+ } else {
+ // If this is SSE/AVX CMPP, bitcast the result back to integer to match
+ // the result type of SETCC. The bitcast is expected to be optimized
+ // away during combining/isel.
+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+ }
+
+ if (IsStrict)
+ return DAG.getMergeValues({Cmp, Chain}, dl);
+
+ return Cmp;
+ }
+
+ assert(!IsStrict && "Strict SETCC only handles FP operands.");
+
+ MVT VTOp0 = Op0.getSimpleValueType();
+ (void)VTOp0;
+ assert(VTOp0 == Op1.getSimpleValueType() &&
+ "Expected operands with same type!");
+ assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
+ "Invalid number of packed elements for source and destination!");
+
+ // The non-AVX512 code below works under the assumption that source and
+ // destination types are the same.
+ assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
+ "Value types for source and destination must be the same!");
+
+ // The result is boolean, but operands are int/float
+ if (VT.getVectorElementType() == MVT::i1) {
+ // In AVX-512 architecture setcc returns mask with i1 elements,
+ // But there is no compare instruction for i8 and i16 elements in KNL.
+ assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
+ "Unexpected operand type");
+ return LowerIntVSETCC_AVX512(Op, DAG);
+ }
+
+ // Lower using XOP integer comparisons.
+ if (VT.is128BitVector() && Subtarget.hasXOP()) {
+ // Translate compare code to XOP PCOM compare mode.
+ unsigned CmpMode = 0;
+ switch (Cond) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETULT:
+ case ISD::SETLT: CmpMode = 0x00; break;
+ case ISD::SETULE:
+ case ISD::SETLE: CmpMode = 0x01; break;
+ case ISD::SETUGT:
+ case ISD::SETGT: CmpMode = 0x02; break;
+ case ISD::SETUGE:
+ case ISD::SETGE: CmpMode = 0x03; break;
+ case ISD::SETEQ: CmpMode = 0x04; break;
+ case ISD::SETNE: CmpMode = 0x05; break;
+ }
+
+ // Are we comparing unsigned or signed integers?
+ unsigned Opc =
+ ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
+
+ return DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getTargetConstant(CmpMode, dl, MVT::i8));
+ }
+
+ // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
+ // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
+ if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
+ SDValue BC0 = peekThroughBitcasts(Op0);
+ if (BC0.getOpcode() == ISD::AND) {
+ APInt UndefElts;
+ SmallVector<APInt, 64> EltBits;
+ if (getTargetConstantBitsFromNode(BC0.getOperand(1),
+ VT.getScalarSizeInBits(), UndefElts,
+ EltBits, false, false)) {
+ if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
+ Cond = ISD::SETEQ;
+ Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
+ }
+ }
+ }
+ }
+
+ // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
+ if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
+ Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
+ ConstantSDNode *C1 = isConstOrConstSplat(Op1);
+ if (C1 && C1->getAPIntValue().isPowerOf2()) {
+ unsigned BitWidth = VT.getScalarSizeInBits();
+ unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
+
+ SDValue Result = Op0.getOperand(0);
+ Result = DAG.getNode(ISD::SHL, dl, VT, Result,
+ DAG.getConstant(ShiftAmt, dl, VT));
+ Result = DAG.getNode(ISD::SRA, dl, VT, Result,
+ DAG.getConstant(BitWidth - 1, dl, VT));
+ return Result;
+ }
+ }
+
+ // Break 256-bit integer vector compare into smaller ones.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return splitIntVSETCC(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8) {
+ assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
+ return splitIntVSETCC(Op, DAG);
+ }
+
+ // If this is a SETNE against the signed minimum value, change it to SETGT.
+ // If this is a SETNE against the signed maximum value, change it to SETLT.
+ // which will be swapped to SETGT.
+ // Otherwise we use PCMPEQ+invert.
+ APInt ConstValue;
+ if (Cond == ISD::SETNE &&
+ ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
+ if (ConstValue.isMinSignedValue())
+ Cond = ISD::SETGT;
+ else if (ConstValue.isMaxSignedValue())
+ Cond = ISD::SETLT;
+ }
+
+ // If both operands are known non-negative, then an unsigned compare is the
+ // same as a signed compare and there's no need to flip signbits.
+ // TODO: We could check for more general simplifications here since we're
+ // computing known bits.
+ bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
+ !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
+
+ // Special case: Use min/max operations for unsigned compares.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (ISD::isUnsignedIntSetCC(Cond) &&
+ (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
+ TLI.isOperationLegal(ISD::UMIN, VT)) {
+ // If we have a constant operand, increment/decrement it and change the
+ // condition to avoid an invert.
+ if (Cond == ISD::SETUGT) {
+ // X > C --> X >= (C+1) --> X == umax(X, C+1)
+ if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
+ Op1 = UGTOp1;
+ Cond = ISD::SETUGE;
+ }
+ }
+ if (Cond == ISD::SETULT) {
+ // X < C --> X <= (C-1) --> X == umin(X, C-1)
+ if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
+ Op1 = ULTOp1;
+ Cond = ISD::SETULE;
+ }
+ }
+ bool Invert = false;
+ unsigned Opc;
+ switch (Cond) {
+ default: llvm_unreachable("Unexpected condition code");
+ case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETULE: Opc = ISD::UMIN; break;
+ case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETUGE: Opc = ISD::UMAX; break;
+ }
+
+ SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+
+ // If the logical-not of the result is required, perform that now.
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+
+ return Result;
+ }
+
+ // Try to use SUBUS and PCMPEQ.
+ if (FlipSigns)
+ if (SDValue V =
+ LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
+ return V;
+
+ // We are handling one of the integer comparisons here. Since SSE only has
+ // GT and EQ comparisons for integer, swapping operands and multiple
+ // operations may be required for some comparisons.
+ unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
+ : X86ISD::PCMPGT;
+ bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
+ Cond == ISD::SETGE || Cond == ISD::SETUGE;
+ bool Invert = Cond == ISD::SETNE ||
+ (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // Check that the operation in question is available (most are plain SSE2,
+ // but PCMPGTQ and PCMPEQQ have different requirements).
+ if (VT == MVT::v2i64) {
+ if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
+ assert(Subtarget.hasSSE2() && "Don't know how to lower!");
+
+ // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
+ // the odd elements over the even elements.
+ if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
+ Op0 = DAG.getConstant(0, dl, MVT::v4i32);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ return DAG.getBitcast(VT, Result);
+ }
+
+ if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
+
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ return DAG.getBitcast(VT, Result);
+ }
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations. The lower
+ // compare is always unsigned.
+ SDValue SB;
+ if (FlipSigns) {
+ SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
+ } else {
+ SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
+ }
+ Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
+
+ // Cast everything to the right type.
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+ // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
+
+ // Create masks for only the low parts/high parts of the 64 bit integers.
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ static const int MaskLo[] = { 0, 0, 2, 2 };
+ SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
+ SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
+ SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
+ Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getBitcast(VT, Result);
+ }
+
+ if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
+ // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+ // pcmpeqd + pshufd + pand.
+ assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+ // First cast everything to the right type.
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+ // Do the compare.
+ SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+ // Make sure the lower and upper halves are both all-ones.
+ static const int Mask[] = { 1, 0, 3, 2 };
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+ Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getBitcast(VT, Result);
+ }
+ }
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations.
+ if (FlipSigns) {
+ MVT EltVT = VT.getVectorElementType();
+ SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
+ VT);
+ Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
+ Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
+ }
+
+ SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+ // If the logical-not of the result is required, perform that now.
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+
+ return Result;
+}
+
+// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
+static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDValue &X86CC) {
+ // Only support equality comparisons.
+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
+ return SDValue();
+
+ // Must be a bitcast from vXi1.
+ if (Op0.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ Op0 = Op0.getOperand(0);
+ MVT VT = Op0.getSimpleValueType();
+ if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
+ !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
+ !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
+ return SDValue();
+
+ X86::CondCode X86Cond;
+ if (isNullConstant(Op1)) {
+ X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+ } else if (isAllOnesConstant(Op1)) {
+ // C flag is set for all ones.
+ X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
+ } else
+ return SDValue();
+
+ // If the input is an AND, we can combine it's operands into the KTEST.
+ bool KTestable = false;
+ if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
+ KTestable = true;
+ if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
+ KTestable = true;
+ if (!isNullConstant(Op1))
+ KTestable = false;
+ if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
+ SDValue LHS = Op0.getOperand(0);
+ SDValue RHS = Op0.getOperand(1);
+ X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
+ }
+
+ // If the input is an OR, we can combine it's operands into the KORTEST.
+ SDValue LHS = Op0;
+ SDValue RHS = Op0;
+ if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
+ LHS = Op0.getOperand(0);
+ RHS = Op0.getOperand(1);
+ }
+
+ X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+}
+
+/// Emit flags for the given setcc condition and operands. Also returns the
+/// corresponding X86 condition code constant in X86CC.
+SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
+ ISD::CondCode CC, const SDLoc &dl,
+ SelectionDAG &DAG,
+ SDValue &X86CC) const {
+ // Optimize to BT if possible.
+ // Lower (X & (1 << N)) == 0 to BT(X, N).
+ // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+ // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+ if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
+ return BT;
+ }
+
+ // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
+ // TODO: We could do AND tree with all 1s as well by using the C flag.
+ if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
+ if (SDValue CmpZ =
+ MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
+ return CmpZ;
+
+ // Try to lower using KORTEST or KTEST.
+ if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
+ return Test;
+
+ // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
+ // these.
+ if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ // If the input is a setcc, then reuse the input setcc or use a new one with
+ // the inverted condition.
+ if (Op0.getOpcode() == X86ISD::SETCC) {
+ bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
+
+ X86CC = Op0.getOperand(0);
+ if (Invert) {
+ X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
+ }
+
+ return Op0.getOperand(1);
+ }
+ }
+
+ // Try to use the carry flag from the add in place of an separate CMP for:
+ // (seteq (add X, -1), -1). Similar for setne.
+ if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
+ Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (isProfitableToUseFlagOp(Op0)) {
+ SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+
+ SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
+ Op0.getOperand(1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
+ X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
+ return SDValue(New.getNode(), 1);
+ }
+ }
+
+ X86::CondCode CondCode =
+ TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
+ assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
+
+ SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
+ X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
+ return EFLAGS;
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
+ Op.getOpcode() == ISD::STRICT_FSETCCS;
+ MVT VT = Op->getSimpleValueType(0);
+
+ if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+ assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
+ SDLoc dl(Op);
+ ISD::CondCode CC =
+ cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
+
+ // Handle f128 first, since one possible outcome is a normal integer
+ // comparison which gets handled by emitFlagsForSetcc.
+ if (Op0.getValueType() == MVT::f128) {
+ softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
+ Op.getOpcode() == ISD::STRICT_FSETCCS);
+
+ // If softenSetCCOperands returned a scalar, use it.
+ if (!Op1.getNode()) {
+ assert(Op0.getValueType() == Op.getValueType() &&
+ "Unexpected setcc expansion!");
+ if (IsStrict)
+ return DAG.getMergeValues({Op0, Chain}, dl);
+ return Op0;
+ }
+ }
+
+ if (Op0.getSimpleValueType().isInteger()) {
+ SDValue X86CC;
+ SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
+ }
+
+ // Handle floating point.
+ X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
+ if (CondCode == X86::COND_INVALID)
+ return SDValue();
+
+ SDValue EFLAGS;
+ if (IsStrict) {
+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+ EFLAGS =
+ DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
+ dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
+ Chain = EFLAGS.getValue(1);
+ } else {
+ EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
+ }
+
+ SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
+}
+
+SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Carry = Op.getOperand(2);
+ SDValue Cond = Op.getOperand(3);
+ SDLoc DL(Op);
+
+ assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
+ X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
+
+ // Recreate the carry if needed.
+ EVT CarryVT = Carry.getValueType();
+ Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
+ Carry, DAG.getAllOnesConstant(DL, CarryVT));
+
+ SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+ SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
+ return getSETCC(CC, Cmp.getValue(1), DL, DAG);
+}
+
+// This function returns three things: the arithmetic computation itself
+// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
+// flag and the condition code define the case in which the arithmetic
+// computation overflows.
+static std::pair<SDValue, SDValue>
+getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getResNo() == 0 && "Unexpected result number!");
+ SDValue Value, Overflow;
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ unsigned BaseOp = 0;
+ SDLoc DL(Op);
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Unknown ovf instruction!");
+ case ISD::SADDO:
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UADDO:
+ BaseOp = X86ISD::ADD;
+ Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
+ break;
+ case ISD::SSUBO:
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_O;
+ break;
+ case ISD::USUBO:
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SMULO:
+ BaseOp = X86ISD::SMUL;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UMULO:
+ BaseOp = X86ISD::UMUL;
+ Cond = X86::COND_O;
+ break;
+ }
+
+ if (BaseOp) {
+ // Also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+ Overflow = Value.getValue(1);
+ }
+
+ return std::make_pair(Value, Overflow);
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+ // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+ // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+ // looks for this combo and may remove the "setcc" instruction if the "setcc"
+ // has only one use.
+ SDLoc DL(Op);
+ X86::CondCode Cond;
+ SDValue Value, Overflow;
+ std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
+
+ SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
+ assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
+}
+
+/// Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+ unsigned Opc = Op.getOpcode();
+ if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+ Opc == X86ISD::FCMP)
+ return true;
+ if (Op.getResNo() == 1 &&
+ (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
+ Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
+ Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
+ return true;
+
+ return false;
+}
+
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+ if (V.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue VOp0 = V.getOperand(0);
+ unsigned InBits = VOp0.getValueSizeInBits();
+ unsigned Bits = V.getValueSizeInBits();
+ return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+ bool AddTest = true;
+ SDValue Cond = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ SDLoc DL(Op);
+ MVT VT = Op1.getSimpleValueType();
+ SDValue CC;
+
+ // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+ // are available or VBLENDV if AVX is available.
+ // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
+ if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
+ VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
+ SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
+ bool IsAlwaysSignaling;
+ unsigned SSECC =
+ translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
+ CondOp0, CondOp1, IsAlwaysSignaling);
+
+ if (Subtarget.hasAVX512()) {
+ SDValue Cmp =
+ DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
+ DAG.getTargetConstant(SSECC, DL, MVT::i8));
+ assert(!VT.isVector() && "Not a scalar type?");
+ return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
+ }
+
+ if (SSECC < 8 || Subtarget.hasAVX()) {
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
+ DAG.getTargetConstant(SSECC, DL, MVT::i8));
+
+ // If we have AVX, we can use a variable vector select (VBLENDV) instead
+ // of 3 logic instructions for size savings and potentially speed.
+ // Unfortunately, there is no scalar form of VBLENDV.
+
+ // If either operand is a +0.0 constant, don't try this. We can expect to
+ // optimize away at least one of the logic instructions later in that
+ // case, so that sequence would be faster than a variable blend.
+
+ // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
+ // uses XMM0 as the selection register. That may need just as many
+ // instructions as the AND/ANDN/OR sequence due to register moves, so
+ // don't bother.
+ if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
+ !isNullFPConstant(Op2)) {
+ // Convert to vectors, do a VSELECT, and convert back to scalar.
+ // All of the conversions should be optimized away.
+ MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+ SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
+ SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
+ SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
+
+ MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+ VCmp = DAG.getBitcast(VCmpVT, VCmp);
+
+ SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ VSel, DAG.getIntPtrConstant(0, DL));
+ }
+ SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
+ SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
+ return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
+ }
+ }
+
+ // AVX512 fallback is to lower selects of scalar floats to masked moves.
+ if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
+ SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
+ return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
+ }
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
+ Cond = NewCond;
+ // If the condition was updated, it's possible that the operands of the
+ // select were also updated (for example, EmitTest has a RAUW). Refresh
+ // the local references to the select operands in case they got stale.
+ Op1 = Op.getOperand(1);
+ Op2 = Op.getOperand(2);
+ }
+ }
+
+ // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
+ // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
+ // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
+ // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+ // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
+ // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
+ if (Cond.getOpcode() == X86ISD::SETCC &&
+ Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+ isNullConstant(Cond.getOperand(1).getOperand(1))) {
+ SDValue Cmp = Cond.getOperand(1);
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ unsigned CondCode = Cond.getConstantOperandVal(0);
+
+ // Special handling for __builtin_ffs(X) - 1 pattern which looks like
+ // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
+ // handle to keep the CMP with 0. This should be removed by
+ // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
+ // cttz_zero_undef.
+ auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
+ return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
+ Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
+ };
+ if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
+ ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
+ (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
+ // Keep Cmp.
+ } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
+ SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+
+ // Apply further optimizations for special cases
+ // (select (x != 0), -1, 0) -> neg & sbb
+ // (select (x == 0), 0, -1) -> neg & sbb
+ if (isNullConstant(Y) &&
+ (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+ SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
+ Zero = DAG.getConstant(0, DL, Op.getValueType());
+ return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
+ }
+
+ Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
+ CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
+
+ SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
+ SDValue Res = // Res = 0 or -1.
+ DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
+
+ if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
+ Res = DAG.getNOT(DL, Res, Res.getValueType());
+
+ return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+ } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
+ Cmp.getOperand(0).getOpcode() == ISD::AND &&
+ isOneConstant(Cmp.getOperand(0).getOperand(1))) {
+ SDValue Src1, Src2;
+ // true if Op2 is XOR or OR operator and one of its operands
+ // is equal to Op1
+ // ( a , a op b) || ( b , a op b)
+ auto isOrXorPattern = [&]() {
+ if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
+ (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
+ Src1 =
+ Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
+ Src2 = Op1;
+ return true;
+ }
+ return false;
+ };
+
+ if (isOrXorPattern()) {
+ SDValue Neg;
+ unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
+ // we need mask of all zeros or ones with same size of the other
+ // operands.
+ if (CmpSz > VT.getSizeInBits())
+ Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
+ else if (CmpSz < VT.getSizeInBits())
+ Neg = DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
+ DAG.getConstant(1, DL, VT));
+ else
+ Neg = CmpOp0;
+ SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ Neg); // -(and (x, 0x1))
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
+ return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
+ }
+ }
+ }
+
+ // Look past (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC ||
+ CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ bool IllegalFPCMov = false;
+ if (VT.isFloatingPoint() && !VT.isVector() &&
+ !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
+ IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+ if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+ Cmp.getOpcode() == X86ISD::BT) { // FIXME
+ Cond = Cmp;
+ AddTest = false;
+ }
+ } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
+ SDValue Value;
+ X86::CondCode X86Cond;
+ std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
+
+ CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
+ AddTest = false;
+ }
+
+ if (AddTest) {
+ // Look past the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ SDValue BTCC;
+ if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
+ CC = BTCC;
+ Cond = BT;
+ AddTest = false;
+ }
+ }
+ }
+
+ if (AddTest) {
+ CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+ }
+
+ // a < b ? -1 : 0 -> RES = ~setcc_carry
+ // a < b ? 0 : -1 -> RES = setcc_carry
+ // a >= b ? -1 : 0 -> RES = setcc_carry
+ // a >= b ? 0 : -1 -> RES = ~setcc_carry
+ if (Cond.getOpcode() == X86ISD::SUB) {
+ unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+
+ if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
+ (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ (isNullConstant(Op1) || isNullConstant(Op2))) {
+ SDValue Res =
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
+ if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
+ return DAG.getNOT(DL, Res, Res.getValueType());
+ return Res;
+ }
+ }
+
+ // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+ // widen the cmov and push the truncate through. This avoids introducing a new
+ // branch during isel and doesn't add any extensions.
+ if (Op.getValueType() == MVT::i8 &&
+ Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+ SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+ if (T1.getValueType() == T2.getValueType() &&
+ // Exclude CopyFromReg to avoid partial register stalls.
+ T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
+ CC, Cond);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+ }
+ }
+
+ // Or finally, promote i8 cmovs if we have CMOV,
+ // or i16 cmovs if it won't prevent folding a load.
+ // FIXME: we should not limit promotion of i8 case to only when the CMOV is
+ // legal, but EmitLoweredSelect() can not deal with these extensions
+ // being inserted between two CMOV's. (in i16 case too TBN)
+ // https://bugs.llvm.org/show_bug.cgi?id=40974
+ if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
+ (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
+ !MayFoldLoad(Op2))) {
+ Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+ Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+ SDValue Ops[] = { Op2, Op1, CC, Cond };
+ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+ }
+
+ // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+ // condition is true.
+ SDValue Ops[] = { Op2, Op1, CC, Cond };
+ return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
+}
+
+static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
+ MVT VTElt = VT.getVectorElementType();
+ SDLoc dl(Op);
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Extend VT if the scalar type is i8/i16 and BWI is not supported.
+ MVT ExtVT = VT;
+ if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
+ // If v16i32 is to be avoided, we'll need to split and concatenate.
+ if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
+ return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
+
+ ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
+ }
+
+ // Widen to 512-bits if VLX is not supported.
+ MVT WideVT = ExtVT;
+ if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
+ NumElts *= 512 / ExtVT.getSizeInBits();
+ InVT = MVT::getVectorVT(MVT::i1, NumElts);
+ In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
+ In, DAG.getIntPtrConstant(0, dl));
+ WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
+ }
+
+ SDValue V;
+ MVT WideEltVT = WideVT.getVectorElementType();
+ if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
+ (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
+ V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
+ } else {
+ SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
+ SDValue Zero = DAG.getConstant(0, dl, WideVT);
+ V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
+ }
+
+ // Truncate if we had to extend i16/i8 above.
+ if (VT != ExtVT) {
+ WideVT = MVT::getVectorVT(VTElt, NumElts);
+ V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
+ }
+
+ // Extract back to 128/256-bit if we widened.
+ if (WideVT != VT)
+ V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
+ DAG.getIntPtrConstant(0, dl));
+
+ return V;
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+
+ if (InVT.getVectorElementType() == MVT::i1)
+ return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
+
+ assert(Subtarget.hasAVX() && "Expected AVX support");
+ return LowerAVXExtend(Op, DAG, Subtarget);
+}
+
+// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
+// For sign extend this needs to handle all vector sizes and SSE4.1 and
+// non-SSE4.1 targets. For zero extend this should only handle inputs of
+// MVT::v64i8 when BWI is not supported, but AVX512 is.
+static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = Op->getOperand(0);
+ MVT VT = Op->getSimpleValueType(0);
+ MVT InVT = In.getSimpleValueType();
+
+ MVT SVT = VT.getVectorElementType();
+ MVT InSVT = InVT.getVectorElementType();
+ assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
+
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
+ return SDValue();
+ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+ return SDValue();
+ if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
+ !(VT.is256BitVector() && Subtarget.hasAVX()) &&
+ !(VT.is512BitVector() && Subtarget.hasAVX512()))
+ return SDValue();
+
+ SDLoc dl(Op);
+ unsigned Opc = Op.getOpcode();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // For 256-bit vectors, we only need the lower (128-bit) half of the input.
+ // For 512-bit vectors, we need 128-bits or 256-bits.
+ if (InVT.getSizeInBits() > 128) {
+ // Input needs to be at least the same number of elements as output, and
+ // at least 128-bits.
+ int InSize = InSVT.getSizeInBits() * NumElts;
+ In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
+ InVT = In.getSimpleValueType();
+ }
+
+ // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
+ // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
+ // need to be handled here for 256/512-bit results.
+ if (Subtarget.hasInt256()) {
+ assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
+
+ if (InVT.getVectorNumElements() != NumElts)
+ return DAG.getNode(Op.getOpcode(), dl, VT, In);
+
+ // FIXME: Apparently we create inreg operations that could be regular
+ // extends.
+ unsigned ExtOpc =
+ Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
+ : ISD::ZERO_EXTEND;
+ return DAG.getNode(ExtOpc, dl, VT, In);
+ }
+
+ // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
+ if (Subtarget.hasAVX()) {
+ assert(VT.is256BitVector() && "256-bit vector expected");
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ int HalfNumElts = HalfVT.getVectorNumElements();
+
+ unsigned NumSrcElts = InVT.getVectorNumElements();
+ SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
+ for (int i = 0; i != HalfNumElts; ++i)
+ HiMask[i] = HalfNumElts + i;
+
+ SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
+ SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
+ Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ }
+
+ // We should only get here for sign extend.
+ assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
+ assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
+
+ // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
+ SDValue Curr = In;
+ SDValue SignExt = Curr;
+
+ // As SRAI is only available on i16/i32 types, we expand only up to i32
+ // and handle i64 separately.
+ if (InVT != MVT::v4i32) {
+ MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
+
+ unsigned DestWidth = DestVT.getScalarSizeInBits();
+ unsigned Scale = DestWidth / InSVT.getSizeInBits();
+
+ unsigned InNumElts = InVT.getVectorNumElements();
+ unsigned DestElts = DestVT.getVectorNumElements();
+
+ // Build a shuffle mask that takes each input element and places it in the
+ // MSBs of the new element size.
+ SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
+ for (unsigned i = 0; i != DestElts; ++i)
+ Mask[i * Scale + (Scale - 1)] = i;
+
+ Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
+ Curr = DAG.getBitcast(DestVT, Curr);
+
+ unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
+ SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
+ DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
+ }
+
+ if (VT == MVT::v2i64) {
+ assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
+ SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
+ SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
+ SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
+ SignExt = DAG.getBitcast(VT, SignExt);
+ }
+
+ return SignExt;
+}
+
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ SDLoc dl(Op);
+
+ if (InVT.getVectorElementType() == MVT::i1)
+ return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
+
+ assert(VT.isVector() && InVT.isVector() && "Expected vector type");
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+ "Expected same number of elements");
+ assert((VT.getVectorElementType() == MVT::i16 ||
+ VT.getVectorElementType() == MVT::i32 ||
+ VT.getVectorElementType() == MVT::i64) &&
+ "Unexpected element type");
+ assert((InVT.getVectorElementType() == MVT::i8 ||
+ InVT.getVectorElementType() == MVT::i16 ||
+ InVT.getVectorElementType() == MVT::i32) &&
+ "Unexpected element type");
+
+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(InVT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
+ }
+
+ if (Subtarget.hasInt256())
+ return Op;
+
+ // Optimize vectors in AVX mode
+ // Sign extend v8i16 to v8i32 and
+ // v4i32 to v4i64
+ //
+ // Divide input vector into two parts
+ // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
+ // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+ // concat the vectors to original VT
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
+
+ unsigned NumElems = InVT.getVectorNumElements();
+ SmallVector<int,8> ShufMask(NumElems, -1);
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask[i] = i + NumElems/2;
+
+ SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
+ OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+/// Change a vector store into a pair of half-size vector stores.
+static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
+ SDValue StoredVal = Store->getValue();
+ assert((StoredVal.getValueType().is256BitVector() ||
+ StoredVal.getValueType().is512BitVector()) &&
+ "Expecting 256/512-bit op");
+
+ // Splitting volatile memory ops is not allowed unless the operation was not
+ // legal to begin with. Assume the input store is legal (this transform is
+ // only used for targets with AVX). Note: It is possible that we have an
+ // illegal type like v2i128, and so we could allow splitting a volatile store
+ // in that case if that is important.
+ if (!Store->isSimple())
+ return SDValue();
+
+ SDLoc DL(Store);
+ SDValue Value0, Value1;
+ std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
+ unsigned HalfOffset = Value0.getValueType().getStoreSize();
+ SDValue Ptr0 = Store->getBasePtr();
+ SDValue Ptr1 =
+ DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
+ SDValue Ch0 =
+ DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
+ Store->getOriginalAlign(),
+ Store->getMemOperand()->getFlags());
+ SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
+ Store->getPointerInfo().getWithOffset(HalfOffset),
+ Store->getOriginalAlign(),
+ Store->getMemOperand()->getFlags());
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
+}
+
+/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
+/// type.
+static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
+ SelectionDAG &DAG) {
+ SDValue StoredVal = Store->getValue();
+ assert(StoreVT.is128BitVector() &&
+ StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
+ StoredVal = DAG.getBitcast(StoreVT, StoredVal);
+
+ // Splitting volatile memory ops is not allowed unless the operation was not
+ // legal to begin with. We are assuming the input op is legal (this transform
+ // is only used for targets with AVX).
+ if (!Store->isSimple())
+ return SDValue();
+
+ MVT StoreSVT = StoreVT.getScalarType();
+ unsigned NumElems = StoreVT.getVectorNumElements();
+ unsigned ScalarSize = StoreSVT.getStoreSize();
+
+ SDLoc DL(Store);
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Offset = i * ScalarSize;
+ SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
+ TypeSize::Fixed(Offset), DL);
+ SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
+ DAG.getIntPtrConstant(i, DL));
+ SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
+ Store->getPointerInfo().getWithOffset(Offset),
+ Store->getOriginalAlign(),
+ Store->getMemOperand()->getFlags());
+ Stores.push_back(Ch);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
+static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+ SDLoc dl(St);
+ SDValue StoredVal = St->getValue();
+
+ // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
+ if (StoredVal.getValueType().isVector() &&
+ StoredVal.getValueType().getVectorElementType() == MVT::i1) {
+ unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
+ assert(NumElts <= 8 && "Unexpected VT");
+ assert(!St->isTruncatingStore() && "Expected non-truncating store");
+ assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+ "Expected AVX512F without AVX512DQI");
+
+ // We must pad with zeros to ensure we store zeroes to any unused bits.
+ StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
+ DAG.getUNDEF(MVT::v16i1), StoredVal,
+ DAG.getIntPtrConstant(0, dl));
+ StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
+ StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
+ // Make sure we store zeros in the extra bits.
+ if (NumElts < 8)
+ StoredVal = DAG.getZeroExtendInReg(
+ StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
+
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ }
+
+ if (St->isTruncatingStore())
+ return SDValue();
+
+ // If this is a 256-bit store of concatenated ops, we are better off splitting
+ // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
+ // and each half can execute independently. Some cores would split the op into
+ // halves anyway, so the concat (vinsertf128) is purely an extra op.
+ MVT StoreVT = StoredVal.getSimpleValueType();
+ if (StoreVT.is256BitVector() ||
+ ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
+ !Subtarget.hasBWI())) {
+ SmallVector<SDValue, 4> CatOps;
+ if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
+ return splitVectorStore(St, DAG);
+ return SDValue();
+ }
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
+ "Unexpected VT");
+ assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
+ TargetLowering::TypeWidenVector && "Unexpected type action!");
+
+ EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
+ StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
+ DAG.getUNDEF(StoreVT));
+
+ if (Subtarget.hasSSE2()) {
+ // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
+ // and store it.
+ MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
+ MVT CastVT = MVT::getVectorVT(StVT, 2);
+ StoredVal = DAG.getBitcast(CastVT, StoredVal);
+ StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
+ DAG.getIntPtrConstant(0, dl));
+
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
+ return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
+ St->getMemOperand());
+}
+
+// Lower vector extended loads using a shuffle. If SSSE3 is not available we
+// may emit an illegal shuffle but the expansion is still better than scalar
+// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
+// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
+// TODO: It is possible to support ZExt by zeroing the undef values during
+// the shuffle phase or after the shuffle.
+static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT RegVT = Op.getSimpleValueType();
+ assert(RegVT.isVector() && "We only custom lower vector loads.");
+ assert(RegVT.isInteger() &&
+ "We only custom lower integer vector loads.");
+
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ SDLoc dl(Ld);
+
+ // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+ if (RegVT.getVectorElementType() == MVT::i1) {
+ assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
+ assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
+ assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+ "Expected AVX512F without AVX512DQI");
+
+ SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+
+ // Replace chain users with the new chain.
+ assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
+
+ SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
+ Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
+ DAG.getBitcast(MVT::v16i1, Val),
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
+ }
+
+ return SDValue();
+}
+
+/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
+/// each of which has no other use apart from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+ Opc = Op.getOpcode();
+ if (Opc != ISD::OR && Opc != ISD::AND)
+ return false;
+ return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse() &&
+ Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(1).hasOneUse());
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Dest = Op.getOperand(2);
+ SDLoc dl(Op);
+
+ if (Cond.getOpcode() == ISD::SETCC &&
+ Cond.getOperand(0).getValueType() != MVT::f128) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Special case for
+ // setcc([su]{add,sub,mul}o == 0)
+ // setcc([su]{add,sub,mul}o != 1)
+ if (ISD::isOverflowIntrOpRes(LHS) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ (isNullConstant(RHS) || isOneConstant(RHS))) {
+ SDValue Value, Overflow;
+ X86::CondCode X86Cond;
+ std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
+
+ if ((CC == ISD::SETEQ) == isNullConstant(RHS))
+ X86Cond = X86::GetOppositeBranchCondition(X86Cond);
+
+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Overflow);
+ }
+
+ if (LHS.getSimpleValueType().isInteger()) {
+ SDValue CCVal;
+ SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ EFLAGS);
+ }
+
+ if (CC == ISD::SETOEQ) {
+ // For FCMP_OEQ, we can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Op.getNode()->hasOneUse()) {
+ SDNode *User = *Op.getNode()->use_begin();
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User->getOpcode() == ISD::BR) {
+ SDValue FalseBB = User->getOperand(1);
+ SDNode *NewBR =
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ assert(NewBR == User);
+ (void)NewBR;
+ Dest = FalseBB;
+
+ SDValue Cmp =
+ DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+ SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
+ CCVal, Cmp);
+ CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
+ }
+ }
+ } else if (CC == ISD::SETUNE) {
+ // For FCMP_UNE, we can emit
+ // two branches instead of an explicit OR instruction with a
+ // separate test.
+ SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+ SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
+ Chain =
+ DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
+ CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
+ } else {
+ X86::CondCode X86Cond =
+ TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
+ SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
+ }
+ }
+
+ if (ISD::isOverflowIntrOpRes(Cond)) {
+ SDValue Value, Overflow;
+ X86::CondCode X86Cond;
+ std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
+
+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Overflow);
+ }
+
+ // Look past the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ EVT CondVT = Cond.getValueType();
+
+ // Add an AND with 1 if we don't already have one.
+ if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
+ Cond =
+ DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
+
+ SDValue LHS = Cond;
+ SDValue RHS = DAG.getConstant(0, dl, CondVT);
+
+ SDValue CCVal;
+ SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ EFLAGS);
+}
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca are needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool SplitStack = MF.shouldSplitStack();
+ bool EmitStackProbeCall = hasStackProbeSymbol(MF);
+ bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
+ SplitStack || EmitStackProbeCall;
+ SDLoc dl(Op);
+
+ // Get the inputs.
+ SDNode *Node = Op.getNode();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ MaybeAlign Alignment(Op.getConstantOperandVal(2));
+ EVT VT = Node->getValueType(0);
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
+ bool Is64Bit = Subtarget.is64Bit();
+ MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+ SDValue Result;
+ if (!Lower) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
+ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+ " not tell us which reg is the stack pointer!");
+
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+ const Align StackAlign = TFI.getStackAlign();
+ if (hasInlineStackProbe(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+ Register Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+ Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ } else {
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SP.getValue(1);
+ Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ }
+ if (Alignment && *Alignment > StackAlign)
+ Result =
+ DAG.getNode(ISD::AND, dl, VT, Result,
+ DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+ } else if (SplitStack) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ if (Is64Bit) {
+ // The 64 bit implementation of segmented stacks needs to clobber both r10
+ // r11. This makes it impossible to use it along with nested parameters.
+ const Function &F = MF.getFunction();
+ for (const auto &A : F.args()) {
+ if (A.hasNestAttr())
+ report_fatal_error("Cannot use segmented stacks with functions that "
+ "have nested arguments.");
+ }
+ }
+
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+ Register Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+ Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ } else {
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
+ MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
+
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ Register SPReg = RegInfo->getStackRegister();
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+ Chain = SP.getValue(1);
+
+ if (Alignment) {
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+ }
+
+ Result = SP;
+ }
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+ SDValue Ops[2] = {Result, Chain};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ SDLoc DL(Op);
+
+ if (!Subtarget.is64Bit() ||
+ Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+ MachinePointerInfo(SV));
+ }
+
+ // __va_list_tag:
+ // gp_offset (0 - 6 * 8)
+ // fp_offset (48 - 48 + 8 * 16)
+ // overflow_arg_area (point to parameters coming in memory).
+ // reg_save_area
+ SmallVector<SDValue, 8> MemOps;
+ SDValue FIN = Op.getOperand(1);
+ // Store gp_offset
+ SDValue Store = DAG.getStore(
+ Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
+ MachinePointerInfo(SV));
+ MemOps.push_back(Store);
+
+ // Store fp_offset
+ FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
+ Store = DAG.getStore(
+ Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
+ MachinePointerInfo(SV, 4));
+ MemOps.push_back(Store);
+
+ // Store ptr to overflow_arg_area
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+ SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+ Store =
+ DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
+ MemOps.push_back(Store);
+
+ // Store ptr to reg_save_area.
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
+ Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
+ Store = DAG.getStore(
+ Op.getOperand(0), DL, RSFIN, FIN,
+ MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
+ MemOps.push_back(Store);
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget.is64Bit() &&
+ "LowerVAARG only handles 64-bit va_arg!");
+ assert(Op.getNumOperands() == 4);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
+ // The Win64 ABI uses char* instead of a structure.
+ return DAG.expandVAArg(Op.getNode());
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue SrcPtr = Op.getOperand(1);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ unsigned Align = Op.getConstantOperandVal(3);
+ SDLoc dl(Op);
+
+ EVT ArgVT = Op.getNode()->getValueType(0);
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+ uint8_t ArgMode;
+
+ // Decide which area this value should be read from.
+ // TODO: Implement the AMD64 ABI in its entirety. This simple
+ // selection mechanism works only for the basic types.
+ assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
+ if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+ ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
+ } else {
+ assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
+ "Unhandled argument type in LowerVAARG");
+ ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
+ }
+
+ if (ArgMode == 2) {
+ // Sanity Check: Make sure using fp_offset makes sense.
+ assert(!Subtarget.useSoftFloat() &&
+ !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ Subtarget.hasSSE1());
+ }
+
+ // Insert VAARG node into the DAG
+ // VAARG returns two values: Variable Argument Address, Chain
+ SDValue InstOps[] = {Chain, SrcPtr,
+ DAG.getTargetConstant(ArgSize, dl, MVT::i32),
+ DAG.getTargetConstant(ArgMode, dl, MVT::i8),
+ DAG.getTargetConstant(Align, dl, MVT::i32)};
+ SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
+ SDValue VAARG = DAG.getMemIntrinsicNode(
+ Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
+ VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
+ /*Alignment=*/None,
+ MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
+ Chain = VAARG.getValue(1);
+
+ // Load the next argument and return it
+ return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
+}
+
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
+ // where a va_list is still an i8*.
+ assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
+ if (Subtarget.isCallingConvWin64(
+ DAG.getMachineFunction().getFunction().getCallingConv()))
+ // Probably a Win64 va_copy.
+ return DAG.expandVACopy(Op.getNode());
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue DstPtr = Op.getOperand(1);
+ SDValue SrcPtr = Op.getOperand(2);
+ const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+ const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ SDLoc DL(Op);
+
+ return DAG.getMemcpy(
+ Chain, DL, DstPtr, SrcPtr,
+ DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
+ Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
+ false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
+static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
+ switch (Opc) {
+ case ISD::SHL:
+ case X86ISD::VSHL:
+ case X86ISD::VSHLI:
+ return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
+ case ISD::SRL:
+ case X86ISD::VSRL:
+ case X86ISD::VSRLI:
+ return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
+ case ISD::SRA:
+ case X86ISD::VSRA:
+ case X86ISD::VSRAI:
+ return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
+ }
+ llvm_unreachable("Unknown target vector shift node");
+}
+
+/// Handle vector element shifts where the shift amount is a constant.
+/// Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
+ SDValue SrcOp, uint64_t ShiftAmt,
+ SelectionDAG &DAG) {
+ MVT ElementType = VT.getVectorElementType();
+
+ // Bitcast the source vector to the output type, this is mainly necessary for
+ // vXi8/vXi64 shifts.
+ if (VT != SrcOp.getSimpleValueType())
+ SrcOp = DAG.getBitcast(VT, SrcOp);
+
+ // Fold this packed shift into its first operand if ShiftAmt is 0.
+ if (ShiftAmt == 0)
+ return SrcOp;
+
+ // Check for ShiftAmt >= element width
+ if (ShiftAmt >= ElementType.getSizeInBits()) {
+ if (Opc == X86ISD::VSRAI)
+ ShiftAmt = ElementType.getSizeInBits() - 1;
+ else
+ return DAG.getConstant(0, dl, VT);
+ }
+
+ assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+ && "Unknown target vector shift-by-constant node");
+
+ // Fold this packed vector shift into a build vector if SrcOp is a
+ // vector of Constants or UNDEFs.
+ if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+ SmallVector<SDValue, 8> Elts;
+ unsigned NumElts = SrcOp->getNumOperands();
+
+ switch (Opc) {
+ default: llvm_unreachable("Unknown opcode!");
+ case X86ISD::VSHLI:
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->isUndef()) {
+ // Must produce 0s in the correct bits.
+ Elts.push_back(DAG.getConstant(0, dl, ElementType));
+ continue;
+ }
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
+ }
+ break;
+ case X86ISD::VSRLI:
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->isUndef()) {
+ // Must produce 0s in the correct bits.
+ Elts.push_back(DAG.getConstant(0, dl, ElementType));
+ continue;
+ }
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
+ }
+ break;
+ case X86ISD::VSRAI:
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue CurrentOp = SrcOp->getOperand(i);
+ if (CurrentOp->isUndef()) {
+ // All shifted in bits must be the same so use 0.
+ Elts.push_back(DAG.getConstant(0, dl, ElementType));
+ continue;
+ }
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
+ const APInt &C = ND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
+ }
+ break;
+ }
+
+ return DAG.getBuildVector(VT, dl, Elts);
+ }
+
+ return DAG.getNode(Opc, dl, VT, SrcOp,
+ DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
+}
+
+/// Handle vector element shifts where the shift amount may or may not be a
+/// constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
+ SDValue SrcOp, SDValue ShAmt,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT SVT = ShAmt.getSimpleValueType();
+ assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
+
+ // Catch shift-by-constant.
+ if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+ return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+ CShAmt->getZExtValue(), DAG);
+
+ // Change opcode to non-immediate version.
+ Opc = getTargetVShiftUniformOpcode(Opc, true);
+
+ // Need to build a vector containing shift amount.
+ // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+ // +====================+============+=======================================+
+ // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
+ // +====================+============+=======================================+
+ // | i64 | Yes, No | Use ShAmt as lowest elt |
+ // | i32 | Yes | zero-extend in-reg |
+ // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
+ // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
+ // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
+ // +====================+============+=======================================+
+
+ if (SVT == MVT::i64)
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+ else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+ ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
+ ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
+ ShAmt = ShAmt.getOperand(0);
+ MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
+ if (Subtarget.hasSSE41())
+ ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
+ MVT::v2i64, ShAmt);
+ else {
+ SDValue ByteShift = DAG.getTargetConstant(
+ (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
+ ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
+ ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
+ ByteShift);
+ ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
+ ByteShift);
+ }
+ } else if (Subtarget.hasSSE41() &&
+ ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
+ ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
+ MVT::v2i64, ShAmt);
+ } else {
+ SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
+ DAG.getUNDEF(SVT)};
+ ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
+ }
+
+ // The return type has to be a 128-bit type with the same element
+ // type as the input type.
+ MVT EltVT = VT.getVectorElementType();
+ MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
+
+ ShAmt = DAG.getBitcast(ShVT, ShAmt);
+ return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
+/// Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl) {
+
+ if (isAllOnesConstant(Mask))
+ return DAG.getConstant(1, dl, MaskVT);
+ if (X86::isZeroNode(Mask))
+ return DAG.getConstant(0, dl, MaskVT);
+
+ assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
+
+ if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
+ assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
+ assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
+ // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(0, dl, MVT::i32));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ Lo = DAG.getBitcast(MVT::v32i1, Lo);
+ Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+ } else {
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+ // are extracted by EXTRACT_SUBVECTOR.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
+ }
+}
+
+/// Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
+/// necessary casting or extending for \p Mask when lowering masking intrinsics
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ unsigned OpcodeSelect = ISD::VSELECT;
+ SDLoc dl(Op);
+
+ if (isAllOnesConstant(Mask))
+ return Op;
+
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ if (PreservedSrc.isUndef())
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
+}
+
+/// Creates an SDNode for a predicated scalar operation.
+/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
+/// The mask is coming as MVT::i8 and it should be transformed
+/// to MVT::v1i1 while lowering masking intrinsics.
+/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
+/// "X86select" instead of "vselect". We just can't create the "vselect" node
+/// for a scalar instruction.
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+
+ if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
+ if (MaskConst->getZExtValue() & 0x1)
+ return Op;
+
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
+ SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
+ DAG.getBitcast(MVT::v8i1, Mask),
+ DAG.getIntPtrConstant(0, dl));
+ if (Op.getOpcode() == X86ISD::FSETCCM ||
+ Op.getOpcode() == X86ISD::FSETCCM_SAE ||
+ Op.getOpcode() == X86ISD::VFPCLASSS)
+ return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
+
+ if (PreservedSrc.isUndef())
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
+}
+
+static int getSEHRegistrationNodeSize(const Function *Fn) {
+ if (!Fn->hasPersonalityFn())
+ report_fatal_error(
+ "querying registration node size for function without personality");
+ // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
+ // WinEHStatePass for the full struct definition.
+ switch (classifyEHPersonality(Fn->getPersonalityFn())) {
+ case EHPersonality::MSVC_X86SEH: return 24;
+ case EHPersonality::MSVC_CXX: return 16;
+ default: break;
+ }
+ report_fatal_error(
+ "can only recover FP for 32-bit MSVC EH personality functions");
+}
+
+/// When the MSVC runtime transfers control to us, either to an outlined
+/// function or when returning to a parent frame after catching an exception, we
+/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
+/// Here's the math:
+/// RegNodeBase = EntryEBP - RegNodeSize
+/// ParentFP = RegNodeBase - ParentFrameOffset
+/// Subtracting RegNodeSize takes us to the offset of the registration node, and
+/// subtracting the offset (negative on x86) takes us back to the parent FP.
+static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
+ SDValue EntryEBP) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDLoc dl;
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+
+ // It's possible that the parent function no longer has a personality function
+ // if the exceptional code was optimized away, in which case we just return
+ // the incoming EBP.
+ if (!Fn->hasPersonalityFn())
+ return EntryEBP;
+
+ // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
+ // registration, or the .set_setframe offset.
+ MCSymbol *OffsetSym =
+ MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()));
+ SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
+ SDValue ParentFrameOffset =
+ DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
+
+ // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
+ // prologue to RBP in the parent function.
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ if (Subtarget.is64Bit())
+ return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
+
+ int RegNodeSize = getSEHRegistrationNodeSize(Fn);
+ // RegNodeBase = EntryEBP - RegNodeSize
+ // ParentFP = RegNodeBase - ParentFrameOffset
+ SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
+ DAG.getConstant(RegNodeSize, dl, PtrVT));
+ return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
+}
+
+SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Helper to detect if the operand is CUR_DIRECTION rounding mode.
+ auto isRoundModeCurDirection = [](SDValue Rnd) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
+ return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
+
+ return false;
+ };
+ auto isRoundModeSAE = [](SDValue Rnd) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
+ unsigned RC = C->getZExtValue();
+ if (RC & X86::STATIC_ROUNDING::NO_EXC) {
+ // Clear the NO_EXC bit and check remaining bits.
+ RC ^= X86::STATIC_ROUNDING::NO_EXC;
+ // As a convenience we allow no other bits or explicitly
+ // current direction.
+ return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ }
+ }
+
+ return false;
+ };
+ auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
+ RC = C->getZExtValue();
+ if (RC & X86::STATIC_ROUNDING::NO_EXC) {
+ // Clear the NO_EXC bit and check remaining bits.
+ RC ^= X86::STATIC_ROUNDING::NO_EXC;
+ return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
+ RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
+ RC == X86::STATIC_ROUNDING::TO_POS_INF ||
+ RC == X86::STATIC_ROUNDING::TO_ZERO;
+ }
+ }
+
+ return false;
+ };
+
+ SDLoc dl(Op);
+ unsigned IntNo = Op.getConstantOperandVal(0);
+ MVT VT = Op.getSimpleValueType();
+ const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+
+ if (IntrData) {
+ switch(IntrData->Type) {
+ case INTR_TYPE_1OP: {
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(2);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Op.getOperand(1),
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
+ }
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1));
+ }
+ case INTR_TYPE_1OP_SAE: {
+ SDValue Sae = Op.getOperand(2);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
+ }
+ case INTR_TYPE_2OP: {
+ SDValue Src2 = Op.getOperand(2);
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(3);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Op.getOperand(1), Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
+ }
+
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Src2);
+ }
+ case INTR_TYPE_2OP_SAE: {
+ SDValue Sae = Op.getOperand(3);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ }
+ case INTR_TYPE_3OP:
+ case INTR_TYPE_3OP_IMM8: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+
+ if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
+ Src3.getValueType() != MVT::i8) {
+ Src3 = DAG.getTargetConstant(
+ cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
+ }
+
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(4);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Src1, Src2, Src3,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
+ }
+
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ {Src1, Src2, Src3});
+ }
+ case INTR_TYPE_4OP_IMM8: {
+ assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
+ SDValue Src4 = Op.getOperand(4);
+ if (Src4.getValueType() != MVT::i8) {
+ Src4 = DAG.getTargetConstant(
+ cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
+ }
+
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+ Src4);
+ }
+ case INTR_TYPE_1OP_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ // We add rounding mode to the Node when
+ // - RC Opcode is specified and
+ // - RC is not "current direction".
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(4);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return getVectorMaskingNode(
+ DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
+ Mask, PassThru, Subtarget, DAG);
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
+ }
+ return getVectorMaskingNode(
+ DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
+ Subtarget, DAG);
+ }
+ case INTR_TYPE_1OP_MASK_SAE: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue Rnd = Op.getOperand(4);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Rnd))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Rnd))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
+ Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ // There are 2 kinds of intrinsics in this group:
+ // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
+ // (2) With rounding mode and sae - 7 operands.
+ bool HasRounding = IntrWithRoundingModeOpcode != 0;
+ if (Op.getNumOperands() == (5U + HasRounding)) {
+ if (HasRounding) {
+ SDValue Rnd = Op.getOperand(5);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return getScalarMaskingNode(
+ DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32)),
+ Mask, passThru, Subtarget, DAG);
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
+ }
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+ Src2),
+ Mask, passThru, Subtarget, DAG);
+ }
+
+ assert(Op.getNumOperands() == (6U + HasRounding) &&
+ "Unexpected intrinsic form");
+ SDValue RoundingMode = Op.getOperand(5);
+ unsigned Opc = IntrData->Opc0;
+ if (HasRounding) {
+ SDValue Sae = Op.getOperand(6);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrWithRoundingModeOpcode;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
+ Src2, RoundingMode),
+ Mask, passThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK_RND: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ SDValue Rnd = Op.getOperand(5);
+
+ SDValue NewOp;
+ unsigned RC = 0;
+ if (isRoundModeCurDirection(Rnd))
+ NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
+ else if (isRoundModeSAEToX(Rnd, RC))
+ NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ else
+ return SDValue();
+
+ return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK_SAE: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ SDValue Sae = Op.getOperand(5);
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
+ Mask, passThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_2OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ SDValue NewOp;
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ else if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
+ }
+ if (!NewOp)
+ NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
+ return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_2OP_MASK_SAE: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(5);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_MASK_SAE: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(6);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case BLENDV: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+
+ EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
+ Src3 = DAG.getBitcast(MaskVT, Src3);
+
+ // Reverse the operands to match VSELECT order.
+ return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
+ }
+ case VPERM_2OP : {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+
+ // Swap Src1 and Src2 in the node creation
+ return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
+ }
+ case IFMA_OP:
+ // NOTE: We need to swizzle the operands to pass the multiply operands
+ // first.
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case FPCLASSS: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Imm = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
+ SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
+ Subtarget, DAG);
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getConstant(0, dl, MVT::v8i1),
+ FPclassMask, DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(MVT::i8, Ins);
+ }
+
+ case CMP_MASK_CC: {
+ MVT MaskVT = Op.getSimpleValueType();
+ SDValue CC = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(5);
+ if (isRoundModeSAE(Sae))
+ return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC, Mask, Sae);
+ if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+ //default rounding mode
+ return DAG.getNode(IntrData->Opc0, dl, MaskVT,
+ {Op.getOperand(1), Op.getOperand(2), CC, Mask});
+ }
+ case CMP_MASK_SCALAR_CC: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue CC = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+
+ SDValue Cmp;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(5);
+ if (isRoundModeSAE(Sae))
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+ //default rounding mode
+ if (!Cmp.getNode())
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
+
+ SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
+ Subtarget, DAG);
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getConstant(0, dl, MVT::v8i1),
+ CmpMask, DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(MVT::i8, Ins);
+ }
+ case COMI: { // Comparison intrinsics
+ ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ // Some conditions require the operands to be swapped.
+ if (CC == ISD::SETLT || CC == ISD::SETLE)
+ std::swap(LHS, RHS);
+
+ SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC;
+ switch (CC) {
+ case ISD::SETEQ: { // (ZF = 0 and PF = 0)
+ SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
+ SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
+ SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
+ break;
+ }
+ case ISD::SETNE: { // (ZF = 1 or PF = 1)
+ SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
+ SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
+ SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
+ break;
+ }
+ case ISD::SETGT: // (CF = 0 and ZF = 0)
+ case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
+ SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
+ break;
+ }
+ case ISD::SETGE: // CF = 0
+ case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
+ SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
+ break;
+ default:
+ llvm_unreachable("Unexpected illegal condition!");
+ }
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+ case COMI_RM: { // Comparison intrinsics with Sae
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ unsigned CondVal = Op.getConstantOperandVal(3);
+ SDValue Sae = Op.getOperand(4);
+
+ SDValue FCmp;
+ if (isRoundModeCurDirection(Sae))
+ FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
+ DAG.getTargetConstant(CondVal, dl, MVT::i8));
+ else if (isRoundModeSAE(Sae))
+ FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
+ DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
+ else
+ return SDValue();
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
+ DAG.getConstant(0, dl, MVT::v16i1),
+ FCmp, DAG.getIntPtrConstant(0, dl));
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
+ DAG.getBitcast(MVT::i16, Ins));
+ }
+ case VSHIFT:
+ return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
+ Op.getOperand(1), Op.getOperand(2), Subtarget,
+ DAG);
+ case COMPRESS_EXPAND_IN_REG: {
+ SDValue Mask = Op.getOperand(3);
+ SDValue DataToCompress = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
+ return Op.getOperand(1);
+
+ // Avoid false dependency.
+ if (PassThru.isUndef())
+ PassThru = DAG.getConstant(0, dl, VT);
+
+ return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
+ Mask);
+ }
+ case FIXUPIMM:
+ case FIXUPIMM_MASKZ: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Imm = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Passthru = (IntrData->Type == FIXUPIMM)
+ ? Src1
+ : getZeroVector(VT, Subtarget, DAG, dl);
+
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(6);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+
+ SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
+
+ if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
+ return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
+
+ return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
+ }
+ case ROUNDP: {
+ assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
+ // Clear the upper bits of the rounding immediate so that the legacy
+ // intrinsic can't trigger the scaling behavior of VRNDSCALE.
+ auto Round = cast<ConstantSDNode>(Op.getOperand(2));
+ SDValue RoundingMode =
+ DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), RoundingMode);
+ }
+ case ROUNDS: {
+ assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
+ // Clear the upper bits of the rounding immediate so that the legacy
+ // intrinsic can't trigger the scaling behavior of VRNDSCALE.
+ auto Round = cast<ConstantSDNode>(Op.getOperand(3));
+ SDValue RoundingMode =
+ DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), RoundingMode);
+ }
+ case BEXTRI: {
+ assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
+
+ uint64_t Imm = Op.getConstantOperandVal(2);
+ SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
+ Op.getValueType());
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Control);
+ }
+ // ADC/ADCX/SBB
+ case ADX: {
+ SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+ SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
+
+ SDValue Res;
+ // If the carry in is zero, then we should just use ADD/SUB instead of
+ // ADC/SBB.
+ if (isNullConstant(Op.getOperand(1))) {
+ Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
+ Op.getOperand(3));
+ } else {
+ SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
+ DAG.getConstant(-1, dl, MVT::i8));
+ Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
+ Op.getOperand(3), GenCF.getValue(1));
+ }
+ SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
+ SDValue Results[] = { SetCC, Res };
+ return DAG.getMergeValues(Results, dl);
+ }
+ case CVTPD2PS_MASK:
+ case CVTPD2DQ_MASK:
+ case CVTQQ2PS_MASK:
+ case TRUNCATE_TO_REG: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+
+ if (isAllOnesConstant(Mask))
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
+
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
+ Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
+ {Src, PassThru, Mask});
+ }
+ case CVTPS2PH_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue Rnd = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+
+ if (isAllOnesConstant(Mask))
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
+
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
+ Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
+ PassThru, Mask);
+
+ }
+ case CVTNEPS2BF16_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+
+ if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
+
+ // Break false dependency.
+ if (PassThru.isUndef())
+ PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
+
+ return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
+ Mask);
+ }
+ default:
+ break;
+ }
+ }
+
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+
+ // ptest and testp intrinsics. The intrinsic these come from are designed to
+ // return an integer value, not just an instruction so lower it to the ptest
+ // or testp pattern and a setcc for the result.
+ case Intrinsic::x86_avx512_ktestc_b:
+ case Intrinsic::x86_avx512_ktestc_w:
+ case Intrinsic::x86_avx512_ktestc_d:
+ case Intrinsic::x86_avx512_ktestc_q:
+ case Intrinsic::x86_avx512_ktestz_b:
+ case Intrinsic::x86_avx512_ktestz_w:
+ case Intrinsic::x86_avx512_ktestz_d:
+ case Intrinsic::x86_avx512_ktestz_q:
+ case Intrinsic::x86_sse41_ptestz:
+ case Intrinsic::x86_sse41_ptestc:
+ case Intrinsic::x86_sse41_ptestnzc:
+ case Intrinsic::x86_avx_ptestz_256:
+ case Intrinsic::x86_avx_ptestc_256:
+ case Intrinsic::x86_avx_ptestnzc_256:
+ case Intrinsic::x86_avx_vtestz_ps:
+ case Intrinsic::x86_avx_vtestc_ps:
+ case Intrinsic::x86_avx_vtestnzc_ps:
+ case Intrinsic::x86_avx_vtestz_pd:
+ case Intrinsic::x86_avx_vtestc_pd:
+ case Intrinsic::x86_avx_vtestnzc_pd:
+ case Intrinsic::x86_avx_vtestz_ps_256:
+ case Intrinsic::x86_avx_vtestc_ps_256:
+ case Intrinsic::x86_avx_vtestnzc_ps_256:
+ case Intrinsic::x86_avx_vtestz_pd_256:
+ case Intrinsic::x86_avx_vtestc_pd_256:
+ case Intrinsic::x86_avx_vtestnzc_pd_256: {
+ unsigned TestOpc = X86ISD::PTEST;
+ X86::CondCode X86CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+ case Intrinsic::x86_avx512_ktestc_b:
+ case Intrinsic::x86_avx512_ktestc_w:
+ case Intrinsic::x86_avx512_ktestc_d:
+ case Intrinsic::x86_avx512_ktestc_q:
+ // CF = 1
+ TestOpc = X86ISD::KTEST;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_avx512_ktestz_b:
+ case Intrinsic::x86_avx512_ktestz_w:
+ case Intrinsic::x86_avx512_ktestz_d:
+ case Intrinsic::x86_avx512_ktestz_q:
+ TestOpc = X86ISD::KTEST;
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_avx_vtestz_ps:
+ case Intrinsic::x86_avx_vtestz_pd:
+ case Intrinsic::x86_avx_vtestz_ps_256:
+ case Intrinsic::x86_avx_vtestz_pd_256:
+ TestOpc = X86ISD::TESTP;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse41_ptestz:
+ case Intrinsic::x86_avx_ptestz_256:
+ // ZF = 1
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_avx_vtestc_ps:
+ case Intrinsic::x86_avx_vtestc_pd:
+ case Intrinsic::x86_avx_vtestc_ps_256:
+ case Intrinsic::x86_avx_vtestc_pd_256:
+ TestOpc = X86ISD::TESTP;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse41_ptestc:
+ case Intrinsic::x86_avx_ptestc_256:
+ // CF = 1
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_avx_vtestnzc_ps:
+ case Intrinsic::x86_avx_vtestnzc_pd:
+ case Intrinsic::x86_avx_vtestnzc_ps_256:
+ case Intrinsic::x86_avx_vtestnzc_pd_256:
+ TestOpc = X86ISD::TESTP;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse41_ptestnzc:
+ case Intrinsic::x86_avx_ptestnzc_256:
+ // ZF and CF = 0
+ X86CC = X86::COND_A;
+ break;
+ }
+
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistria128:
+ case Intrinsic::x86_sse42_pcmpestria128:
+ case Intrinsic::x86_sse42_pcmpistric128:
+ case Intrinsic::x86_sse42_pcmpestric128:
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ case Intrinsic::x86_sse42_pcmpistris128:
+ case Intrinsic::x86_sse42_pcmpestris128:
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ case Intrinsic::x86_sse42_pcmpestriz128: {
+ unsigned Opcode;
+ X86::CondCode X86CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse42_pcmpistria128:
+ Opcode = X86ISD::PCMPISTR;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpestria128:
+ Opcode = X86ISD::PCMPESTR;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpistric128:
+ Opcode = X86ISD::PCMPISTR;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpestric128:
+ Opcode = X86ISD::PCMPESTR;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ Opcode = X86ISD::PCMPISTR;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ Opcode = X86ISD::PCMPESTR;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpistris128:
+ Opcode = X86ISD::PCMPISTR;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpestris128:
+ Opcode = X86ISD::PCMPESTR;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ Opcode = X86ISD::PCMPISTR;
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_sse42_pcmpestriz128:
+ Opcode = X86ISD::PCMPESTR;
+ X86CC = X86::COND_E;
+ break;
+ }
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
+ SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistri128:
+ case Intrinsic::x86_sse42_pcmpestri128: {
+ unsigned Opcode;
+ if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+ Opcode = X86ISD::PCMPISTR;
+ else
+ Opcode = X86ISD::PCMPESTR;
+
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+ return DAG.getNode(Opcode, dl, VTs, NewOps);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistrm128:
+ case Intrinsic::x86_sse42_pcmpestrm128: {
+ unsigned Opcode;
+ if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
+ Opcode = X86ISD::PCMPISTR;
+ else
+ Opcode = X86ISD::PCMPESTR;
+
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+ return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
+ }
+
+ case Intrinsic::eh_sjlj_lsda: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ auto &Context = MF.getMMI().getContext();
+ MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
+ Twine(MF.getFunctionNumber()));
+ return DAG.getNode(getGlobalWrapperKind(), dl, VT,
+ DAG.getMCSymbol(S, PtrVT));
+ }
+
+ case Intrinsic::x86_seh_lsda: {
+ // Compute the symbol for the LSDA. We know it'll get emitted later.
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Op1 = Op.getOperand(1);
+ auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
+ MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()));
+
+ // Generate a simple absolute symbol reference. This intrinsic is only
+ // supported on 32-bit Windows, which isn't PIC.
+ SDValue Result = DAG.getMCSymbol(LSDASym, VT);
+ return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
+ }
+
+ case Intrinsic::eh_recoverfp: {
+ SDValue FnOp = Op.getOperand(1);
+ SDValue IncomingFPOp = Op.getOperand(2);
+ GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
+ auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
+ if (!Fn)
+ report_fatal_error(
+ "llvm.eh.recoverfp must take a function as the first argument");
+ return recoverFramePointer(DAG, Fn, IncomingFPOp);
+ }
+
+ case Intrinsic::localaddress: {
+ // Returns one of the stack, base, or frame pointer registers, depending on
+ // which is used to reference local variables.
+ MachineFunction &MF = DAG.getMachineFunction();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ unsigned Reg;
+ if (RegInfo->hasBasePointer(MF))
+ Reg = RegInfo->getBaseRegister();
+ else { // Handles the SP or FP case.
+ bool CantUseFP = RegInfo->needsStackRealignment(MF);
+ if (CantUseFP)
+ Reg = RegInfo->getPtrSizedStackRegister(MF);
+ else
+ Reg = RegInfo->getPtrSizedFrameRegister(MF);
+ }
+ return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+ }
+
+ case Intrinsic::x86_avx512_vp2intersect_q_512:
+ case Intrinsic::x86_avx512_vp2intersect_q_256:
+ case Intrinsic::x86_avx512_vp2intersect_q_128:
+ case Intrinsic::x86_avx512_vp2intersect_d_512:
+ case Intrinsic::x86_avx512_vp2intersect_d_256:
+ case Intrinsic::x86_avx512_vp2intersect_d_128: {
+ MVT MaskVT = Op.getSimpleValueType();
+
+ SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
+ SDLoc DL(Op);
+
+ SDValue Operation =
+ DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
+ Op->getOperand(1), Op->getOperand(2));
+
+ SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
+ MaskVT, Operation);
+ SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
+ MaskVT, Operation);
+ return DAG.getMergeValues({Result0, Result1}, DL);
+ }
+ case Intrinsic::x86_mmx_pslli_w:
+ case Intrinsic::x86_mmx_pslli_d:
+ case Intrinsic::x86_mmx_pslli_q:
+ case Intrinsic::x86_mmx_psrli_w:
+ case Intrinsic::x86_mmx_psrli_d:
+ case Intrinsic::x86_mmx_psrli_q:
+ case Intrinsic::x86_mmx_psrai_w:
+ case Intrinsic::x86_mmx_psrai_d: {
+ SDLoc DL(Op);
+ SDValue ShAmt = Op.getOperand(2);
+ // If the argument is a constant, convert it to a target constant.
+ if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
+ // Clamp out of bounds shift amounts since they will otherwise be masked
+ // to 8-bits which may make it no longer out of bounds.
+ unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
+ if (ShiftAmount == 0)
+ return Op.getOperand(1);
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1),
+ DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
+ }
+
+ unsigned NewIntrinsic;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_mmx_pslli_w:
+ NewIntrinsic = Intrinsic::x86_mmx_psll_w;
+ break;
+ case Intrinsic::x86_mmx_pslli_d:
+ NewIntrinsic = Intrinsic::x86_mmx_psll_d;
+ break;
+ case Intrinsic::x86_mmx_pslli_q:
+ NewIntrinsic = Intrinsic::x86_mmx_psll_q;
+ break;
+ case Intrinsic::x86_mmx_psrli_w:
+ NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
+ break;
+ case Intrinsic::x86_mmx_psrli_d:
+ NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
+ break;
+ case Intrinsic::x86_mmx_psrli_q:
+ NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
+ break;
+ case Intrinsic::x86_mmx_psrai_w:
+ NewIntrinsic = Intrinsic::x86_mmx_psra_w;
+ break;
+ case Intrinsic::x86_mmx_psrai_d:
+ NewIntrinsic = Intrinsic::x86_mmx_psra_d;
+ break;
+ }
+
+ // The vector shift intrinsics with scalars uses 32b shift amounts but
+ // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
+ // MMX register.
+ ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ DAG.getTargetConstant(NewIntrinsic, DL,
+ getPointerTy(DAG.getDataLayout())),
+ Op.getOperand(1), ShAmt);
+ }
+ }
+}
+
+static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ // Scale must be constant.
+ if (!C)
+ return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
+ // If source is undef or we know it won't be used, use a zero vector
+ // to break register dependency.
+ // TODO: use undef instead and let BreakFalseDeps deal with it?
+ if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
+ Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+
+ // Cast mask to an integer type.
+ Mask = DAG.getBitcast(MaskVT, Mask);
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Res =
+ DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
+}
+
+static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ // Scale must be constant.
+ if (!C)
+ return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
+ VT.getVectorNumElements());
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
+
+ // We support two versions of the gather intrinsics. One with scalar mask and
+ // one with vXi1 mask. Convert scalar to vXi1 if necessary.
+ if (Mask.getValueType() != MaskVT)
+ Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
+ // If source is undef or we know it won't be used, use a zero vector
+ // to break register dependency.
+ // TODO: use undef instead and let BreakFalseDeps deal with it?
+ if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
+ Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Res =
+ DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ // Scale must be constant.
+ if (!C)
+ return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
+ Src.getSimpleValueType().getVectorNumElements());
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
+
+ // We support two versions of the scatter intrinsics. One with scalar mask and
+ // one with vXi1 mask. Convert scalar to vXi1 if necessary.
+ if (Mask.getValueType() != MaskVT)
+ Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
+ SDValue Res =
+ DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return Res;
+}
+
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Mask, SDValue Base, SDValue Index,
+ SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ // Scale must be constant.
+ if (!C)
+ return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ MVT MaskVT =
+ MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+ return SDValue(Res, 0);
+}
+
+/// Handles the lowering of builtin intrinsics with chain that return their
+/// value into registers EDX:EAX.
+/// If operand ScrReg is a valid register identifier, then operand 2 of N is
+/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
+/// TargetOpcode.
+/// Returns a Glue value which can be used to add extra copy-from-reg if the
+/// expanded intrinsics implicitly defines extra registers (i.e. not just
+/// EDX:EAX).
+static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ unsigned TargetOpcode,
+ unsigned SrcReg,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Glue;
+
+ if (SrcReg) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+ Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
+ Glue = Chain.getValue(1);
+ }
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue N1Ops[] = {Chain, Glue};
+ SDNode *N1 = DAG.getMachineNode(
+ TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
+ Chain = SDValue(N1, 0);
+
+ // Reads the content of XCR and returns it in registers EDX:EAX.
+ SDValue LO, HI;
+ if (Subtarget.is64Bit()) {
+ LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ Chain = HI.getValue(1);
+ Glue = HI.getValue(2);
+
+ if (Subtarget.is64Bit()) {
+ // Merge the two 32-bit values into a 64-bit one.
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, DL, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return Glue;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+ return Glue;
+}
+
+/// Handles the lowering of builtin intrinsics that read the time stamp counter
+/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
+/// READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+ // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+ // and the EAX register is loaded with the low-order 32 bits.
+ SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
+ /* NoRegister */0, Subtarget,
+ Results);
+ if (Opcode != X86::RDTSCP)
+ return;
+
+ SDValue Chain = Results[1];
+ // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+ // the ECX register. Add 'ecx' explicitly to the chain.
+ SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
+ Results[1] = ecx;
+ Results.push_back(ecx.getValue(1));
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SmallVector<SDValue, 3> Results;
+ SDLoc DL(Op);
+ getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
+ Results);
+ return DAG.getMergeValues(Results, DL);
+}
+
+static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+ SDValue RegNode = Op.getOperand(2);
+ WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+ if (!EHInfo)
+ report_fatal_error("EH registrations only live in functions using WinEH");
+
+ // Cast the operand to an alloca, and remember the frame index.
+ auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
+ if (!FINode)
+ report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
+ EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
+
+ // Return the chain operand without making any DAG nodes.
+ return Chain;
+}
+
+static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+ SDValue EHGuard = Op.getOperand(2);
+ WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+ if (!EHInfo)
+ report_fatal_error("EHGuard only live in functions using WinEH");
+
+ // Cast the operand to an alloca, and remember the frame index.
+ auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
+ if (!FINode)
+ report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
+ EHInfo->EHGuardFrameIndex = FINode->getIndex();
+
+ // Return the chain operand without making any DAG nodes.
+ return Chain;
+}
+
+/// Emit Truncating Store with signed or unsigned saturation.
+static SDValue
+EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
+ SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
+ SelectionDAG &DAG) {
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
+ SDValue Ops[] = { Chain, Val, Ptr, Undef };
+ unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
+ return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
+}
+
+/// Emit Masked Truncating Store with signed or unsigned saturation.
+static SDValue
+EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
+ SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
+ MachineMemOperand *MMO, SelectionDAG &DAG) {
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = { Chain, Val, Ptr, Mask };
+ unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
+ return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ unsigned IntNo = Op.getConstantOperandVal(1);
+ const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
+ if (!IntrData) {
+ switch (IntNo) {
+ case llvm::Intrinsic::x86_seh_ehregnode:
+ return MarkEHRegistrationNode(Op, DAG);
+ case llvm::Intrinsic::x86_seh_ehguard:
+ return MarkEHGuard(Op, DAG);
+ case llvm::Intrinsic::x86_rdpkru: {
+ SDLoc dl(Op);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ // Create a RDPKRU node and pass 0 to the ECX parameter.
+ return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
+ DAG.getConstant(0, dl, MVT::i32));
+ }
+ case llvm::Intrinsic::x86_wrpkru: {
+ SDLoc dl(Op);
+ // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
+ // to the EDX and ECX parameters.
+ return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
+ Op.getOperand(0), Op.getOperand(2),
+ DAG.getConstant(0, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ }
+ case llvm::Intrinsic::x86_flags_read_u32:
+ case llvm::Intrinsic::x86_flags_read_u64:
+ case llvm::Intrinsic::x86_flags_write_u32:
+ case llvm::Intrinsic::x86_flags_write_u64: {
+ // We need a frame pointer because this will get lowered to a PUSH/POP
+ // sequence.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setHasCopyImplyingStackAdjustment(true);
+ // Don't do anything here, we will expand these intrinsics out later
+ // during FinalizeISel in EmitInstrWithCustomInserter.
+ return Op;
+ }
+ case Intrinsic::x86_lwpins32:
+ case Intrinsic::x86_lwpins64:
+ case Intrinsic::x86_umwait:
+ case Intrinsic::x86_tpause: {
+ SDLoc dl(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ unsigned Opcode;
+
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_umwait:
+ Opcode = X86ISD::UMWAIT;
+ break;
+ case Intrinsic::x86_tpause:
+ Opcode = X86ISD::TPAUSE;
+ break;
+ case Intrinsic::x86_lwpins32:
+ case Intrinsic::x86_lwpins64:
+ Opcode = X86ISD::LWPINS;
+ break;
+ }
+
+ SDValue Operation =
+ DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
+ Op->getOperand(3), Op->getOperand(4));
+ SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
+ Operation.getValue(1));
+ }
+ case Intrinsic::x86_enqcmd:
+ case Intrinsic::x86_enqcmds: {
+ SDLoc dl(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic!");
+ case Intrinsic::x86_enqcmd:
+ Opcode = X86ISD::ENQCMD;
+ break;
+ case Intrinsic::x86_enqcmds:
+ Opcode = X86ISD::ENQCMDS;
+ break;
+ }
+ SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
+ Op.getOperand(3));
+ SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
+ Operation.getValue(1));
+ }
+ case Intrinsic::x86_aesenc128kl:
+ case Intrinsic::x86_aesdec128kl:
+ case Intrinsic::x86_aesenc256kl:
+ case Intrinsic::x86_aesdec256kl: {
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
+ SDValue Chain = Op.getOperand(0);
+ unsigned Opcode;
+
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_aesenc128kl:
+ Opcode = X86ISD::AESENC128KL;
+ break;
+ case Intrinsic::x86_aesdec128kl:
+ Opcode = X86ISD::AESDEC128KL;
+ break;
+ case Intrinsic::x86_aesenc256kl:
+ Opcode = X86ISD::AESENC256KL;
+ break;
+ case Intrinsic::x86_aesdec256kl:
+ Opcode = X86ISD::AESDEC256KL;
+ break;
+ }
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ MachineMemOperand *MMO = MemIntr->getMemOperand();
+ EVT MemVT = MemIntr->getMemoryVT();
+ SDValue Operation = DAG.getMemIntrinsicNode(
+ Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
+ MMO);
+ SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ {ZF, Operation.getValue(0), Operation.getValue(2)});
+ }
+ case Intrinsic::x86_aesencwide128kl:
+ case Intrinsic::x86_aesdecwide128kl:
+ case Intrinsic::x86_aesencwide256kl:
+ case Intrinsic::x86_aesdecwide256kl: {
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(
+ {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
+ MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
+ SDValue Chain = Op.getOperand(0);
+ unsigned Opcode;
+
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_aesencwide128kl:
+ Opcode = X86ISD::AESENCWIDE128KL;
+ break;
+ case Intrinsic::x86_aesdecwide128kl:
+ Opcode = X86ISD::AESDECWIDE128KL;
+ break;
+ case Intrinsic::x86_aesencwide256kl:
+ Opcode = X86ISD::AESENCWIDE256KL;
+ break;
+ case Intrinsic::x86_aesdecwide256kl:
+ Opcode = X86ISD::AESDECWIDE256KL;
+ break;
+ }
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ MachineMemOperand *MMO = MemIntr->getMemOperand();
+ EVT MemVT = MemIntr->getMemoryVT();
+ SDValue Operation = DAG.getMemIntrinsicNode(
+ Opcode, DL, VTs,
+ {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+ Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
+ Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
+ MemVT, MMO);
+ SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ {ZF, Operation.getValue(1), Operation.getValue(2),
+ Operation.getValue(3), Operation.getValue(4),
+ Operation.getValue(5), Operation.getValue(6),
+ Operation.getValue(7), Operation.getValue(8),
+ Operation.getValue(9)});
+ }
+ case Intrinsic::x86_testui: {
+ SDLoc dl(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
+ SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
+ Operation.getValue(1));
+ }
+ }
+ return SDValue();
+ }
+
+ SDLoc dl(Op);
+ switch(IntrData->Type) {
+ default: llvm_unreachable("Unknown Intrinsic Type");
+ case RDSEED:
+ case RDRAND: {
+ // Emit the node with the right value type.
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
+ SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+
+ // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
+ // Otherwise return the value from Rand, which is always 0, casted to i32.
+ SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+ DAG.getConstant(1, dl, Op->getValueType(1)),
+ DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
+ SDValue(Result.getNode(), 1)};
+ SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
+
+ // Return { result, isValid, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+ SDValue(Result.getNode(), 2));
+ }
+ case GATHER_AVX2: {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Src = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+ Scale, Chain, Subtarget);
+ }
+ case GATHER: {
+ //gather(v1, mask, index, base, scale);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Src = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
+ Chain, Subtarget);
+ }
+ case SCATTER: {
+ //scatter(base, mask, index, v1, scale);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Base = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Src = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+ Scale, Chain, Subtarget);
+ }
+ case PREFETCH: {
+ const APInt &HintVal = Op.getConstantOperandAPInt(6);
+ assert((HintVal == 2 || HintVal == 3) &&
+ "Wrong prefetch hint in intrinsic: should be 2 or 3");
+ unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Mask = Op.getOperand(2);
+ SDValue Index = Op.getOperand(3);
+ SDValue Base = Op.getOperand(4);
+ SDValue Scale = Op.getOperand(5);
+ return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
+ Subtarget);
+ }
+ // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+ case RDTSC: {
+ SmallVector<SDValue, 2> Results;
+ getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
+ Results);
+ return DAG.getMergeValues(Results, dl);
+ }
+ // Read Performance Monitoring Counters.
+ case RDPMC:
+ // GetExtended Control Register.
+ case XGETBV: {
+ SmallVector<SDValue, 2> Results;
+
+ // RDPMC uses ECX to select the index of the performance counter to read.
+ // XGETBV uses ECX to select the index of the XCR register to return.
+ // The result is stored into registers EDX:EAX.
+ expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
+ Subtarget, Results);
+ return DAG.getMergeValues(Results, dl);
+ }
+ // XTEST intrinsics.
+ case XTEST: {
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+ SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+
+ SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
+ SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+ Ret, SDValue(InTrans.getNode(), 1));
+ }
+ case TRUNCATE_TO_MEM_VI8:
+ case TRUNCATE_TO_MEM_VI16:
+ case TRUNCATE_TO_MEM_VI32: {
+ SDValue Mask = Op.getOperand(4);
+ SDValue DataToTruncate = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+
+ MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+ assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+ EVT MemVT = MemIntr->getMemoryVT();
+
+ uint16_t TruncationOp = IntrData->Opc0;
+ switch (TruncationOp) {
+ case X86ISD::VTRUNC: {
+ if (isAllOnesConstant(Mask)) // return just a truncate store
+ return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
+ MemIntr->getMemOperand());
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ SDValue Offset = DAG.getUNDEF(VMask.getValueType());
+
+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
+ MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
+ true /* truncating */);
+ }
+ case X86ISD::VTRUNCUS:
+ case X86ISD::VTRUNCS: {
+ bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
+ if (isAllOnesConstant(Mask))
+ return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
+ MemIntr->getMemOperand(), DAG);
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
+ VMask, MemVT, MemIntr->getMemOperand(), DAG);
+ }
+ default:
+ llvm_unreachable("Unsupported truncstore intrinsic");
+ }
+ }
+ }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ unsigned Depth = Op.getConstantOperandVal(0);
+ SDLoc dl(Op);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (Depth > 0) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
+ MachinePointerInfo());
+ }
+
+ // Just load the return address.
+ SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+ MachinePointerInfo());
+}
+
+SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
+ return getReturnAddressFrameIndex(DAG);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ EVT VT = Op.getValueType();
+
+ MFI.setFrameAddressIsTaken(true);
+
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+ // Depth > 0 makes no sense on targets which use Windows unwind codes. It
+ // is not possible to crawl up the stack without looking at the unwind codes
+ // simultaneously.
+ int FrameAddrIndex = FuncInfo->getFAIndex();
+ if (!FrameAddrIndex) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
+ SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
+ FuncInfo->setFAIndex(FrameAddrIndex);
+ }
+ return DAG.getFrameIndex(FrameAddrIndex, VT);
+ }
+
+ unsigned FrameReg =
+ RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+ SDLoc dl(Op); // FIXME probably not meaningful
+ unsigned Depth = Op.getConstantOperandVal(0);
+ assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+ (FrameReg == X86::EBP && VT == MVT::i32)) &&
+ "Invalid Frame Register!");
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+ MachinePointerInfo());
+ return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
+ const MachineFunction &MF) const {
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+
+ Register Reg = StringSwitch<unsigned>(RegName)
+ .Case("esp", X86::ESP)
+ .Case("rsp", X86::RSP)
+ .Case("ebp", X86::EBP)
+ .Case("rbp", X86::RBP)
+ .Default(0);
+
+ if (Reg == X86::EBP || Reg == X86::RBP) {
+ if (!TFI.hasFP(MF))
+ report_fatal_error("register " + StringRef(RegName) +
+ " is allocatable: function has no frame pointer");
+#ifndef NDEBUG
+ else {
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
+ assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
+ "Invalid Frame Register!");
+ }
+#endif
+ }
+
+ if (Reg)
+ return Reg;
+
+ report_fatal_error("Invalid register name global variable");
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+ SelectionDAG &DAG) const {
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
+}
+
+Register X86TargetLowering::getExceptionPointerRegister(
+ const Constant *PersonalityFn) const {
+ if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
+ return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
+
+ return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
+}
+
+Register X86TargetLowering::getExceptionSelectorRegister(
+ const Constant *PersonalityFn) const {
+ // Funclet personalities don't use selectors (the runtime does the selection).
+ assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
+ return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
+}
+
+bool X86TargetLowering::needsFixedCatchObjects() const {
+ return Subtarget.isTargetWin64();
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ SDLoc dl (Op);
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
+ (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
+ "Invalid Frame Register!");
+ SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
+ Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+
+ SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+ DAG.getIntPtrConstant(RegInfo->getSlotSize(),
+ dl));
+ StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
+ Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
+ Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+
+ return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
+ DAG.getRegister(StoreAddrReg, PtrVT));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ // If the subtarget is not 64bit, we may need the global base reg
+ // after isel expand pseudo, i.e., after CGBR pass ran.
+ // Therefore, ask for the GlobalBaseReg now, so that the pass
+ // inserts the code for us in case we need it.
+ // Otherwise, we will end up in a situation where we will
+ // reference a virtual register that is not defined!
+ if (!Subtarget.is64Bit()) {
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
+ }
+ return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+ DAG.getVTList(MVT::i32, MVT::Other),
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
+ Op.getOperand(0));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+ return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Root = Op.getOperand(0);
+ SDValue Trmp = Op.getOperand(1); // trampoline
+ SDValue FPtr = Op.getOperand(2); // nested function
+ SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+ SDLoc dl (Op);
+
+ const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ if (Subtarget.is64Bit()) {
+ SDValue OutChains[6];
+
+ // Large code-model.
+ const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
+ const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+ const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+ const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
+
+ const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+ // Load the pointer to the nested function into R11.
+ unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+ SDValue Addr = Trmp;
+ OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(2, dl, MVT::i64));
+ OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
+ MachinePointerInfo(TrmpAddr, 2), Align(2));
+
+ // Load the 'nest' parameter value into R10.
+ // R10 is specified in X86CallingConv.td
+ OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(10, dl, MVT::i64));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr, 10));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(12, dl, MVT::i64));
+ OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 12), Align(2));
+
+ // Jump to the nested function.
+ OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(20, dl, MVT::i64));
+ OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+ Addr, MachinePointerInfo(TrmpAddr, 20));
+
+ unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(22, dl, MVT::i64));
+ OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
+ Addr, MachinePointerInfo(TrmpAddr, 22));
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ } else {
+ const Function *Func =
+ cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+ CallingConv::ID CC = Func->getCallingConv();
+ unsigned NestReg;
+
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::C:
+ case CallingConv::X86_StdCall: {
+ // Pass 'nest' parameter in ECX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::ECX;
+
+ // Check that ECX wasn't needed by an 'inreg' parameter.
+ FunctionType *FTy = Func->getFunctionType();
+ const AttributeList &Attrs = Func->getAttributes();
+
+ if (!Attrs.isEmpty() && !Func->isVarArg()) {
+ unsigned InRegCount = 0;
+ unsigned Idx = 1;
+
+ for (FunctionType::param_iterator I = FTy->param_begin(),
+ E = FTy->param_end(); I != E; ++I, ++Idx)
+ if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
+ const DataLayout &DL = DAG.getDataLayout();
+ // FIXME: should only count parameters that are lowered to integers.
+ InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
+ }
+
+ if (InRegCount > 2) {
+ report_fatal_error("Nest register in use - reduce number of inreg"
+ " parameters!");
+ }
+ }
+ break;
+ }
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::Fast:
+ case CallingConv::Tail:
+ // Pass 'nest' parameter in EAX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::EAX;
+ break;
+ }
+
+ SDValue OutChains[4];
+ SDValue Addr, Disp;
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(10, dl, MVT::i32));
+ Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+ // This is storing the opcode for MOV32ri.
+ const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+ const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
+ OutChains[0] =
+ DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
+ Trmp, MachinePointerInfo(TrmpAddr));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(1, dl, MVT::i32));
+ OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 1), Align(1));
+
+ const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(5, dl, MVT::i32));
+ OutChains[2] =
+ DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
+ MachinePointerInfo(TrmpAddr, 5), Align(1));
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(6, dl, MVT::i32));
+ OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
+ MachinePointerInfo(TrmpAddr, 6), Align(1));
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+ SelectionDAG &DAG) const {
+ /*
+ The rounding mode is in bits 11:10 of FPSR, and has the following
+ settings:
+ 00 Round to nearest
+ 01 Round to -inf
+ 10 Round to +inf
+ 11 Round to 0
+
+ FLT_ROUNDS, on the other hand, expects the following:
+ -1 Undefined
+ 0 Round to 0
+ 1 Round to nearest
+ 2 Round to +inf
+ 3 Round to -inf
+
+ To perform the conversion, we use a packed lookup table of the four 2-bit
+ values that we can index by FPSP[11:10]
+ 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
+
+ (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
+ */
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+
+ // Save FP Control Word to stack slot
+ int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
+ SDValue StackSlot =
+ DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
+
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue Ops[] = {Chain, StackSlot};
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+ DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
+ Align(2), MachineMemOperand::MOStore);
+
+ // Load FP Control Word from stack slot
+ SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
+ Chain = CWD.getValue(1);
+
+ // Mask and turn the control bits into a shift for the lookup table.
+ SDValue Shift =
+ DAG.getNode(ISD::SRL, DL, MVT::i16,
+ DAG.getNode(ISD::AND, DL, MVT::i16,
+ CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
+ DAG.getConstant(9, DL, MVT::i8));
+ Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
+
+ SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
+ SDValue RetVal =
+ DAG.getNode(ISD::AND, DL, MVT::i32,
+ DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
+ DAG.getConstant(3, DL, MVT::i32));
+
+ RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
+
+ return DAG.getMergeValues({RetVal, Chain}, DL);
+}
+
+/// Lower a vector CTLZ using native supported vector CTLZ instruction.
+//
+// i8/i16 vector implemented using dword LZCNT vector instruction
+// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+// split the vector, perform operation on it's Lo a Hi part and
+// concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(Op.getOpcode() == ISD::CTLZ);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
+ "Unsupported element type");
+
+ // Split vector, it's Lo and Hi parts will be handled in next iteration.
+ if (NumElems > 16 ||
+ (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
+ return splitVectorIntUnary(Op, DAG);
+
+ MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+ assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
+ "Unsupported value type for operation");
+
+ // Use native supported vector instruction vplzcntd.
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
+ SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
+ SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
+ SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
+
+ return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
+}
+
+// Lower CTLZ using a PSHUFB lookup table implementation.
+static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ int NumElts = VT.getVectorNumElements();
+ int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
+ MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
+
+ // Per-nibble leading zero PSHUFB lookup table.
+ const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
+ /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
+ /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
+ /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
+
+ SmallVector<SDValue, 64> LUTVec;
+ for (int i = 0; i < NumBytes; ++i)
+ LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+ SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
+
+ // Begin by bitcasting the input to byte vector, then split those bytes
+ // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
+ // If the hi input nibble is zero then we add both results together, otherwise
+ // we just take the hi result (by masking the lo result to zero before the
+ // add).
+ SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
+ SDValue Zero = DAG.getConstant(0, DL, CurrVT);
+
+ SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
+ SDValue Lo = Op0;
+ SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
+ SDValue HiZ;
+ if (CurrVT.is512BitVector()) {
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
+ HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
+ HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
+ } else {
+ HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+ }
+
+ Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
+ Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
+ Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
+ SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
+
+ // Merge result back from vXi8 back to VT, working on the lo/hi halves
+ // of the current vector width in the same way we did for the nibbles.
+ // If the upper half of the input element is zero then add the halves'
+ // leading zero counts together, otherwise just use the upper half's.
+ // Double the width of the result until we are at target width.
+ while (CurrVT != VT) {
+ int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
+ int CurrNumElts = CurrVT.getVectorNumElements();
+ MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
+ MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
+ SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
+
+ // Check if the upper half of the input element is zero.
+ if (CurrVT.is512BitVector()) {
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
+ HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
+ DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
+ } else {
+ HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+ DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ }
+ HiZ = DAG.getBitcast(NextVT, HiZ);
+
+ // Move the upper/lower halves to the lower bits as we'll be extending to
+ // NextVT. Mask the lower result to zero if HiZ is true and add the results
+ // together.
+ SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
+ SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
+ SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
+ R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
+ Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
+ CurrVT = NextVT;
+ }
+
+ return Res;
+}
+
+static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ if (Subtarget.hasCDI() &&
+ // vXi8 vectors need to be promoted to 512-bits for vXi32.
+ (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
+ return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return splitVectorIntUnary(Op, DAG);
+
+ // Decompose 512-bit ops into smaller 256-bit ops.
+ if (VT.is512BitVector() && !Subtarget.hasBWI())
+ return splitVectorIntUnary(Op, DAG);
+
+ assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
+ return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ unsigned Opc = Op.getOpcode();
+
+ if (VT.isVector())
+ return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+ if (Opc == ISD::CTLZ) {
+ // If src is zero (i.e. bsr sets ZF), returns NumBits.
+ SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+ DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)};
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+ }
+
+ // Finally xor with NumBits-1.
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
+ DAG.getConstant(NumBits - 1, dl, OpVT));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ unsigned NumBits = VT.getScalarSizeInBits();
+ SDValue N0 = Op.getOperand(0);
+ SDLoc dl(Op);
+
+ assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
+ "Only scalar CTTZ requires custom lowering");
+
+ // Issue a bsf (scan bits forward) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
+
+ // If src is zero (i.e. bsf sets ZF), returns NumBits.
+ SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
+ DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)};
+ return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
+}
+
+static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT == MVT::i16 || VT == MVT::i32)
+ return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
+
+ if (VT.getScalarType() == MVT::i1)
+ return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
+ Op.getOperand(0), Op.getOperand(1));
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return splitVectorIntBinary(Op, DAG);
+}
+
+static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+ unsigned Opcode = Op.getOpcode();
+ SDLoc DL(Op);
+
+ if (VT.getScalarType() == MVT::i1) {
+ switch (Opcode) {
+ default: llvm_unreachable("Expected saturated arithmetic opcode");
+ case ISD::UADDSAT:
+ case ISD::SADDSAT:
+ // *addsat i1 X, Y --> X | Y
+ return DAG.getNode(ISD::OR, DL, VT, X, Y);
+ case ISD::USUBSAT:
+ case ISD::SSUBSAT:
+ // *subsat i1 X, Y --> X & ~Y
+ return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT));
+ }
+ }
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
+ (VT.is256BitVector() && !Subtarget.hasInt256())) {
+ assert(Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX vector integer operation");
+ return splitVectorIntBinary(Op, DAG);
+ }
+
+ // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
+ // usubsat X, Y --> (X >u Y) ? X - Y : 0
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
+ // TODO: Move this to DAGCombiner?
+ if (SetCCResultType == VT &&
+ DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
+ return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
+ return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+ }
+
+ // Use default expansion.
+ return SDValue();
+}
+
+static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
+ // Since X86 does not have CMOV for 8-bit integer, we don't convert
+ // 8-bit integer abs to NEG and CMOV.
+ SDLoc DL(Op);
+ SDValue N0 = Op.getOperand(0);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+ DAG.getConstant(0, DL, VT), N0);
+ SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1)};
+ return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
+ }
+
+ // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
+ if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ SDValue Sub =
+ DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
+ return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
+ }
+
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+ assert(VT.isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return splitVectorIntUnary(Op, DAG);
+ }
+
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntUnary(Op, DAG);
+
+ // Default to expand.
+ return SDValue();
+}
+
+static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ // For AVX1 cases, split to use legal ops (everything but v4i64).
+ if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
+ return splitVectorIntBinary(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
+ // Default to expand.
+ return SDValue();
+}
+
+static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT.getScalarType() == MVT::i1)
+ return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
+
+ // Decompose 256-bit ops into 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return splitVectorIntBinary(Op, DAG);
+
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntBinary(Op, DAG);
+
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
+ // vector pairs, multiply and truncate.
+ if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
+ MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+ return DAG.getNode(
+ ISD::TRUNCATE, dl, VT,
+ DAG.getNode(ISD::MUL, dl, ExVT,
+ DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
+ DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
+ }
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+
+ // Extract the lo/hi parts to any extend to i16.
+ // We're going to mask off the low byte of each result element of the
+ // pmullw, so it doesn't matter what's in the high byte of each 16-bit
+ // element.
+ SDValue Undef = DAG.getUNDEF(VT);
+ SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
+ SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
+
+ SDValue BLo, BHi;
+ if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+ // If the LHS is a constant, manually unpackl/unpackh.
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (unsigned i = 0; i != NumElts; i += 16) {
+ for (unsigned j = 0; j != 8; ++j) {
+ LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
+ MVT::i16));
+ HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
+ MVT::i16));
+ }
+ }
+
+ BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+ BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+ } else {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
+ BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
+ }
+
+ // Multiply, mask the lower 8bits of the lo/hi results and pack.
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
+ RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ }
+
+ // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+ if (VT == MVT::v4i32) {
+ assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
+ "Should not custom lower when pmulld is available!");
+
+ // Extract the odd parts.
+ static const int UnpackMask[] = { 1, -1, 3, -1 };
+ SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+ SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+ // Multiply the even parts.
+ SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, A),
+ DAG.getBitcast(MVT::v2i64, B));
+ // Now multiply odd parts.
+ SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, Aodds),
+ DAG.getBitcast(MVT::v2i64, Bodds));
+
+ Evens = DAG.getBitcast(VT, Evens);
+ Odds = DAG.getBitcast(VT, Odds);
+
+ // Merge the two vectors back together with a shuffle. This expands into 2
+ // shuffles.
+ static const int ShufMask[] = { 0, 4, 2, 6 };
+ return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+ }
+
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+ "Only know how to lower V2I64/V4I64/V8I64 multiply");
+ assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
+
+ // Ahi = psrlqi(a, 32);
+ // Bhi = psrlqi(b, 32);
+ //
+ // AloBlo = pmuludq(a, b);
+ // AloBhi = pmuludq(a, Bhi);
+ // AhiBlo = pmuludq(Ahi, b);
+ //
+ // Hi = psllqi(AloBhi + AhiBlo, 32);
+ // return AloBlo + Hi;
+ KnownBits AKnown = DAG.computeKnownBits(A);
+ KnownBits BKnown = DAG.computeKnownBits(B);
+
+ APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
+ bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
+ bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
+
+ APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
+ bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
+ bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
+
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+
+ // Only multiply lo/hi halves that aren't known to be zero.
+ SDValue AloBlo = Zero;
+ if (!ALoIsZero && !BLoIsZero)
+ AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+
+ SDValue AloBhi = Zero;
+ if (!ALoIsZero && !BHiIsZero) {
+ SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
+ AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+ }
+
+ SDValue AhiBlo = Zero;
+ if (!AHiIsZero && !BLoIsZero) {
+ SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+ AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+ }
+
+ SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
+ Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
+
+ return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
+}
+
+static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ bool IsSigned = Op->getOpcode() == ISD::MULHS;
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ // Decompose 256-bit ops into 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return splitVectorIntBinary(Op, DAG);
+
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntBinary(Op, DAG);
+
+ if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
+ assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
+ (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
+ (VT == MVT::v16i32 && Subtarget.hasAVX512()));
+
+ // PMULxD operations multiply each even value (starting at 0) of LHS with
+ // the related value of RHS and produce a widen result.
+ // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ //
+ // In other word, to have all the results, we need to perform two PMULxD:
+ // 1. one with the even values.
+ // 2. one with the odd values.
+ // To achieve #2, with need to place the odd values at an even position.
+ //
+ // Place the odd value at an even position (basically, shift all values 1
+ // step to the left):
+ const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
+ 9, -1, 11, -1, 13, -1, 15, -1};
+ // <a|b|c|d> => <b|undef|d|undef>
+ SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
+ makeArrayRef(&Mask[0], NumElts));
+ // <e|f|g|h> => <f|undef|h|undef>
+ SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
+ makeArrayRef(&Mask[0], NumElts));
+
+ // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+ // ints.
+ MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
+ unsigned Opcode =
+ (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
+ // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, A),
+ DAG.getBitcast(MulVT, B)));
+ // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+ // => <2 x i64> <bf|dh>
+ SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, Odd0),
+ DAG.getBitcast(MulVT, Odd1)));
+
+ // Shuffle it back into the right order.
+ SmallVector<int, 16> ShufMask(NumElts);
+ for (int i = 0; i != (int)NumElts; ++i)
+ ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
+
+ SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
+
+ // If we have a signed multiply but no PMULDQ fix up the result of an
+ // unsigned multiply.
+ if (IsSigned && !Subtarget.hasSSE41()) {
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+ SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
+ SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
+
+ SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
+ }
+
+ return Res;
+ }
+
+ // Only i8 vectors should need custom lowering after this.
+ assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
+ "Unsupported vector type");
+
+ // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
+ // logical shift down the upper half and pack back to i8.
+
+ // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
+ // and then ashr/lshr the upper bits down to the lower bits before multiply.
+ unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+ if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
+ MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
+ SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
+ SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
+ SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
+ Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
+ }
+
+ // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
+ // to a vXi16 type. Do the multiplies, shift the results and pack the half
+ // lane results back together.
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+
+ static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+
+ // Extract the lo parts and zero/sign extend to i16.
+ // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
+ // shifts to sign extend. Using unpack for unsigned only requires an xor to
+ // create zeros and a copy due to tied registers contraints pre-avx. But using
+ // zero_extend_vector_inreg would require an additional pshufd for the high
+ // part.
+
+ SDValue ALo, AHi;
+ if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
+ ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
+
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
+ AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
+ } else if (IsSigned) {
+ ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
+
+ ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
+ AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
+ } else {
+ ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
+ DAG.getConstant(0, dl, VT)));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
+ DAG.getConstant(0, dl, VT)));
+ }
+
+ SDValue BLo, BHi;
+ if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+ // If the LHS is a constant, manually unpackl/unpackh and extend.
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (unsigned i = 0; i != NumElts; i += 16) {
+ for (unsigned j = 0; j != 8; ++j) {
+ SDValue LoOp = B.getOperand(i + j);
+ SDValue HiOp = B.getOperand(i + j + 8);
+
+ if (IsSigned) {
+ LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
+ } else {
+ LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
+ }
+
+ LoOps.push_back(LoOp);
+ HiOps.push_back(HiOp);
+ }
+ }
+
+ BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+ BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+ } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
+ BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
+
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
+ BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
+ } else if (IsSigned) {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
+ BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
+
+ BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
+ BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
+ } else {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
+ DAG.getConstant(0, dl, VT)));
+ BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
+ DAG.getConstant(0, dl, VT)));
+ }
+
+ // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
+ // pack back to vXi8.
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
+ RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
+
+ // Bitcast back to VT and then pack all the even elements from Lo and Hi.
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+}
+
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget.isTargetWin64() && "Unexpected target");
+ EVT VT = Op.getValueType();
+ assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+ "Unexpected return type for lowering");
+
+ RTLIB::Libcall LC;
+ bool isSigned;
+ switch (Op->getOpcode()) {
+ default: llvm_unreachable("Unexpected request for libcall!");
+ case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
+ case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
+ case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
+ case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
+ }
+
+ SDLoc dl(Op);
+ SDValue InChain = DAG.getEntryNode();
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+ EVT ArgVT = Op->getOperand(i).getValueType();
+ assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+ "Unexpected argument type for lowering");
+ SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+ Entry.Node = StackPtr;
+ InChain =
+ DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Entry.Ty = PointerType::get(ArgTy,0);
+ Entry.IsSExt = false;
+ Entry.IsZExt = false;
+ Args.push_back(Entry);
+ }
+
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy(DAG.getDataLayout()));
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(InChain)
+ .setLibCallee(
+ getLibcallCallingConv(LC),
+ static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
+ std::move(Args))
+ .setInRegister()
+ .setSExtResult(isSigned)
+ .setZExtResult(!isSigned);
+
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+ return DAG.getBitcast(VT, CallInfo.first);
+}
+
+// Return true if the required (according to Opcode) shift-imm form is natively
+// supported by the Subtarget
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
+ unsigned Opcode) {
+ if (VT.getScalarSizeInBits() < 16)
+ return false;
+
+ if (VT.is512BitVector() && Subtarget.hasAVX512() &&
+ (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
+ return true;
+
+ bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (VT.is256BitVector() && Subtarget.hasInt256());
+
+ bool AShift = LShift && (Subtarget.hasAVX512() ||
+ (VT != MVT::v2i64 && VT != MVT::v4i64));
+ return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+// The shift amount is a variable, but it is the same for all vector lanes.
+// These instructions are defined together with shift-immediate.
+static
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
+ unsigned Opcode) {
+ return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
+}
+
+// Return true if the required (according to Opcode) variable-shift form is
+// natively supported by the Subtarget
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
+ unsigned Opcode) {
+
+ if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
+ return false;
+
+ // vXi16 supported only on AVX-512, BWI
+ if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
+ return false;
+
+ if (Subtarget.hasAVX512())
+ return true;
+
+ bool LShift = VT.is128BitVector() || VT.is256BitVector();
+ bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
+ return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+ unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
+
+ auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
+ MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+ SDValue Ex = DAG.getBitcast(ExVT, R);
+
+ // ashr(R, 63) === cmp_slt(R, 0)
+ if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
+ assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
+ "Unsupported PCMPGT op");
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
+ }
+
+ if (ShiftAmt >= 32) {
+ // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
+ SDValue Upper =
+ getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
+ SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+ ShiftAmt - 32, DAG);
+ if (VT == MVT::v2i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
+ if (VT == MVT::v4i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+ {9, 1, 11, 3, 13, 5, 15, 7});
+ } else {
+ // SRA upper i32, SRL whole i64 and select lower i32.
+ SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+ ShiftAmt, DAG);
+ SDValue Lower =
+ getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
+ Lower = DAG.getBitcast(ExVT, Lower);
+ if (VT == MVT::v2i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
+ if (VT == MVT::v4i64)
+ Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+ {8, 1, 10, 3, 12, 5, 14, 7});
+ }
+ return DAG.getBitcast(VT, Ex);
+ };
+
+ // Optimize shl/srl/sra with constant shift amount.
+ APInt APIntShiftAmt;
+ if (!X86::isConstantSplat(Amt, APIntShiftAmt))
+ return SDValue();
+
+ // If the shift amount is out of range, return undef.
+ if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
+ return DAG.getUNDEF(VT);
+
+ uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
+
+ if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+ return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+ // i64 SRA needs to be performed as partial shifts.
+ if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
+ (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
+ Op.getOpcode() == ISD::SRA)
+ return ArithmeticShiftRight64(ShiftAmt);
+
+ if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
+ (Subtarget.hasBWI() && VT == MVT::v64i8)) {
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+
+ // Simple i8 add case
+ if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+ return DAG.getNode(ISD::ADD, dl, VT, R, R);
+
+ // ashr(R, 7) === cmp_slt(R, 0)
+ if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
+ SDValue Zeros = DAG.getConstant(0, dl, VT);
+ if (VT.is512BitVector()) {
+ assert(VT == MVT::v64i8 && "Unexpected element type!");
+ SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
+ }
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+ }
+
+ // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+ if (VT == MVT::v16i8 && Subtarget.hasXOP())
+ return SDValue();
+
+ if (Op.getOpcode() == ISD::SHL) {
+ // Make a large shift.
+ SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
+ ShiftAmt, DAG);
+ SHL = DAG.getBitcast(VT, SHL);
+ // Zero out the rightmost bits.
+ APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
+ return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
+ }
+ if (Op.getOpcode() == ISD::SRL) {
+ // Make a large shift.
+ SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
+ ShiftAmt, DAG);
+ SRL = DAG.getBitcast(VT, SRL);
+ // Zero out the leftmost bits.
+ return DAG.getNode(ISD::AND, dl, VT, SRL,
+ DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
+ }
+ if (Op.getOpcode() == ISD::SRA) {
+ // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
+ SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+
+ SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
+ Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+ return Res;
+ }
+ llvm_unreachable("Unknown shift opcode.");
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+ unsigned Opcode = Op.getOpcode();
+ unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
+ unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
+
+ if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
+ if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
+ MVT EltVT = VT.getVectorElementType();
+ assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
+ if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
+ else if (EltVT.bitsLT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+ return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
+ }
+
+ // vXi8 shifts - shift as v8i16 + mask result.
+ if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
+ (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
+ VT == MVT::v64i8) &&
+ !Subtarget.hasXOP()) {
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
+ unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
+ unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+ // Create the mask using vXi16 shifts. For shift-rights we need to move
+ // the upper byte down before splatting the vXi8 mask.
+ SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
+ BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
+ BaseShAmt, Subtarget, DAG);
+ if (Opcode != ISD::SHL)
+ BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
+ 8, DAG);
+ BitMask = DAG.getBitcast(VT, BitMask);
+ BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
+ SmallVector<int, 64>(NumElts, 0));
+
+ SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
+ DAG.getBitcast(ExtVT, R), BaseShAmt,
+ Subtarget, DAG);
+ Res = DAG.getBitcast(VT, Res);
+ Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
+
+ if (Opcode == ISD::SRA) {
+ // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
+ // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
+ SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
+ SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
+ BaseShAmt, Subtarget, DAG);
+ SignMask = DAG.getBitcast(VT, SignMask);
+ Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
+ }
+ return Res;
+ }
+ }
+ }
+
+ // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
+ if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
+ Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ Amt = Amt.getOperand(0);
+ unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
+ std::vector<SDValue> Vals(Ratio);
+ for (unsigned i = 0; i != Ratio; ++i)
+ Vals[i] = Amt.getOperand(i);
+ for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
+ for (unsigned j = 0; j != Ratio; ++j)
+ if (Vals[j] != Amt.getOperand(i + j))
+ return SDValue();
+ }
+
+ if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+ return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
+ }
+ return SDValue();
+}
+
+// Convert a shift/rotate left amount to a multiplication scale factor.
+static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Amt.getSimpleValueType();
+ if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ (Subtarget.hasInt256() && VT == MVT::v16i16) ||
+ (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
+ (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
+ return SDValue();
+
+ if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ SmallVector<SDValue, 8> Elts;
+ MVT SVT = VT.getVectorElementType();
+ unsigned SVTBits = SVT.getSizeInBits();
+ APInt One(SVTBits, 1);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDValue Op = Amt->getOperand(i);
+ if (Op->isUndef()) {
+ Elts.push_back(Op);
+ continue;
+ }
+
+ ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+ APInt C(SVTBits, ND->getZExtValue());
+ uint64_t ShAmt = C.getZExtValue();
+ if (ShAmt >= SVTBits) {
+ Elts.push_back(DAG.getUNDEF(SVT));
+ continue;
+ }
+ Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
+ }
+ return DAG.getBuildVector(VT, dl, Elts);
+ }
+
+ // If the target doesn't support variable shifts, use either FP conversion
+ // or integer multiplication to avoid shifting each element individually.
+ if (VT == MVT::v4i32) {
+ Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
+ DAG.getConstant(0x3f800000U, dl, VT));
+ Amt = DAG.getBitcast(MVT::v4f32, Amt);
+ return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
+ }
+
+ // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
+ if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
+ SDValue Z = DAG.getConstant(0, dl, VT);
+ SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
+ SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
+ Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
+ Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
+ if (Subtarget.hasSSE41())
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+
+ return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
+ DAG.getBitcast(VT, Hi),
+ {0, 2, 4, 6, 8, 10, 12, 14});
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+
+ unsigned Opc = Op.getOpcode();
+ unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
+ unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
+
+ assert(VT.isVector() && "Custom lowering only for vector shifts!");
+ assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
+
+ if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
+ return V;
+
+ if (SupportedVectorVarShift(VT, Subtarget, Opc))
+ return Op;
+
+ // XOP has 128-bit variable logical/arithmetic shifts.
+ // +ve/-ve Amt = shift left/right.
+ if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v8i16 || VT == MVT::v16i8)) {
+ if (Opc == ISD::SRL || Opc == ISD::SRA) {
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+ Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
+ }
+ if (Opc == ISD::SHL || Opc == ISD::SRL)
+ return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+ if (Opc == ISD::SRA)
+ return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+ }
+
+ // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+ // shifts per-lane and then shuffle the partial results back together.
+ if (VT == MVT::v2i64 && Opc != ISD::SRA) {
+ // Splat the shift amounts so the scalar shifts above will catch it.
+ SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+ SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+ SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
+ SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
+ return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
+ }
+
+ // i64 vector arithmetic shift can be emulated with the transform:
+ // M = lshr(SIGN_MASK, Amt)
+ // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
+ if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
+ Opc == ISD::SRA) {
+ SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
+ SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
+ R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+ R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+ R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+ return R;
+ }
+
+ // If possible, lower this shift as a sequence of two shifts by
+ // constant plus a BLENDing shuffle instead of scalarizing it.
+ // Example:
+ // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+ //
+ // Could be rewritten as:
+ // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+ //
+ // The advantage is that the two shifts from the example would be
+ // lowered as X86ISD::VSRLI nodes in parallel before blending.
+ if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
+ SDValue Amt1, Amt2;
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 8> ShuffleMask;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue A = Amt->getOperand(i);
+ if (A.isUndef()) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ if (!Amt1 || Amt1 == A) {
+ ShuffleMask.push_back(i);
+ Amt1 = A;
+ continue;
+ }
+ if (!Amt2 || Amt2 == A) {
+ ShuffleMask.push_back(i + NumElts);
+ Amt2 = A;
+ continue;
+ }
+ break;
+ }
+
+ // Only perform this blend if we can perform it without loading a mask.
+ if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
+ (VT != MVT::v16i16 ||
+ is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
+ (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
+ canWidenShuffleElements(ShuffleMask))) {
+ auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
+ auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
+ if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
+ Cst2->getAPIntValue().ult(EltSizeInBits)) {
+ SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
+ Cst1->getZExtValue(), DAG);
+ SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
+ Cst2->getZExtValue(), DAG);
+ return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
+ }
+ }
+ }
+
+ // If possible, lower this packed shift into a vector multiply instead of
+ // expanding it into a sequence of scalar shifts.
+ if (Opc == ISD::SHL)
+ if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
+ return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
+
+ // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
+ // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
+ if (Opc == ISD::SRL && ConstantAmt &&
+ (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
+ SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
+ SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
+ if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+ SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
+ SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
+ return DAG.getSelect(dl, VT, ZAmt, R, Res);
+ }
+ }
+
+ // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
+ // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
+ // TODO: Special case handling for shift by 0/1, really we can afford either
+ // of these cases in pre-SSE41/XOP/AVX512 but not both.
+ if (Opc == ISD::SRA && ConstantAmt &&
+ (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
+ ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
+ !Subtarget.hasAVX512()) ||
+ DAG.isKnownNeverZero(Amt))) {
+ SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
+ SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
+ if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
+ SDValue Amt0 =
+ DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
+ SDValue Amt1 =
+ DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
+ SDValue Sra1 =
+ getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
+ SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
+ Res = DAG.getSelect(dl, VT, Amt0, R, Res);
+ return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
+ }
+ }
+
+ // v4i32 Non Uniform Shifts.
+ // If the shift amount is constant we can shift each lane using the SSE2
+ // immediate shifts, else we need to zero-extend each lane to the lower i64
+ // and shift using the SSE2 variable shifts.
+ // The separate results can then be blended together.
+ if (VT == MVT::v4i32) {
+ SDValue Amt0, Amt1, Amt2, Amt3;
+ if (ConstantAmt) {
+ Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
+ Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
+ Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
+ Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
+ } else {
+ // The SSE2 shifts use the lower i64 as the same shift amount for
+ // all lanes and the upper i64 is ignored. On AVX we're better off
+ // just zero-extending, but for SSE just duplicating the top 16-bits is
+ // cheaper and has the same effect for out of range values.
+ if (Subtarget.hasAVX()) {
+ SDValue Z = DAG.getConstant(0, dl, VT);
+ Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+ Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+ Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+ Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+ } else {
+ SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
+ SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
+ {4, 5, 6, 7, -1, -1, -1, -1});
+ Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
+ {0, 1, 1, 1, -1, -1, -1, -1});
+ Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
+ {2, 3, 3, 3, -1, -1, -1, -1});
+ Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
+ {0, 1, 1, 1, -1, -1, -1, -1});
+ Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
+ {2, 3, 3, 3, -1, -1, -1, -1});
+ }
+ }
+
+ unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
+ SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
+ SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
+ SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
+ SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
+
+ // Merge the shifted lane results optimally with/without PBLENDW.
+ // TODO - ideally shuffle combining would handle this.
+ if (Subtarget.hasSSE41()) {
+ SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+ SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+ return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+ }
+ SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
+ SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
+ return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
+ }
+
+ // It's worth extending once and using the vXi16/vXi32 shifts for smaller
+ // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
+ // make the existing SSE solution better.
+ // NOTE: We honor prefered vector width before promoting to 512-bits.
+ if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
+ (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
+ (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
+ (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
+ assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
+ "Unexpected vector type");
+ MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
+ MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
+ unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+ Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(Opc, dl, ExtVT, R, Amt));
+ }
+
+ // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
+ // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
+ if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
+ (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
+ !Subtarget.hasXOP()) {
+ int NumElts = VT.getVectorNumElements();
+ SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
+
+ // Extend constant shift amount to vXi16 (it doesn't matter if the type
+ // isn't legal).
+ MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
+ Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
+ Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
+ Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
+ assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
+ "Constant build vector expected");
+
+ if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
+ R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
+ : DAG.getZExtOrTrunc(R, dl, ExVT);
+ R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
+ R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
+ return DAG.getZExtOrTrunc(R, dl, VT);
+ }
+
+ SmallVector<SDValue, 16> LoAmt, HiAmt;
+ for (int i = 0; i != NumElts; i += 16) {
+ for (int j = 0; j != 8; ++j) {
+ LoAmt.push_back(Amt.getOperand(i + j));
+ HiAmt.push_back(Amt.getOperand(i + j + 8));
+ }
+ }
+
+ MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
+ SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
+
+ SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
+ SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
+ LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
+ HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
+ LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
+ HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
+ LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
+ HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
+ }
+
+ if (VT == MVT::v16i8 ||
+ (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) {
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+
+ auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
+ if (VT.is512BitVector()) {
+ // On AVX512BW targets we make use of the fact that VSELECT lowers
+ // to a masked blend which selects bytes based just on the sign bit
+ // extracted to a mask.
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ V0 = DAG.getBitcast(VT, V0);
+ V1 = DAG.getBitcast(VT, V1);
+ Sel = DAG.getBitcast(VT, Sel);
+ Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
+ ISD::SETGT);
+ return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
+ } else if (Subtarget.hasSSE41()) {
+ // On SSE41 targets we can use PBLENDVB which selects bytes based just
+ // on the sign bit.
+ V0 = DAG.getBitcast(VT, V0);
+ V1 = DAG.getBitcast(VT, V1);
+ Sel = DAG.getBitcast(VT, Sel);
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
+ }
+ // On pre-SSE41 targets we test for the sign bit by comparing to
+ // zero - a negative value will set all bits of the lanes to true
+ // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
+ SDValue Z = DAG.getConstant(0, dl, SelVT);
+ SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
+ return DAG.getSelect(dl, SelVT, C, V0, V1);
+ };
+
+ // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+ // We can safely do this using i16 shifts as we're only interested in
+ // the 3 lower bits of each byte.
+ Amt = DAG.getBitcast(ExtVT, Amt);
+ Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
+ Amt = DAG.getBitcast(VT, Amt);
+
+ if (Opc == ISD::SHL || Opc == ISD::SRL) {
+ // r = VSELECT(r, shift(r, 4), a);
+ SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // r = VSELECT(r, shift(r, 2), a);
+ M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // return VSELECT(r, shift(r, 1), a);
+ M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
+ R = SignBitSelect(VT, Amt, M, R);
+ return R;
+ }
+
+ if (Opc == ISD::SRA) {
+ // For SRA we need to unpack each byte to the higher byte of a i16 vector
+ // so we can correctly sign extend. We don't care what happens to the
+ // lower byte.
+ SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
+ SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
+ SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
+ SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
+ ALo = DAG.getBitcast(ExtVT, ALo);
+ AHi = DAG.getBitcast(ExtVT, AHi);
+ RLo = DAG.getBitcast(ExtVT, RLo);
+ RHi = DAG.getBitcast(ExtVT, RHi);
+
+ // r = VSELECT(r, shift(r, 4), a);
+ SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
+ SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
+ RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+ RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+ // a += a
+ ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+ AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+ // r = VSELECT(r, shift(r, 2), a);
+ MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
+ MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
+ RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+ RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+ // a += a
+ ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+ AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+ // r = VSELECT(r, shift(r, 1), a);
+ MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
+ MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
+ RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+ RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+ // Logical shift the result back to the lower byte, leaving a zero upper
+ // byte meaning that we can safely pack with PACKUSWB.
+ RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
+ RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ }
+ }
+
+ if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
+ MVT ExtVT = MVT::v8i32;
+ SDValue Z = DAG.getConstant(0, dl, VT);
+ SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
+ SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
+ SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
+ SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
+ ALo = DAG.getBitcast(ExtVT, ALo);
+ AHi = DAG.getBitcast(ExtVT, AHi);
+ RLo = DAG.getBitcast(ExtVT, RLo);
+ RHi = DAG.getBitcast(ExtVT, RHi);
+ SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
+ SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
+ Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
+ Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+ }
+
+ if (VT == MVT::v8i16) {
+ // If we have a constant shift amount, the non-SSE41 path is best as
+ // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
+ bool UseSSE41 = Subtarget.hasSSE41() &&
+ !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+
+ auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
+ // On SSE41 targets we can use PBLENDVB which selects bytes based just on
+ // the sign bit.
+ if (UseSSE41) {
+ MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+ V0 = DAG.getBitcast(ExtVT, V0);
+ V1 = DAG.getBitcast(ExtVT, V1);
+ Sel = DAG.getBitcast(ExtVT, Sel);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
+ }
+ // On pre-SSE41 targets we splat the sign bit - a negative value will
+ // set all bits of the lanes to true and VSELECT uses that in
+ // its OR(AND(V0,C),AND(V1,~C)) lowering.
+ SDValue C =
+ getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
+ return DAG.getSelect(dl, VT, C, V0, V1);
+ };
+
+ // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
+ if (UseSSE41) {
+ // On SSE41 targets we need to replicate the shift mask in both
+ // bytes for PBLENDVB.
+ Amt = DAG.getNode(
+ ISD::OR, dl, VT,
+ getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
+ getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
+ } else {
+ Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
+ }
+
+ // r = VSELECT(r, shift(r, 8), a);
+ SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
+ R = SignBitSelect(Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // r = VSELECT(r, shift(r, 4), a);
+ M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
+ R = SignBitSelect(Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // r = VSELECT(r, shift(r, 2), a);
+ M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
+ R = SignBitSelect(Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+ // return VSELECT(r, shift(r, 1), a);
+ M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
+ R = SignBitSelect(Amt, M, R);
+ return R;
+ }
+
+ // Decompose 256-bit shifts into 128-bit shifts.
+ if (VT.is256BitVector())
+ return splitVectorIntBinary(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
+ return SDValue();
+}
+
+static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ assert(VT.isVector() && "Custom lowering only for vector rotates!");
+
+ SDLoc DL(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+ unsigned Opcode = Op.getOpcode();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ int NumElts = VT.getVectorNumElements();
+
+ // Check for constant splat rotation amount.
+ APInt CstSplatValue;
+ bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
+
+ // Check for splat rotate by zero.
+ if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
+ return R;
+
+ // AVX512 implicitly uses modulo rotation amounts.
+ if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
+ // Attempt to rotate by immediate.
+ if (IsCstSplat) {
+ unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+ uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
+ return DAG.getNode(RotOpc, DL, VT, R,
+ DAG.getTargetConstant(RotAmt, DL, MVT::i8));
+ }
+
+ // Else, fall-back on VPROLV/VPRORV.
+ return Op;
+ }
+
+ // AVX512 VBMI2 vXi16 - lower to funnel shifts.
+ if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
+ unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
+ return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
+ }
+
+ assert((Opcode == ISD::ROTL) && "Only ROTL supported");
+
+ // XOP has 128-bit vector variable + immediate rotates.
+ // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
+ // XOP implicitly uses modulo rotation amounts.
+ if (Subtarget.hasXOP()) {
+ if (VT.is256BitVector())
+ return splitVectorIntBinary(Op, DAG);
+ assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+
+ // Attempt to rotate by immediate.
+ if (IsCstSplat) {
+ uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
+ return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
+ DAG.getTargetConstant(RotAmt, DL, MVT::i8));
+ }
+
+ // Use general rotate by variable (per-element).
+ return Op;
+ }
+
+ // Split 256-bit integers on pre-AVX2 targets.
+ if (VT.is256BitVector() && !Subtarget.hasAVX2())
+ return splitVectorIntBinary(Op, DAG);
+
+ assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
+ ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
+ VT == MVT::v32i16) &&
+ Subtarget.hasAVX2())) &&
+ "Only vXi32/vXi16/vXi8 vector rotates supported");
+
+ // Rotate by an uniform constant - expand back to shifts.
+ if (IsCstSplat)
+ return SDValue();
+
+ bool IsSplatAmt = DAG.isSplatValue(Amt);
+
+ // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
+ // the amount bit.
+ if (EltSizeInBits == 8 && !IsSplatAmt) {
+ if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
+ return SDValue();
+
+ // We don't need ModuloAmt here as we just peek at individual bits.
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+
+ auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
+ if (Subtarget.hasSSE41()) {
+ // On SSE41 targets we can use PBLENDVB which selects bytes based just
+ // on the sign bit.
+ V0 = DAG.getBitcast(VT, V0);
+ V1 = DAG.getBitcast(VT, V1);
+ Sel = DAG.getBitcast(VT, Sel);
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
+ }
+ // On pre-SSE41 targets we test for the sign bit by comparing to
+ // zero - a negative value will set all bits of the lanes to true
+ // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
+ SDValue Z = DAG.getConstant(0, DL, SelVT);
+ SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
+ return DAG.getSelect(DL, SelVT, C, V0, V1);
+ };
+
+ // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+ // We can safely do this using i16 shifts as we're only interested in
+ // the 3 lower bits of each byte.
+ Amt = DAG.getBitcast(ExtVT, Amt);
+ Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
+ Amt = DAG.getBitcast(VT, Amt);
+
+ // r = VSELECT(r, rot(r, 4), a);
+ SDValue M;
+ M = DAG.getNode(
+ ISD::OR, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
+ DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
+
+ // r = VSELECT(r, rot(r, 2), a);
+ M = DAG.getNode(
+ ISD::OR, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
+ DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
+ R = SignBitSelect(VT, Amt, M, R);
+
+ // a += a
+ Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
+
+ // return VSELECT(r, rot(r, 1), a);
+ M = DAG.getNode(
+ ISD::OR, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
+ DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
+ return SignBitSelect(VT, Amt, M, R);
+ }
+
+ // ISD::ROT* uses modulo rotate amounts.
+ Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
+ DAG.getConstant(EltSizeInBits - 1, DL, VT));
+
+ bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+ bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
+ SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
+
+ // Fallback for splats + all supported variable shifts.
+ // Fallback for non-constants AVX2 vXi16 as well.
+ if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
+ SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
+ AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+ SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+ return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
+ }
+
+ // As with shifts, convert the rotation amount to a multiplication factor.
+ SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
+ assert(Scale && "Failed to convert ROTL amount to scale");
+
+ // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
+ if (EltSizeInBits == 16) {
+ SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
+ SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
+ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
+ }
+
+ // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
+ // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
+ // that can then be OR'd with the lower 32-bits.
+ assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
+ static const int OddMask[] = {1, -1, 3, -1};
+ SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
+ SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
+
+ SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, R),
+ DAG.getBitcast(MVT::v2i64, Scale));
+ SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, R13),
+ DAG.getBitcast(MVT::v2i64, Scale13));
+ Res02 = DAG.getBitcast(VT, Res02);
+ Res13 = DAG.getBitcast(VT, Res13);
+
+ return DAG.getNode(ISD::OR, DL, VT,
+ DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
+ DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
+}
+
+/// Returns true if the operand type is exactly twice the native width, and
+/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
+/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
+/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
+bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
+ unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+
+ if (OpWidth == 64)
+ return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
+ if (OpWidth == 128)
+ return Subtarget.hasCmpxchg16b();
+
+ return false;
+}
+
+bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+ Type *MemType = SI->getValueOperand()->getType();
+
+ bool NoImplicitFloatOps =
+ SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
+ return false;
+
+ return needsCmpXchgNb(MemType);
+}
+
+// Note: this turns large loads into lock cmpxchg8b/16b.
+// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ Type *MemType = LI->getType();
+
+ // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
+ // can use movq to do the load. If we have X87 we can load into an 80-bit
+ // X87 register and store it to a stack temporary.
+ bool NoImplicitFloatOps =
+ LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
+ return AtomicExpansionKind::None;
+
+ return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ Type *MemType = AI->getType();
+
+ // If the operand is too big, we must see if cmpxchg8/16b is available
+ // and default to library calls otherwise.
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
+ return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
+ }
+
+ AtomicRMWInst::BinOp Op = AI->getOperation();
+ switch (Op) {
+ default:
+ llvm_unreachable("Unknown atomic operation");
+ case AtomicRMWInst::Xchg:
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ // It's better to use xadd, xsub or xchg for these in all cases.
+ return AtomicExpansionKind::None;
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Xor:
+ // If the atomicrmw's result isn't actually used, we can just add a "lock"
+ // prefix to a normal instruction for these operations.
+ return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
+ case AtomicRMWInst::Nand:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub:
+ // These always require a non-trivial set of data operations on x86. We must
+ // use a cmpxchg loop.
+ return AtomicExpansionKind::CmpXChg;
+ }
+}
+
+LoadInst *
+X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+ unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ Type *MemType = AI->getType();
+ // Accesses larger than the native width are turned into cmpxchg/libcalls, so
+ // there is no benefit in turning such RMWs into loads, and it is actually
+ // harmful as it introduces a mfence.
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+ return nullptr;
+
+ // If this is a canonical idempotent atomicrmw w/no uses, we have a better
+ // lowering available in lowerAtomicArith.
+ // TODO: push more cases through this path.
+ if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
+ if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
+ AI->use_empty())
+ return nullptr;
+
+ IRBuilder<> Builder(AI);
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ auto SSID = AI->getSyncScopeID();
+ // We must restrict the ordering to avoid generating loads with Release or
+ // ReleaseAcquire orderings.
+ auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+
+ // Before the load we need a fence. Here is an example lifted from
+ // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
+ // is required:
+ // Thread 0:
+ // x.store(1, relaxed);
+ // r1 = y.fetch_add(0, release);
+ // Thread 1:
+ // y.fetch_add(42, acquire);
+ // r2 = x.load(relaxed);
+ // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
+ // lowered to just a load without a fence. A mfence flushes the store buffer,
+ // making the optimization clearly correct.
+ // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
+ // otherwise, we might be able to be more aggressive on relaxed idempotent
+ // rmw. In practice, they do not look useful, so we don't try to be
+ // especially clever.
+ if (SSID == SyncScope::SingleThread)
+ // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
+ // the IR level, so we must wrap it in an intrinsic.
+ return nullptr;
+
+ if (!Subtarget.hasMFence())
+ // FIXME: it might make sense to use a locked operation here but on a
+ // different cache-line to prevent cache-line bouncing. In practice it
+ // is probably a small win, and x86 processors without mfence are rare
+ // enough that we do not bother.
+ return nullptr;
+
+ Function *MFence =
+ llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
+ Builder.CreateCall(MFence, {});
+
+ // Finally we can emit the atomic load.
+ LoadInst *Loaded =
+ Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
+ Align(AI->getType()->getPrimitiveSizeInBits()));
+ Loaded->setAtomic(Order, SSID);
+ AI->replaceAllUsesWith(Loaded);
+ AI->eraseFromParent();
+ return Loaded;
+}
+
+bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
+ if (!SI.isUnordered())
+ return false;
+ return ExperimentalUnorderedISEL;
+}
+bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
+ if (!LI.isUnordered())
+ return false;
+ return ExperimentalUnorderedISEL;
+}
+
+
+/// Emit a locked operation on a stack location which does not change any
+/// memory location, but does involve a lock prefix. Location is chosen to be
+/// a) very likely accessed only by a single thread to minimize cache traffic,
+/// and b) definitely dereferenceable. Returns the new Chain result.
+static SDValue emitLockedStackOp(SelectionDAG &DAG,
+ const X86Subtarget &Subtarget, SDValue Chain,
+ const SDLoc &DL) {
+ // Implementation notes:
+ // 1) LOCK prefix creates a full read/write reordering barrier for memory
+ // operations issued by the current processor. As such, the location
+ // referenced is not relevant for the ordering properties of the instruction.
+ // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
+ // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
+ // 2) Using an immediate operand appears to be the best encoding choice
+ // here since it doesn't require an extra register.
+ // 3) OR appears to be very slightly faster than ADD. (Though, the difference
+ // is small enough it might just be measurement noise.)
+ // 4) When choosing offsets, there are several contributing factors:
+ // a) If there's no redzone, we default to TOS. (We could allocate a cache
+ // line aligned stack object to improve this case.)
+ // b) To minimize our chances of introducing a false dependence, we prefer
+ // to offset the stack usage from TOS slightly.
+ // c) To minimize concerns about cross thread stack usage - in particular,
+ // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
+ // captures state in the TOS frame and accesses it from many threads -
+ // we want to use an offset such that the offset is in a distinct cache
+ // line from the TOS frame.
+ //
+ // For a general discussion of the tradeoffs and benchmark results, see:
+ // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+
+ auto &MF = DAG.getMachineFunction();
+ auto &TFL = *Subtarget.getFrameLowering();
+ const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
+
+ if (Subtarget.is64Bit()) {
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::RSP, MVT::i64), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i64), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain};
+ SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
+ MVT::Other, Ops);
+ return SDValue(Res, 1);
+ }
+
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain
+ };
+ SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
+ MVT::Other, Ops);
+ return SDValue(Res, 1);
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ AtomicOrdering FenceOrdering =
+ static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
+ SyncScope::ID FenceSSID =
+ static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
+
+ // The only fence that needs an instruction is a sequentially-consistent
+ // cross-thread fence.
+ if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
+ FenceSSID == SyncScope::System) {
+ if (Subtarget.hasMFence())
+ return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+ SDValue Chain = Op.getOperand(0);
+ return emitLockedStackOp(DAG, Subtarget, Chain, dl);
+ }
+
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT T = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ unsigned Reg = 0;
+ unsigned size = 0;
+ switch(T.SimpleTy) {
+ default: llvm_unreachable("Invalid value type!");
+ case MVT::i8: Reg = X86::AL; size = 1; break;
+ case MVT::i16: Reg = X86::AX; size = 2; break;
+ case MVT::i32: Reg = X86::EAX; size = 4; break;
+ case MVT::i64:
+ assert(Subtarget.is64Bit() && "Node not type legal!");
+ Reg = X86::RAX; size = 8;
+ break;
+ }
+ SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
+ Op.getOperand(2), SDValue());
+ SDValue Ops[] = { cpIn.getValue(0),
+ Op.getOperand(1),
+ Op.getOperand(3),
+ DAG.getTargetConstant(size, DL, MVT::i8),
+ cpIn.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
+ Ops, T, MMO);
+
+ SDValue cpOut =
+ DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+ SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+ MVT::i32, cpOut.getValue(2));
+ SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ cpOut, Success, EFLAGS.getValue(1));
+}
+
+// Create MOVMSKB, taking into account whether we need to split for AVX1.
+static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT InVT = V.getSimpleValueType();
+
+ if (InVT == MVT::v64i8) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
+ Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
+ Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
+ Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
+ Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
+ DAG.getConstant(32, DL, MVT::i8));
+ return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
+ }
+ if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
+ Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
+ Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
+ Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
+ DAG.getConstant(16, DL, MVT::i8));
+ return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
+ }
+
+ return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+}
+
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT DstVT = Op.getSimpleValueType();
+
+ // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
+ // half to v32i1 and concatenating the result.
+ if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
+ assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
+ assert(Subtarget.hasBWI() && "Expected BWI target");
+ SDLoc dl(Op);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
+ DAG.getIntPtrConstant(0, dl));
+ Lo = DAG.getBitcast(MVT::v32i1, Lo);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
+ DAG.getIntPtrConstant(1, dl));
+ Hi = DAG.getBitcast(MVT::v32i1, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+ }
+
+ // Use MOVMSK for vector to scalar conversion to prevent scalarization.
+ if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
+ assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
+ MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
+ SDLoc DL(Op);
+ SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
+ V = getPMOVMSKB(DL, V, DAG, Subtarget);
+ return DAG.getZExtOrTrunc(V, DL, DstVT);
+ }
+
+ assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
+ SrcVT == MVT::i64) && "Unexpected VT!");
+
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
+ !(DstVT == MVT::x86mmx && SrcVT.isVector()))
+ // This conversion needs to be expanded.
+ return SDValue();
+
+ SDLoc dl(Op);
+ if (SrcVT.isVector()) {
+ // Widen the vector in input in the case of MVT::v2i32.
+ // Example: from MVT::v2i32 to MVT::v4i32.
+ MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
+ SrcVT.getVectorNumElements() * 2);
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
+ DAG.getUNDEF(SrcVT));
+ } else {
+ assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
+ "Unexpected source type in LowerBITCAST");
+ Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
+ }
+
+ MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
+ Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
+
+ if (DstVT == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+/// Compute the horizontal sum of bytes in V for the elements of VT.
+///
+/// Requires V to be a byte vector and VT to be an integer vector type with
+/// wider elements than V's type. The width of the elements of VT determines
+/// how many bytes of V are summed horizontally to produce each element of the
+/// result.
+static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(V);
+ MVT ByteVecVT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
+ "Expected value to have byte element type.");
+ assert(EltVT != MVT::i8 &&
+ "Horizontal byte sum only makes sense for wider elements!");
+ unsigned VecSize = VT.getSizeInBits();
+ assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
+
+ // PSADBW instruction horizontally add all bytes and leave the result in i64
+ // chunks, thus directly computes the pop count for v2i64 and v4i64.
+ if (EltVT == MVT::i64) {
+ SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
+ MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+ V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
+ return DAG.getBitcast(VT, V);
+ }
+
+ if (EltVT == MVT::i32) {
+ // We unpack the low half and high half into i32s interleaved with zeros so
+ // that we can use PSADBW to horizontally sum them. The most useful part of
+ // this is that it lines up the results of two PSADBW instructions to be
+ // two v2i64 vectors which concatenated are the 4 population counts. We can
+ // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
+ SDValue Zeros = DAG.getConstant(0, DL, VT);
+ SDValue V32 = DAG.getBitcast(VT, V);
+ SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
+ SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
+
+ // Do the horizontal sums into two v2i64s.
+ Zeros = DAG.getConstant(0, DL, ByteVecVT);
+ MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+ Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+ DAG.getBitcast(ByteVecVT, Low), Zeros);
+ High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+ DAG.getBitcast(ByteVecVT, High), Zeros);
+
+ // Merge them together.
+ MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
+ V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
+ DAG.getBitcast(ShortVecVT, Low),
+ DAG.getBitcast(ShortVecVT, High));
+
+ return DAG.getBitcast(VT, V);
+ }
+
+ // The only element type left is i16.
+ assert(EltVT == MVT::i16 && "Unknown how to handle type");
+
+ // To obtain pop count for each i16 element starting from the pop count for
+ // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
+ // right by 8. It is important to shift as i16s as i8 vector shift isn't
+ // directly supported.
+ SDValue ShifterV = DAG.getConstant(8, DL, VT);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
+ V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
+ DAG.getBitcast(ByteVecVT, V));
+ return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
+}
+
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ int NumElts = VT.getVectorNumElements();
+ (void)EltVT;
+ assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
+
+ // Implement a lookup table in register by using an algorithm based on:
+ // http://wm.ite.pl/articles/sse-popcount.html
+ //
+ // The general idea is that every lower byte nibble in the input vector is an
+ // index into a in-register pre-computed pop count table. We then split up the
+ // input vector in two new ones: (1) a vector with only the shifted-right
+ // higher nibbles for each byte and (2) a vector with the lower nibbles (and
+ // masked out higher ones) for each byte. PSHUFB is used separately with both
+ // to index the in-register table. Next, both are added and the result is a
+ // i8 vector where each element contains the pop count for input byte.
+ const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
+ /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
+ /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
+ /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
+
+ SmallVector<SDValue, 64> LUTVec;
+ for (int i = 0; i < NumElts; ++i)
+ LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+ SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
+ SDValue M0F = DAG.getConstant(0x0F, DL, VT);
+
+ // High nibbles
+ SDValue FourV = DAG.getConstant(4, DL, VT);
+ SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
+
+ // Low nibbles
+ SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
+
+ // The input vector is used as the shuffle mask that index elements into the
+ // LUT. After counting low and high nibbles, add the vector to obtain the
+ // final pop count per i8 element.
+ SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
+ SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
+ return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
+}
+
+// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
+// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
+ "Unknown CTPOP type to handle");
+ SDLoc DL(Op.getNode());
+ SDValue Op0 = Op.getOperand(0);
+
+ // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
+ if (Subtarget.hasVPOPCNTDQ()) {
+ unsigned NumElems = VT.getVectorNumElements();
+ assert((VT.getVectorElementType() == MVT::i8 ||
+ VT.getVectorElementType() == MVT::i16) && "Unexpected type");
+ if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
+ MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+ Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
+ Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
+ }
+ }
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return splitVectorIntUnary(Op, DAG);
+
+ // Decompose 512-bit ops into smaller 256-bit ops.
+ if (VT.is512BitVector() && !Subtarget.hasBWI())
+ return splitVectorIntUnary(Op, DAG);
+
+ // For element types greater than i8, do vXi8 pop counts and a bytesum.
+ if (VT.getScalarType() != MVT::i8) {
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+ SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
+ SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
+ return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
+ }
+
+ // We can't use the fast LUT approach, so fall back on LegalizeDAG.
+ if (!Subtarget.hasSSSE3())
+ return SDValue();
+
+ return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().isVector() &&
+ "We only do custom lowering for vector population count.");
+ return LowerVectorCTPOP(Op, Subtarget, DAG);
+}
+
+static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ // For scalars, its still beneficial to transfer to/from the SIMD unit to
+ // perform the BITREVERSE.
+ if (!VT.isVector()) {
+ MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
+ Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ int NumElts = VT.getVectorNumElements();
+ int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector())
+ return splitVectorIntUnary(Op, DAG);
+
+ assert(VT.is128BitVector() &&
+ "Only 128-bit vector bitreverse lowering supported.");
+
+ // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
+ // perform the BSWAP in the shuffle.
+ // Its best to shuffle using the second operand as this will implicitly allow
+ // memory folding for multiple vectors.
+ SmallVector<SDValue, 16> MaskElts;
+ for (int i = 0; i != NumElts; ++i) {
+ for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
+ int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
+ int PermuteByte = SourceByte | (2 << 5);
+ MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
+ }
+ }
+
+ SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
+ SDValue Res = DAG.getBitcast(MVT::v16i8, In);
+ Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
+ Res, Mask);
+ return DAG.getBitcast(VT, Res);
+}
+
+static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ if (Subtarget.hasXOP() && !VT.is512BitVector())
+ return LowerBITREVERSE_XOP(Op, DAG);
+
+ assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
+
+ SDValue In = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ assert(VT.getScalarType() == MVT::i8 &&
+ "Only byte vector BITREVERSE supported");
+
+ // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
+ if (VT == MVT::v64i8 && !Subtarget.hasBWI())
+ return splitVectorIntUnary(Op, DAG);
+
+ // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
+ if (VT == MVT::v32i8 && !Subtarget.hasInt256())
+ return splitVectorIntUnary(Op, DAG);
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
+ if (Subtarget.hasGFNI()) {
+ MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
+ SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
+ Matrix = DAG.getBitcast(VT, Matrix);
+ return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
+ DAG.getTargetConstant(0, DL, MVT::i8));
+ }
+
+ // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
+ // two nibbles and a PSHUFB lookup to find the bitreverse of each
+ // 0-15 value (moved to the other nibble).
+ SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
+ SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
+ SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
+
+ const int LoLUT[16] = {
+ /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
+ /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
+ /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
+ /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
+ const int HiLUT[16] = {
+ /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
+ /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
+ /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
+ /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
+
+ SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
+ HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
+ }
+
+ SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
+ SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
+ Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
+ Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
+ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
+}
+
+static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue X = Op.getOperand(0);
+ MVT VT = Op.getSimpleValueType();
+
+ // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
+ if (VT == MVT::i8 ||
+ DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
+ X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+ SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
+ DAG.getConstant(0, DL, MVT::i8));
+ // Copy the inverse of the parity flag into a register with setcc.
+ SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+ // Extend to the original type.
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+ }
+
+ if (VT == MVT::i64) {
+ // Xor the high and low 16-bits together using a 32-bit operation.
+ SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+ DAG.getNode(ISD::SRL, DL, MVT::i64, X,
+ DAG.getConstant(32, DL, MVT::i8)));
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+ X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
+ }
+
+ if (VT != MVT::i16) {
+ // Xor the high and low 16-bits together using a 32-bit operation.
+ SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
+ DAG.getConstant(16, DL, MVT::i8));
+ X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
+ } else {
+ // If the input is 16-bits, we need to extend to use an i32 shift below.
+ X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
+ }
+
+ // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
+ // This should allow an h-reg to be used to save a shift.
+ SDValue Hi = DAG.getNode(
+ ISD::TRUNCATE, DL, MVT::i8,
+ DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+ SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
+ SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
+
+ // Copy the inverse of the parity flag into a register with setcc.
+ SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+ // Extend to the original type.
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+}
+
+static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned NewOpc = 0;
+ switch (N->getOpcode()) {
+ case ISD::ATOMIC_LOAD_ADD:
+ NewOpc = X86ISD::LADD;
+ break;
+ case ISD::ATOMIC_LOAD_SUB:
+ NewOpc = X86ISD::LSUB;
+ break;
+ case ISD::ATOMIC_LOAD_OR:
+ NewOpc = X86ISD::LOR;
+ break;
+ case ISD::ATOMIC_LOAD_XOR:
+ NewOpc = X86ISD::LXOR;
+ break;
+ case ISD::ATOMIC_LOAD_AND:
+ NewOpc = X86ISD::LAND;
+ break;
+ default:
+ llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
+ }
+
+ MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+
+ return DAG.getMemIntrinsicNode(
+ NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
+ {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
+ /*MemVT=*/N->getSimpleValueType(0), MMO);
+}
+
+/// Lower atomic_load_ops into LOCK-prefixed operations.
+static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
+ SDValue Chain = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ unsigned Opc = N->getOpcode();
+ MVT VT = N->getSimpleValueType(0);
+ SDLoc DL(N);
+
+ // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
+ // can only be lowered when the result is unused. They should have already
+ // been transformed into a cmpxchg loop in AtomicExpand.
+ if (N->hasAnyUseOfValue(0)) {
+ // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
+ // select LXADD if LOCK_SUB can't be selected.
+ if (Opc == ISD::ATOMIC_LOAD_SUB) {
+ RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
+ RHS, AN->getMemOperand());
+ }
+ assert(Opc == ISD::ATOMIC_LOAD_ADD &&
+ "Used AtomicRMW ops other than Add should have been expanded!");
+ return N;
+ }
+
+ // Specialized lowering for the canonical form of an idemptotent atomicrmw.
+ // The core idea here is that since the memory location isn't actually
+ // changing, all we need is a lowering for the *ordering* impacts of the
+ // atomicrmw. As such, we can chose a different operation and memory
+ // location to minimize impact on other code.
+ if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
+ // On X86, the only ordering which actually requires an instruction is
+ // seq_cst which isn't SingleThread, everything just needs to be preserved
+ // during codegen and then dropped. Note that we expect (but don't assume),
+ // that orderings other than seq_cst and acq_rel have been canonicalized to
+ // a store or load.
+ if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
+ AN->getSyncScopeID() == SyncScope::System) {
+ // Prefer a locked operation against a stack location to minimize cache
+ // traffic. This assumes that stack locations are very likely to be
+ // accessed only by the owning thread.
+ SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
+ assert(!N->hasAnyUseOfValue(0));
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), NewChain);
+ }
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
+ assert(!N->hasAnyUseOfValue(0));
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), NewChain);
+ }
+
+ SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
+ // RAUW the chain, but don't worry about the result, as it's unused.
+ assert(!N->hasAnyUseOfValue(0));
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), LockOp.getValue(1));
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
+ SDLoc dl(Node);
+ EVT VT = Node->getMemoryVT();
+
+ bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
+ bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
+
+ // If this store is not sequentially consistent and the type is legal
+ // we can just keep it.
+ if (!IsSeqCst && IsTypeLegal)
+ return Op;
+
+ if (VT == MVT::i64 && !IsTypeLegal) {
+ // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
+ // is enabled.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
+ SDValue Chain;
+ if (Subtarget.hasSSE1()) {
+ SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Node->getOperand(2));
+ MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+ SclToVec = DAG.getBitcast(StVT, SclToVec);
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
+ Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
+ MVT::i64, Node->getMemOperand());
+ } else if (Subtarget.hasX87()) {
+ // First load this into an 80-bit X87 register using a stack temporary.
+ // This will put the whole integer into the significand.
+ SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+ Chain =
+ DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
+ MPI, MaybeAlign(), MachineMemOperand::MOStore);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue LdOps[] = {Chain, StackPtr};
+ SDValue Value =
+ DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
+ /*Align*/ None, MachineMemOperand::MOLoad);
+ Chain = Value.getValue(1);
+
+ // Now use an FIST to do the atomic store.
+ SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
+ Chain =
+ DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
+ StoreOps, MVT::i64, Node->getMemOperand());
+ }
+
+ if (Chain) {
+ // If this is a sequentially consistent store, also emit an appropriate
+ // barrier.
+ if (IsSeqCst)
+ Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+
+ return Chain;
+ }
+ }
+ }
+
+ // Convert seq_cst store -> xchg
+ // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+ // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ Node->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2),
+ Node->getMemOperand());
+ return Swap.getValue(1);
+}
+
+static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
+ SDNode *N = Op.getNode();
+ MVT VT = N->getSimpleValueType(0);
+ unsigned Opc = Op.getOpcode();
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ SDLoc DL(N);
+
+ // Set the carry flag.
+ SDValue Carry = Op.getOperand(2);
+ EVT CarryVT = Carry.getValueType();
+ Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
+ Carry, DAG.getAllOnesConstant(DL, CarryVT));
+
+ bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
+ SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
+ Op.getOperand(0), Op.getOperand(1),
+ Carry.getValue(1));
+
+ bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
+ SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
+ Sum.getValue(1), DL, DAG);
+ if (N->getValueType(1) == MVT::i1)
+ SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+}
+
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
+
+ // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+ // which returns the values as { float, float } (in XMM0) or
+ // { double, double } (which is returned in XMM0, XMM1).
+ SDLoc dl(Op);
+ SDValue Arg = Op.getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+
+ Entry.Node = Arg;
+ Entry.Ty = ArgTy;
+ Entry.IsSExt = false;
+ Entry.IsZExt = false;
+ Args.push_back(Entry);
+
+ bool isF64 = ArgVT == MVT::f64;
+ // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+ // the small struct {f32, f32} is returned in (eax, edx). For f64,
+ // the results are returned via SRet in memory.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
+ const char *LibcallName = TLI.getLibcallName(LC);
+ SDValue Callee =
+ DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
+
+ Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
+ : (Type *)FixedVectorType::get(ArgTy, 4);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(DAG.getEntryNode())
+ .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+
+ std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+ if (isF64)
+ // Returned in xmm0 and xmm1.
+ return CallResult.first;
+
+ // Returned in bits 0:31 and 32:64 xmm0.
+ SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(0, dl));
+ SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(1, dl));
+ SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
+/// Widen a vector input to a vector of NVT. The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+ bool FillWithZeroes = false) {
+ // Check if InOp already has the right width.
+ MVT InVT = InOp.getSimpleValueType();
+ if (InVT == NVT)
+ return InOp;
+
+ if (InOp.isUndef())
+ return DAG.getUNDEF(NVT);
+
+ assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+ "input and widen element type must match");
+
+ unsigned InNumElts = InVT.getVectorNumElements();
+ unsigned WidenNumElts = NVT.getVectorNumElements();
+ assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+ "Unexpected request for vector widening");
+
+ SDLoc dl(InOp);
+ if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
+ InOp.getNumOperands() == 2) {
+ SDValue N1 = InOp.getOperand(1);
+ if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
+ N1.isUndef()) {
+ InOp = InOp.getOperand(0);
+ InVT = InOp.getSimpleValueType();
+ InNumElts = InVT.getVectorNumElements();
+ }
+ }
+ if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+ SmallVector<SDValue, 16> Ops;
+ for (unsigned i = 0; i < InNumElts; ++i)
+ Ops.push_back(InOp.getOperand(i));
+
+ EVT EltVT = InOp.getOperand(0).getValueType();
+
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+ DAG.getUNDEF(EltVT);
+ for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+ Ops.push_back(FillVal);
+ return DAG.getBuildVector(NVT, dl, Ops);
+ }
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+ DAG.getUNDEF(NVT);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+ InOp, DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX512() &&
+ "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+ MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
+ SDValue Src = N->getValue();
+ MVT VT = Src.getSimpleValueType();
+ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
+ SDLoc dl(Op);
+
+ SDValue Scale = N->getScale();
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Chain = N->getChain();
+ SDValue BasePtr = N->getBasePtr();
+
+ if (VT == MVT::v2f32 || VT == MVT::v2i32) {
+ assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+ // If the index is v2i64 and we have VLX we can use xmm for data and index.
+ if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+ return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+ N->getMemoryVT(), N->getMemOperand());
+ }
+ return SDValue();
+ }
+
+ MVT IndexVT = Index.getSimpleValueType();
+
+ // If the index is v2i32, we're being called by type legalization and we
+ // should just let the default handling take care of it.
+ if (IndexVT == MVT::v2i32)
+ return SDValue();
+
+ // If we don't have VLX and neither the passthru or index is 512-bits, we
+ // need to widen until one is.
+ if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
+ !Index.getSimpleValueType().is512BitVector()) {
+ // Determine how much we need to widen by to get a 512-bit type.
+ unsigned Factor = std::min(512/VT.getSizeInBits(),
+ 512/IndexVT.getSizeInBits());
+ unsigned NumElts = VT.getVectorNumElements() * Factor;
+
+ VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+
+ Src = ExtendToType(Src, VT, DAG);
+ Index = ExtendToType(Index, IndexVT, DAG);
+ Mask = ExtendToType(Mask, MaskVT, DAG, true);
+ }
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+ return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+ N->getMemoryVT(), N->getMemOperand());
+}
+
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+
+ MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+ MVT VT = Op.getSimpleValueType();
+ MVT ScalarVT = VT.getScalarType();
+ SDValue Mask = N->getMask();
+ MVT MaskVT = Mask.getSimpleValueType();
+ SDValue PassThru = N->getPassThru();
+ SDLoc dl(Op);
+
+ // Handle AVX masked loads which don't support passthru other than 0.
+ if (MaskVT.getVectorElementType() != MVT::i1) {
+ // We also allow undef in the isel pattern.
+ if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
+ return Op;
+
+ SDValue NewLoad = DAG.getMaskedLoad(
+ VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+ getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
+ N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
+ N->isExpandingLoad());
+ // Emit a blend.
+ SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
+ return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
+ }
+
+ assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
+ "Expanding masked load is supported on AVX-512 target only!");
+
+ assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
+ "Expanding masked load is supported for 32 and 64-bit types only!");
+
+ assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+ "Cannot lower masked load op.");
+
+ assert((ScalarVT.getSizeInBits() >= 32 ||
+ (Subtarget.hasBWI() &&
+ (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+ "Unsupported masked load op.");
+
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+ PassThru = ExtendToType(PassThru, WideDataVT, DAG);
+
+ // Mask element has to be i1.
+ assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
+ "Unexpected mask type");
+
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ SDValue NewLoad = DAG.getMaskedLoad(
+ WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+ PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+ N->getExtensionType(), N->isExpandingLoad());
+
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+ SDValue DataToStore = N->getValue();
+ MVT VT = DataToStore.getSimpleValueType();
+ MVT ScalarVT = VT.getScalarType();
+ SDValue Mask = N->getMask();
+ SDLoc dl(Op);
+
+ assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
+ "Expanding masked load is supported on AVX-512 target only!");
+
+ assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
+ "Expanding masked load is supported for 32 and 64-bit types only!");
+
+ assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+ "Cannot lower masked store op.");
+
+ assert((ScalarVT.getSizeInBits() >= 32 ||
+ (Subtarget.hasBWI() &&
+ (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+ "Unsupported masked store op.");
+
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+
+ // Mask element has to be i1.
+ assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
+ "Unexpected mask type");
+
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+
+ DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+ N->getOffset(), Mask, N->getMemoryVT(),
+ N->getMemOperand(), N->getAddressingMode(),
+ N->isTruncatingStore(), N->isCompressingStore());
+}
+
+static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX2() &&
+ "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
+
+ MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue PassThru = N->getPassThru();
+ MVT IndexVT = Index.getSimpleValueType();
+
+ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
+
+ // If the index is v2i32, we're being called by type legalization.
+ if (IndexVT == MVT::v2i32)
+ return SDValue();
+
+ // If we don't have VLX and neither the passthru or index is 512-bits, we
+ // need to widen until one is.
+ MVT OrigVT = VT;
+ if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+ !IndexVT.is512BitVector()) {
+ // Determine how much we need to widen by to get a 512-bit type.
+ unsigned Factor = std::min(512/VT.getSizeInBits(),
+ 512/IndexVT.getSizeInBits());
+
+ unsigned NumElts = VT.getVectorNumElements() * Factor;
+
+ VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+
+ PassThru = ExtendToType(PassThru, VT, DAG);
+ Index = ExtendToType(Index, IndexVT, DAG);
+ Mask = ExtendToType(Mask, MaskVT, DAG, true);
+ }
+
+ SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
+ N->getScale() };
+ SDValue NewGather = DAG.getMemIntrinsicNode(
+ X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
+ N->getMemOperand());
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
+ NewGather, DAG.getIntPtrConstant(0, dl));
+ return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
+}
+
+static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ SDValue Src = Op.getOperand(0);
+ MVT DstVT = Op.getSimpleValueType();
+
+ AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
+ unsigned SrcAS = N->getSrcAddressSpace();
+
+ assert(SrcAS != N->getDestAddressSpace() &&
+ "addrspacecast must be between different address spaces");
+
+ if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
+ } else if (DstVT == MVT::i64) {
+ Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
+ } else if (DstVT == MVT::i32) {
+ Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
+ } else {
+ report_fatal_error("Bad address space in addrspacecast");
+ }
+ return Op;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
+ SelectionDAG &DAG) const {
+ // TODO: Eventually, the lowering of these nodes should be informed by or
+ // deferred to the GC strategy for the function in which they appear. For
+ // now, however, they must be lowered to something. Since they are logically
+ // no-ops in the case of a null GC strategy (or a GC strategy which does not
+ // require special handling for these nodes), lower them as literal NOOPs for
+ // the time being.
+ SmallVector<SDValue, 2> Ops;
+
+ Ops.push_back(Op.getOperand(0));
+ if (Op->getGluedNode())
+ Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+ SDLoc OpDL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+ return NOOP;
+}
+
+// Custom split CVTPS2PH with wide types.
+static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ SDValue RC = Op.getOperand(1);
+ Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
+ Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+}
+
+/// Provide custom lowering hooks for some operations.
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Should not custom lower this!");
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+ return LowerCMP_SWAP(Op, Subtarget, DAG);
+ case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
+ case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
+ case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
+ case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
+ case ISD::VSELECT: return LowerVSELECT(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::SHL_PARTS:
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
+ case ISD::FSHL:
+ case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
+ case ISD::STRICT_SINT_TO_FP:
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::STRICT_UINT_TO_FP:
+ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
+ case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
+ case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+ case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
+ case ISD::FP_TO_SINT_SAT:
+ case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
+ case ISD::FP_EXTEND:
+ case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::FP_ROUND:
+ case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
+ case ISD::FP16_TO_FP:
+ case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
+ case ISD::FP_TO_FP16:
+ case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
+ case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
+ case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
+ case ISD::FADD:
+ case ISD::FSUB: return lowerFaddFsub(Op, DAG);
+ case ISD::FROUND: return LowerFROUND(Op, DAG);
+ case ISD::FABS:
+ case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
+ case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
+ case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
+ case ISD::LRINT:
+ case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
+ case ISD::SETCC:
+ case ISD::STRICT_FSETCC:
+ case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
+ case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::VAARG: return LowerVAARG(Op, DAG);
+ case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::FRAME_TO_ARGS_OFFSET:
+ return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
+ case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
+ case ISD::EH_SJLJ_SETUP_DISPATCH:
+ return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
+ case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
+ case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::CTLZ:
+ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
+ case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
+ case ISD::MULHS:
+ case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
+ case ISD::ROTL:
+ case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ case ISD::SMULO:
+ case ISD::UMULO: return LowerXALUO(Op, DAG);
+ case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
+ case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
+ case ISD::SADDO_CARRY:
+ case ISD::SSUBO_CARRY:
+ case ISD::ADDCARRY:
+ case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
+ case ISD::ADD:
+ case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
+ case ISD::UADDSAT:
+ case ISD::SADDSAT:
+ case ISD::USUBSAT:
+ case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
+ case ISD::SMAX:
+ case ISD::SMIN:
+ case ISD::UMAX:
+ case ISD::UMIN: return LowerMINMAX(Op, DAG);
+ case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
+ case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
+ case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
+ case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
+ case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
+ case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
+ case ISD::GC_TRANSITION_START:
+ case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
+ case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
+ case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
+ }
+}
+
+/// Replace a node with an illegal result type with a new node built out of
+/// custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const {
+ SDLoc dl(N);
+ switch (N->getOpcode()) {
+ default:
+#ifndef NDEBUG
+ dbgs() << "ReplaceNodeResults: ";
+ N->dump(&DAG);
+#endif
+ llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case X86ISD::CVTPH2PS: {
+ EVT VT = N->getValueType(0);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
+ Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ return;
+ }
+ case X86ISD::STRICT_CVTPH2PS: {
+ EVT VT = N->getValueType(0);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
+ {N->getOperand(0), Lo});
+ Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
+ {N->getOperand(0), Hi});
+ SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Lo.getValue(1), Hi.getValue(1));
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ case X86ISD::CVTPS2PH:
+ Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
+ return;
+ case ISD::CTPOP: {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ // Use a v2i64 if possible.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
+ SDValue Wide =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
+ Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
+ // Bit count should fit in 32-bits, extract it as that and then zero
+ // extend to i64. Otherwise we end up extracting bits 63:32 separately.
+ Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
+ Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
+ DAG.getIntPtrConstant(0, dl));
+ Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
+ Results.push_back(Wide);
+ }
+ return;
+ }
+ case ISD::MUL: {
+ EVT VT = N->getValueType(0);
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
+ // Pre-promote these to vXi16 to avoid op legalization thinking all 16
+ // elements are needed.
+ MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+ SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
+ SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
+ SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ unsigned NumConcats = 16 / VT.getVectorNumElements();
+ SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
+ ConcatOps[0] = Res;
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
+ Results.push_back(Res);
+ return;
+ }
+ case X86ISD::VPMADDWD:
+ case X86ISD::AVG: {
+ // Legalize types for X86ISD::AVG/VPMADDWD by widening.
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+
+ EVT VT = N->getValueType(0);
+ EVT InVT = N->getOperand(0).getValueType();
+ assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
+ "Expected a VT that divides into 128 bits.");
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
+ unsigned NumConcat = 128 / InVT.getSizeInBits();
+
+ EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
+ InVT.getVectorElementType(),
+ NumConcat * InVT.getVectorNumElements());
+ EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
+ VT.getVectorElementType(),
+ NumConcat * VT.getVectorNumElements());
+
+ SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+ Ops[0] = N->getOperand(0);
+ SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
+ Ops[0] = N->getOperand(1);
+ SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
+
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
+ Results.push_back(Res);
+ return;
+ }
+ // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+ case X86ISD::FMINC:
+ case X86ISD::FMIN:
+ case X86ISD::FMAXC:
+ case X86ISD::FMAX: {
+ EVT VT = N->getValueType(0);
+ assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
+ SDValue UNDEF = DAG.getUNDEF(VT);
+ SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(0), UNDEF);
+ SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(1), UNDEF);
+ Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+ return;
+ }
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM: {
+ EVT VT = N->getValueType(0);
+ if (VT.isVector()) {
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
+ // If this RHS is a constant splat vector we can widen this and let
+ // division/remainder by constant optimize it.
+ // TODO: Can we do something for non-splat?
+ APInt SplatVal;
+ if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
+ unsigned NumConcats = 128 / VT.getSizeInBits();
+ SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
+ Ops0[0] = N->getOperand(0);
+ EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
+ SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
+ Results.push_back(Res);
+ }
+ return;
+ }
+
+ SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+ Results.push_back(V);
+ return;
+ }
+ case ISD::TRUNCATE: {
+ MVT VT = N->getSimpleValueType(0);
+ if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+ return;
+
+ // The generic legalizer will try to widen the input type to the same
+ // number of elements as the widened result type. But this isn't always
+ // the best thing so do some custom legalization to avoid some cases.
+ MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+
+ unsigned InBits = InVT.getSizeInBits();
+ if (128 % InBits == 0) {
+ // 128 bit and smaller inputs should avoid truncate all together and
+ // just use a build_vector that will become a shuffle.
+ // TODO: Widen and use a shuffle directly?
+ MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
+ EVT EltVT = VT.getVectorElementType();
+ unsigned WidenNumElts = WidenVT.getVectorNumElements();
+ SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
+ // Use the original element count so we don't do more scalar opts than
+ // necessary.
+ unsigned MinElts = VT.getVectorNumElements();
+ for (unsigned i=0; i < MinElts; ++i) {
+ SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
+ DAG.getIntPtrConstant(i, dl));
+ Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
+ }
+ Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
+ return;
+ }
+ // With AVX512 there are some cases that can use a target specific
+ // truncate node to go from 256/512 to less than 128 with zeros in the
+ // upper elements of the 128 bit result.
+ if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
+ // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
+ if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
+ Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
+ return;
+ }
+ // There's one case we can widen to 512 bits and use VTRUNC.
+ if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
+ In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
+ DAG.getUNDEF(MVT::v4i64));
+ Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
+ return;
+ }
+ }
+ if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
+ getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
+ isTypeLegal(MVT::v4i64)) {
+ // Input needs to be split and output needs to widened. Let's use two
+ // VTRUNCs, and shuffle their results together into the wider type.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
+
+ Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
+ Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
+ SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
+ { 0, 1, 2, 3, 16, 17, 18, 19,
+ -1, -1, -1, -1, -1, -1, -1, -1 });
+ Results.push_back(Res);
+ return;
+ }
+
+ return;
+ }
+ case ISD::ANY_EXTEND:
+ // Right now, only MVT::v8i8 has Custom action for an illegal type.
+ // It's intended to custom handle the input type.
+ assert(N->getValueType(0) == MVT::v8i8 &&
+ "Do not know how to legalize this Node");
+ return;
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND: {
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+ if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
+ (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
+ assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
+ "Unexpected type action!");
+ assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
+ // Custom split this so we can extend i8/i16->i32 invec. This is better
+ // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
+ // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
+ // we allow the sra from the extend to i32 to be shared by the split.
+ In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
+
+ // Fill a vector with sign bits for each element.
+ SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
+ SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
+
+ // Create an unpackl and unpackh to interleave the sign bits then bitcast
+ // to v2i64.
+ SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
+ {0, 4, 1, 5});
+ Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
+ SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
+ {2, 6, 3, 7});
+ Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
+
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ return;
+ }
+
+ if (VT == MVT::v16i32 || VT == MVT::v8i64) {
+ if (!InVT.is128BitVector()) {
+ // Not a 128 bit vector, but maybe type legalization will promote
+ // it to 128 bits.
+ if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
+ return;
+ InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
+ if (!InVT.is128BitVector())
+ return;
+
+ // Promote the input to 128 bits. Type legalization will turn this into
+ // zext_inreg/sext_inreg.
+ In = DAG.getNode(N->getOpcode(), dl, InVT, In);
+ }
+
+ // Perform custom splitting instead of the two stage extend we would get
+ // by default.
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+ assert(isTypeLegal(LoVT) && "Split VT not legal?");
+
+ SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
+
+ // We need to shift the input over by half the number of elements.
+ unsigned NumElts = InVT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
+ for (unsigned i = 0; i != HalfNumElts; ++i)
+ ShufMask[i] = i + HalfNumElts;
+
+ SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
+ Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
+
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ }
+ return;
+ }
+ case ISD::FP_TO_SINT:
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_UINT: {
+ bool IsStrict = N->isStrictFPOpcode();
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
+ N->getOpcode() == ISD::STRICT_FP_TO_SINT;
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ EVT SrcVT = Src.getValueType();
+
+ if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
+
+ // Try to create a 128 bit vector, but don't exceed a 32 bit element.
+ unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
+ MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
+ VT.getVectorNumElements());
+ SDValue Res;
+ SDValue Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
+ {N->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
+
+ // Preserve what we know about the size of the original result. Except
+ // when the result is v2i32 since we can't widen the assert.
+ if (PromoteVT != MVT::v2i32)
+ Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
+ dl, PromoteVT, Res,
+ DAG.getValueType(VT.getVectorElementType()));
+
+ // Truncate back to the original width.
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+
+ // Now widen to 128 bits.
+ unsigned NumConcats = 128 / VT.getSizeInBits();
+ MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
+ VT.getVectorNumElements() * NumConcats);
+ SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
+ ConcatOps[0] = Res;
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
+ Results.push_back(Res);
+ if (IsStrict)
+ Results.push_back(Chain);
+ return;
+ }
+
+
+ if (VT == MVT::v2i32) {
+ assert((IsSigned || Subtarget.hasAVX512()) &&
+ "Can only handle signed conversion without AVX512");
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
+ if (Src.getValueType() == MVT::v2f64) {
+ unsigned Opc;
+ if (IsStrict)
+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+ else
+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+
+ // If we have VLX we can emit a target specific FP_TO_UINT node,.
+ if (!IsSigned && !Subtarget.hasVLX()) {
+ // Otherwise we can defer to the generic legalizer which will widen
+ // the input as well. This will be further widened during op
+ // legalization to v8i32<-v8f64.
+ // For strict nodes we'll need to widen ourselves.
+ // FIXME: Fix the type legalizer to safely widen strict nodes?
+ if (!IsStrict)
+ return;
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
+ DAG.getConstantFP(0.0, dl, MVT::v2f64));
+ Opc = N->getOpcode();
+ }
+ SDValue Res;
+ SDValue Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
+ {N->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
+ }
+ Results.push_back(Res);
+ if (IsStrict)
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Custom widen strict v2f32->v2i32 by padding with zeros.
+ // FIXME: Should generic type legalizer do this?
+ if (Src.getValueType() == MVT::v2f32 && IsStrict) {
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getConstantFP(0.0, dl, MVT::v2f32));
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
+ {N->getOperand(0), Src});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ return;
+ }
+
+ // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
+ // so early out here.
+ return;
+ }
+
+ assert(!VT.isVector() && "Vectors should have been handled above!");
+
+ if (Subtarget.hasDQI() && VT == MVT::i64 &&
+ (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
+ assert(!Subtarget.is64Bit() && "i64 should be legal");
+ unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
+ // If we use a 128-bit result we might need to use a target specific node.
+ unsigned SrcElts =
+ std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
+ MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
+ MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
+ unsigned Opc = N->getOpcode();
+ if (NumElts != SrcElts) {
+ if (IsStrict)
+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+ else
+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ }
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+ SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
+ DAG.getConstantFP(0.0, dl, VecInVT), Src,
+ ZeroIdx);
+ SDValue Chain;
+ if (IsStrict) {
+ SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
+ Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
+ Results.push_back(Res);
+ if (IsStrict)
+ Results.push_back(Chain);
+ return;
+ }
+
+ SDValue Chain;
+ if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
+ Results.push_back(V);
+ if (IsStrict)
+ Results.push_back(Chain);
+ }
+ return;
+ }
+ case ISD::LRINT:
+ case ISD::LLRINT: {
+ if (SDValue V = LRINT_LLRINTHelper(N, DAG))
+ Results.push_back(V);
+ return;
+ }
+
+ case ISD::SINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ case ISD::STRICT_UINT_TO_FP: {
+ bool IsStrict = N->isStrictFPOpcode();
+ bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
+ N->getOpcode() == ISD::STRICT_SINT_TO_FP;
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2f32)
+ return;
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ EVT SrcVT = Src.getValueType();
+ if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
+ if (IsStrict) {
+ unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
+ : X86ISD::STRICT_CVTUI2P;
+ SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), Src});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ } else {
+ unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
+ Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
+ }
+ return;
+ }
+ if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
+ Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
+ SDValue Zero = DAG.getConstant(0, dl, SrcVT);
+ SDValue One = DAG.getConstant(1, dl, SrcVT);
+ SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
+ DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
+ DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
+ SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
+ SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
+ SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
+ for (int i = 0; i != 2; ++i) {
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ SignSrc, DAG.getIntPtrConstant(i, dl));
+ if (IsStrict)
+ SignCvts[i] =
+ DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
+ {N->getOperand(0), Elt});
+ else
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
+ };
+ SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
+ SDValue Slow, Chain;
+ if (IsStrict) {
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ SignCvts[0].getValue(1), SignCvts[1].getValue(1));
+ Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
+ {Chain, SignCvt, SignCvt});
+ Chain = Slow.getValue(1);
+ } else {
+ Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
+ }
+ IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
+ IsNeg =
+ DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
+ SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
+ Results.push_back(Cvt);
+ if (IsStrict)
+ Results.push_back(Chain);
+ return;
+ }
+
+ if (SrcVT != MVT::v2i32)
+ return;
+
+ if (IsSigned || Subtarget.hasAVX512()) {
+ if (!IsStrict)
+ return;
+
+ // Custom widen strict v2i32->v2f32 to avoid scalarization.
+ // FIXME: Should generic type legalizer do this?
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getConstant(0, dl, MVT::v2i32));
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), Src});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ return;
+ }
+
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
+ SDValue VBias =
+ DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+ DAG.getBitcast(MVT::v2i64, VBias));
+ Or = DAG.getBitcast(MVT::v2f64, Or);
+ if (IsStrict) {
+ SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
+ {N->getOperand(0), Or, VBias});
+ SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
+ {MVT::v4f32, MVT::Other},
+ {Sub.getValue(1), Sub});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ } else {
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+ Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+ }
+ return;
+ }
+ case ISD::STRICT_FP_ROUND:
+ case ISD::FP_ROUND: {
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ if (!isTypeLegal(Src.getValueType()))
+ return;
+ SDValue V;
+ if (IsStrict)
+ V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), N->getOperand(1)});
+ else
+ V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+ Results.push_back(V);
+ if (IsStrict)
+ Results.push_back(V.getValue(1));
+ return;
+ }
+ case ISD::FP_EXTEND:
+ case ISD::STRICT_FP_EXTEND: {
+ // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
+ // No other ValueType for FP_EXTEND should reach this point.
+ assert(N->getValueType(0) == MVT::v2f32 &&
+ "Do not know how to legalize this Node");
+ return;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = N->getConstantOperandVal(1);
+ switch (IntNo) {
+ default : llvm_unreachable("Do not know how to custom type "
+ "legalize this intrinsic operation!");
+ case Intrinsic::x86_rdtsc:
+ return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
+ Results);
+ case Intrinsic::x86_rdtscp:
+ return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
+ Results);
+ case Intrinsic::x86_rdpmc:
+ expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
+ Results);
+ return;
+ case Intrinsic::x86_xgetbv:
+ expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
+ Results);
+ return;
+ }
+ }
+ case ISD::READCYCLECOUNTER: {
+ return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
+ }
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+ EVT T = N->getValueType(0);
+ assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+ bool Regs64bit = T == MVT::i128;
+ assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
+ "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
+ MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+ SDValue cpInL, cpInH;
+ cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(0, dl, HalfT));
+ cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(1, dl, HalfT));
+ cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ cpInL, SDValue());
+ cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ cpInH, cpInL.getValue(1));
+ SDValue swapInL, swapInH;
+ swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(0, dl, HalfT));
+ swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(1, dl, HalfT));
+ swapInH =
+ DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
+ swapInH, cpInH.getValue(1));
+
+ // In 64-bit mode we might need the base pointer in RBX, but we can't know
+ // until later. So we keep the RBX input in a vreg and use a custom
+ // inserter.
+ // Since RBX will be a reserved register the register allocator will not
+ // make sure its value will be properly saved and restored around this
+ // live-range.
+ SDValue Result;
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+ if (Regs64bit) {
+ SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
+ swapInH.getValue(1)};
+ Result =
+ DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
+ } else {
+ swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
+ swapInH.getValue(1));
+ SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
+ swapInL.getValue(1)};
+ Result =
+ DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
+ }
+
+ SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ HalfT, Result.getValue(1));
+ SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ HalfT, cpOutL.getValue(2));
+ SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+ SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+ MVT::i32, cpOutH.getValue(2));
+ SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
+ Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
+ Results.push_back(Success);
+ Results.push_back(EFLAGS.getValue(1));
+ return;
+ }
+ case ISD::ATOMIC_LOAD: {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
+ auto *Node = cast<AtomicSDNode>(N);
+ if (Subtarget.hasSSE1()) {
+ // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
+ // Then extract the lower 64-bits.
+ MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+ SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
+ SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+ SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ MVT::i64, Node->getMemOperand());
+ if (Subtarget.hasSSE2()) {
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Ld.getValue(1));
+ return;
+ }
+ // We use an alternative sequence for SSE1 that extracts as v2f32 and
+ // then casts to i64. This avoids a 128-bit stack temporary being
+ // created by type legalization if we were to cast v4f32->v2i64.
+ SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
+ DAG.getIntPtrConstant(0, dl));
+ Res = DAG.getBitcast(MVT::i64, Res);
+ Results.push_back(Res);
+ Results.push_back(Ld.getValue(1));
+ return;
+ }
+ if (Subtarget.hasX87()) {
+ // First load this into an 80-bit X87 register. This will put the whole
+ // integer into the significand.
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
+ dl, Tys, Ops, MVT::i64,
+ Node->getMemOperand());
+ SDValue Chain = Result.getValue(1);
+
+ // Now store the X87 register to a stack temporary and convert to i64.
+ // This store is not atomic and doesn't need to be.
+ // FIXME: We don't need a stack temporary if the result of the load
+ // is already being stored. We could just directly store there.
+ SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+ SDValue StoreOps[] = { Chain, Result, StackPtr };
+ Chain = DAG.getMemIntrinsicNode(
+ X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
+ MPI, None /*Align*/, MachineMemOperand::MOStore);
+
+ // Finally load the value back from the stack temporary and return it.
+ // This load is not atomic and doesn't need to be.
+ // This load will be further type legalized.
+ Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
+ Results.push_back(Result);
+ Results.push_back(Result.getValue(1));
+ return;
+ }
+ }
+ // TODO: Use MOVLPS when SSE1 is available?
+ // Delegate to generic TypeLegalization. Situations we can really handle
+ // should have already been dealt with by AtomicExpandPass.cpp.
+ break;
+ }
+ case ISD::ATOMIC_SWAP:
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_NAND:
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_UMIN:
+ case ISD::ATOMIC_LOAD_UMAX:
+ // Delegate to generic TypeLegalization. Situations we can really handle
+ // should have already been dealt with by AtomicExpandPass.cpp.
+ break;
+
+ case ISD::BITCAST: {
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ EVT DstVT = N->getValueType(0);
+ EVT SrcVT = N->getOperand(0).getValueType();
+
+ // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
+ // we can split using the k-register rather than memory.
+ if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
+ assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ Lo = DAG.getBitcast(MVT::i32, Lo);
+ Hi = DAG.getBitcast(MVT::i32, Hi);
+ SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ Results.push_back(Res);
+ return;
+ }
+
+ if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
+ // FIXME: Use v4f32 for SSE1?
+ assert(Subtarget.hasSSE2() && "Requires SSE2");
+ assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
+ "Unexpected type action!");
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
+ SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
+ N->getOperand(0));
+ Res = DAG.getBitcast(WideVT, Res);
+ Results.push_back(Res);
+ return;
+ }
+
+ return;
+ }
+ case ISD::MGATHER: {
+ EVT VT = N->getValueType(0);
+ if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
+ (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
+ auto *Gather = cast<MaskedGatherSDNode>(N);
+ SDValue Index = Gather->getIndex();
+ if (Index.getValueType() != MVT::v2i64)
+ return;
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ SDValue Mask = Gather->getMask();
+ assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+ SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
+ Gather->getPassThru(),
+ DAG.getUNDEF(VT));
+ if (!Subtarget.hasVLX()) {
+ // We need to widen the mask, but the instruction will only use 2
+ // of its elements. So we can use undef.
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getUNDEF(MVT::v2i1));
+ Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
+ }
+ SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
+ Gather->getBasePtr(), Index, Gather->getScale() };
+ SDValue Res = DAG.getMemIntrinsicNode(
+ X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
+ Gather->getMemoryVT(), Gather->getMemOperand());
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ return;
+ }
+ return;
+ }
+ case ISD::LOAD: {
+ // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
+ // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
+ // cast since type legalization will try to use an i64 load.
+ MVT VT = N->getSimpleValueType(0);
+ assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
+ if (!ISD::isNON_EXTLoad(N))
+ return;
+ auto *Ld = cast<LoadSDNode>(N);
+ if (Subtarget.hasSSE2()) {
+ MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
+ SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ SDValue Chain = Res.getValue(1);
+ MVT VecVT = MVT::getVectorVT(LdVT, 2);
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ Res = DAG.getBitcast(WideVT, Res);
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
+ SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ MVT::i64, Ld->getMemOperand());
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ return;
+ }
+ case ISD::ADDRSPACECAST: {
+ SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
+ Results.push_back(V);
+ return;
+ }
+ case ISD::BITREVERSE:
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ assert(Subtarget.hasXOP() && "Expected XOP");
+ // We can use VPPERM by copying to a vector register and back. We'll need
+ // to move the scalar in two i32 pieces.
+ Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
+ return;
+ }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch ((X86ISD::NodeType)Opcode) {
+ case X86ISD::FIRST_NUMBER: break;
+#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
+ NODE_NAME_CASE(BSF)
+ NODE_NAME_CASE(BSR)
+ NODE_NAME_CASE(FSHL)
+ NODE_NAME_CASE(FSHR)
+ NODE_NAME_CASE(FAND)
+ NODE_NAME_CASE(FANDN)
+ NODE_NAME_CASE(FOR)
+ NODE_NAME_CASE(FXOR)
+ NODE_NAME_CASE(FILD)
+ NODE_NAME_CASE(FIST)
+ NODE_NAME_CASE(FP_TO_INT_IN_MEM)
+ NODE_NAME_CASE(FLD)
+ NODE_NAME_CASE(FST)
+ NODE_NAME_CASE(CALL)
+ NODE_NAME_CASE(BT)
+ NODE_NAME_CASE(CMP)
+ NODE_NAME_CASE(FCMP)
+ NODE_NAME_CASE(STRICT_FCMP)
+ NODE_NAME_CASE(STRICT_FCMPS)
+ NODE_NAME_CASE(COMI)
+ NODE_NAME_CASE(UCOMI)
+ NODE_NAME_CASE(CMPM)
+ NODE_NAME_CASE(CMPMM)
+ NODE_NAME_CASE(STRICT_CMPM)
+ NODE_NAME_CASE(CMPMM_SAE)
+ NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(SETCC_CARRY)
+ NODE_NAME_CASE(FSETCC)
+ NODE_NAME_CASE(FSETCCM)
+ NODE_NAME_CASE(FSETCCM_SAE)
+ NODE_NAME_CASE(CMOV)
+ NODE_NAME_CASE(BRCOND)
+ NODE_NAME_CASE(RET_FLAG)
+ NODE_NAME_CASE(IRET)
+ NODE_NAME_CASE(REP_STOS)
+ NODE_NAME_CASE(REP_MOVS)
+ NODE_NAME_CASE(GlobalBaseReg)
+ NODE_NAME_CASE(Wrapper)
+ NODE_NAME_CASE(WrapperRIP)
+ NODE_NAME_CASE(MOVQ2DQ)
+ NODE_NAME_CASE(MOVDQ2Q)
+ NODE_NAME_CASE(MMX_MOVD2W)
+ NODE_NAME_CASE(MMX_MOVW2D)
+ NODE_NAME_CASE(PEXTRB)
+ NODE_NAME_CASE(PEXTRW)
+ NODE_NAME_CASE(INSERTPS)
+ NODE_NAME_CASE(PINSRB)
+ NODE_NAME_CASE(PINSRW)
+ NODE_NAME_CASE(PSHUFB)
+ NODE_NAME_CASE(ANDNP)
+ NODE_NAME_CASE(BLENDI)
+ NODE_NAME_CASE(BLENDV)
+ NODE_NAME_CASE(HADD)
+ NODE_NAME_CASE(HSUB)
+ NODE_NAME_CASE(FHADD)
+ NODE_NAME_CASE(FHSUB)
+ NODE_NAME_CASE(CONFLICT)
+ NODE_NAME_CASE(FMAX)
+ NODE_NAME_CASE(FMAXS)
+ NODE_NAME_CASE(FMAX_SAE)
+ NODE_NAME_CASE(FMAXS_SAE)
+ NODE_NAME_CASE(FMIN)
+ NODE_NAME_CASE(FMINS)
+ NODE_NAME_CASE(FMIN_SAE)
+ NODE_NAME_CASE(FMINS_SAE)
+ NODE_NAME_CASE(FMAXC)
+ NODE_NAME_CASE(FMINC)
+ NODE_NAME_CASE(FRSQRT)
+ NODE_NAME_CASE(FRCP)
+ NODE_NAME_CASE(EXTRQI)
+ NODE_NAME_CASE(INSERTQI)
+ NODE_NAME_CASE(TLSADDR)
+ NODE_NAME_CASE(TLSBASEADDR)
+ NODE_NAME_CASE(TLSCALL)
+ NODE_NAME_CASE(EH_SJLJ_SETJMP)
+ NODE_NAME_CASE(EH_SJLJ_LONGJMP)
+ NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
+ NODE_NAME_CASE(EH_RETURN)
+ NODE_NAME_CASE(TC_RETURN)
+ NODE_NAME_CASE(FNSTCW16m)
+ NODE_NAME_CASE(LCMPXCHG_DAG)
+ NODE_NAME_CASE(LCMPXCHG8_DAG)
+ NODE_NAME_CASE(LCMPXCHG16_DAG)
+ NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
+ NODE_NAME_CASE(LADD)
+ NODE_NAME_CASE(LSUB)
+ NODE_NAME_CASE(LOR)
+ NODE_NAME_CASE(LXOR)
+ NODE_NAME_CASE(LAND)
+ NODE_NAME_CASE(VZEXT_MOVL)
+ NODE_NAME_CASE(VZEXT_LOAD)
+ NODE_NAME_CASE(VEXTRACT_STORE)
+ NODE_NAME_CASE(VTRUNC)
+ NODE_NAME_CASE(VTRUNCS)
+ NODE_NAME_CASE(VTRUNCUS)
+ NODE_NAME_CASE(VMTRUNC)
+ NODE_NAME_CASE(VMTRUNCS)
+ NODE_NAME_CASE(VMTRUNCUS)
+ NODE_NAME_CASE(VTRUNCSTORES)
+ NODE_NAME_CASE(VTRUNCSTOREUS)
+ NODE_NAME_CASE(VMTRUNCSTORES)
+ NODE_NAME_CASE(VMTRUNCSTOREUS)
+ NODE_NAME_CASE(VFPEXT)
+ NODE_NAME_CASE(STRICT_VFPEXT)
+ NODE_NAME_CASE(VFPEXT_SAE)
+ NODE_NAME_CASE(VFPEXTS)
+ NODE_NAME_CASE(VFPEXTS_SAE)
+ NODE_NAME_CASE(VFPROUND)
+ NODE_NAME_CASE(STRICT_VFPROUND)
+ NODE_NAME_CASE(VMFPROUND)
+ NODE_NAME_CASE(VFPROUND_RND)
+ NODE_NAME_CASE(VFPROUNDS)
+ NODE_NAME_CASE(VFPROUNDS_RND)
+ NODE_NAME_CASE(VSHLDQ)
+ NODE_NAME_CASE(VSRLDQ)
+ NODE_NAME_CASE(VSHL)
+ NODE_NAME_CASE(VSRL)
+ NODE_NAME_CASE(VSRA)
+ NODE_NAME_CASE(VSHLI)
+ NODE_NAME_CASE(VSRLI)
+ NODE_NAME_CASE(VSRAI)
+ NODE_NAME_CASE(VSHLV)
+ NODE_NAME_CASE(VSRLV)
+ NODE_NAME_CASE(VSRAV)
+ NODE_NAME_CASE(VROTLI)
+ NODE_NAME_CASE(VROTRI)
+ NODE_NAME_CASE(VPPERM)
+ NODE_NAME_CASE(CMPP)
+ NODE_NAME_CASE(STRICT_CMPP)
+ NODE_NAME_CASE(PCMPEQ)
+ NODE_NAME_CASE(PCMPGT)
+ NODE_NAME_CASE(PHMINPOS)
+ NODE_NAME_CASE(ADD)
+ NODE_NAME_CASE(SUB)
+ NODE_NAME_CASE(ADC)
+ NODE_NAME_CASE(SBB)
+ NODE_NAME_CASE(SMUL)
+ NODE_NAME_CASE(UMUL)
+ NODE_NAME_CASE(OR)
+ NODE_NAME_CASE(XOR)
+ NODE_NAME_CASE(AND)
+ NODE_NAME_CASE(BEXTR)
+ NODE_NAME_CASE(BEXTRI)
+ NODE_NAME_CASE(BZHI)
+ NODE_NAME_CASE(PDEP)
+ NODE_NAME_CASE(PEXT)
+ NODE_NAME_CASE(MUL_IMM)
+ NODE_NAME_CASE(MOVMSK)
+ NODE_NAME_CASE(PTEST)
+ NODE_NAME_CASE(TESTP)
+ NODE_NAME_CASE(KORTEST)
+ NODE_NAME_CASE(KTEST)
+ NODE_NAME_CASE(KADD)
+ NODE_NAME_CASE(KSHIFTL)
+ NODE_NAME_CASE(KSHIFTR)
+ NODE_NAME_CASE(PACKSS)
+ NODE_NAME_CASE(PACKUS)
+ NODE_NAME_CASE(PALIGNR)
+ NODE_NAME_CASE(VALIGN)
+ NODE_NAME_CASE(VSHLD)
+ NODE_NAME_CASE(VSHRD)
+ NODE_NAME_CASE(VSHLDV)
+ NODE_NAME_CASE(VSHRDV)
+ NODE_NAME_CASE(PSHUFD)
+ NODE_NAME_CASE(PSHUFHW)
+ NODE_NAME_CASE(PSHUFLW)
+ NODE_NAME_CASE(SHUFP)
+ NODE_NAME_CASE(SHUF128)
+ NODE_NAME_CASE(MOVLHPS)
+ NODE_NAME_CASE(MOVHLPS)
+ NODE_NAME_CASE(MOVDDUP)
+ NODE_NAME_CASE(MOVSHDUP)
+ NODE_NAME_CASE(MOVSLDUP)
+ NODE_NAME_CASE(MOVSD)
+ NODE_NAME_CASE(MOVSS)
+ NODE_NAME_CASE(UNPCKL)
+ NODE_NAME_CASE(UNPCKH)
+ NODE_NAME_CASE(VBROADCAST)
+ NODE_NAME_CASE(VBROADCAST_LOAD)
+ NODE_NAME_CASE(VBROADCASTM)
+ NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
+ NODE_NAME_CASE(VPERMILPV)
+ NODE_NAME_CASE(VPERMILPI)
+ NODE_NAME_CASE(VPERM2X128)
+ NODE_NAME_CASE(VPERMV)
+ NODE_NAME_CASE(VPERMV3)
+ NODE_NAME_CASE(VPERMI)
+ NODE_NAME_CASE(VPTERNLOG)
+ NODE_NAME_CASE(VFIXUPIMM)
+ NODE_NAME_CASE(VFIXUPIMM_SAE)
+ NODE_NAME_CASE(VFIXUPIMMS)
+ NODE_NAME_CASE(VFIXUPIMMS_SAE)
+ NODE_NAME_CASE(VRANGE)
+ NODE_NAME_CASE(VRANGE_SAE)
+ NODE_NAME_CASE(VRANGES)
+ NODE_NAME_CASE(VRANGES_SAE)
+ NODE_NAME_CASE(PMULUDQ)
+ NODE_NAME_CASE(PMULDQ)
+ NODE_NAME_CASE(PSADBW)
+ NODE_NAME_CASE(DBPSADBW)
+ NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
+ NODE_NAME_CASE(VAARG_64)
+ NODE_NAME_CASE(VAARG_X32)
+ NODE_NAME_CASE(WIN_ALLOCA)
+ NODE_NAME_CASE(MEMBARRIER)
+ NODE_NAME_CASE(MFENCE)
+ NODE_NAME_CASE(SEG_ALLOCA)
+ NODE_NAME_CASE(PROBED_ALLOCA)
+ NODE_NAME_CASE(RDRAND)
+ NODE_NAME_CASE(RDSEED)
+ NODE_NAME_CASE(RDPKRU)
+ NODE_NAME_CASE(WRPKRU)
+ NODE_NAME_CASE(VPMADDUBSW)
+ NODE_NAME_CASE(VPMADDWD)
+ NODE_NAME_CASE(VPSHA)
+ NODE_NAME_CASE(VPSHL)
+ NODE_NAME_CASE(VPCOM)
+ NODE_NAME_CASE(VPCOMU)
+ NODE_NAME_CASE(VPERMIL2)
+ NODE_NAME_CASE(FMSUB)
+ NODE_NAME_CASE(STRICT_FMSUB)
+ NODE_NAME_CASE(FNMADD)
+ NODE_NAME_CASE(STRICT_FNMADD)
+ NODE_NAME_CASE(FNMSUB)
+ NODE_NAME_CASE(STRICT_FNMSUB)
+ NODE_NAME_CASE(FMADDSUB)
+ NODE_NAME_CASE(FMSUBADD)
+ NODE_NAME_CASE(FMADD_RND)
+ NODE_NAME_CASE(FNMADD_RND)
+ NODE_NAME_CASE(FMSUB_RND)
+ NODE_NAME_CASE(FNMSUB_RND)
+ NODE_NAME_CASE(FMADDSUB_RND)
+ NODE_NAME_CASE(FMSUBADD_RND)
+ NODE_NAME_CASE(VPMADD52H)
+ NODE_NAME_CASE(VPMADD52L)
+ NODE_NAME_CASE(VRNDSCALE)
+ NODE_NAME_CASE(STRICT_VRNDSCALE)
+ NODE_NAME_CASE(VRNDSCALE_SAE)
+ NODE_NAME_CASE(VRNDSCALES)
+ NODE_NAME_CASE(VRNDSCALES_SAE)
+ NODE_NAME_CASE(VREDUCE)
+ NODE_NAME_CASE(VREDUCE_SAE)
+ NODE_NAME_CASE(VREDUCES)
+ NODE_NAME_CASE(VREDUCES_SAE)
+ NODE_NAME_CASE(VGETMANT)
+ NODE_NAME_CASE(VGETMANT_SAE)
+ NODE_NAME_CASE(VGETMANTS)
+ NODE_NAME_CASE(VGETMANTS_SAE)
+ NODE_NAME_CASE(PCMPESTR)
+ NODE_NAME_CASE(PCMPISTR)
+ NODE_NAME_CASE(XTEST)
+ NODE_NAME_CASE(COMPRESS)
+ NODE_NAME_CASE(EXPAND)
+ NODE_NAME_CASE(SELECTS)
+ NODE_NAME_CASE(ADDSUB)
+ NODE_NAME_CASE(RCP14)
+ NODE_NAME_CASE(RCP14S)
+ NODE_NAME_CASE(RCP28)
+ NODE_NAME_CASE(RCP28_SAE)
+ NODE_NAME_CASE(RCP28S)
+ NODE_NAME_CASE(RCP28S_SAE)
+ NODE_NAME_CASE(EXP2)
+ NODE_NAME_CASE(EXP2_SAE)
+ NODE_NAME_CASE(RSQRT14)
+ NODE_NAME_CASE(RSQRT14S)
+ NODE_NAME_CASE(RSQRT28)
+ NODE_NAME_CASE(RSQRT28_SAE)
+ NODE_NAME_CASE(RSQRT28S)
+ NODE_NAME_CASE(RSQRT28S_SAE)
+ NODE_NAME_CASE(FADD_RND)
+ NODE_NAME_CASE(FADDS)
+ NODE_NAME_CASE(FADDS_RND)
+ NODE_NAME_CASE(FSUB_RND)
+ NODE_NAME_CASE(FSUBS)
+ NODE_NAME_CASE(FSUBS_RND)
+ NODE_NAME_CASE(FMUL_RND)
+ NODE_NAME_CASE(FMULS)
+ NODE_NAME_CASE(FMULS_RND)
+ NODE_NAME_CASE(FDIV_RND)
+ NODE_NAME_CASE(FDIVS)
+ NODE_NAME_CASE(FDIVS_RND)
+ NODE_NAME_CASE(FSQRT_RND)
+ NODE_NAME_CASE(FSQRTS)
+ NODE_NAME_CASE(FSQRTS_RND)
+ NODE_NAME_CASE(FGETEXP)
+ NODE_NAME_CASE(FGETEXP_SAE)
+ NODE_NAME_CASE(FGETEXPS)
+ NODE_NAME_CASE(FGETEXPS_SAE)
+ NODE_NAME_CASE(SCALEF)
+ NODE_NAME_CASE(SCALEF_RND)
+ NODE_NAME_CASE(SCALEFS)
+ NODE_NAME_CASE(SCALEFS_RND)
+ NODE_NAME_CASE(AVG)
+ NODE_NAME_CASE(MULHRS)
+ NODE_NAME_CASE(SINT_TO_FP_RND)
+ NODE_NAME_CASE(UINT_TO_FP_RND)
+ NODE_NAME_CASE(CVTTP2SI)
+ NODE_NAME_CASE(CVTTP2UI)
+ NODE_NAME_CASE(STRICT_CVTTP2SI)
+ NODE_NAME_CASE(STRICT_CVTTP2UI)
+ NODE_NAME_CASE(MCVTTP2SI)
+ NODE_NAME_CASE(MCVTTP2UI)
+ NODE_NAME_CASE(CVTTP2SI_SAE)
+ NODE_NAME_CASE(CVTTP2UI_SAE)
+ NODE_NAME_CASE(CVTTS2SI)
+ NODE_NAME_CASE(CVTTS2UI)
+ NODE_NAME_CASE(CVTTS2SI_SAE)
+ NODE_NAME_CASE(CVTTS2UI_SAE)
+ NODE_NAME_CASE(CVTSI2P)
+ NODE_NAME_CASE(CVTUI2P)
+ NODE_NAME_CASE(STRICT_CVTSI2P)
+ NODE_NAME_CASE(STRICT_CVTUI2P)
+ NODE_NAME_CASE(MCVTSI2P)
+ NODE_NAME_CASE(MCVTUI2P)
+ NODE_NAME_CASE(VFPCLASS)
+ NODE_NAME_CASE(VFPCLASSS)
+ NODE_NAME_CASE(MULTISHIFT)
+ NODE_NAME_CASE(SCALAR_SINT_TO_FP)
+ NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
+ NODE_NAME_CASE(SCALAR_UINT_TO_FP)
+ NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
+ NODE_NAME_CASE(CVTPS2PH)
+ NODE_NAME_CASE(STRICT_CVTPS2PH)
+ NODE_NAME_CASE(MCVTPS2PH)
+ NODE_NAME_CASE(CVTPH2PS)
+ NODE_NAME_CASE(STRICT_CVTPH2PS)
+ NODE_NAME_CASE(CVTPH2PS_SAE)
+ NODE_NAME_CASE(CVTP2SI)
+ NODE_NAME_CASE(CVTP2UI)
+ NODE_NAME_CASE(MCVTP2SI)
+ NODE_NAME_CASE(MCVTP2UI)
+ NODE_NAME_CASE(CVTP2SI_RND)
+ NODE_NAME_CASE(CVTP2UI_RND)
+ NODE_NAME_CASE(CVTS2SI)
+ NODE_NAME_CASE(CVTS2UI)
+ NODE_NAME_CASE(CVTS2SI_RND)
+ NODE_NAME_CASE(CVTS2UI_RND)
+ NODE_NAME_CASE(CVTNE2PS2BF16)
+ NODE_NAME_CASE(CVTNEPS2BF16)
+ NODE_NAME_CASE(MCVTNEPS2BF16)
+ NODE_NAME_CASE(DPBF16PS)
+ NODE_NAME_CASE(LWPINS)
+ NODE_NAME_CASE(MGATHER)
+ NODE_NAME_CASE(MSCATTER)
+ NODE_NAME_CASE(VPDPBUSD)
+ NODE_NAME_CASE(VPDPBUSDS)
+ NODE_NAME_CASE(VPDPWSSD)
+ NODE_NAME_CASE(VPDPWSSDS)
+ NODE_NAME_CASE(VPSHUFBITQMB)
+ NODE_NAME_CASE(GF2P8MULB)
+ NODE_NAME_CASE(GF2P8AFFINEQB)
+ NODE_NAME_CASE(GF2P8AFFINEINVQB)
+ NODE_NAME_CASE(NT_CALL)
+ NODE_NAME_CASE(NT_BRIND)
+ NODE_NAME_CASE(UMWAIT)
+ NODE_NAME_CASE(TPAUSE)
+ NODE_NAME_CASE(ENQCMD)
+ NODE_NAME_CASE(ENQCMDS)
+ NODE_NAME_CASE(VP2INTERSECT)
+ NODE_NAME_CASE(AESENC128KL)
+ NODE_NAME_CASE(AESDEC128KL)
+ NODE_NAME_CASE(AESENC256KL)
+ NODE_NAME_CASE(AESDEC256KL)
+ NODE_NAME_CASE(AESENCWIDE128KL)
+ NODE_NAME_CASE(AESDECWIDE128KL)
+ NODE_NAME_CASE(AESENCWIDE256KL)
+ NODE_NAME_CASE(AESDECWIDE256KL)
+ NODE_NAME_CASE(TESTUI)
+ }
+ return nullptr;
+#undef NODE_NAME_CASE
+}
+
+/// Return true if the addressing mode represented by AM is legal for this
+/// target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS,
+ Instruction *I) const {
+ // X86 supports extremely general addressing modes.
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+
+ // X86 allows a sign-extended 32-bit immediate field as a displacement.
+ if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
+ return false;
+
+ if (AM.BaseGV) {
+ unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
+
+ // If a reference to this global requires an extra load, we can't fold it.
+ if (isGlobalStubReference(GVFlags))
+ return false;
+
+ // If BaseGV requires a register for the PIC base, we cannot also have a
+ // BaseReg specified.
+ if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+ return false;
+
+ // If lower 4G is not available, then we must use rip-relative addressing.
+ if ((M != CodeModel::Small || isPositionIndependent()) &&
+ Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+ return false;
+ }
+
+ switch (AM.Scale) {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ // These scales always work.
+ break;
+ case 3:
+ case 5:
+ case 9:
+ // These scales are formed with basereg+scalereg. Only accept if there is
+ // no basereg yet.
+ if (AM.HasBaseReg)
+ return false;
+ break;
+ default: // Other stuff never works.
+ return false;
+ }
+
+ return true;
+}
+
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+ unsigned Bits = Ty->getScalarSizeInBits();
+
+ // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+ // particularly cheaper than those without.
+ if (Bits == 8)
+ return false;
+
+ // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
+ // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
+ if (Subtarget.hasXOP() &&
+ (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
+ return false;
+
+ // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
+ // shifts just as cheap as scalar ones.
+ if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
+ return false;
+
+ // AVX512BW has shifts such as vpsllvw.
+ if (Subtarget.hasBWI() && Bits == 16)
+ return false;
+
+ // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+ // fully general vector.
+ return true;
+}
+
+bool X86TargetLowering::isBinOp(unsigned Opcode) const {
+ switch (Opcode) {
+ // These are non-commutative binops.
+ // TODO: Add more X86ISD opcodes once we have test coverage.
+ case X86ISD::ANDNP:
+ case X86ISD::PCMPGT:
+ case X86ISD::FMAX:
+ case X86ISD::FMIN:
+ case X86ISD::FANDN:
+ return true;
+ }
+
+ return TargetLoweringBase::isBinOp(Opcode);
+}
+
+bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
+ switch (Opcode) {
+ // TODO: Add more X86ISD opcodes once we have test coverage.
+ case X86ISD::PCMPEQ:
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ:
+ case X86ISD::FMAXC:
+ case X86ISD::FMINC:
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR:
+ return true;
+ }
+
+ return TargetLoweringBase::isCommutativeBinOp(Opcode);
+}
+
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+
+ if (!isTypeLegal(EVT::getEVT(Ty1)))
+ return false;
+
+ assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+ // Assuming the caller doesn't have a zeroext or signext return parameter,
+ // truncation all the way down to i1 is valid.
+ return true;
+}
+
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ // Can also use sub to handle negated immediates.
+ return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
+ return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ EVT VT1 = Val.getValueType();
+ if (isZExtFree(VT1, VT2))
+ return true;
+
+ if (Val.getOpcode() != ISD::LOAD)
+ return false;
+
+ if (!VT1.isSimple() || !VT1.isInteger() ||
+ !VT2.isSimple() || !VT2.isInteger())
+ return false;
+
+ switch (VT1.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ // X86 has 8, 16, and 32-bit zero-extending loads.
+ return true;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const {
+ // A uniform shift amount in a vector shift or funnel shift may be much
+ // cheaper than a generic variable vector shift, so make that pattern visible
+ // to SDAG by sinking the shuffle instruction next to the shift.
+ int ShiftAmountOpNum = -1;
+ if (I->isShift())
+ ShiftAmountOpNum = 1;
+ else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == Intrinsic::fshl ||
+ II->getIntrinsicID() == Intrinsic::fshr)
+ ShiftAmountOpNum = 2;
+ }
+
+ if (ShiftAmountOpNum == -1)
+ return false;
+
+ auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
+ if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
+ isVectorShiftByScalarCheap(I->getType())) {
+ Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
+ return true;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
+ if (!Subtarget.is64Bit())
+ return false;
+ return TargetLowering::shouldConvertPhiType(From, To);
+}
+
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+ if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
+ return false;
+
+ EVT SrcVT = ExtVal.getOperand(0).getValueType();
+
+ // There is no extending load for vXi1.
+ if (SrcVT.getScalarType() == MVT::i1)
+ return false;
+
+ return true;
+}
+
+bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const {
+ if (!Subtarget.hasAnyFMA())
+ return false;
+
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+ // i16 instructions are longer (0x66 prefix) and potentially slower.
+ return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// Targets can use this to indicate that they only support *some*
+/// VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
+ if (!VT.isSimple())
+ return false;
+
+ // Not for i1 vectors
+ if (VT.getSimpleVT().getScalarType() == MVT::i1)
+ return false;
+
+ // Very little shuffling can be done for 64-bit vectors right now.
+ if (VT.getSimpleVT().getSizeInBits() == 64)
+ return false;
+
+ // We only care that the types being shuffled are legal. The lowering can
+ // handle any possible shuffle mask that results.
+ return isTypeLegal(VT.getSimpleVT());
+}
+
+bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
+ EVT VT) const {
+ // Don't convert an 'and' into a shuffle that we don't directly support.
+ // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
+ if (!Subtarget.hasAVX2())
+ if (VT == MVT::v32i8 || VT == MVT::v16i16)
+ return false;
+
+ // Just delegate to the generic legality, clear masks aren't special.
+ return isShuffleMaskLegal(Mask, VT);
+}
+
+bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
+ // If the subtarget is using thunks, we need to not generate jump tables.
+ if (Subtarget.useIndirectThunkBranches())
+ return false;
+
+ // Otherwise, fallback on the generic logic.
+ return TargetLowering::areJTsAllowed(Fn);
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+// Returns true if EFLAG is consumed after this iterator in the rest of the
+// basic block or any successors of the basic block.
+static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
+ MachineBasicBlock *BB) {
+ // Scan forward through BB for a use/def of EFLAGS.
+ for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
+ miI != miE; ++miI) {
+ const MachineInstr& mi = *miI;
+ if (mi.readsRegister(X86::EFLAGS))
+ return true;
+ // If we found a def, we can stop searching.
+ if (mi.definesRegister(X86::EFLAGS))
+ return false;
+ }
+
+ // If we hit the end of the block, check whether EFLAGS is live into a
+ // successor.
+ for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+ sEnd = BB->succ_end();
+ sItr != sEnd; ++sItr) {
+ MachineBasicBlock* succ = *sItr;
+ if (succ->isLiveIn(X86::EFLAGS))
+ return true;
+ }
+
+ return false;
+}
+
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
+ const TargetInstrInfo *TII) {
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = ++MBB->getIterator();
+
+ // For the v = xbegin(), we generate
+ //
+ // thisMBB:
+ // xbegin sinkMBB
+ //
+ // mainMBB:
+ // s0 = -1
+ //
+ // fallBB:
+ // eax = # XABORT_DEF
+ // s1 = eax
+ //
+ // sinkMBB:
+ // v = phi(s0/mainBB, s1/fallBB)
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineFunction *MF = MBB->getParent();
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, fallMBB);
+ MF->insert(I, sinkMBB);
+
+ if (isEFLAGSLiveAfter(MI, MBB)) {
+ mainMBB->addLiveIn(X86::EFLAGS);
+ fallMBB->addLiveIn(X86::EFLAGS);
+ sinkMBB->addLiveIn(X86::EFLAGS);
+ }
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ Register DstReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ Register mainDstReg = MRI.createVirtualRegister(RC);
+ Register fallDstReg = MRI.createVirtualRegister(RC);
+
+ // thisMBB:
+ // xbegin fallMBB
+ // # fallthrough to mainMBB
+ // # abortion to fallMBB
+ BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(fallMBB);
+
+ // mainMBB:
+ // mainDstReg := -1
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
+ BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // fallMBB:
+ // ; pseudo instruction to model hardware's definition from XABORT
+ // EAX := XABORT_DEF
+ // fallDstReg := EAX
+ BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
+ BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
+ .addReg(X86::EAX);
+ fallMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(fallDstReg).addMBB(fallMBB);
+
+ MI.eraseFromParent();
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ // Emit va_arg instruction on X86-64.
+
+ // Operands to this pseudo-instruction:
+ // 0 ) Output : destination address (reg)
+ // 1-5) Input : va_list address (addr, i64mem)
+ // 6 ) ArgSize : Size (in bytes) of vararg type
+ // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
+ // 8 ) Align : Alignment of type
+ // 9 ) EFLAGS (implicit-def)
+
+ assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
+ static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
+
+ Register DestReg = MI.getOperand(0).getReg();
+ MachineOperand &Base = MI.getOperand(1);
+ MachineOperand &Scale = MI.getOperand(2);
+ MachineOperand &Index = MI.getOperand(3);
+ MachineOperand &Disp = MI.getOperand(4);
+ MachineOperand &Segment = MI.getOperand(5);
+ unsigned ArgSize = MI.getOperand(6).getImm();
+ unsigned ArgMode = MI.getOperand(7).getImm();
+ Align Alignment = Align(MI.getOperand(8).getImm());
+
+ MachineFunction *MF = MBB->getParent();
+
+ // Memory Reference
+ assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
+
+ MachineMemOperand *OldMMO = MI.memoperands().front();
+
+ // Clone the MMO into two separate MMOs for loading and storing
+ MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
+ OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
+ MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
+ OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
+
+ // Machine Information
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
+ const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // struct va_list {
+ // i32 gp_offset
+ // i32 fp_offset
+ // i64 overflow_area (address)
+ // i64 reg_save_area (address)
+ // }
+ // sizeof(va_list) = 24
+ // alignment(va_list) = 8
+
+ unsigned TotalNumIntRegs = 6;
+ unsigned TotalNumXMMRegs = 8;
+ bool UseGPOffset = (ArgMode == 1);
+ bool UseFPOffset = (ArgMode == 2);
+ unsigned MaxOffset = TotalNumIntRegs * 8 +
+ (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+
+ /* Align ArgSize to a multiple of 8 */
+ unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
+ bool NeedsAlign = (Alignment > 8);
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *overflowMBB;
+ MachineBasicBlock *offsetMBB;
+ MachineBasicBlock *endMBB;
+
+ unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
+ unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
+ unsigned OffsetReg = 0;
+
+ if (!UseGPOffset && !UseFPOffset) {
+ // If we only pull from the overflow region, we don't create a branch.
+ // We don't need to alter control flow.
+ OffsetDestReg = 0; // unused
+ OverflowDestReg = DestReg;
+
+ offsetMBB = nullptr;
+ overflowMBB = thisMBB;
+ endMBB = thisMBB;
+ } else {
+ // First emit code to check if gp_offset (or fp_offset) is below the bound.
+ // If so, pull the argument from reg_save_area. (branch to offsetMBB)
+ // If not, pull from overflow_area. (branch to overflowMBB)
+ //
+ // thisMBB
+ // | .
+ // | .
+ // offsetMBB overflowMBB
+ // | .
+ // | .
+ // endMBB
+
+ // Registers for the PHI in endMBB
+ OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
+ OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+
+ // Insert the new basic blocks
+ MF->insert(MBBIter, offsetMBB);
+ MF->insert(MBBIter, overflowMBB);
+ MF->insert(MBBIter, endMBB);
+
+ // Transfer the remainder of MBB and its successor edges to endMBB.
+ endMBB->splice(endMBB->begin(), thisMBB,
+ std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
+ endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+
+ // Make offsetMBB and overflowMBB successors of thisMBB
+ thisMBB->addSuccessor(offsetMBB);
+ thisMBB->addSuccessor(overflowMBB);
+
+ // endMBB is a successor of both offsetMBB and overflowMBB
+ offsetMBB->addSuccessor(endMBB);
+ overflowMBB->addSuccessor(endMBB);
+
+ // Load the offset value into a register
+ OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .add(Segment)
+ .setMemRefs(LoadOnlyMMO);
+
+ // Check if there is enough room left to pull this argument.
+ BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+ .addReg(OffsetReg)
+ .addImm(MaxOffset + 8 - ArgSizeA8);
+
+ // Branch to "overflowMBB" if offset >= max
+ // Fall through to "offsetMBB" otherwise
+ BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
+ .addMBB(overflowMBB).addImm(X86::COND_AE);
+ }
+
+ // In offsetMBB, emit code to use the reg_save_area.
+ if (offsetMBB) {
+ assert(OffsetReg != 0);
+
+ // Read the reg_save_area address.
+ Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(
+ offsetMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
+ RegSaveReg)
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
+ .add(Segment)
+ .setMemRefs(LoadOnlyMMO);
+
+ if (Subtarget.isTarget64BitLP64()) {
+ // Zero-extend the offset
+ Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+ .addImm(0)
+ .addReg(OffsetReg)
+ .addImm(X86::sub_32bit);
+
+ // Add the offset to the reg_save_area to get the final address.
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+ .addReg(OffsetReg64)
+ .addReg(RegSaveReg);
+ } else {
+ // Add the offset to the reg_save_area to get the final address.
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
+ .addReg(OffsetReg)
+ .addReg(RegSaveReg);
+ }
+
+ // Compute the offset for the next argument
+ Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+ .addReg(OffsetReg)
+ .addImm(UseFPOffset ? 16 : 8);
+
+ // Store it back into the va_list.
+ BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .add(Segment)
+ .addReg(NextOffsetReg)
+ .setMemRefs(StoreOnlyMMO);
+
+ // Jump to endMBB
+ BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
+ .addMBB(endMBB);
+ }
+
+ //
+ // Emit code to use overflow area
+ //
+
+ // Load the overflow_area address into a register.
+ Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
+ OverflowAddrReg)
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, 8)
+ .add(Segment)
+ .setMemRefs(LoadOnlyMMO);
+
+ // If we need to align it, do so. Otherwise, just copy the address
+ // to OverflowDestReg.
+ if (NeedsAlign) {
+ // Align the overflow address
+ Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
+
+ // aligned_addr = (addr + (align-1)) & ~(align-1)
+ BuildMI(
+ overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
+ TmpReg)
+ .addReg(OverflowAddrReg)
+ .addImm(Alignment.value() - 1);
+
+ BuildMI(
+ overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
+ OverflowDestReg)
+ .addReg(TmpReg)
+ .addImm(~(uint64_t)(Alignment.value() - 1));
+ } else {
+ BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+ .addReg(OverflowAddrReg);
+ }
+
+ // Compute the next overflow address after this argument.
+ // (the overflow address should be kept 8-byte aligned)
+ Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(
+ overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
+ NextAddrReg)
+ .addReg(OverflowDestReg)
+ .addImm(ArgSizeA8);
+
+ // Store the new overflow address.
+ BuildMI(overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, 8)
+ .add(Segment)
+ .addReg(NextAddrReg)
+ .setMemRefs(StoreOnlyMMO);
+
+ // If we branched, emit the PHI to the front of endMBB.
+ if (offsetMBB) {
+ BuildMI(*endMBB, endMBB->begin(), DL,
+ TII->get(X86::PHI), DestReg)
+ .addReg(OffsetDestReg).addMBB(offsetMBB)
+ .addReg(OverflowDestReg).addMBB(overflowMBB);
+ }
+
+ // Erase the pseudo instruction
+ MI.eraseFromParent();
+
+ return endMBB;
+}
+
+MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *MBB) const {
+ // Emit code to save XMM registers to the stack. The ABI says that the
+ // number of registers to save is given in %al, so it's theoretically
+ // possible to do an indirect jump trick to avoid saving all of them,
+ // however this code takes a simpler approach and just executes all
+ // of the stores if %al is non-zero. It's less code, and it's probably
+ // easier on the hardware branch predictor, and stores aren't all that
+ // expensive anyway.
+
+ // Create the new basic blocks. One block contains all the XMM stores,
+ // and one block is the final destination regardless of whether any
+ // stores were performed.
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction *F = MBB->getParent();
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+ MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, XMMSaveMBB);
+ F->insert(MBBIter, EndMBB);
+
+ // Transfer the remainder of MBB and its successor edges to EndMBB.
+ EndMBB->splice(EndMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // The original block will now fall through to the XMM save block.
+ MBB->addSuccessor(XMMSaveMBB);
+ // The XMMSaveMBB will fall through to the end block.
+ XMMSaveMBB->addSuccessor(EndMBB);
+
+ // Now add the instructions.
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ Register CountReg = MI.getOperand(0).getReg();
+ int RegSaveFrameIndex = MI.getOperand(1).getImm();
+ int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
+
+ if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
+ // If %al is 0, branch around the XMM save block.
+ BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+ BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
+ MBB->addSuccessor(EndMBB);
+ }
+
+ // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+ // that was just emitted, but clearly shouldn't be "saved".
+ assert((MI.getNumOperands() <= 3 ||
+ !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
+ MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
+ "Expected last argument to be EFLAGS");
+ unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ // In the XMM save block, save all the XMM argument registers.
+ for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
+ int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+ MachineMemOperand *MMO = F->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
+ MachineMemOperand::MOStore,
+ /*Size=*/16, Align(16));
+ BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
+ .addFrameIndex(RegSaveFrameIndex)
+ .addImm(/*Scale=*/1)
+ .addReg(/*IndexReg=*/0)
+ .addImm(/*Disp=*/Offset)
+ .addReg(/*Segment=*/0)
+ .addReg(MI.getOperand(i).getReg())
+ .addMemOperand(MMO);
+ }
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+ return EndMBB;
+}
+
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+ MachineBasicBlock* BB,
+ const TargetRegisterInfo* TRI) {
+ if (isEFLAGSLiveAfter(SelectItr, BB))
+ return false;
+
+ // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+ // out. SelectMI should have a kill flag on EFLAGS.
+ SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+ return true;
+}
+
+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR32X:
+ case X86::CMOV_FR64:
+ case X86::CMOV_FR64X:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ case X86::CMOV_VR64:
+ case X86::CMOV_VR128:
+ case X86::CMOV_VR128X:
+ case X86::CMOV_VR256:
+ case X86::CMOV_VR256X:
+ case X86::CMOV_VR512:
+ case X86::CMOV_VK1:
+ case X86::CMOV_VK2:
+ case X86::CMOV_VK4:
+ case X86::CMOV_VK8:
+ case X86::CMOV_VK16:
+ case X86::CMOV_VK32:
+ case X86::CMOV_VK64:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+// Helper function, which inserts PHI functions into SinkMBB:
+// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
+// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
+// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
+// the last PHI function inserted.
+static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
+ MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
+ MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+ MachineBasicBlock *SinkMBB) {
+ MachineFunction *MF = TrueMBB->getParent();
+ const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ DebugLoc DL = MIItBegin->getDebugLoc();
+
+ X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+ MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+
+ // As we are creating the PHIs, we have to be careful if there is more than
+ // one. Later CMOVs may reference the results of earlier CMOVs, but later
+ // PHIs have to reference the individual true/false inputs from earlier PHIs.
+ // That also means that PHI construction must work forward from earlier to
+ // later, and that the code must maintain a mapping from earlier PHI's
+ // destination registers, and the registers that went into the PHI.
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+ MachineInstrBuilder MIB;
+
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ Register DestReg = MIIt->getOperand(0).getReg();
+ Register Op1Reg = MIIt->getOperand(1).getReg();
+ Register Op2Reg = MIIt->getOperand(2).getReg();
+
+ // If this CMOV we are generating is the opposite condition from
+ // the jump we generated, then we have to swap the operands for the
+ // PHI that is going to be generated.
+ if (MIIt->getOperand(3).getImm() == OppCC)
+ std::swap(Op1Reg, Op2Reg);
+
+ if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+ Op1Reg = RegRewriteTable[Op1Reg].first;
+
+ if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+ Op2Reg = RegRewriteTable[Op2Reg].second;
+
+ MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
+ .addReg(Op1Reg)
+ .addMBB(FalseMBB)
+ .addReg(Op2Reg)
+ .addMBB(TrueMBB);
+
+ // Add this PHI to the rewrite table.
+ RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+ }
+
+ return MIB;
+}
+
+// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
+ MachineInstr &SecondCascadedCMOV,
+ MachineBasicBlock *ThisMBB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = FirstCMOV.getDebugLoc();
+
+ // We lower cascaded CMOVs such as
+ //
+ // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
+ //
+ // to two successive branches.
+ //
+ // Without this, we would add a PHI between the two jumps, which ends up
+ // creating a few copies all around. For instance, for
+ //
+ // (sitofp (zext (fcmp une)))
+ //
+ // we would generate:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // movaps %xmm0, %xmm1
+ // jne .LBB5_2
+ // xorps %xmm1, %xmm1
+ // .LBB5_2:
+ // jp .LBB5_4
+ // movaps %xmm1, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ // because this custom-inserter would have generated:
+ //
+ // A
+ // | \
+ // | B
+ // | /
+ // C
+ // | \
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // B: empty
+ // C: Z = PHI [X, A], [Y, B]
+ // D: empty
+ // E: PHI [X, C], [Z, D]
+ //
+ // If we lower both CMOVs in a single step, we can instead generate:
+ //
+ // A
+ // | \
+ // | C
+ // | /|
+ // |/ |
+ // | |
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // D: empty
+ // E: PHI [X, A], [X, C], [Y, D]
+ //
+ // Which, in our sitofp/fcmp example, gives us something like:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // jne .LBB5_4
+ // jp .LBB5_4
+ // xorps %xmm0, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+
+ // We lower cascaded CMOV into two successive branches to the same block.
+ // EFLAGS is used by both, so mark it as live in the second.
+ const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
+ MachineFunction *F = ThisMBB->getParent();
+ MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator It = ++ThisMBB->getIterator();
+ F->insert(It, FirstInsertedMBB);
+ F->insert(It, SecondInsertedMBB);
+ F->insert(It, SinkMBB);
+
+ // For a cascaded CMOV, we lower it to two successive branches to
+ // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
+ // the FirstInsertedMBB.
+ FirstInsertedMBB->addLiveIn(X86::EFLAGS);
+
+ // If the EFLAGS register isn't dead in the terminator, then claim that it's
+ // live into the sink and copy blocks.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
+ SecondInsertedMBB->addLiveIn(X86::EFLAGS);
+ SinkMBB->addLiveIn(X86::EFLAGS);
+ }
+
+ // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
+ SinkMBB->splice(SinkMBB->begin(), ThisMBB,
+ std::next(MachineBasicBlock::iterator(FirstCMOV)),
+ ThisMBB->end());
+ SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
+
+ // Fallthrough block for ThisMBB.
+ ThisMBB->addSuccessor(FirstInsertedMBB);
+ // The true block target of the first branch is always SinkMBB.
+ ThisMBB->addSuccessor(SinkMBB);
+ // Fallthrough block for FirstInsertedMBB.
+ FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
+ // The true block for the branch of FirstInsertedMBB.
+ FirstInsertedMBB->addSuccessor(SinkMBB);
+ // This is fallthrough.
+ SecondInsertedMBB->addSuccessor(SinkMBB);
+
+ // Create the conditional branch instructions.
+ X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
+ BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
+
+ X86::CondCode SecondCC =
+ X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
+ BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
+
+ // SinkMBB:
+ // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
+ Register DestReg = FirstCMOV.getOperand(0).getReg();
+ Register Op1Reg = FirstCMOV.getOperand(1).getReg();
+ Register Op2Reg = FirstCMOV.getOperand(2).getReg();
+ MachineInstrBuilder MIB =
+ BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
+ .addReg(Op1Reg)
+ .addMBB(SecondInsertedMBB)
+ .addReg(Op2Reg)
+ .addMBB(ThisMBB);
+
+ // The second SecondInsertedMBB provides the same incoming value as the
+ // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
+ MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
+ // Copy the PHI result to the register defined by the second CMOV.
+ BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
+ TII->get(TargetOpcode::COPY),
+ SecondCascadedCMOV.getOperand(0).getReg())
+ .addReg(FirstCMOV.getOperand(0).getReg());
+
+ // Now remove the CMOVs.
+ FirstCMOV.eraseFromParent();
+ SecondCascadedCMOV.eraseFromParent();
+
+ return SinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
+ MachineBasicBlock *ThisMBB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between and a branch opcode to use.
+
+ // ThisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> FalseMBB
+
+ // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+ // as described above, by inserting a BB, and then making a PHI at the join
+ // point to select the true and false operands of the CMOV in the PHI.
+ //
+ // The code also handles two different cases of multiple CMOV opcodes
+ // in a row.
+ //
+ // Case 1:
+ // In this case, there are multiple CMOVs in a row, all which are based on
+ // the same condition setting (or the exact opposite condition setting).
+ // In this case we can lower all the CMOVs using a single inserted BB, and
+ // then make a number of PHIs at the join point to model the CMOVs. The only
+ // trickiness here, is that in a case like:
+ //
+ // t2 = CMOV cond1 t1, f1
+ // t3 = CMOV cond1 t2, f2
+ //
+ // when rewriting this into PHIs, we have to perform some renaming on the
+ // temps since you cannot have a PHI operand refer to a PHI result earlier
+ // in the same block. The "simple" but wrong lowering would be:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t2(BB1), f2(BB2)
+ //
+ // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+ // renaming is to note that on the path through BB1, t2 is really just a
+ // copy of t1, and do that renaming, properly generating:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t1(BB1), f2(BB2)
+ //
+ // Case 2:
+ // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
+ // function - EmitLoweredCascadedSelect.
+
+ X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+ MachineInstr *LastCMOV = &MI;
+ MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
+
+ // Check for case 1, where there are multiple CMOVs with the same condition
+ // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
+ // number of jumps the most.
+
+ if (isCMOVPseudo(MI)) {
+ // See if we have a string of CMOVS with the same condition. Skip over
+ // intervening debug insts.
+ while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
+ (NextMIIt->getOperand(3).getImm() == CC ||
+ NextMIIt->getOperand(3).getImm() == OppCC)) {
+ LastCMOV = &*NextMIIt;
+ NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
+ }
+ }
+
+ // This checks for case 2, but only do this if we didn't already find
+ // case 1, as indicated by LastCMOV == MI.
+ if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
+ NextMIIt->getOpcode() == MI.getOpcode() &&
+ NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
+ NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+ NextMIIt->getOperand(1).isKill()) {
+ return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
+ }
+
+ const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
+ MachineFunction *F = ThisMBB->getParent();
+ MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator It = ++ThisMBB->getIterator();
+ F->insert(It, FalseMBB);
+ F->insert(It, SinkMBB);
+
+ // If the EFLAGS register isn't dead in the terminator, then claim that it's
+ // live into the sink and copy blocks.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ if (!LastCMOV->killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
+ FalseMBB->addLiveIn(X86::EFLAGS);
+ SinkMBB->addLiveIn(X86::EFLAGS);
+ }
+
+ // Transfer any debug instructions inside the CMOV sequence to the sunk block.
+ auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
+ auto DbgIt = MachineBasicBlock::iterator(MI);
+ while (DbgIt != DbgEnd) {
+ auto Next = std::next(DbgIt);
+ if (DbgIt->isDebugInstr())
+ SinkMBB->push_back(DbgIt->removeFromParent());
+ DbgIt = Next;
+ }
+
+ // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
+ SinkMBB->splice(SinkMBB->end(), ThisMBB,
+ std::next(MachineBasicBlock::iterator(LastCMOV)),
+ ThisMBB->end());
+ SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
+
+ // Fallthrough block for ThisMBB.
+ ThisMBB->addSuccessor(FalseMBB);
+ // The true block target of the first (or only) branch is always a SinkMBB.
+ ThisMBB->addSuccessor(SinkMBB);
+ // Fallthrough block for FalseMBB.
+ FalseMBB->addSuccessor(SinkMBB);
+
+ // Create the conditional branch instruction.
+ BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
+
+ // SinkMBB:
+ // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
+ // ...
+ MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock::iterator MIItEnd =
+ std::next(MachineBasicBlock::iterator(LastCMOV));
+ createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
+
+ // Now remove the CMOV(s).
+ ThisMBB->erase(MIItBegin, MIItEnd);
+
+ return SinkMBB;
+}
+
+static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::SUB64ri8;
+ return X86::SUB64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::SUB32ri8;
+ return X86::SUB32ri;
+ }
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+
+ const unsigned ProbeSize = getStackProbeSize(*MF);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+ MF->insert(MBBIter, testMBB);
+ MF->insert(MBBIter, blockMBB);
+ MF->insert(MBBIter, tailMBB);
+
+ Register sizeVReg = MI.getOperand(1).getReg();
+
+ Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
+
+ Register TmpStackPtr = MRI.createVirtualRegister(
+ TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
+ Register FinalStackPtr = MRI.createVirtualRegister(
+ TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
+
+ BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
+ .addReg(physSPReg);
+ {
+ const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
+ BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
+ .addReg(TmpStackPtr)
+ .addReg(sizeVReg);
+ }
+
+ // test rsp size
+
+ BuildMI(testMBB, DL,
+ TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+ .addReg(FinalStackPtr)
+ .addReg(physSPReg);
+
+ BuildMI(testMBB, DL, TII->get(X86::JCC_1))
+ .addMBB(tailMBB)
+ .addImm(X86::COND_GE);
+ testMBB->addSuccessor(blockMBB);
+ testMBB->addSuccessor(tailMBB);
+
+ // Touch the block then extend it. This is done on the opposite side of
+ // static probe where we allocate then touch, to avoid the need of probing the
+ // tail of the static alloca. Possible scenarios are:
+ //
+ // + ---- <- ------------ <- ------------- <- ------------ +
+ // | |
+ // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
+ // | |
+ // + <- ----------- <- ------------ <- ----------- <- ------------ +
+ //
+ // The property we want to enforce is to never have more than [page alloc] between two probes.
+
+ const unsigned XORMIOpc =
+ TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
+ addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
+ .addImm(0);
+
+ BuildMI(blockMBB, DL,
+ TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
+ .addReg(physSPReg)
+ .addImm(ProbeSize);
+
+
+ BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
+ blockMBB->addSuccessor(testMBB);
+
+ // Replace original instruction by the expected stack ptr
+ BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+ .addReg(FinalStackPtr);
+
+ tailMBB->splice(tailMBB->end(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MBB->addSuccessor(testMBB);
+
+ // Delete the original pseudo instruction.
+ MI.eraseFromParent();
+
+ // And we're done.
+ return tailMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+ assert(MF->shouldSplitStack());
+
+ const bool Is64Bit = Subtarget.is64Bit();
+ const bool IsLP64 = Subtarget.isTarget64BitLP64();
+
+ const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+ const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
+
+ // BB:
+ // ... [Till the alloca]
+ // If stacklet is not large enough, jump to mallocMBB
+ //
+ // bumpMBB:
+ // Allocate by subtracting from RSP
+ // Jump to continueMBB
+ //
+ // mallocMBB:
+ // Allocate by call to runtime
+ //
+ // continueMBB:
+ // ...
+ // [rest of original BB]
+ //
+
+ MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(getPointerTy(MF->getDataLayout()));
+
+ Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+ SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+ sizeVReg = MI.getOperand(1).getReg(),
+ physSPReg =
+ IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
+
+ MachineFunction::iterator MBBIter = ++BB->getIterator();
+
+ MF->insert(MBBIter, bumpMBB);
+ MF->insert(MBBIter, mallocMBB);
+ MF->insert(MBBIter, continueMBB);
+
+ continueMBB->splice(continueMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add code to the main basic block to check if the stack limit has been hit,
+ // and if so, jump to mallocMBB otherwise to bumpMBB.
+ BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+ .addReg(tmpSPVReg).addReg(sizeVReg);
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+ .addReg(SPLimitVReg);
+ BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
+
+ // bumpMBB simply decreases the stack pointer, since we know the current
+ // stacklet has enough space.
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+ .addReg(SPLimitVReg);
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+ .addReg(SPLimitVReg);
+ BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+ // Calls into a routine in libgcc to allocate more space from the heap.
+ const uint32_t *RegMask =
+ Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
+ if (IsLP64) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::RDI, RegState::Implicit)
+ .addReg(X86::RAX, RegState::ImplicitDefine);
+ } else if (Is64Bit) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EDI, RegState::Implicit)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
+ } else {
+ BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+ .addImm(12);
+ BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
+ }
+
+ if (!Is64Bit)
+ BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+ .addImm(16);
+
+ BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+ .addReg(IsLP64 ? X86::RAX : X86::EAX);
+ BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+ // Set up the CFG correctly.
+ BB->addSuccessor(bumpMBB);
+ BB->addSuccessor(mallocMBB);
+ mallocMBB->addSuccessor(continueMBB);
+ bumpMBB->addSuccessor(continueMBB);
+
+ // Take care of the PHI nodes.
+ BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+ MI.getOperand(0).getReg())
+ .addReg(mallocPtrVReg)
+ .addMBB(mallocMBB)
+ .addReg(bumpSPPtrVReg)
+ .addMBB(bumpMBB);
+
+ // Delete the original pseudo instruction.
+ MI.eraseFromParent();
+
+ // And we're done.
+ return continueMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ assert(!isAsynchronousEHPersonality(
+ classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
+ "SEH does not use catchret!");
+
+ // Only 32-bit EH needs to worry about manually restoring stack pointers.
+ if (!Subtarget.is32Bit())
+ return BB;
+
+ // C++ EH creates a new target block to hold the restore code, and wires up
+ // the new block to the return destination with a normal JMP_4.
+ MachineBasicBlock *RestoreMBB =
+ MF->CreateMachineBasicBlock(BB->getBasicBlock());
+ assert(BB->succ_size() == 1);
+ MF->insert(std::next(BB->getIterator()), RestoreMBB);
+ RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
+ BB->addSuccessor(RestoreMBB);
+ MI.getOperand(0).setMBB(RestoreMBB);
+
+ // Marking this as an EH pad but not a funclet entry block causes PEI to
+ // restore stack pointers in the block.
+ RestoreMBB->setIsEHPad(true);
+
+ auto RestoreMBBI = RestoreMBB->begin();
+ BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ // So, here we replace TLSADDR with the sequence:
+ // adjust_stackdown -> TLSADDR -> adjust_stackup.
+ // We need this because TLSADDR is lowered into calls
+ // inside MC, therefore without the two markers shrink-wrapping
+ // may push the prologue/epilogue pass them.
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineFunction &MF = *BB->getParent();
+
+ // Emit CALLSEQ_START right before the instruction.
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ MachineInstrBuilder CallseqStart =
+ BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
+ BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
+
+ // Emit CALLSEQ_END right after the instruction.
+ // We don't call erase from parent because we want to keep the
+ // original instruction around.
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ MachineInstrBuilder CallseqEnd =
+ BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
+ BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
+
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ // This is pretty easy. We're taking the value that we received from
+ // our load from the relocation, sticking it in either RDI (x86-64)
+ // or EAX and doing an indirect call. The return value will then
+ // be in the normal return register.
+ MachineFunction *F = BB->getParent();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
+ assert(MI.getOperand(3).isGlobal() && "This should be a global");
+
+ // Get a register mask for the lowered call.
+ // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+ // proper register mask.
+ const uint32_t *RegMask =
+ Subtarget.is64Bit() ?
+ Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
+ Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+ if (Subtarget.is64Bit()) {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+ MI.getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+ addDirectMem(MIB, X86::RDI);
+ MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ } else if (!isPositionIndependent()) {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0)
+ .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+ MI.getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+ addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ } else {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(TII->getGlobalBaseReg(F))
+ .addImm(0)
+ .addReg(0)
+ .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+ MI.getOperand(3).getTargetFlags())
+ .addReg(0);
+ MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+ addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+ }
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
+ switch (RPOpc) {
+ case X86::INDIRECT_THUNK_CALL32:
+ return X86::CALLpcrel32;
+ case X86::INDIRECT_THUNK_CALL64:
+ return X86::CALL64pcrel32;
+ case X86::INDIRECT_THUNK_TCRETURN32:
+ return X86::TCRETURNdi;
+ case X86::INDIRECT_THUNK_TCRETURN64:
+ return X86::TCRETURNdi64;
+ }
+ llvm_unreachable("not indirect thunk opcode");
+}
+
+static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
+ unsigned Reg) {
+ if (Subtarget.useRetpolineExternalThunk()) {
+ // When using an external thunk for retpolines, we pick names that match the
+ // names GCC happens to use as well. This helps simplify the implementation
+ // of the thunks for kernels where they have no easy ability to create
+ // aliases and are doing non-trivial configuration of the thunk's body. For
+ // example, the Linux kernel will do boot-time hot patching of the thunk
+ // bodies and cannot easily export aliases of these to loaded modules.
+ //
+ // Note that at any point in the future, we may need to change the semantics
+ // of how we implement retpolines and at that time will likely change the
+ // name of the called thunk. Essentially, there is no hard guarantee that
+ // LLVM will generate calls to specific thunks, we merely make a best-effort
+ // attempt to help out kernels and other systems where duplicating the
+ // thunks is costly.
+ switch (Reg) {
+ case X86::EAX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_eax";
+ case X86::ECX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_ecx";
+ case X86::EDX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_edx";
+ case X86::EDI:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_edi";
+ case X86::R11:
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__x86_indirect_thunk_r11";
+ }
+ llvm_unreachable("unexpected reg for external indirect thunk");
+ }
+
+ if (Subtarget.useRetpolineIndirectCalls() ||
+ Subtarget.useRetpolineIndirectBranches()) {
+ // When targeting an internal COMDAT thunk use an LLVM-specific name.
+ switch (Reg) {
+ case X86::EAX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_eax";
+ case X86::ECX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_ecx";
+ case X86::EDX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edx";
+ case X86::EDI:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edi";
+ case X86::R11:
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__llvm_retpoline_r11";
+ }
+ llvm_unreachable("unexpected reg for retpoline");
+ }
+
+ if (Subtarget.useLVIControlFlowIntegrity()) {
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__llvm_lvi_thunk_r11";
+ }
+ llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ // Copy the virtual register into the R11 physical register and
+ // call the retpoline thunk.
+ const DebugLoc &DL = MI.getDebugLoc();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ Register CalleeVReg = MI.getOperand(0).getReg();
+ unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
+
+ // Find an available scratch register to hold the callee. On 64-bit, we can
+ // just use R11, but we scan for uses anyway to ensure we don't generate
+ // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
+ // already a register use operand to the call to hold the callee. If none
+ // are available, use EDI instead. EDI is chosen because EBX is the PIC base
+ // register and ESI is the base pointer to realigned stack frames with VLAs.
+ SmallVector<unsigned, 3> AvailableRegs;
+ if (Subtarget.is64Bit())
+ AvailableRegs.push_back(X86::R11);
+ else
+ AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
+
+ // Zero out any registers that are already used.
+ for (const auto &MO : MI.operands()) {
+ if (MO.isReg() && MO.isUse())
+ for (unsigned &Reg : AvailableRegs)
+ if (Reg == MO.getReg())
+ Reg = 0;
+ }
+
+ // Choose the first remaining non-zero available register.
+ unsigned AvailableReg = 0;
+ for (unsigned MaybeReg : AvailableRegs) {
+ if (MaybeReg) {
+ AvailableReg = MaybeReg;
+ break;
+ }
+ }
+ if (!AvailableReg)
+ report_fatal_error("calling convention incompatible with retpoline, no "
+ "available registers");
+
+ const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
+
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+ .addReg(CalleeVReg);
+ MI.getOperand(0).ChangeToES(Symbol);
+ MI.setDesc(TII->get(Opc));
+ MachineInstrBuilder(*BB->getParent(), &MI)
+ .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
+ return BB;
+}
+
+/// SetJmp implies future control flow change upon calling the corresponding
+/// LongJmp.
+/// Instead of using the 'return' instruction, the long jump fixes the stack and
+/// performs an indirect branch. To do so it uses the registers that were stored
+/// in the jump buffer (when calling SetJmp).
+/// In case the shadow stack is enabled we need to fix it as well, because some
+/// return addresses will be skipped.
+/// The function will save the SSP for future fixing in the function
+/// emitLongJmpShadowStackFix.
+/// \sa emitLongJmpShadowStackFix
+/// \param [in] MI The temporary Machine Instruction for the builtin.
+/// \param [in] MBB The Machine Basic Block that will be modified.
+void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstrBuilder MIB;
+
+ // Memory Reference.
+ SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
+ MI.memoperands_end());
+
+ // Initialize a register with zero.
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+ Register ZReg = MRI.createVirtualRegister(PtrRC);
+ unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
+ BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
+ .addDef(ZReg)
+ .addReg(ZReg, RegState::Undef)
+ .addReg(ZReg, RegState::Undef);
+
+ // Read the current SSP Register value to the zeroed register.
+ Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+ unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
+ BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
+
+ // Write the SSP register value to offset 3 in input memory buffer.
+ unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
+ const int64_t SSPOffset = 3 * PVT.getStoreSize();
+ const unsigned MemOpndSlot = 1;
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
+ else
+ MIB.add(MI.getOperand(MemOpndSlot + i));
+ }
+ MIB.addReg(SSPCopyReg);
+ MIB.setMemRefs(MMOs);
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = ++MBB->getIterator();
+
+ // Memory Reference
+ SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
+ MI.memoperands_end());
+
+ unsigned DstReg;
+ unsigned MemOpndSlot = 0;
+
+ unsigned CurOp = 0;
+
+ DstReg = MI.getOperand(CurOp++).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
+ (void)TRI;
+ Register mainDstReg = MRI.createVirtualRegister(RC);
+ Register restoreDstReg = MRI.createVirtualRegister(RC);
+
+ MemOpndSlot = CurOp;
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ // For v = setjmp(buf), we generate
+ //
+ // thisMBB:
+ // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
+ // SjLjSetup restoreMBB
+ //
+ // mainMBB:
+ // v_main = 0
+ //
+ // sinkMBB:
+ // v = phi(main, restore)
+ //
+ // restoreMBB:
+ // if base pointer being used, load it from frame
+ // v_restore = 1
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+ MF->push_back(restoreMBB);
+ restoreMBB->setHasAddressTaken();
+
+ MachineInstrBuilder MIB;
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ unsigned PtrStoreOpc = 0;
+ unsigned LabelReg = 0;
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+ !isPositionIndependent();
+
+ // Prepare IP either in reg or imm.
+ if (!UseImmLabel) {
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+ LabelReg = MRI.createVirtualRegister(PtrRC);
+ if (Subtarget.is64Bit()) {
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB)
+ .addReg(0);
+ } else {
+ const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+ .addReg(XII->getGlobalBaseReg(MF))
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
+ .addReg(0);
+ }
+ } else
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+ // Store IP
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
+ else
+ MIB.add(MI.getOperand(MemOpndSlot + i));
+ }
+ if (!UseImmLabel)
+ MIB.addReg(LabelReg);
+ else
+ MIB.addMBB(restoreMBB);
+ MIB.setMemRefs(MMOs);
+
+ if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
+ emitSetJmpShadowStackFix(MI, thisMBB);
+ }
+
+ // Setup
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+ .addMBB(restoreMBB);
+
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ MIB.addRegMask(RegInfo->getNoPreservedMask());
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(restoreMBB);
+
+ // mainMBB:
+ // EAX = 0
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(X86::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(restoreDstReg).addMBB(restoreMBB);
+
+ // restoreMBB:
+ if (RegInfo->hasBasePointer(*MF)) {
+ const bool Uses64BitFramePtr =
+ Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ X86FI->setRestoreBasePointer(MF);
+ Register FramePtr = RegInfo->getFrameRegister(*MF);
+ Register BasePtr = RegInfo->getBaseRegister();
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+ addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+ FramePtr, true, X86FI->getRestoreBasePointerOffset())
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+ BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
+ restoreMBB->addSuccessor(sinkMBB);
+
+ MI.eraseFromParent();
+ return sinkMBB;
+}
+
+/// Fix the shadow stack using the previously saved SSP pointer.
+/// \sa emitSetJmpShadowStackFix
+/// \param [in] MI The temporary Machine Instruction for the builtin.
+/// \param [in] MBB The Machine Basic Block that will be modified.
+/// \return The sink MBB that will perform the future indirect branch.
+MachineBasicBlock *
+X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Memory Reference
+ SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
+ MI.memoperands_end());
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+
+ // checkSspMBB:
+ // xor vreg1, vreg1
+ // rdssp vreg1
+ // test vreg1, vreg1
+ // je sinkMBB # Jump if Shadow Stack is not supported
+ // fallMBB:
+ // mov buf+24/12(%rip), vreg2
+ // sub vreg1, vreg2
+ // jbe sinkMBB # No need to fix the Shadow Stack
+ // fixShadowMBB:
+ // shr 3/2, vreg2
+ // incssp vreg2 # fix the SSP according to the lower 8 bits
+ // shr 8, vreg2
+ // je sinkMBB
+ // fixShadowLoopPrepareMBB:
+ // shl vreg2
+ // mov 128, vreg3
+ // fixShadowLoopMBB:
+ // incssp vreg3
+ // dec vreg2
+ // jne fixShadowLoopMBB # Iterate until you finish fixing
+ // # the Shadow Stack
+ // sinkMBB:
+
+ MachineFunction::iterator I = ++MBB->getIterator();
+ const BasicBlock *BB = MBB->getBasicBlock();
+
+ MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, checkSspMBB);
+ MF->insert(I, fallMBB);
+ MF->insert(I, fixShadowMBB);
+ MF->insert(I, fixShadowLoopPrepareMBB);
+ MF->insert(I, fixShadowLoopMBB);
+ MF->insert(I, sinkMBB);
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
+ MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ MBB->addSuccessor(checkSspMBB);
+
+ // Initialize a register with zero.
+ Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
+
+ if (PVT == MVT::i64) {
+ Register TmpZReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
+ .addImm(0)
+ .addReg(ZReg)
+ .addImm(X86::sub_32bit);
+ ZReg = TmpZReg;
+ }
+
+ // Read the current SSP Register value to the zeroed register.
+ Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+ unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
+ BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
+
+ // Check whether the result of the SSP register is zero and jump directly
+ // to the sink.
+ unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
+ BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
+ .addReg(SSPCopyReg)
+ .addReg(SSPCopyReg);
+ BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
+ checkSspMBB->addSuccessor(sinkMBB);
+ checkSspMBB->addSuccessor(fallMBB);
+
+ // Reload the previously saved SSP register value.
+ Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
+ unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+ const int64_t SPPOffset = 3 * PVT.getStoreSize();
+ MachineInstrBuilder MIB =
+ BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MO, SPPOffset);
+ else if (MO.isReg()) // Don't add the whole operand, we don't want to
+ // preserve kill flags.
+ MIB.addReg(MO.getReg());
+ else
+ MIB.add(MO);
+ }
+ MIB.setMemRefs(MMOs);
+
+ // Subtract the current SSP from the previous SSP.
+ Register SspSubReg = MRI.createVirtualRegister(PtrRC);
+ unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
+ BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
+ .addReg(PrevSSPReg)
+ .addReg(SSPCopyReg);
+
+ // Jump to sink in case PrevSSPReg <= SSPCopyReg.
+ BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
+ fallMBB->addSuccessor(sinkMBB);
+ fallMBB->addSuccessor(fixShadowMBB);
+
+ // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
+ unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
+ unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
+ Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
+ .addReg(SspSubReg)
+ .addImm(Offset);
+
+ // Increase SSP when looking only on the lower 8 bits of the delta.
+ unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
+ BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
+
+ // Reset the lower 8 bits.
+ Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
+ .addReg(SspFirstShrReg)
+ .addImm(8);
+
+ // Jump if the result of the shift is zero.
+ BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
+ fixShadowMBB->addSuccessor(sinkMBB);
+ fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
+
+ // Do a single shift left.
+ unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
+ Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
+ .addReg(SspSecondShrReg);
+
+ // Save the value 128 to a register (will be used next with incssp).
+ Register Value128InReg = MRI.createVirtualRegister(PtrRC);
+ unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
+ BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
+ .addImm(128);
+ fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
+
+ // Since incssp only looks at the lower 8 bits, we might need to do several
+ // iterations of incssp until we finish fixing the shadow stack.
+ Register DecReg = MRI.createVirtualRegister(PtrRC);
+ Register CounterReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
+ .addReg(SspAfterShlReg)
+ .addMBB(fixShadowLoopPrepareMBB)
+ .addReg(DecReg)
+ .addMBB(fixShadowLoopMBB);
+
+ // Every iteration we increase the SSP by 128.
+ BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
+
+ // Every iteration we decrement the counter by 1.
+ unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
+ BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
+
+ // Jump if the counter is not zero yet.
+ BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
+ fixShadowLoopMBB->addSuccessor(sinkMBB);
+ fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
+
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Memory Reference
+ SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
+ MI.memoperands_end());
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ const TargetRegisterClass *RC =
+ (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+ Register Tmp = MRI.createVirtualRegister(RC);
+ // Since FP is only updated here but NOT referenced, it's treated as GPR.
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+ Register SP = RegInfo->getStackRegister();
+
+ MachineInstrBuilder MIB;
+
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+ unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+ unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+ MachineBasicBlock *thisMBB = MBB;
+
+ // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
+ if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
+ thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
+ }
+
+ // Reload FP
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) // Don't add the whole operand, we don't want to
+ // preserve kill flags.
+ MIB.addReg(MO.getReg());
+ else
+ MIB.add(MO);
+ }
+ MIB.setMemRefs(MMOs);
+
+ // Reload IP
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MO, LabelOffset);
+ else if (MO.isReg()) // Don't add the whole operand, we don't want to
+ // preserve kill flags.
+ MIB.addReg(MO.getReg());
+ else
+ MIB.add(MO);
+ }
+ MIB.setMemRefs(MMOs);
+
+ // Reload SP
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI.getOperand(i), SPOffset);
+ else
+ MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
+ // the last instruction of the expansion.
+ }
+ MIB.setMemRefs(MMOs);
+
+ // Jump
+ BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+ MI.eraseFromParent();
+ return thisMBB;
+}
+
+void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB,
+ int FI) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
+
+ unsigned Op = 0;
+ unsigned VR = 0;
+
+ bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+ !isPositionIndependent();
+
+ if (UseImmLabel) {
+ Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+ } else {
+ const TargetRegisterClass *TRC =
+ (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+ VR = MRI->createVirtualRegister(TRC);
+ Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+
+ if (Subtarget.is64Bit())
+ BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addMBB(DispatchBB)
+ .addReg(0);
+ else
+ BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
+ .addReg(0) /* TII->getGlobalBaseReg(MF) */
+ .addImm(1)
+ .addReg(0)
+ .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
+ .addReg(0);
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
+ addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
+ if (UseImmLabel)
+ MIB.addMBB(DispatchBB);
+ else
+ MIB.addReg(VR);
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ int FI = MF->getFrameInfo().getFunctionContextIndex();
+
+ // Get a mapping of the call site numbers to all of the landing pads they're
+ // associated with.
+ DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
+ unsigned MaxCSNum = 0;
+ for (auto &MBB : *MF) {
+ if (!MBB.isEHPad())
+ continue;
+
+ MCSymbol *Sym = nullptr;
+ for (const auto &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
+
+ assert(MI.isEHLabel() && "expected EH_LABEL");
+ Sym = MI.getOperand(0).getMCSymbol();
+ break;
+ }
+
+ if (!MF->hasCallSiteLandingPad(Sym))
+ continue;
+
+ for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
+ CallSiteNumToLPad[CSI].push_back(&MBB);
+ MaxCSNum = std::max(MaxCSNum, CSI);
+ }
+ }
+
+ // Get an ordered list of the machine basic blocks for the jump table.
+ std::vector<MachineBasicBlock *> LPadList;
+ SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
+ LPadList.reserve(CallSiteNumToLPad.size());
+
+ for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
+ for (auto &LP : CallSiteNumToLPad[CSI]) {
+ LPadList.push_back(LP);
+ InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
+ }
+ }
+
+ assert(!LPadList.empty() &&
+ "No landing pad destinations for the dispatch jump table!");
+
+ // Create the MBBs for the dispatch code.
+
+ // Shove the dispatch's address into the return slot in the function context.
+ MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+ DispatchBB->setIsEHPad(true);
+
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+ BuildMI(TrapBB, DL, TII->get(X86::TRAP));
+ DispatchBB->addSuccessor(TrapBB);
+
+ MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+ DispatchBB->addSuccessor(DispContBB);
+
+ // Insert MBBs.
+ MF->push_back(DispatchBB);
+ MF->push_back(DispContBB);
+ MF->push_back(TrapBB);
+
+ // Insert code into the entry block that creates and registers the function
+ // context.
+ SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
+
+ // Create the jump table and associated information
+ unsigned JTE = getJumpTableEncoding();
+ MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
+ unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+
+ const X86RegisterInfo &RI = TII->getRegisterInfo();
+ // Add a register mask with no preserved registers. This results in all
+ // registers being marked as clobbered.
+ if (RI.hasBasePointer(*MF)) {
+ const bool FPIs64Bit =
+ Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+ X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setRestoreBasePointer(MF);
+
+ Register FP = RI.getFrameRegister(*MF);
+ Register BP = RI.getBaseRegister();
+ unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
+ addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
+ MFI->getRestoreBasePointerOffset())
+ .addRegMask(RI.getNoPreservedMask());
+ } else {
+ BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
+ .addRegMask(RI.getNoPreservedMask());
+ }
+
+ // IReg is used as an index in a memory operand and therefore can't be SP
+ Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
+ addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
+ Subtarget.is64Bit() ? 8 : 4);
+ BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
+ .addReg(IReg)
+ .addImm(LPadList.size());
+ BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
+
+ if (Subtarget.is64Bit()) {
+ Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+
+ // leaq .LJTI0_0(%rip), BReg
+ BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addJumpTableIndex(MJTI)
+ .addReg(0);
+ // movzx IReg64, IReg
+ BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
+ .addImm(0)
+ .addReg(IReg)
+ .addImm(X86::sub_32bit);
+
+ switch (JTE) {
+ case MachineJumpTableInfo::EK_BlockAddress:
+ // jmpq *(BReg,IReg64,8)
+ BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
+ .addReg(BReg)
+ .addImm(8)
+ .addReg(IReg64)
+ .addImm(0)
+ .addReg(0);
+ break;
+ case MachineJumpTableInfo::EK_LabelDifference32: {
+ Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+
+ // movl (BReg,IReg64,4), OReg
+ BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
+ .addReg(BReg)
+ .addImm(4)
+ .addReg(IReg64)
+ .addImm(0)
+ .addReg(0);
+ // movsx OReg64, OReg
+ BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
+ // addq BReg, OReg64, TReg
+ BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
+ .addReg(OReg64)
+ .addReg(BReg);
+ // jmpq *TReg
+ BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
+ break;
+ }
+ default:
+ llvm_unreachable("Unexpected jump table encoding");
+ }
+ } else {
+ // jmpl *.LJTI0_0(,IReg,4)
+ BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
+ .addReg(0)
+ .addImm(4)
+ .addReg(IReg)
+ .addJumpTableIndex(MJTI)
+ .addReg(0);
+ }
+
+ // Add the jump table entries as successors to the MBB.
+ SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
+ for (auto &LP : LPadList)
+ if (SeenMBBs.insert(LP).second)
+ DispContBB->addSuccessor(LP);
+
+ // N.B. the order the invoke BBs are processed in doesn't matter here.
+ SmallVector<MachineBasicBlock *, 64> MBBLPads;
+ const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
+ for (MachineBasicBlock *MBB : InvokeBBs) {
+ // Remove the landing pad successor from the invoke block and replace it
+ // with the new dispatch block.
+ // Keep a copy of Successors since it's modified inside the loop.
+ SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
+ MBB->succ_rend());
+ // FIXME: Avoid quadratic complexity.
+ for (auto MBBS : Successors) {
+ if (MBBS->isEHPad()) {
+ MBB->removeSuccessor(MBBS);
+ MBBLPads.push_back(MBBS);
+ }
+ }
+
+ MBB->addSuccessor(DispatchBB);
+
+ // Find the invoke call and mark all of the callee-saved registers as
+ // 'implicit defined' so that they're spilled. This prevents code from
+ // moving instructions to before the EH block, where they will never be
+ // executed.
+ for (auto &II : reverse(*MBB)) {
+ if (!II.isCall())
+ continue;
+
+ DenseMap<unsigned, bool> DefRegs;
+ for (auto &MOp : II.operands())
+ if (MOp.isReg())
+ DefRegs[MOp.getReg()] = true;
+
+ MachineInstrBuilder MIB(*MF, &II);
+ for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
+ unsigned Reg = SavedRegs[RegIdx];
+ if (!DefRegs[Reg])
+ MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+ }
+
+ break;
+ }
+ }
+
+ // Mark all former landing pads as non-landing pads. The dispatch is the only
+ // landing pad now.
+ for (auto &LP : MBBLPads)
+ LP->setIsEHPad(false);
+
+ // The instruction is gone now.
+ MI.eraseFromParent();
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ auto TMMImmToTMMReg = [](unsigned Imm) {
+ assert (Imm < 8 && "Illegal tmm index");
+ return X86::TMM0 + Imm;
+ };
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unexpected instr type to insert");
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ case X86::TLS_addrX32:
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ case X86::TLS_base_addrX32:
+ return EmitLoweredTLSAddr(MI, BB);
+ case X86::INDIRECT_THUNK_CALL32:
+ case X86::INDIRECT_THUNK_CALL64:
+ case X86::INDIRECT_THUNK_TCRETURN32:
+ case X86::INDIRECT_THUNK_TCRETURN64:
+ return EmitLoweredIndirectThunk(MI, BB);
+ case X86::CATCHRET:
+ return EmitLoweredCatchRet(MI, BB);
+ case X86::SEG_ALLOCA_32:
+ case X86::SEG_ALLOCA_64:
+ return EmitLoweredSegAlloca(MI, BB);
+ case X86::PROBED_ALLOCA_32:
+ case X86::PROBED_ALLOCA_64:
+ return EmitLoweredProbedAlloca(MI, BB);
+ case X86::TLSCall_32:
+ case X86::TLSCall_64:
+ return EmitLoweredTLSCall(MI, BB);
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR32X:
+ case X86::CMOV_FR64:
+ case X86::CMOV_FR64X:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ case X86::CMOV_VR64:
+ case X86::CMOV_VR128:
+ case X86::CMOV_VR128X:
+ case X86::CMOV_VR256:
+ case X86::CMOV_VR256X:
+ case X86::CMOV_VR512:
+ case X86::CMOV_VK1:
+ case X86::CMOV_VK2:
+ case X86::CMOV_VK4:
+ case X86::CMOV_VK8:
+ case X86::CMOV_VK16:
+ case X86::CMOV_VK32:
+ case X86::CMOV_VK64:
+ return EmitLoweredSelect(MI, BB);
+
+ case X86::RDFLAGS32:
+ case X86::RDFLAGS64: {
+ unsigned PushF =
+ MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
+ unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
+ MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
+ // Permit reads of the EFLAGS and DF registers without them being defined.
+ // This intrinsic exists to read external processor state in flags, such as
+ // the trap flag, interrupt flag, and direction flag, none of which are
+ // modeled by the backend.
+ assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
+ "Unexpected register in operand!");
+ Push->getOperand(2).setIsUndef();
+ assert(Push->getOperand(3).getReg() == X86::DF &&
+ "Unexpected register in operand!");
+ Push->getOperand(3).setIsUndef();
+ BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+
+ case X86::WRFLAGS32:
+ case X86::WRFLAGS64: {
+ unsigned Push =
+ MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
+ unsigned PopF =
+ MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
+ BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(PopF));
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+
+ case X86::FP32_TO_INT16_IN_MEM:
+ case X86::FP32_TO_INT32_IN_MEM:
+ case X86::FP32_TO_INT64_IN_MEM:
+ case X86::FP64_TO_INT16_IN_MEM:
+ case X86::FP64_TO_INT32_IN_MEM:
+ case X86::FP64_TO_INT64_IN_MEM:
+ case X86::FP80_TO_INT16_IN_MEM:
+ case X86::FP80_TO_INT32_IN_MEM:
+ case X86::FP80_TO_INT64_IN_MEM: {
+ // Change the floating point control register to use "round towards zero"
+ // mode when truncating to an integer value.
+ int OrigCWFrameIdx =
+ MF->getFrameInfo().CreateStackObject(2, Align(2), false);
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
+
+ // Load the old value of the control word...
+ Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
+ OrigCWFrameIdx);
+
+ // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
+ Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
+ .addReg(OldCW, RegState::Kill).addImm(0xC00);
+
+ // Extract to 16 bits.
+ Register NewCW16 =
+ MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
+ .addReg(NewCW, RegState::Kill, X86::sub_16bit);
+
+ // Prepare memory for FLDCW.
+ int NewCWFrameIdx =
+ MF->getFrameInfo().CreateStackObject(2, Align(2), false);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
+ NewCWFrameIdx)
+ .addReg(NewCW16, RegState::Kill);
+
+ // Reload the modified control word now...
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FLDCW16m)), NewCWFrameIdx);
+
+ // Get the X86 opcode to use.
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+ case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+ case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+ case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+ case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+ case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+ case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+ case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+ case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+ }
+
+ X86AddressMode AM = getAddressFromInstr(&MI, 0);
+ addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+ .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
+
+ // Reload the original control word now.
+ addFrameReference(BuildMI(*BB, MI, DL,
+ TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
+
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+ }
+
+ // xbegin
+ case X86::XBEGIN:
+ return emitXBegin(MI, BB, Subtarget.getInstrInfo());
+
+ case X86::VASTART_SAVE_XMM_REGS:
+ return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+
+ case X86::VAARG_64:
+ case X86::VAARG_X32:
+ return EmitVAARGWithCustomInserter(MI, BB);
+
+ case X86::EH_SjLj_SetJmp32:
+ case X86::EH_SjLj_SetJmp64:
+ return emitEHSjLjSetJmp(MI, BB);
+
+ case X86::EH_SjLj_LongJmp32:
+ case X86::EH_SjLj_LongJmp64:
+ return emitEHSjLjLongJmp(MI, BB);
+
+ case X86::Int_eh_sjlj_setup_dispatch:
+ return EmitSjLjDispatchBlock(MI, BB);
+
+ case TargetOpcode::STATEPOINT:
+ // As an implementation detail, STATEPOINT shares the STACKMAP format at
+ // this point in the process. We diverge later.
+ return emitPatchPoint(MI, BB);
+
+ case TargetOpcode::STACKMAP:
+ case TargetOpcode::PATCHPOINT:
+ return emitPatchPoint(MI, BB);
+
+ case TargetOpcode::PATCHABLE_EVENT_CALL:
+ case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+ return BB;
+
+ case X86::LCMPXCHG8B: {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
+ // requires a memory operand. If it happens that current architecture is
+ // i686 and for current function we need a base pointer
+ // - which is ESI for i686 - register allocator would not be able to
+ // allocate registers for an address in form of X(%reg, %reg, Y)
+ // - there never would be enough unreserved registers during regalloc
+ // (without the need for base ptr the only option would be X(%edi, %esi, Y).
+ // We are giving a hand to register allocator by precomputing the address in
+ // a new vreg using LEA.
+
+ // If it is not i686 or there is no base pointer - nothing to do here.
+ if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
+ return BB;
+
+ // Even though this code does not necessarily needs the base pointer to
+ // be ESI, we check for that. The reason: if this assert fails, there are
+ // some changes happened in the compiler base pointer handling, which most
+ // probably have to be addressed somehow here.
+ assert(TRI->getBaseRegister() == X86::ESI &&
+ "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
+ "base pointer in mind");
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MVT SPTy = getPointerTy(MF->getDataLayout());
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+ Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
+
+ X86AddressMode AM = getAddressFromInstr(&MI, 0);
+ // Regalloc does not need any help when the memory operand of CMPXCHG8B
+ // does not use index register.
+ if (AM.IndexReg == X86::NoRegister)
+ return BB;
+
+ // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
+ // four operand definitions that are E[ABCD] registers. We skip them and
+ // then insert the LEA.
+ MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
+ while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
+ RMBBI->definesRegister(X86::EBX) ||
+ RMBBI->definesRegister(X86::ECX) ||
+ RMBBI->definesRegister(X86::EDX))) {
+ ++RMBBI;
+ }
+ MachineBasicBlock::iterator MBBI(RMBBI);
+ addFullAddress(
+ BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
+
+ setDirectAddressInInstr(&MI, 0, computedAddrVReg);
+
+ return BB;
+ }
+ case X86::LCMPXCHG16B_NO_RBX: {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ Register BasePtr = TRI->getBaseRegister();
+ if (TRI->hasBasePointer(*MF) &&
+ (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
+ if (!BB->isLiveIn(BasePtr))
+ BB->addLiveIn(BasePtr);
+ // Save RBX into a virtual register.
+ Register SaveRBX =
+ MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+ .addReg(X86::RBX);
+ Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
+ for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
+ MIB.add(MI.getOperand(Idx));
+ MIB.add(MI.getOperand(X86::AddrNumOperands));
+ MIB.addReg(SaveRBX);
+ } else {
+ // Simple case, just copy the virtual register to RBX.
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
+ .add(MI.getOperand(X86::AddrNumOperands));
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
+ for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
+ MIB.add(MI.getOperand(Idx));
+ }
+ MI.eraseFromParent();
+ return BB;
+ }
+ case X86::MWAITX: {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ Register BasePtr = TRI->getBaseRegister();
+ bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
+ // If no need to save the base pointer, we generate MWAITXrrr,
+ // else we generate pseudo MWAITX_SAVE_RBX.
+ if (!IsRBX || !TRI->hasBasePointer(*MF)) {
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+ .addReg(MI.getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+ .addReg(MI.getOperand(1).getReg());
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
+ .addReg(MI.getOperand(2).getReg());
+ BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
+ MI.eraseFromParent();
+ } else {
+ if (!BB->isLiveIn(BasePtr)) {
+ BB->addLiveIn(BasePtr);
+ }
+ // Parameters can be copied into ECX and EAX but not EBX yet.
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+ .addReg(MI.getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+ .addReg(MI.getOperand(1).getReg());
+ assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
+ // Save RBX into a virtual register.
+ Register SaveRBX =
+ MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+ .addReg(X86::RBX);
+ // Generate mwaitx pseudo.
+ Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
+ .addDef(Dst) // Destination tied in with SaveRBX.
+ .addReg(MI.getOperand(2).getReg()) // input value of EBX.
+ .addUse(SaveRBX); // Save of base pointer.
+ MI.eraseFromParent();
+ }
+ return BB;
+ }
+ case TargetOpcode::PREALLOCATED_SETUP: {
+ assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
+ auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setHasPreallocatedCall(true);
+ int64_t PreallocatedId = MI.getOperand(0).getImm();
+ size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
+ assert(StackAdjustment != 0 && "0 stack adjustment");
+ LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
+ << StackAdjustment << "\n");
+ BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
+ .addReg(X86::ESP)
+ .addImm(StackAdjustment);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case TargetOpcode::PREALLOCATED_ARG: {
+ assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
+ int64_t PreallocatedId = MI.getOperand(1).getImm();
+ int64_t ArgIdx = MI.getOperand(2).getImm();
+ auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
+ LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
+ << ", arg offset " << ArgOffset << "\n");
+ // stack pointer + offset
+ addRegOffset(
+ BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
+ X86::ESP, false, ArgOffset);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case X86::PTDPBSSD:
+ case X86::PTDPBSUD:
+ case X86::PTDPBUSD:
+ case X86::PTDPBUUD:
+ case X86::PTDPBF16PS: {
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
+ case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
+ case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
+ case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
+ case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ case X86::PTILEZERO: {
+ unsigned Imm = MI.getOperand(0).getImm();
+ BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ case X86::PTILELOADD:
+ case X86::PTILELOADDT1:
+ case X86::PTILESTORED: {
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::PTILELOADD: Opc = X86::TILELOADD; break;
+ case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
+ case X86::PTILESTORED: Opc = X86::TILESTORED; break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ unsigned CurOp = 0;
+ if (Opc != X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Define);
+
+ MIB.add(MI.getOperand(CurOp++)); // base
+ MIB.add(MI.getOperand(CurOp++)); // scale
+ MIB.add(MI.getOperand(CurOp++)); // index -- stride
+ MIB.add(MI.getOperand(CurOp++)); // displacement
+ MIB.add(MI.getOperand(CurOp++)); // segment
+
+ if (Opc == X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Undef);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+bool
+X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
+ const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ TargetLoweringOpt &TLO) const {
+ EVT VT = Op.getValueType();
+ unsigned Opcode = Op.getOpcode();
+ unsigned EltSize = VT.getScalarSizeInBits();
+
+ if (VT.isVector()) {
+ // If the constant is only all signbits in the active bits, then we should
+ // extend it to the entire constant to allow it act as a boolean constant
+ // vector.
+ auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
+ if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
+ return false;
+ for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
+ if (!DemandedElts[i] || V.getOperand(i).isUndef())
+ continue;
+ const APInt &Val = V.getConstantOperandAPInt(i);
+ if (Val.getBitWidth() > Val.getNumSignBits() &&
+ Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
+ return true;
+ }
+ return false;
+ };
+ // For vectors - if we have a constant, then try to sign extend.
+ // TODO: Handle AND/ANDN cases.
+ unsigned ActiveBits = DemandedBits.getActiveBits();
+ if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
+ (Opcode == ISD::OR || Opcode == ISD::XOR) &&
+ NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
+ EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
+ EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
+ VT.getVectorNumElements());
+ SDValue NewC =
+ TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
+ Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
+ SDValue NewOp =
+ TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
+ return TLO.CombineTo(Op, NewOp);
+ }
+ return false;
+ }
+
+ // Only optimize Ands to prevent shrinking a constant that could be
+ // matched by movzx.
+ if (Opcode != ISD::AND)
+ return false;
+
+ // Make sure the RHS really is a constant.
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!C)
+ return false;
+
+ const APInt &Mask = C->getAPIntValue();
+
+ // Clear all non-demanded bits initially.
+ APInt ShrunkMask = Mask & DemandedBits;
+
+ // Find the width of the shrunk mask.
+ unsigned Width = ShrunkMask.getActiveBits();
+
+ // If the mask is all 0s there's nothing to do here.
+ if (Width == 0)
+ return false;
+
+ // Find the next power of 2 width, rounding up to a byte.
+ Width = PowerOf2Ceil(std::max(Width, 8U));
+ // Truncate the width to size to handle illegal types.
+ Width = std::min(Width, EltSize);
+
+ // Calculate a possible zero extend mask for this constant.
+ APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
+
+ // If we aren't changing the mask, just return true to keep it and prevent
+ // the caller from optimizing.
+ if (ZeroExtendMask == Mask)
+ return true;
+
+ // Make sure the new mask can be represented by a combination of mask bits
+ // and non-demanded bits.
+ if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
+ return false;
+
+ // Replace the constant with the zero extend mask.
+ SDLoc DL(Op);
+ SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
+ SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+ return TLO.CombineTo(Op, NewOp);
+}
+
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned BitWidth = Known.getBitWidth();
+ unsigned NumElts = DemandedElts.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert((Opc >= ISD::BUILTIN_OP_END ||
+ Opc == ISD::INTRINSIC_WO_CHAIN ||
+ Opc == ISD::INTRINSIC_W_CHAIN ||
+ Opc == ISD::INTRINSIC_VOID) &&
+ "Should use MaskedValueIsZero if you don't know whether Op"
+ " is a target node!");
+
+ Known.resetAll();
+ switch (Opc) {
+ default: break;
+ case X86ISD::SETCC:
+ Known.Zero.setBitsFrom(1);
+ break;
+ case X86ISD::MOVMSK: {
+ unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
+ Known.Zero.setBitsFrom(NumLoBits);
+ break;
+ }
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW: {
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
+ Op.getConstantOperandVal(1));
+ Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
+ Known = Known.anyextOrTrunc(BitWidth);
+ Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
+ break;
+ }
+ case X86ISD::VSRAI:
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI: {
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ if (ShAmt >= VT.getScalarSizeInBits()) {
+ Known.setAllZero();
+ break;
+ }
+
+ Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ if (Opc == X86ISD::VSHLI) {
+ Known.Zero <<= ShAmt;
+ Known.One <<= ShAmt;
+ // Low bits are known zero.
+ Known.Zero.setLowBits(ShAmt);
+ } else if (Opc == X86ISD::VSRLI) {
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
+ // High bits are known zero.
+ Known.Zero.setHighBits(ShAmt);
+ } else {
+ Known.Zero.ashrInPlace(ShAmt);
+ Known.One.ashrInPlace(ShAmt);
+ }
+ break;
+ }
+ case X86ISD::PACKUS: {
+ // PACKUS is just a truncation if the upper half is zero.
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+
+ Known.One = APInt::getAllOnesValue(BitWidth * 2);
+ Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
+
+ KnownBits Known2;
+ if (!!DemandedLHS) {
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+ Known = KnownBits::commonBits(Known, Known2);
+ }
+ if (!!DemandedRHS) {
+ Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
+ Known = KnownBits::commonBits(Known, Known2);
+ }
+
+ if (Known.countMinLeadingZeros() < BitWidth)
+ Known.resetAll();
+ Known = Known.trunc(BitWidth);
+ break;
+ }
+ case X86ISD::ANDNP: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ // ANDNP = (~X & Y);
+ Known.One &= Known2.Zero;
+ Known.Zero |= Known2.One;
+ break;
+ }
+ case X86ISD::FOR: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ Known |= Known2;
+ break;
+ }
+ case X86ISD::PSADBW: {
+ assert(VT.getScalarType() == MVT::i64 &&
+ Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
+ "Unexpected PSADBW types");
+
+ // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
+ Known.Zero.setBitsFrom(16);
+ break;
+ }
+ case X86ISD::CMOV: {
+ Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
+ // If we don't know any bits, early out.
+ if (Known.isUnknown())
+ break;
+ KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+
+ // Only known if known in both the LHS and RHS.
+ Known = KnownBits::commonBits(Known, Known2);
+ break;
+ }
+ case X86ISD::BEXTR:
+ case X86ISD::BEXTRI: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+ unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
+ unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
+
+ // If the length is 0, the result is 0.
+ if (Length == 0) {
+ Known.setAllZero();
+ break;
+ }
+
+ if ((Shift + Length) <= BitWidth) {
+ Known = DAG.computeKnownBits(Op0, Depth + 1);
+ Known = Known.extractBits(Length, Shift);
+ Known = Known.zextOrTrunc(BitWidth);
+ }
+ }
+ break;
+ }
+ case X86ISD::PDEP: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ // Zeros are retained from the mask operand. But not ones.
+ Known.One.clearAllBits();
+ // The result will have at least as many trailing zeros as the non-mask
+ // operand since bits can only map to the same or higher bit position.
+ Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+ break;
+ }
+ case X86ISD::PEXT: {
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ // The result has as many leading zeros as the number of zeroes in the mask.
+ unsigned Count = Known.Zero.countPopulation();
+ Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
+ Known.One.clearAllBits();
+ break;
+ }
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS:
+ case X86ISD::CVTSI2P:
+ case X86ISD::CVTUI2P:
+ case X86ISD::CVTP2SI:
+ case X86ISD::CVTP2UI:
+ case X86ISD::MCVTP2SI:
+ case X86ISD::MCVTP2UI:
+ case X86ISD::CVTTP2SI:
+ case X86ISD::CVTTP2UI:
+ case X86ISD::MCVTTP2SI:
+ case X86ISD::MCVTTP2UI:
+ case X86ISD::MCVTSI2P:
+ case X86ISD::MCVTUI2P:
+ case X86ISD::VFPROUND:
+ case X86ISD::VMFPROUND:
+ case X86ISD::CVTPS2PH:
+ case X86ISD::MCVTPS2PH: {
+ // Truncations/Conversions - upper elements are known zero.
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ if (SrcVT.isVector()) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ if (NumElts > NumSrcElts &&
+ DemandedElts.countTrailingZeros() >= NumSrcElts)
+ Known.setAllZero();
+ }
+ break;
+ }
+ case X86ISD::STRICT_CVTTP2SI:
+ case X86ISD::STRICT_CVTTP2UI:
+ case X86ISD::STRICT_CVTSI2P:
+ case X86ISD::STRICT_CVTUI2P:
+ case X86ISD::STRICT_VFPROUND:
+ case X86ISD::STRICT_CVTPS2PH: {
+ // Strict Conversions - upper elements are known zero.
+ EVT SrcVT = Op.getOperand(1).getValueType();
+ if (SrcVT.isVector()) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ if (NumElts > NumSrcElts &&
+ DemandedElts.countTrailingZeros() >= NumSrcElts)
+ Known.setAllZero();
+ }
+ break;
+ }
+ case X86ISD::MOVQ2DQ: {
+ // Move from MMX to XMM. Upper half of XMM should be 0.
+ if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
+ Known.setAllZero();
+ break;
+ }
+ }
+
+ // Handle target shuffles.
+ // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
+ if (isTargetShuffle(Opc)) {
+ bool IsUnary;
+ SmallVector<int, 64> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
+ IsUnary)) {
+ unsigned NumOps = Ops.size();
+ unsigned NumElts = VT.getVectorNumElements();
+ if (Mask.size() == NumElts) {
+ SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
+ Known.Zero.setAllBits(); Known.One.setAllBits();
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (!DemandedElts[i])
+ continue;
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ // For UNDEF elements, we don't know anything about the common state
+ // of the shuffle result.
+ Known.resetAll();
+ break;
+ } else if (M == SM_SentinelZero) {
+ Known.One.clearAllBits();
+ continue;
+ }
+ assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
+ "Shuffle index out of range");
+
+ unsigned OpIdx = (unsigned)M / NumElts;
+ unsigned EltIdx = (unsigned)M % NumElts;
+ if (Ops[OpIdx].getValueType() != VT) {
+ // TODO - handle target shuffle ops with different value types.
+ Known.resetAll();
+ break;
+ }
+ DemandedOps[OpIdx].setBit(EltIdx);
+ }
+ // Known bits are the values that are shared by every demanded element.
+ for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
+ if (!DemandedOps[i])
+ continue;
+ KnownBits Known2 =
+ DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
+ Known = KnownBits::commonBits(Known, Known2);
+ }
+ }
+ }
+ }
+}
+
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ unsigned Depth) const {
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getScalarSizeInBits();
+ unsigned Opcode = Op.getOpcode();
+ switch (Opcode) {
+ case X86ISD::SETCC_CARRY:
+ // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+ return VTBits;
+
+ case X86ISD::VTRUNC: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
+ assert(VTBits < NumSrcBits && "Illegal truncation input type");
+ APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+ if (Tmp > (NumSrcBits - VTBits))
+ return Tmp - (NumSrcBits - VTBits);
+ return 1;
+ }
+
+ case X86ISD::PACKSS: {
+ // PACKSS is just a truncation if the sign bits extend to the packed size.
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
+ DemandedRHS);
+
+ unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
+ unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
+ if (!!DemandedLHS)
+ Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+ if (!!DemandedRHS)
+ Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
+ unsigned Tmp = std::min(Tmp0, Tmp1);
+ if (Tmp > (SrcBits - VTBits))
+ return Tmp - (SrcBits - VTBits);
+ return 1;
+ }
+
+ case X86ISD::VSHLI: {
+ SDValue Src = Op.getOperand(0);
+ const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
+ if (ShiftVal.uge(VTBits))
+ return VTBits; // Shifted all bits out --> zero.
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
+ if (ShiftVal.uge(Tmp))
+ return 1; // Shifted all sign bits out --> unknown.
+ return Tmp - ShiftVal.getZExtValue();
+ }
+
+ case X86ISD::VSRAI: {
+ SDValue Src = Op.getOperand(0);
+ APInt ShiftVal = Op.getConstantOperandAPInt(1);
+ if (ShiftVal.uge(VTBits - 1))
+ return VTBits; // Sign splat.
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
+ ShiftVal += Tmp;
+ return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
+ }
+
+ case X86ISD::PCMPGT:
+ case X86ISD::PCMPEQ:
+ case X86ISD::CMPP:
+ case X86ISD::VPCOM:
+ case X86ISD::VPCOMU:
+ // Vector compares return zero/all-bits result values.
+ return VTBits;
+
+ case X86ISD::ANDNP: {
+ unsigned Tmp0 =
+ DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ if (Tmp0 == 1) return 1; // Early out.
+ unsigned Tmp1 =
+ DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ return std::min(Tmp0, Tmp1);
+ }
+
+ case X86ISD::CMOV: {
+ unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
+ if (Tmp0 == 1) return 1; // Early out.
+ unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
+ return std::min(Tmp0, Tmp1);
+ }
+ }
+
+ // Handle target shuffles.
+ // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
+ if (isTargetShuffle(Opcode)) {
+ bool IsUnary;
+ SmallVector<int, 64> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
+ IsUnary)) {
+ unsigned NumOps = Ops.size();
+ unsigned NumElts = VT.getVectorNumElements();
+ if (Mask.size() == NumElts) {
+ SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (!DemandedElts[i])
+ continue;
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ // For UNDEF elements, we don't know anything about the common state
+ // of the shuffle result.
+ return 1;
+ } else if (M == SM_SentinelZero) {
+ // Zero = all sign bits.
+ continue;
+ }
+ assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
+ "Shuffle index out of range");
+
+ unsigned OpIdx = (unsigned)M / NumElts;
+ unsigned EltIdx = (unsigned)M % NumElts;
+ if (Ops[OpIdx].getValueType() != VT) {
+ // TODO - handle target shuffle ops with different value types.
+ return 1;
+ }
+ DemandedOps[OpIdx].setBit(EltIdx);
+ }
+ unsigned Tmp0 = VTBits;
+ for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
+ if (!DemandedOps[i])
+ continue;
+ unsigned Tmp1 =
+ DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
+ Tmp0 = std::min(Tmp0, Tmp1);
+ }
+ return Tmp0;
+ }
+ }
+ }
+
+ // Fallback case.
+ return 1;
+}
+
+SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
+ if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
+ return N->getOperand(0);
+ return N;
+}
+
+// Helper to look for a normal load that can be narrowed into a vzload with the
+// specified VT and memory VT. Returns SDValue() on failure.
+static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
+ SelectionDAG &DAG) {
+ // Can't if the load is volatile or atomic.
+ if (!LN->isSimple())
+ return SDValue();
+
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
+ LN->getPointerInfo(), LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+}
+
+// Attempt to match a combined shuffle mask against supported unary shuffle
+// instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget, unsigned &Shuffle,
+ MVT &SrcVT, MVT &DstVT) {
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
+
+ // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
+ if (MaskEltSize == 32 && Mask[0] == 0) {
+ if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+ if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+ }
+
+ // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
+ // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
+ unsigned MaxScale = 64 / MaskEltSize;
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
+ bool MatchAny = true;
+ bool MatchZero = true;
+ unsigned NumDstElts = NumMaskElts / Scale;
+ for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
+ if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
+ MatchAny = MatchZero = false;
+ break;
+ }
+ MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
+ MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
+ }
+ if (MatchAny || MatchZero) {
+ assert(MatchZero && "Failed to match zext but matched aext?");
+ unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
+ MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
+ MVT::getIntegerVT(MaskEltSize);
+ SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
+
+ if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
+ V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
+
+ Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
+ if (SrcVT.getVectorNumElements() != NumDstElts)
+ Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
+
+ DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
+ DstVT = MVT::getVectorVT(DstVT, NumDstElts);
+ return true;
+ }
+ }
+ }
+
+ // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
+ if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+ isUndefOrEqual(Mask[0], 0) &&
+ isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+
+ // Check if we have SSE3 which will let us use MOVDDUP etc. The
+ // instructions are no slower than UNPCKLPD but has the option to
+ // fold the input operand into even an unaligned memory load.
+ if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
+ Shuffle = X86ISD::MOVDDUP;
+ SrcVT = DstVT = MVT::v2f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
+ Shuffle = X86ISD::MOVSLDUP;
+ SrcVT = DstVT = MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
+ Shuffle = X86ISD::MOVSHDUP;
+ SrcVT = DstVT = MVT::v4f32;
+ return true;
+ }
+ }
+
+ if (MaskVT.is256BitVector() && AllowFloatDomain) {
+ assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
+ Shuffle = X86ISD::MOVDDUP;
+ SrcVT = DstVT = MVT::v4f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
+ Shuffle = X86ISD::MOVSLDUP;
+ SrcVT = DstVT = MVT::v8f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
+ Shuffle = X86ISD::MOVSHDUP;
+ SrcVT = DstVT = MVT::v8f32;
+ return true;
+ }
+ }
+
+ if (MaskVT.is512BitVector() && AllowFloatDomain) {
+ assert(Subtarget.hasAVX512() &&
+ "AVX512 required for 512-bit vector shuffles");
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
+ Shuffle = X86ISD::MOVDDUP;
+ SrcVT = DstVT = MVT::v8f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(
+ MaskVT, Mask,
+ {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
+ Shuffle = X86ISD::MOVSLDUP;
+ SrcVT = DstVT = MVT::v16f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(
+ MaskVT, Mask,
+ {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
+ Shuffle = X86ISD::MOVSHDUP;
+ SrcVT = DstVT = MVT::v16f32;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Attempt to match a combined shuffle mask against supported unary immediate
+// permute instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &PermuteImm) {
+ unsigned NumMaskElts = Mask.size();
+ unsigned InputSizeInBits = MaskVT.getSizeInBits();
+ unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
+ MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
+ bool ContainsZeros = isAnyZero(Mask);
+
+ // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
+ if (!ContainsZeros && MaskScalarSizeInBits == 64) {
+ // Check for lane crossing permutes.
+ if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
+ // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
+ if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
+ PermuteImm = getV4X86ShuffleImm(Mask);
+ return true;
+ }
+ if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
+ PermuteImm = getV4X86ShuffleImm(RepeatedMask);
+ return true;
+ }
+ }
+ } else if (AllowFloatDomain && Subtarget.hasAVX()) {
+ // VPERMILPD can permute with a non-repeating shuffle.
+ Shuffle = X86ISD::VPERMILPI;
+ ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
+ PermuteImm = 0;
+ for (int i = 0, e = Mask.size(); i != e; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
+ PermuteImm |= (M & 1) << i;
+ }
+ return true;
+ }
+ }
+
+ // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
+ // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
+ // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
+ if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
+ !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+ // Narrow the repeated mask to create 32-bit element permutes.
+ SmallVector<int, 4> WordMask = RepeatedMask;
+ if (MaskScalarSizeInBits == 64)
+ narrowShuffleMaskElts(2, RepeatedMask, WordMask);
+
+ Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
+ ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
+ PermuteImm = getV4X86ShuffleImm(WordMask);
+ return true;
+ }
+ }
+
+ // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+ ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
+ ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
+
+ // PSHUFLW: permute lower 4 elements only.
+ if (isUndefOrInRange(LoMask, 0, 4) &&
+ isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
+ Shuffle = X86ISD::PSHUFLW;
+ ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
+ PermuteImm = getV4X86ShuffleImm(LoMask);
+ return true;
+ }
+
+ // PSHUFHW: permute upper 4 elements only.
+ if (isUndefOrInRange(HiMask, 4, 8) &&
+ isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
+ // Offset the HiMask so that we can create the shuffle immediate.
+ int OffsetHiMask[4];
+ for (int i = 0; i != 4; ++i)
+ OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
+
+ Shuffle = X86ISD::PSHUFHW;
+ ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
+ PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
+ return true;
+ }
+ }
+ }
+
+ // Attempt to match against byte/bit shifts.
+ if (AllowIntDomain &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
+ int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
+ Mask, 0, Zeroable, Subtarget);
+ if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
+ 32 <= ShuffleVT.getScalarSizeInBits())) {
+ PermuteImm = (unsigned)ShiftAmt;
+ return true;
+ }
+ }
+
+ // Attempt to match against bit rotates.
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
+ Subtarget.hasAVX512())) {
+ int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
+ Subtarget, Mask);
+ if (0 < RotateAmt) {
+ Shuffle = X86ISD::VROTLI;
+ PermuteImm = (unsigned)RotateAmt;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Attempt to match a combined unary shuffle mask against supported binary
+// shuffle instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, SDValue &V2, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
+ bool IsUnary) {
+ unsigned NumMaskElts = Mask.size();
+ unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
+
+ if (MaskVT.is128BitVector()) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
+ V2 = V1;
+ V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
+ Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
+ SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
+ V2 = V1;
+ Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
+ SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
+ Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
+ std::swap(V1, V2);
+ Shuffle = X86ISD::MOVSD;
+ SrcVT = DstVT = MVT::v2f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
+ (AllowFloatDomain || !Subtarget.hasSSE41())) {
+ Shuffle = X86ISD::MOVSS;
+ SrcVT = DstVT = MVT::v4f32;
+ return true;
+ }
+ }
+
+ // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
+ if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
+ ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
+ ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
+ if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
+ Subtarget)) {
+ DstVT = MaskVT;
+ return true;
+ }
+ }
+
+ // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
+ if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
+ if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
+ Subtarget)) {
+ SrcVT = DstVT = MaskVT;
+ if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
+ SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+ return true;
+ }
+ }
+
+ // Attempt to match against a OR if we're performing a blend shuffle and the
+ // non-blended source element is zero in each case.
+ if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+ (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
+ bool IsBlend = true;
+ unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
+ unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
+ unsigned Scale1 = NumV1Elts / NumMaskElts;
+ unsigned Scale2 = NumV2Elts / NumMaskElts;
+ APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
+ APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ if (M == SM_SentinelZero) {
+ DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
+ DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
+ continue;
+ }
+ if (M == (int)i) {
+ DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
+ continue;
+ }
+ if (M == (int)(i + NumMaskElts)) {
+ DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
+ continue;
+ }
+ IsBlend = false;
+ break;
+ }
+ if (IsBlend &&
+ DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
+ DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
+ Shuffle = ISD::OR;
+ SrcVT = DstVT = MaskVT.changeTypeToInteger();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool matchBinaryPermuteShuffle(
+ MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
+ bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
+ const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
+ unsigned NumMaskElts = Mask.size();
+ unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
+
+ // Attempt to match against VALIGND/VALIGNQ rotate.
+ if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
+ ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
+ if (!isAnyZero(Mask)) {
+ int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
+ if (0 < Rotation) {
+ Shuffle = X86ISD::VALIGN;
+ if (EltSizeInBits == 64)
+ ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
+ else
+ ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
+ PermuteImm = Rotation;
+ return true;
+ }
+ }
+ }
+
+ // Attempt to match against PALIGNR byte rotate.
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
+ int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
+ if (0 < ByteRotation) {
+ Shuffle = X86ISD::PALIGNR;
+ ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
+ PermuteImm = ByteRotation;
+ return true;
+ }
+ }
+
+ // Attempt to combine to X86ISD::BLENDI.
+ if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
+ (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
+ (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
+ uint64_t BlendMask = 0;
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
+ if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
+ ForceV2Zero, BlendMask)) {
+ if (MaskVT == MVT::v16i16) {
+ // We can only use v16i16 PBLENDW if the lanes are repeated.
+ SmallVector<int, 8> RepeatedMask;
+ if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
+ RepeatedMask)) {
+ assert(RepeatedMask.size() == 8 &&
+ "Repeated mask size doesn't match!");
+ PermuteImm = 0;
+ for (int i = 0; i < 8; ++i)
+ if (RepeatedMask[i] >= 8)
+ PermuteImm |= 1 << i;
+ V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+ V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+ Shuffle = X86ISD::BLENDI;
+ ShuffleVT = MaskVT;
+ return true;
+ }
+ } else {
+ V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+ V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+ PermuteImm = (unsigned)BlendMask;
+ Shuffle = X86ISD::BLENDI;
+ ShuffleVT = MaskVT;
+ return true;
+ }
+ }
+ }
+
+ // Attempt to combine to INSERTPS, but only if it has elements that need to
+ // be set to zero.
+ if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+ MaskVT.is128BitVector() && isAnyZero(Mask) &&
+ matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ Shuffle = X86ISD::INSERTPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+
+ // Attempt to combine to SHUFPD.
+ if (AllowFloatDomain && EltSizeInBits == 64 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
+ PermuteImm, Mask, Zeroable)) {
+ V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+ V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+ Shuffle = X86ISD::SHUFP;
+ ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
+ return true;
+ }
+ }
+
+ // Attempt to combine to SHUFPS.
+ if (AllowFloatDomain && EltSizeInBits == 32 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
+ SmallVector<int, 4> RepeatedMask;
+ if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+ // Match each half of the repeated mask, to determine if its just
+ // referencing one of the vectors, is zeroable or entirely undef.
+ auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
+ int M0 = RepeatedMask[Offset];
+ int M1 = RepeatedMask[Offset + 1];
+
+ if (isUndefInRange(RepeatedMask, Offset, 2)) {
+ return DAG.getUNDEF(MaskVT);
+ } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : 0);
+ S1 = (SM_SentinelUndef == M1 ? -1 : 1);
+ return getZeroVector(MaskVT, Subtarget, DAG, DL);
+ } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+ S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+ return V1;
+ } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+ S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+ return V2;
+ }
+
+ return SDValue();
+ };
+
+ int ShufMask[4] = {-1, -1, -1, -1};
+ SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
+ SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
+
+ if (Lo && Hi) {
+ V1 = Lo;
+ V2 = Hi;
+ Shuffle = X86ISD::SHUFP;
+ ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
+ PermuteImm = getV4X86ShuffleImm(ShufMask);
+ return true;
+ }
+ }
+ }
+
+ // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
+ if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+ MaskVT.is128BitVector() &&
+ matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ Shuffle = X86ISD::INSERTPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+
+ return false;
+}
+
+static SDValue combineX86ShuffleChainWithExtract(
+ ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget);
+
+/// Combine an arbitrary chain of shuffles into a single instruction if
+/// possible.
+///
+/// This is the leaf of the recursive combine below. When we have found some
+/// chain of single-use x86 shuffle instructions and accumulated the combined
+/// shuffle mask represented by them, this will try to pattern match that mask
+/// into either a single instruction if there is a special purpose instruction
+/// for this operation, or into a PSHUFB instruction which is a fully general
+/// instruction but should only be used to replace chains over a certain depth.
+static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
+ ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask,
+ bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
+ assert((Inputs.size() == 1 || Inputs.size() == 2) &&
+ "Unexpected number of shuffle inputs!");
+
+ MVT RootVT = Root.getSimpleValueType();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ unsigned NumRootElts = RootVT.getVectorNumElements();
+
+ // Canonicalize shuffle input op to the requested type.
+ // TODO: Support cases where Op is smaller than VT.
+ auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
+ return DAG.getBitcast(VT, Op);
+ };
+
+ // Find the inputs that enter the chain. Note that multiple uses are OK
+ // here, we're not going to remove the operands we find.
+ bool UnaryShuffle = (Inputs.size() == 1);
+ SDValue V1 = peekThroughBitcasts(Inputs[0]);
+ SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
+ : peekThroughBitcasts(Inputs[1]));
+
+ MVT VT1 = V1.getSimpleValueType();
+ MVT VT2 = V2.getSimpleValueType();
+ assert(VT1.getSizeInBits() == RootSizeInBits &&
+ VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
+
+ SDLoc DL(Root);
+ SDValue Res;
+
+ unsigned NumBaseMaskElts = BaseMask.size();
+ if (NumBaseMaskElts == 1) {
+ assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
+ return CanonicalizeShuffleInput(RootVT, V1);
+ }
+
+ bool OptForSize = DAG.shouldOptForSize();
+ unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
+ bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
+ (RootVT.isFloatingPoint() && Depth >= 1) ||
+ (RootVT.is256BitVector() && !Subtarget.hasAVX2());
+
+ // Don't combine if we are a AVX512/EVEX target and the mask element size
+ // is different from the root element size - this would prevent writemasks
+ // from being reused.
+ bool IsMaskedShuffle = false;
+ if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
+ if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
+ Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
+ IsMaskedShuffle = true;
+ }
+ }
+
+ // If we are shuffling a broadcast (and not introducing zeros) then
+ // we can just use the broadcast directly. This works for smaller broadcast
+ // elements as well as they already repeat across each mask element
+ if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
+ (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+ V1.getValueSizeInBits() >= RootSizeInBits) {
+ return CanonicalizeShuffleInput(RootVT, V1);
+ }
+
+ // Handle 128/256-bit lane shuffles of 512-bit vectors.
+ if (RootVT.is512BitVector() &&
+ (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
+ // If the upper subvectors are zeroable, then an extract+insert is more
+ // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
+ // to zero the upper subvectors.
+ if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
+ if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+ return SDValue(); // Nothing to do!
+ assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
+ "Unexpected lane shuffle");
+ Res = CanonicalizeShuffleInput(RootVT, V1);
+ unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
+ bool UseZero = isAnyZero(BaseMask);
+ Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
+ return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
+ }
+
+ // Narrow shuffle mask to v4x128.
+ SmallVector<int, 4> Mask;
+ assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
+ narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
+
+ // Try to lower to vshuf64x2/vshuf32x4.
+ auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2, SelectionDAG &DAG) {
+ unsigned PermMask = 0;
+ // Insure elements came from the same Op.
+ SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
+ for (int i = 0; i < 4; ++i) {
+ assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
+ if (Mask[i] < 0)
+ continue;
+
+ SDValue Op = Mask[i] >= 4 ? V2 : V1;
+ unsigned OpIndex = i / 2;
+ if (Ops[OpIndex].isUndef())
+ Ops[OpIndex] = Op;
+ else if (Ops[OpIndex] != Op)
+ return SDValue();
+
+ // Convert the 128-bit shuffle mask selection values into 128-bit
+ // selection bits defined by a vshuf64x2 instruction's immediate control
+ // byte.
+ PermMask |= (Mask[i] % 4) << (i * 2);
+ }
+
+ return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
+ CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
+ CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ };
+
+ // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
+ // doesn't work because our mask is for 128 bits and we don't have an MVT
+ // to match that.
+ bool PreferPERMQ =
+ UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
+ isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
+ isUndefOrInRange(Mask[3], 2, 4) &&
+ (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
+ (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
+
+ if (!isAnyZero(Mask) && !PreferPERMQ) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+ return SDValue(); // Nothing to do!
+ MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+ if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
+ return DAG.getBitcast(RootVT, V);
+ }
+ }
+
+ // Handle 128-bit lane shuffles of 256-bit vectors.
+ if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
+ // If the upper half is zeroable, then an extract+insert is more optimal
+ // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
+ // zero the upper half.
+ if (isUndefOrZero(BaseMask[1])) {
+ if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+ return SDValue(); // Nothing to do!
+ assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
+ Res = CanonicalizeShuffleInput(RootVT, V1);
+ Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
+ return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
+ DL, 256);
+ }
+
+ if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
+ return SDValue(); // Nothing to do!
+
+ // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
+ // we need to use the zeroing feature.
+ // Prefer blends for sequential shuffles unless we are optimizing for size.
+ if (UnaryShuffle &&
+ !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
+ (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+ PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+ return DAG.getNode(
+ X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
+ DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ }
+
+ if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+ return SDValue(); // Nothing to do!
+
+ // TODO - handle AVX512VL cases with X86ISD::SHUF128.
+ if (!UnaryShuffle && !IsMaskedShuffle) {
+ assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
+ "Unexpected shuffle sentinel value");
+ // Prefer blends to X86ISD::VPERM2X128.
+ if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
+ (BaseMask[0] == 2 && BaseMask[1] == 1))) {
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] & 3) << 0);
+ PermMask |= ((BaseMask[1] & 3) << 4);
+ SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
+ SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
+ return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
+ CanonicalizeShuffleInput(RootVT, LHS),
+ CanonicalizeShuffleInput(RootVT, RHS),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ }
+ }
+ }
+
+ // For masks that have been widened to 128-bit elements or more,
+ // narrow back down to 64-bit elements.
+ SmallVector<int, 64> Mask;
+ if (BaseMaskEltSizeInBits > 64) {
+ assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
+ int MaskScale = BaseMaskEltSizeInBits / 64;
+ narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
+ } else {
+ Mask.assign(BaseMask.begin(), BaseMask.end());
+ }
+
+ // For masked shuffles, we're trying to match the root width for better
+ // writemask folding, attempt to scale the mask.
+ // TODO - variable shuffles might need this to be widened again.
+ if (IsMaskedShuffle && NumRootElts > Mask.size()) {
+ assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
+ int MaskScale = NumRootElts / Mask.size();
+ SmallVector<int, 64> ScaledMask;
+ narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
+ Mask = std::move(ScaledMask);
+ }
+
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
+
+ // Determine the effective mask value type.
+ FloatDomain &= (32 <= MaskEltSizeInBits);
+ MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
+ : MVT::getIntegerVT(MaskEltSizeInBits);
+ MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
+
+ // Only allow legal mask types.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
+ return SDValue();
+
+ // Attempt to match the mask against known shuffle patterns.
+ MVT ShuffleSrcVT, ShuffleVT;
+ unsigned Shuffle, PermuteImm;
+
+ // Which shuffle domains are permitted?
+ // Permit domain crossing at higher combine depths.
+ // TODO: Should we indicate which domain is preferred if both are allowed?
+ bool AllowFloatDomain = FloatDomain || (Depth >= 3);
+ bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
+ (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
+
+ // Determine zeroable mask elements.
+ APInt KnownUndef, KnownZero;
+ resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
+ APInt Zeroable = KnownUndef | KnownZero;
+
+ if (UnaryShuffle) {
+ // Attempt to match against broadcast-from-vector.
+ // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
+ if ((Subtarget.hasAVX2() ||
+ (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
+ (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
+ if (isUndefOrEqual(Mask, 0)) {
+ if (V1.getValueType() == MaskVT &&
+ V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ MayFoldLoad(V1.getOperand(0))) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
+ return SDValue(); // Nothing to do!
+ Res = V1.getOperand(0);
+ Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ if (Subtarget.hasAVX2()) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
+ return SDValue(); // Nothing to do!
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
+ Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+ }
+
+ SDValue NewV1 = V1; // Save operand in case early exit happens.
+ if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
+ DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT) &&
+ (!IsMaskedShuffle ||
+ (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
+ return SDValue(); // Nothing to do!
+ Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+ AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
+ PermuteImm) &&
+ (!IsMaskedShuffle ||
+ (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
+ return SDValue(); // Nothing to do!
+ Res = CanonicalizeShuffleInput(ShuffleVT, V1);
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
+ // Attempt to combine to INSERTPS, but only if the inserted element has come
+ // from a scalar.
+ // TODO: Handle other insertions here as well?
+ if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
+ Subtarget.hasSSE41() &&
+ !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
+ if (MaskEltSizeInBits == 32) {
+ SDValue SrcV1 = V1, SrcV2 = V2;
+ if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
+ DAG) &&
+ SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+ return SDValue(); // Nothing to do!
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
+ CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
+ CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+ if (MaskEltSizeInBits == 64 &&
+ isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
+ V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ V2.getScalarValueSizeInBits() <= 32) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+ return SDValue(); // Nothing to do!
+ PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
+ CanonicalizeShuffleInput(MVT::v4f32, V1),
+ CanonicalizeShuffleInput(MVT::v4f32, V2),
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
+ SDValue NewV1 = V1; // Save operands in case early exit happens.
+ SDValue NewV2 = V2;
+ if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
+ NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT, UnaryShuffle) &&
+ (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
+ return SDValue(); // Nothing to do!
+ NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
+ NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ NewV1 = V1; // Save operands in case early exit happens.
+ NewV2 = V2;
+ if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+ AllowIntDomain, NewV1, NewV2, DL, DAG,
+ Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
+ (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
+ return SDValue(); // Nothing to do!
+ NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
+ NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // Typically from here on, we need an integer version of MaskVT.
+ MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
+ IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
+
+ // Annoyingly, SSE4A instructions don't map into the above match helpers.
+ if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
+ uint64_t BitLen, BitIdx;
+ if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
+ Zeroable)) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
+ return SDValue(); // Nothing to do!
+ V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
+ Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
+ return SDValue(); // Nothing to do!
+ V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
+ V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
+ Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
+ // Match shuffle against TRUNCATE patterns.
+ if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
+ // Match against a VTRUNC instruction, accounting for src/dst sizes.
+ if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
+ Subtarget)) {
+ bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
+ ShuffleSrcVT.getVectorNumElements();
+ unsigned Opc =
+ IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
+ if (Depth == 0 && Root.getOpcode() == Opc)
+ return SDValue(); // Nothing to do!
+ V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
+ Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
+ if (ShuffleVT.getSizeInBits() < RootSizeInBits)
+ Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // Do we need a more general binary truncation pattern?
+ if (RootSizeInBits < 512 &&
+ ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
+ (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
+ (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
+ isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
+ if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
+ return SDValue(); // Nothing to do!
+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
+ V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
+ V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
+ Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
+ // Don't try to re-form single instruction chains under any circumstances now
+ // that we've done encoding canonicalization for them.
+ if (Depth < 1)
+ return SDValue();
+
+ // Depth threshold above which we can efficiently use variable mask shuffles.
+ int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
+ AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
+ // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
+ // higher depth before combining them.
+ bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
+
+ bool MaskContainsZeros = isAnyZero(Mask);
+
+ if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
+ // If we have a single input lane-crossing shuffle then lower to VPERMV.
+ if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
+ if (Subtarget.hasAVX2() &&
+ (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
+ Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ // AVX512 variants (non-VLX will pad to 512-bit shuffles).
+ if ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = DAG.getUNDEF(MaskVT);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
+ // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
+ // vector as the second source (non-VLX will pad to 512-bit shuffles).
+ if (UnaryShuffle && AllowVariableMask &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+ // Adjust shuffle mask - replace SM_SentinelZero with second source index.
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (Mask[i] == SM_SentinelZero)
+ Mask[i] = NumMaskElts + i;
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // If that failed and either input is extracted then try to combine as a
+ // shuffle with the larger type.
+ if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
+ Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
+ DAG, Subtarget))
+ return WideShuffle;
+
+ // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
+ // (non-VLX will pad to 512-bit shuffles).
+ if (AllowVariableMask && !MaskContainsZeros &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = CanonicalizeShuffleInput(MaskVT, V2);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ return SDValue();
+ }
+
+ // See if we can combine a single input shuffle with zeros to a bit-mask,
+ // which is much simpler than any shuffle.
+ if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
+ isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
+ DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
+ APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
+ APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
+ APInt UndefElts(NumMaskElts, 0);
+ SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ UndefElts.setBit(i);
+ continue;
+ }
+ if (M == SM_SentinelZero)
+ continue;
+ EltBits[i] = AllOnes;
+ }
+ SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
+ unsigned AndOpcode =
+ MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
+ Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // the 128-bit lanes use the variable mask to VPERMILPS.
+ // TODO Combine other mask types at higher depths.
+ if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
+ ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
+ SmallVector<SDValue, 16> VPermIdx;
+ for (int M : Mask) {
+ SDValue Idx =
+ M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
+ VPermIdx.push_back(Idx);
+ }
+ SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
+ Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
+ // to VPERMIL2PD/VPERMIL2PS.
+ if (AllowVariableMask && Subtarget.hasXOP() &&
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
+ MaskVT == MVT::v8f32)) {
+ // VPERMIL2 Operation.
+ // Bits[3] - Match Bit.
+ // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+ // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+ unsigned NumLanes = MaskVT.getSizeInBits() / 128;
+ unsigned NumEltsPerLane = NumMaskElts / NumLanes;
+ SmallVector<int, 8> VPerm2Idx;
+ unsigned M2ZImm = 0;
+ for (int M : Mask) {
+ if (M == SM_SentinelUndef) {
+ VPerm2Idx.push_back(-1);
+ continue;
+ }
+ if (M == SM_SentinelZero) {
+ M2ZImm = 2;
+ VPerm2Idx.push_back(8);
+ continue;
+ }
+ int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
+ Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
+ VPerm2Idx.push_back(Index);
+ }
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = CanonicalizeShuffleInput(MaskVT, V2);
+ SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
+ Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
+ DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // If we have 3 or more shuffle instructions or a chain involving a variable
+ // mask, we can replace them with a single PSHUFB instruction profitably.
+ // Intel's manuals suggest only using PSHUFB if doing so replacing 5
+ // instructions, but in practice PSHUFB tends to be *very* fast so we're
+ // more aggressive.
+ if (UnaryShuffle && AllowVariableMask &&
+ ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
+ SmallVector<SDValue, 16> PSHUFBMask;
+ int NumBytes = RootVT.getSizeInBits() / 8;
+ int Ratio = NumBytes / NumMaskElts;
+ for (int i = 0; i < NumBytes; ++i) {
+ int M = Mask[i / Ratio];
+ if (M == SM_SentinelUndef) {
+ PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
+ continue;
+ }
+ if (M == SM_SentinelZero) {
+ PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
+ continue;
+ }
+ M = Ratio * M + i % Ratio;
+ assert((M / 16) == (i / 16) && "Lane crossing detected");
+ PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
+ }
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+ Res = CanonicalizeShuffleInput(ByteVT, V1);
+ SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
+ Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // With XOP, if we have a 128-bit binary input shuffle we can always combine
+ // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
+ // slower than PSHUFB on targets that support both.
+ if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
+ // VPPERM Mask Operation
+ // Bits[4:0] - Byte Index (0 - 31)
+ // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
+ SmallVector<SDValue, 16> VPPERMMask;
+ int NumBytes = 16;
+ int Ratio = NumBytes / NumMaskElts;
+ for (int i = 0; i < NumBytes; ++i) {
+ int M = Mask[i / Ratio];
+ if (M == SM_SentinelUndef) {
+ VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
+ continue;
+ }
+ if (M == SM_SentinelZero) {
+ VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
+ continue;
+ }
+ M = Ratio * M + i % Ratio;
+ VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
+ }
+ MVT ByteVT = MVT::v16i8;
+ V1 = CanonicalizeShuffleInput(ByteVT, V1);
+ V2 = CanonicalizeShuffleInput(ByteVT, V2);
+ SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
+ Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // If that failed and either input is extracted then try to combine as a
+ // shuffle with the larger type.
+ if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
+ Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
+ DAG, Subtarget))
+ return WideShuffle;
+
+ // If we have a dual input shuffle then lower to VPERMV3,
+ // (non-VLX will pad to 512-bit shuffles)
+ if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
+ MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
+ MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
+ MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = CanonicalizeShuffleInput(MaskVT, V2);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // Failed to find any combines.
+ return SDValue();
+}
+
+// Combine an arbitrary chain of shuffles + extract_subvectors into a single
+// instruction if possible.
+//
+// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
+// type size to attempt to combine:
+// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
+// -->
+// extract_subvector(shuffle(x,y,m2),0)
+static SDValue combineX86ShuffleChainWithExtract(
+ ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned NumMaskElts = BaseMask.size();
+ unsigned NumInputs = Inputs.size();
+ if (NumInputs == 0)
+ return SDValue();
+
+ EVT RootVT = Root.getValueType();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
+
+ SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
+ SmallVector<unsigned, 4> Offsets(NumInputs, 0);
+
+ // Peek through subvectors.
+ // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
+ unsigned WideSizeInBits = RootSizeInBits;
+ for (unsigned i = 0; i != NumInputs; ++i) {
+ SDValue &Src = WideInputs[i];
+ unsigned &Offset = Offsets[i];
+ Src = peekThroughBitcasts(Src);
+ EVT BaseVT = Src.getValueType();
+ while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ Offset += Src.getConstantOperandVal(1);
+ Src = Src.getOperand(0);
+ }
+ WideSizeInBits = std::max(WideSizeInBits,
+ (unsigned)Src.getValueSizeInBits());
+ assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
+ "Unexpected subvector extraction");
+ Offset /= BaseVT.getVectorNumElements();
+ Offset *= NumMaskElts;
+ }
+
+ // Bail if we're always extracting from the lowest subvectors,
+ // combineX86ShuffleChain should match this for the current width.
+ if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
+ return SDValue();
+
+ unsigned Scale = WideSizeInBits / RootSizeInBits;
+ assert((WideSizeInBits % RootSizeInBits) == 0 &&
+ "Unexpected subvector extraction");
+
+ // If the src vector types aren't the same, see if we can extend
+ // them to match each other.
+ // TODO: Support different scalar types?
+ EVT WideSVT = WideInputs[0].getValueType().getScalarType();
+ if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
+ return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
+ Op.getValueType().getScalarType() != WideSVT;
+ }))
+ return SDValue();
+
+ for (SDValue &NewInput : WideInputs) {
+ assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
+ "Shuffle vector size mismatch");
+ if (WideSizeInBits > NewInput.getValueSizeInBits())
+ NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
+ SDLoc(NewInput), WideSizeInBits);
+ assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
+ "Unexpected subvector extraction");
+ }
+
+ // Create new mask for larger type.
+ for (unsigned i = 1; i != NumInputs; ++i)
+ Offsets[i] += i * Scale * NumMaskElts;
+
+ SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
+ for (int &M : WideMask) {
+ if (M < 0)
+ continue;
+ M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
+ }
+ WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
+
+ // Remove unused/repeated shuffle source ops.
+ resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
+ assert(!WideInputs.empty() && "Shuffle with no inputs detected");
+
+ if (WideInputs.size() > 2)
+ return SDValue();
+
+ // Increase depth for every upper subvector we've peeked through.
+ Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
+
+ // Attempt to combine wider chain.
+ // TODO: Can we use a better Root?
+ SDValue WideRoot = WideInputs[0];
+ if (SDValue WideShuffle = combineX86ShuffleChain(
+ WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget)) {
+ WideShuffle =
+ extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
+ return DAG.getBitcast(RootVT, WideShuffle);
+ }
+ return SDValue();
+}
+
+// Canonicalize the combined shuffle mask chain with horizontal ops.
+// NOTE: This may update the Ops and Mask.
+static SDValue canonicalizeShuffleMaskWithHorizOp(
+ MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
+ unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Mask.empty() || Ops.empty())
+ return SDValue();
+
+ SmallVector<SDValue> BC;
+ for (SDValue Op : Ops)
+ BC.push_back(peekThroughBitcasts(Op));
+
+ // All ops must be the same horizop + type.
+ SDValue BC0 = BC[0];
+ EVT VT0 = BC0.getValueType();
+ unsigned Opcode0 = BC0.getOpcode();
+ if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
+ return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
+ }))
+ return SDValue();
+
+ bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+ Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
+ bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
+ if (!isHoriz && !isPack)
+ return SDValue();
+
+ int NumElts = VT0.getVectorNumElements();
+ int NumLanes = VT0.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+ int NumHalfEltsPerLane = NumEltsPerLane / 2;
+
+ // See if we can remove the shuffle by resorting the HOP chain so that
+ // the HOP args are pre-shuffled.
+ // TODO: Generalize to any sized/depth chain.
+ // TODO: Add support for PACKSS/PACKUS.
+ if (isHoriz && NumEltsPerLane == 4 && VT0.is128BitVector() &&
+ shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget)) {
+ SmallVector<int> ScaledMask;
+ if (scaleShuffleElements(Mask, 4, ScaledMask)) {
+ // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
+ auto GetHOpSrc = [&](int M) {
+ if (M == SM_SentinelUndef)
+ return DAG.getUNDEF(VT0);
+ if (M == SM_SentinelZero)
+ return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
+ SDValue Src0 = BC[M / NumElts];
+ SDValue Src1 = Src0.getOperand((M % 4) >= 2);
+ if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
+ return Src1.getOperand(M % 2);
+ return SDValue();
+ };
+ SDValue M0 = GetHOpSrc(ScaledMask[0]);
+ SDValue M1 = GetHOpSrc(ScaledMask[1]);
+ SDValue M2 = GetHOpSrc(ScaledMask[2]);
+ SDValue M3 = GetHOpSrc(ScaledMask[3]);
+ if (M0 && M1 && M2 && M3) {
+ SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1);
+ SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3);
+ return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
+ }
+ }
+ }
+
+ if (2 < Ops.size())
+ return SDValue();
+
+ SDValue BC1 = BC[BC.size() - 1];
+ if (Mask.size() == VT0.getVectorNumElements()) {
+ // Canonicalize binary shuffles of horizontal ops that use the
+ // same sources to an unary shuffle.
+ // TODO: Try to perform this fold even if the shuffle remains.
+ if (Ops.size() == 2) {
+ auto ContainsOps = [](SDValue HOp, SDValue Op) {
+ return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
+ };
+ // Commute if all BC0's ops are contained in BC1.
+ if (ContainsOps(BC1, BC0.getOperand(0)) &&
+ ContainsOps(BC1, BC0.getOperand(1))) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ std::swap(BC0, BC1);
+ }
+
+ // If BC1 can be represented by BC0, then convert to unary shuffle.
+ if (ContainsOps(BC0, BC1.getOperand(0)) &&
+ ContainsOps(BC0, BC1.getOperand(1))) {
+ for (int &M : Mask) {
+ if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
+ continue;
+ int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
+ M -= NumElts + (SubLane * NumHalfEltsPerLane);
+ if (BC1.getOperand(SubLane) != BC0.getOperand(0))
+ M += NumHalfEltsPerLane;
+ }
+ }
+ }
+
+ // Canonicalize unary horizontal ops to only refer to lower halves.
+ for (int i = 0; i != NumElts; ++i) {
+ int &M = Mask[i];
+ if (isUndefOrZero(M))
+ continue;
+ if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
+ (M % NumEltsPerLane) >= NumHalfEltsPerLane)
+ M -= NumHalfEltsPerLane;
+ if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
+ (M % NumEltsPerLane) >= NumHalfEltsPerLane)
+ M -= NumHalfEltsPerLane;
+ }
+ }
+
+ // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
+ // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
+ // represents the LHS/RHS inputs for the lower/upper halves.
+ unsigned EltSizeInBits = RootSizeInBits / Mask.size();
+ SmallVector<int, 16> TargetMask128, WideMask128;
+ if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
+ scaleShuffleElements(TargetMask128, 2, WideMask128)) {
+ assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
+ bool SingleOp = (Ops.size() == 1);
+ if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
+ SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
+ SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
+ Lo = Lo.getOperand(WideMask128[0] & 1);
+ Hi = Hi.getOperand(WideMask128[1] & 1);
+ if (SingleOp) {
+ MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+ SDValue Undef = DAG.getUNDEF(SrcVT);
+ SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
+ Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
+ Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
+ Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
+ Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
+ }
+ return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+ }
+ }
+
+ return SDValue();
+}
+
+// Attempt to constant fold all of the constant source ops.
+// Returns true if the entire shuffle is folded to a constant.
+// TODO: Extend this to merge multiple constant Ops and update the mask.
+static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
+ ArrayRef<int> Mask, SDValue Root,
+ bool HasVariableMask,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Root.getSimpleValueType();
+
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
+ unsigned NumOps = Ops.size();
+
+ // Extract constant bits from each source op.
+ bool OneUseConstantOp = false;
+ SmallVector<APInt, 16> UndefEltsOps(NumOps);
+ SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
+ for (unsigned i = 0; i != NumOps; ++i) {
+ SDValue SrcOp = Ops[i];
+ OneUseConstantOp |= SrcOp.hasOneUse();
+ if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
+ RawBitsOps[i]))
+ return SDValue();
+ }
+
+ // Only fold if at least one of the constants is only used once or
+ // the combined shuffle has included a variable mask shuffle, this
+ // is to avoid constant pool bloat.
+ if (!OneUseConstantOp && !HasVariableMask)
+ return SDValue();
+
+ // Shuffle the constant bits according to the mask.
+ SDLoc DL(Root);
+ APInt UndefElts(NumMaskElts, 0);
+ APInt ZeroElts(NumMaskElts, 0);
+ APInt ConstantElts(NumMaskElts, 0);
+ SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
+ APInt::getNullValue(MaskSizeInBits));
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ UndefElts.setBit(i);
+ continue;
+ } else if (M == SM_SentinelZero) {
+ ZeroElts.setBit(i);
+ continue;
+ }
+ assert(0 <= M && M < (int)(NumMaskElts * NumOps));
+
+ unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
+ unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
+
+ auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
+ if (SrcUndefElts[SrcMaskIdx]) {
+ UndefElts.setBit(i);
+ continue;
+ }
+
+ auto &SrcEltBits = RawBitsOps[SrcOpIdx];
+ APInt &Bits = SrcEltBits[SrcMaskIdx];
+ if (!Bits) {
+ ZeroElts.setBit(i);
+ continue;
+ }
+
+ ConstantElts.setBit(i);
+ ConstantBitData[i] = Bits;
+ }
+ assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
+
+ // Attempt to create a zero vector.
+ if ((UndefElts | ZeroElts).isAllOnesValue())
+ return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
+
+ // Create the constant data.
+ MVT MaskSVT;
+ if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
+ MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
+ else
+ MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
+
+ MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
+ return SDValue();
+
+ SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
+ return DAG.getBitcast(VT, CstOp);
+}
+
+namespace llvm {
+ namespace X86 {
+ enum {
+ MaxShuffleCombineDepth = 8
+ };
+ }
+} // namespace llvm
+
+/// Fully generic combining of x86 shuffle instructions.
+///
+/// This should be the last combine run over the x86 shuffle instructions. Once
+/// they have been fully optimized, this will recursively consider all chains
+/// of single-use shuffle instructions, build a generic model of the cumulative
+/// shuffle operation, and check for simpler instructions which implement this
+/// operation. We use this primarily for two purposes:
+///
+/// 1) Collapse generic shuffles to specialized single instructions when
+/// equivalent. In most cases, this is just an encoding size win, but
+/// sometimes we will collapse multiple generic shuffles into a single
+/// special-purpose shuffle.
+/// 2) Look for sequences of shuffle instructions with 3 or more total
+/// instructions, and replace them with the slightly more expensive SSSE3
+/// PSHUFB instruction if available. We do this as the last combining step
+/// to ensure we avoid using PSHUFB if we can implement the shuffle with
+/// a suitable short sequence of other instructions. The PSHUFB will either
+/// use a register or have to read from memory and so is slightly (but only
+/// slightly) more expensive than the other shuffle instructions.
+///
+/// Because this is inherently a quadratic operation (for each shuffle in
+/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
+/// This should never be an issue in practice as the shuffle lowering doesn't
+/// produce sequences of more than 8 instructions.
+///
+/// FIXME: We will currently miss some cases where the redundant shuffling
+/// would simplify under the threshold for PSHUFB formation because of
+/// combine-ordering. To fix this, we should do the redundant instruction
+/// combining in this recursive walk.
+static SDValue combineX86ShufflesRecursively(
+ ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
+ ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
+ unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ assert(RootMask.size() > 0 &&
+ (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
+ "Illegal shuffle root mask");
+ assert(Root.getSimpleValueType().isVector() &&
+ "Shuffles operate on vector types!");
+ unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
+
+ // Bound the depth of our recursive combine because this is ultimately
+ // quadratic in nature.
+ if (Depth >= MaxDepth)
+ return SDValue();
+
+ // Directly rip through bitcasts to find the underlying operand.
+ SDValue Op = SrcOps[SrcOpIndex];
+ Op = peekThroughOneUseBitcasts(Op);
+
+ EVT VT = Op.getValueType();
+ if (!VT.isVector() || !VT.isSimple())
+ return SDValue(); // Bail if we hit a non-simple non-vector.
+
+ assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
+ "Can only combine shuffles upto size of the root op.");
+
+ // Extract target shuffle mask and resolve sentinels and inputs.
+ // TODO - determine Op's demanded elts from RootMask.
+ SmallVector<int, 64> OpMask;
+ SmallVector<SDValue, 2> OpInputs;
+ APInt OpUndef, OpZero;
+ APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
+ if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
+ OpZero, DAG, Depth, false))
+ return SDValue();
+
+ // Shuffle inputs must not be larger than the shuffle result.
+ // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
+ if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
+ return OpInput.getValueSizeInBits() > VT.getSizeInBits();
+ }))
+ return SDValue();
+
+ // If the shuffle result was smaller than the root, we need to adjust the
+ // mask indices and pad the mask with undefs.
+ if (RootSizeInBits > VT.getSizeInBits()) {
+ unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
+ unsigned OpMaskSize = OpMask.size();
+ if (OpInputs.size() > 1) {
+ unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
+ for (int &M : OpMask) {
+ if (M < 0)
+ continue;
+ int EltIdx = M % OpMaskSize;
+ int OpIdx = M / OpMaskSize;
+ M = (PaddedMaskSize * OpIdx) + EltIdx;
+ }
+ }
+ OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
+ OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
+ OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
+ }
+
+ SmallVector<int, 64> Mask;
+ SmallVector<SDValue, 16> Ops;
+
+ // We don't need to merge masks if the root is empty.
+ bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
+ if (EmptyRoot) {
+ // Only resolve zeros if it will remove an input, otherwise we might end
+ // up in an infinite loop.
+ bool ResolveKnownZeros = true;
+ if (!OpZero.isNullValue()) {
+ APInt UsedInputs = APInt::getNullValue(OpInputs.size());
+ for (int i = 0, e = OpMask.size(); i != e; ++i) {
+ int M = OpMask[i];
+ if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
+ continue;
+ UsedInputs.setBit(M / OpMask.size());
+ if (UsedInputs.isAllOnesValue()) {
+ ResolveKnownZeros = false;
+ break;
+ }
+ }
+ }
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
+ ResolveKnownZeros);
+
+ Mask = OpMask;
+ Ops.append(OpInputs.begin(), OpInputs.end());
+ } else {
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
+
+ // Add the inputs to the Ops list, avoiding duplicates.
+ Ops.append(SrcOps.begin(), SrcOps.end());
+
+ auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
+ // Attempt to find an existing match.
+ SDValue InputBC = peekThroughBitcasts(Input);
+ for (int i = 0, e = Ops.size(); i < e; ++i)
+ if (InputBC == peekThroughBitcasts(Ops[i]))
+ return i;
+ // Match failed - should we replace an existing Op?
+ if (InsertionPoint >= 0) {
+ Ops[InsertionPoint] = Input;
+ return InsertionPoint;
+ }
+ // Add to the end of the Ops list.
+ Ops.push_back(Input);
+ return Ops.size() - 1;
+ };
+
+ SmallVector<int, 2> OpInputIdx;
+ for (SDValue OpInput : OpInputs)
+ OpInputIdx.push_back(
+ AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
+
+ assert(((RootMask.size() > OpMask.size() &&
+ RootMask.size() % OpMask.size() == 0) ||
+ (OpMask.size() > RootMask.size() &&
+ OpMask.size() % RootMask.size() == 0) ||
+ OpMask.size() == RootMask.size()) &&
+ "The smaller number of elements must divide the larger.");
+
+ // This function can be performance-critical, so we rely on the power-of-2
+ // knowledge that we have about the mask sizes to replace div/rem ops with
+ // bit-masks and shifts.
+ assert(isPowerOf2_32(RootMask.size()) &&
+ "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
+ unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
+
+ unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
+ unsigned RootRatio =
+ std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
+ unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
+ assert((RootRatio == 1 || OpRatio == 1) &&
+ "Must not have a ratio for both incoming and op masks!");
+
+ assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
+ unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
+
+ Mask.resize(MaskWidth, SM_SentinelUndef);
+
+ // Merge this shuffle operation's mask into our accumulated mask. Note that
+ // this shuffle's mask will be the first applied to the input, followed by
+ // the root mask to get us all the way to the root value arrangement. The
+ // reason for this order is that we are recursing up the operation chain.
+ for (unsigned i = 0; i < MaskWidth; ++i) {
+ unsigned RootIdx = i >> RootRatioLog2;
+ if (RootMask[RootIdx] < 0) {
+ // This is a zero or undef lane, we're done.
+ Mask[i] = RootMask[RootIdx];
+ continue;
+ }
+
+ unsigned RootMaskedIdx =
+ RootRatio == 1
+ ? RootMask[RootIdx]
+ : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
+
+ // Just insert the scaled root mask value if it references an input other
+ // than the SrcOp we're currently inserting.
+ if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
+ (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
+ Mask[i] = RootMaskedIdx;
+ continue;
+ }
+
+ RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
+ unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
+ if (OpMask[OpIdx] < 0) {
+ // The incoming lanes are zero or undef, it doesn't matter which ones we
+ // are using.
+ Mask[i] = OpMask[OpIdx];
+ continue;
+ }
+
+ // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
+ unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
+ : (OpMask[OpIdx] << OpRatioLog2) +
+ (RootMaskedIdx & (OpRatio - 1));
+
+ OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
+ int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
+ assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
+ OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
+
+ Mask[i] = OpMaskedIdx;
+ }
+ }
+
+ // Remove unused/repeated shuffle source ops.
+ resolveTargetShuffleInputsAndMask(Ops, Mask);
+
+ // Handle the all undef/zero cases early.
+ if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
+ return DAG.getUNDEF(Root.getValueType());
+ if (all_of(Mask, [](int Idx) { return Idx < 0; }))
+ return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
+ SDLoc(Root));
+
+ assert(!Ops.empty() && "Shuffle with no inputs detected");
+ HasVariableMask |= IsOpVariableMask;
+
+ // Update the list of shuffle nodes that have been combined so far.
+ SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
+ SrcNodes.end());
+ CombinedNodes.push_back(Op.getNode());
+
+ // See if we can recurse into each shuffle source op (if it's a target
+ // shuffle). The source op should only be generally combined if it either has
+ // a single use (i.e. current Op) or all its users have already been combined,
+ // if not then we can still combine but should prevent generation of variable
+ // shuffles to avoid constant pool bloat.
+ // Don't recurse if we already have more source ops than we can combine in
+ // the remaining recursion depth.
+ if (Ops.size() < (MaxDepth - Depth)) {
+ for (int i = 0, e = Ops.size(); i < e; ++i) {
+ // For empty roots, we need to resolve zeroable elements before combining
+ // them with other shuffles.
+ SmallVector<int, 64> ResolvedMask = Mask;
+ if (EmptyRoot)
+ resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
+ bool AllowVar = false;
+ if (Ops[i].getNode()->hasOneUse() ||
+ SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+ AllowVar = AllowVariableMask;
+ if (SDValue Res = combineX86ShufflesRecursively(
+ Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
+ HasVariableMask, AllowVar, DAG, Subtarget))
+ return Res;
+ }
+ }
+
+ // Attempt to constant fold all of the constant source ops.
+ if (SDValue Cst = combineX86ShufflesConstants(
+ Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
+ return Cst;
+
+ // Canonicalize the combined shuffle mask chain with horizontal ops.
+ // NOTE: This will update the Ops and Mask.
+ if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
+ Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
+ return DAG.getBitcast(Root.getValueType(), HOp);
+
+ // Widen any subvector shuffle inputs we've collected.
+ if (any_of(Ops, [RootSizeInBits](SDValue Op) {
+ return Op.getValueSizeInBits() < RootSizeInBits;
+ })) {
+ for (SDValue &Op : Ops)
+ if (Op.getValueSizeInBits() < RootSizeInBits)
+ Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
+ RootSizeInBits);
+ // Reresolve - we might have repeated subvector sources.
+ resolveTargetShuffleInputsAndMask(Ops, Mask);
+ }
+
+ // We can only combine unary and binary shuffle mask cases.
+ if (Ops.size() <= 2) {
+ // Minor canonicalization of the accumulated shuffle mask to make it easier
+ // to match below. All this does is detect masks with sequential pairs of
+ // elements, and shrink them to the half-width mask. It does this in a loop
+ // so it will reduce the size of the mask to the minimal width mask which
+ // performs an equivalent shuffle.
+ while (Mask.size() > 1) {
+ SmallVector<int, 64> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ break;
+ Mask = std::move(WidenedMask);
+ }
+
+ // Canonicalization of binary shuffle masks to improve pattern matching by
+ // commuting the inputs.
+ if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ }
+
+ // Finally, try to combine into a single shuffle instruction.
+ return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget);
+ }
+
+ // If that failed and any input is extracted then try to combine as a
+ // shuffle with the larger type.
+ return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
+ HasVariableMask, AllowVariableMask,
+ DAG, Subtarget);
+}
+
+/// Helper entry wrapper to combineX86ShufflesRecursively.
+static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
+ X86::MaxShuffleCombineDepth,
+ /*HasVarMask*/ false,
+ /*AllowVarMask*/ true, DAG, Subtarget);
+}
+
+/// Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+ MVT VT = N.getSimpleValueType();
+ SmallVector<int, 4> Mask;
+ SmallVector<SDValue, 2> Ops;
+ bool IsUnary;
+ bool HaveMask =
+ getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
+ (void)HaveMask;
+ assert(HaveMask);
+
+ // If we have more than 128-bits, only the low 128-bits of shuffle mask
+ // matter. Check that the upper masks are repeats and remove them.
+ if (VT.getSizeInBits() > 128) {
+ int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+ for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+ for (int j = 0; j < LaneElts; ++j)
+ assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
+ "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+ Mask.resize(LaneElts);
+ }
+
+ switch (N.getOpcode()) {
+ case X86ISD::PSHUFD:
+ return Mask;
+ case X86ISD::PSHUFLW:
+ Mask.resize(4);
+ return Mask;
+ case X86ISD::PSHUFHW:
+ Mask.erase(Mask.begin(), Mask.begin() + 4);
+ for (int &M : Mask)
+ M -= 4;
+ return Mask;
+ default:
+ llvm_unreachable("No valid shuffle instruction found!");
+ }
+}
+
+/// Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static SDValue
+combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(N.getOpcode() == X86ISD::PSHUFD &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+
+ // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
+ // of the shuffles in the chain so that we can form a fresh chain to replace
+ // this one.
+ SmallVector<SDValue, 8> Chain;
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return SDValue(); // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFD:
+ // Found another dword shuffle.
+ break;
+
+ case X86ISD::PSHUFLW:
+ // Check that the low words (being shuffled) are the identity in the
+ // dword shuffle, and the high words are self-contained.
+ if (Mask[0] != 0 || Mask[1] != 1 ||
+ !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+ return SDValue();
+
+ Chain.push_back(V);
+ continue;
+
+ case X86ISD::PSHUFHW:
+ // Check that the high words (being shuffled) are the identity in the
+ // dword shuffle, and the low words are self-contained.
+ if (Mask[2] != 2 || Mask[3] != 3 ||
+ !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+ return SDValue();
+
+ Chain.push_back(V);
+ continue;
+
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+ // shuffle into a preceding word shuffle.
+ if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
+ V.getSimpleValueType().getVectorElementType() != MVT::i16)
+ return SDValue();
+
+ // Search for a half-shuffle which we can combine with.
+ unsigned CombineOp =
+ V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+ if (V.getOperand(0) != V.getOperand(1) ||
+ !V->isOnlyUserOf(V.getOperand(0).getNode()))
+ return SDValue();
+ Chain.push_back(V);
+ V = V.getOperand(0);
+ do {
+ switch (V.getOpcode()) {
+ default:
+ return SDValue(); // Nothing to combine.
+
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ if (V.getOpcode() == CombineOp)
+ break;
+
+ Chain.push_back(V);
+
+ LLVM_FALLTHROUGH;
+ case ISD::BITCAST:
+ V = V.getOperand(0);
+ continue;
+ }
+ break;
+ } while (V.hasOneUse());
+ break;
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return SDValue();
+
+ // Merge this node's mask and our incoming mask.
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Rebuild the chain around this new shuffle.
+ while (!Chain.empty()) {
+ SDValue W = Chain.pop_back_val();
+
+ if (V.getValueType() != W.getOperand(0).getValueType())
+ V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
+
+ switch (W.getOpcode()) {
+ default:
+ llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
+
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
+ break;
+
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
+ break;
+ }
+ }
+ if (V.getValueType() != N.getValueType())
+ V = DAG.getBitcast(N.getValueType(), V);
+
+ // Return the new chain to replace N.
+ return V;
+}
+
+// Attempt to commute shufps LHS loads:
+// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
+static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
+ SelectionDAG &DAG) {
+ // TODO: Add vXf64 support.
+ if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
+ return SDValue();
+
+ // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
+ auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
+ if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
+ return SDValue();
+ SDValue N0 = V.getOperand(0);
+ SDValue N1 = V.getOperand(1);
+ unsigned Imm = V.getConstantOperandVal(2);
+ if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
+ MayFoldLoad(peekThroughOneUseBitcasts(N1)))
+ return SDValue();
+ Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
+ DAG.getTargetConstant(Imm, DL, MVT::i8));
+ };
+
+ switch (N.getOpcode()) {
+ case X86ISD::VPERMILPI:
+ if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
+ unsigned Imm = N.getConstantOperandVal(1);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+ }
+ break;
+ case X86ISD::SHUFP: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ unsigned Imm = N.getConstantOperandVal(2);
+ if (N0 == N1) {
+ if (SDValue NewSHUFP = commuteSHUFP(N, N0))
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
+ DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
+ }
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
+/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
+static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
+ SelectionDAG &DAG,
+ const SDLoc &DL) {
+ assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
+
+ MVT VT = V.getSimpleValueType();
+ SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
+ SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
+ unsigned SrcOpc0 = Src0.getOpcode();
+ unsigned SrcOpc1 = Src1.getOpcode();
+ EVT SrcVT0 = Src0.getValueType();
+ EVT SrcVT1 = Src1.getValueType();
+
+ if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
+ return SDValue();
+
+ switch (SrcOpc0) {
+ case X86ISD::MOVDDUP: {
+ SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
+ SDValue RHS =
+ DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
+ SDValue Res =
+ DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res));
+ return DAG.getBitcast(VT, Res);
+ }
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI:
+ case X86ISD::PSHUFD:
+ case X86ISD::VPERMILPI:
+ if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
+ SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
+ SDValue RHS =
+ DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
+ SDValue Res =
+ DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
+ Src0.getOperand(1));
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ }
+
+ return SDValue();
+}
+
+/// Try to combine x86 target specific shuffles.
+static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+ SmallVector<int, 4> Mask;
+ unsigned Opcode = N.getOpcode();
+
+ if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
+ return R;
+
+ // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
+ // help expose the 'NOT' pattern further up the DAG.
+ // TODO: This might be beneficial for any binop with a 'splattable' operand.
+ switch (Opcode) {
+ case X86ISD::MOVDDUP:
+ case X86ISD::PSHUFD: {
+ SDValue Src = N.getOperand(0);
+ if (Src.hasOneUse() && Src.getValueType() == VT) {
+ if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {
+ Not = DAG.getBitcast(VT, Not);
+ Not = Opcode == X86ISD::MOVDDUP
+ ? DAG.getNode(Opcode, DL, VT, Not)
+ : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
+ EVT IntVT = Not.getValueType().changeTypeToInteger();
+ SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
+ Not = DAG.getBitcast(IntVT, Not);
+ Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
+ return DAG.getBitcast(VT, Not);
+ }
+ }
+ break;
+ }
+ }
+
+ // Handle specific target shuffles.
+ switch (Opcode) {
+ case X86ISD::MOVDDUP: {
+ SDValue Src = N.getOperand(0);
+ // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
+ if (VT == MVT::v2f64 && Src.hasOneUse() &&
+ ISD::isNormalLoad(Src.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src);
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
+ SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
+ DCI.CombineTo(N.getNode(), Movddup);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ return SDValue();
+ }
+ case X86ISD::VBROADCAST: {
+ SDValue Src = N.getOperand(0);
+ SDValue BC = peekThroughBitcasts(Src);
+ EVT SrcVT = Src.getValueType();
+ EVT BCVT = BC.getValueType();
+
+ // If broadcasting from another shuffle, attempt to simplify it.
+ // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
+ if (isTargetShuffle(BC.getOpcode()) &&
+ VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
+ unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
+ SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
+ SM_SentinelUndef);
+ for (unsigned i = 0; i != Scale; ++i)
+ DemandedMask[i] = i;
+ if (SDValue Res = combineX86ShufflesRecursively(
+ {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
+ X86::MaxShuffleCombineDepth,
+ /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getBitcast(SrcVT, Res));
+ }
+
+ // broadcast(bitcast(src)) -> bitcast(broadcast(src))
+ // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
+ if (Src.getOpcode() == ISD::BITCAST &&
+ SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
+ VT.getVectorNumElements());
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
+ }
+
+ // Reduce broadcast source vector to lowest 128-bits.
+ if (SrcVT.getSizeInBits() > 128)
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ extract128BitVector(Src, 0, DAG, DL));
+
+ // broadcast(scalar_to_vector(x)) -> broadcast(x).
+ if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
+
+ // Share broadcast with the longest vector and extract low subvector (free).
+ // Ensure the same SDValue from the SDNode use is being used.
+ for (SDNode *User : Src->uses())
+ if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
+ Src == User->getOperand(0) &&
+ User->getValueSizeInBits(0).getFixedSize() >
+ VT.getFixedSizeInBits()) {
+ return extractSubVector(SDValue(User, 0), 0, DAG, DL,
+ VT.getSizeInBits());
+ }
+
+ // vbroadcast(scalarload X) -> vbroadcast_load X
+ // For float loads, extract other uses of the scalar from the broadcast.
+ if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
+ ISD::isNormalLoad(Src.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ // If the load value is used only by N, replace it via CombineTo N.
+ bool NoReplaceExtract = Src.hasOneUse();
+ DCI.CombineTo(N.getNode(), BcastLd);
+ if (NoReplaceExtract) {
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ } else {
+ SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
+ DAG.getIntPtrConstant(0, DL));
+ DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
+ }
+ return N; // Return N so it doesn't get rechecked!
+ }
+
+ // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
+ // i16. So shrink it ourselves if we can make a broadcast_load.
+ if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
+ Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
+ assert(Subtarget.hasAVX2() && "Expected AVX2");
+ SDValue TruncIn = Src.getOperand(0);
+
+ // If this is a truncate of a non extending load we can just narrow it to
+ // use a broadcast_load.
+ if (ISD::isNormalLoad(TruncIn.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
+ // Unless its volatile or atomic.
+ if (LN->isSimple()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+ LN->getPointerInfo(), LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // If this is a truncate of an i16 extload, we can directly replace it.
+ if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
+ ISD::isEXTLoad(Src.getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
+ if (LN->getMemoryVT().getSizeInBits() == 16) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // If this is a truncate of load that has been shifted right, we can
+ // offset the pointer and use a narrower load.
+ if (TruncIn.getOpcode() == ISD::SRL &&
+ TruncIn.getOperand(0).hasOneUse() &&
+ isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
+ ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
+ unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
+ // Make sure the shift amount and the load size are divisible by 16.
+ // Don't do this if the load is volatile or atomic.
+ if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
+ LN->isSimple()) {
+ unsigned Offset = ShiftAmt / 8;
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
+ TypeSize::Fixed(Offset), DL);
+ SDValue Ops[] = { LN->getChain(), Ptr };
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+ LN->getPointerInfo().getWithOffset(Offset),
+ LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+ }
+
+ // vbroadcast(vzload X) -> vbroadcast_load X
+ if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
+ MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
+ if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // vbroadcast(vector load X) -> vbroadcast_load
+ if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
+ SrcVT == MVT::v4i32) &&
+ Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src);
+ // Unless the load is volatile or atomic.
+ if (LN->isSimple()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
+ LN->getPointerInfo(), LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ return SDValue();
+ }
+ case X86ISD::VZEXT_MOVL: {
+ SDValue N0 = N.getOperand(0);
+
+ // If this a vzmovl of a full vector load, replace it with a vzload, unless
+ // the load is volatile.
+ if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
+ auto *LN = cast<LoadSDNode>(N0);
+ if (SDValue VZLoad =
+ narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
+ DCI.CombineTo(N.getNode(), VZLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N;
+ }
+ }
+
+ // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
+ // and can just use a VZEXT_LOAD.
+ // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
+ if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *LN = cast<MemSDNode>(N0);
+ if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), VZLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N;
+ }
+ }
+
+ // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
+ // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
+ // if the upper bits of the i64 are zero.
+ if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ N0.getOperand(0).hasOneUse() &&
+ N0.getOperand(0).getValueType() == MVT::i64) {
+ SDValue In = N0.getOperand(0);
+ APInt Mask = APInt::getHighBitsSet(64, 32);
+ if (DAG.MaskedValueIsZero(In, Mask)) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
+ MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+ SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
+ return DAG.getBitcast(VT, Movl);
+ }
+ }
+
+ // Load a scalar integer constant directly to XMM instead of transferring an
+ // immediate value from GPR.
+ // vzext_movl (scalar_to_vector C) --> load [C,0...]
+ if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+ // Create a vector constant - scalar constant followed by zeros.
+ EVT ScalarVT = N0.getOperand(0).getValueType();
+ Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
+ unsigned NumElts = VT.getVectorNumElements();
+ Constant *Zero = ConstantInt::getNullValue(ScalarTy);
+ SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
+ ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
+
+ // Load the vector constant from constant pool.
+ MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
+ MachineMemOperand::MOLoad);
+ }
+ }
+
+ // Pull subvector inserts into undef through VZEXT_MOVL by making it an
+ // insert into a zero vector. This helps get VZEXT_MOVL closer to
+ // scalar_to_vectors where 256/512 are canonicalized to an insert and a
+ // 128-bit scalar_to_vector. This reduces the number of isel patterns.
+ if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
+ SDValue V = peekThroughOneUseBitcasts(N0);
+
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
+ isNullConstant(V.getOperand(2))) {
+ SDValue In = V.getOperand(1);
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ In.getValueSizeInBits() /
+ VT.getScalarSizeInBits());
+ In = DAG.getBitcast(SubVT, In);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), Movl,
+ V.getOperand(2));
+ }
+ }
+
+ return SDValue();
+ }
+ case X86ISD::BLENDI: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+
+ // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
+ // TODO: Handle MVT::v16i16 repeated blend mask.
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
+ MVT SrcVT = N0.getOperand(0).getSimpleValueType();
+ if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
+ SrcVT.getScalarSizeInBits() >= 32) {
+ unsigned BlendMask = N.getConstantOperandVal(2);
+ unsigned Size = VT.getVectorNumElements();
+ unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
+ N1.getOperand(0),
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
+ }
+ }
+ return SDValue();
+ }
+ case X86ISD::VPERMI: {
+ // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
+ // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ if (N0.getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
+ SDValue Src = N0.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
+ return DAG.getBitcast(VT, Res);
+ }
+ return SDValue();
+ }
+ case X86ISD::VPERM2X128: {
+ // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if (LHS.getOpcode() == ISD::BITCAST &&
+ (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
+ EVT SrcVT = LHS.getOperand(0).getValueType();
+ if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
+ DAG.getBitcast(SrcVT, LHS),
+ DAG.getBitcast(SrcVT, RHS),
+ N->getOperand(2)));
+ }
+ }
+
+ // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
+ if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
+ return Res;
+
+ // Fold vperm2x128 subvector shuffle with an inner concat pattern.
+ // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
+ auto FindSubVector128 = [&](unsigned Idx) {
+ if (Idx > 3)
+ return SDValue();
+ SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
+ SmallVector<SDValue> SubOps;
+ if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
+ return SubOps[Idx & 1];
+ unsigned NumElts = Src.getValueType().getVectorNumElements();
+ if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueSizeInBits() == 128 &&
+ Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
+ return Src.getOperand(1);
+ }
+ return SDValue();
+ };
+ unsigned Imm = N.getConstantOperandVal(2);
+ if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
+ if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
+ MVT SubVT = VT.getHalfNumVectorElementsVT();
+ SubLo = DAG.getBitcast(SubVT, SubLo);
+ SubHi = DAG.getBitcast(SubVT, SubHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
+ }
+ }
+ return SDValue();
+ }
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ Mask = getPSHUFShuffleMask(N);
+ assert(Mask.size() == 4);
+ break;
+ case X86ISD::MOVSD:
+ case X86ISD::MOVSS: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+
+ // Canonicalize scalar FPOps:
+ // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
+ // If commutable, allow OP(N1[0], N0[0]).
+ unsigned Opcode1 = N1.getOpcode();
+ if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
+ Opcode1 == ISD::FDIV) {
+ SDValue N10 = N1.getOperand(0);
+ SDValue N11 = N1.getOperand(1);
+ if (N10 == N0 ||
+ (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
+ if (N10 != N0)
+ std::swap(N10, N11);
+ MVT SVT = VT.getVectorElementType();
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+ N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
+ N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
+ SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
+ SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
+ return DAG.getNode(Opcode, DL, VT, N0, SclVec);
+ }
+ }
+
+ return SDValue();
+ }
+ case X86ISD::INSERTPS: {
+ assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+ SDValue Op0 = N.getOperand(0);
+ SDValue Op1 = N.getOperand(1);
+ unsigned InsertPSMask = N.getConstantOperandVal(2);
+ unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
+ unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
+ unsigned ZeroMask = InsertPSMask & 0xF;
+
+ // If we zero out all elements from Op0 then we don't need to reference it.
+ if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
+
+ // If we zero out the element from Op1 then we don't need to reference it.
+ if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
+
+ // Attempt to merge insertps Op1 with an inner target shuffle node.
+ SmallVector<int, 8> TargetMask1;
+ SmallVector<SDValue, 2> Ops1;
+ APInt KnownUndef1, KnownZero1;
+ if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
+ KnownZero1)) {
+ if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
+ // Zero/UNDEF insertion - zero out element and remove dependency.
+ InsertPSMask |= (1u << DstIdx);
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
+ }
+ // Update insertps mask srcidx and reference the source input directly.
+ int M = TargetMask1[SrcIdx];
+ assert(0 <= M && M < 8 && "Shuffle index out of range");
+ InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
+ Op1 = Ops1[M < 4 ? 0 : 1];
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
+ }
+
+ // Attempt to merge insertps Op0 with an inner target shuffle node.
+ SmallVector<int, 8> TargetMask0;
+ SmallVector<SDValue, 2> Ops0;
+ APInt KnownUndef0, KnownZero0;
+ if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
+ KnownZero0)) {
+ bool Updated = false;
+ bool UseInput00 = false;
+ bool UseInput01 = false;
+ for (int i = 0; i != 4; ++i) {
+ if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+ // No change if element is already zero or the inserted element.
+ continue;
+ } else if (KnownUndef0[i] || KnownZero0[i]) {
+ // If the target mask is undef/zero then we must zero the element.
+ InsertPSMask |= (1u << i);
+ Updated = true;
+ continue;
+ }
+
+ // The input vector element must be inline.
+ int M = TargetMask0[i];
+ if (M != i && M != (i + 4))
+ return SDValue();
+
+ // Determine which inputs of the target shuffle we're using.
+ UseInput00 |= (0 <= M && M < 4);
+ UseInput01 |= (4 <= M);
+ }
+
+ // If we're not using both inputs of the target shuffle then use the
+ // referenced input directly.
+ if (UseInput00 && !UseInput01) {
+ Updated = true;
+ Op0 = Ops0[0];
+ } else if (!UseInput00 && UseInput01) {
+ Updated = true;
+ Op0 = Ops0[1];
+ }
+
+ if (Updated)
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
+ }
+
+ // If we're inserting an element from a vbroadcast load, fold the
+ // load into the X86insertps instruction. We need to convert the scalar
+ // load to a vector and clear the source lane of the INSERTPS control.
+ if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
+ if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
+ SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
+ MemIntr->getBasePtr(),
+ MemIntr->getMemOperand());
+ SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
+ Load),
+ DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
+ return Insert;
+ }
+ }
+
+ return SDValue();
+ }
+ default:
+ return SDValue();
+ }
+
+ // Nuke no-op shuffles that show up after combining.
+ if (isNoopShuffleMask(Mask))
+ return N.getOperand(0);
+
+ // Look for simplifications involving one or two shuffle instructions.
+ SDValue V = N.getOperand(0);
+ switch (N.getOpcode()) {
+ default:
+ break;
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
+
+ // See if this reduces to a PSHUFD which is no more expensive and can
+ // combine with more operations. Note that it has to at least flip the
+ // dwords as otherwise it would have been removed as a no-op.
+ if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
+ int DMask[] = {0, 1, 2, 3};
+ int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+ DMask[DOffset + 0] = DOffset + 1;
+ DMask[DOffset + 1] = DOffset + 0;
+ MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ V = DAG.getBitcast(DVT, V);
+ V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
+ getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
+ return DAG.getBitcast(VT, V);
+ }
+
+ // Look for shuffle patterns which can be implemented as a single unpack.
+ // FIXME: This doesn't handle the location of the PSHUFD generically, and
+ // only works when we have a PSHUFD followed by two half-shuffles.
+ if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+ (V.getOpcode() == X86ISD::PSHUFLW ||
+ V.getOpcode() == X86ISD::PSHUFHW) &&
+ V.getOpcode() != N.getOpcode() &&
+ V.hasOneUse() && V.getOperand(0).hasOneUse()) {
+ SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
+ if (D.getOpcode() == X86ISD::PSHUFD) {
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+ int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+ int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+ int WordMask[8];
+ for (int i = 0; i < 4; ++i) {
+ WordMask[i + NOffset] = Mask[i] + NOffset;
+ WordMask[i + VOffset] = VMask[i] + VOffset;
+ }
+ // Map the word mask through the DWord mask.
+ int MappedMask[8];
+ for (int i = 0; i < 8; ++i)
+ MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+ if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
+ // We can replace all three shuffles with an unpack.
+ V = DAG.getBitcast(VT, D.getOperand(0));
+ return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+ : X86ISD::UNPCKH,
+ DL, VT, V, V);
+ }
+ }
+ }
+
+ break;
+
+ case X86ISD::PSHUFD:
+ if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
+ return NewN;
+
+ break;
+ }
+
+ return SDValue();
+}
+
+/// Checks if the shuffle mask takes subsequent elements
+/// alternately from two vectors.
+/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
+static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
+
+ int ParitySrc[2] = {-1, -1};
+ unsigned Size = Mask.size();
+ for (unsigned i = 0; i != Size; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+
+ // Make sure we are using the matching element from the input.
+ if ((M % Size) != i)
+ return false;
+
+ // Make sure we use the same input for all elements of the same parity.
+ int Src = M / Size;
+ if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
+ return false;
+ ParitySrc[i % 2] = Src;
+ }
+
+ // Make sure each input is used.
+ if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
+ return false;
+
+ Op0Even = ParitySrc[0] == 0;
+ return true;
+}
+
+/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
+/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
+/// are written to the parameters \p Opnd0 and \p Opnd1.
+///
+/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
+/// so it is easier to generically match. We also insert dummy vector shuffle
+/// nodes for the operands which explicitly discard the lanes which are unused
+/// by this operation to try to flow through the rest of the combiner
+/// the fact that they're unused.
+static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
+ bool &IsSubAdd) {
+
+ EVT VT = N->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
+ !VT.getSimpleVT().isFloatingPoint())
+ return false;
+
+ // We only handle target-independent shuffles.
+ // FIXME: It would be easy and harmless to use the target shuffle mask
+ // extraction tool to support more.
+ if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+ return false;
+
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+
+ // Make sure we have an FADD and an FSUB.
+ if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
+ (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
+ V1.getOpcode() == V2.getOpcode())
+ return false;
+
+ // If there are other uses of these operations we can't fold them.
+ if (!V1->hasOneUse() || !V2->hasOneUse())
+ return false;
+
+ // Ensure that both operations have the same operands. Note that we can
+ // commute the FADD operands.
+ SDValue LHS, RHS;
+ if (V1.getOpcode() == ISD::FSUB) {
+ LHS = V1->getOperand(0); RHS = V1->getOperand(1);
+ if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
+ (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
+ return false;
+ } else {
+ assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
+ LHS = V2->getOperand(0); RHS = V2->getOperand(1);
+ if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
+ (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
+ return false;
+ }
+
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+ bool Op0Even;
+ if (!isAddSubOrSubAddMask(Mask, Op0Even))
+ return false;
+
+ // It's a subadd if the vector in the even parity is an FADD.
+ IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
+ : V2->getOpcode() == ISD::FADD;
+
+ Opnd0 = LHS;
+ Opnd1 = RHS;
+ return true;
+}
+
+/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
+static SDValue combineShuffleToFMAddSub(SDNode *N,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // We only handle target-independent shuffles.
+ // FIXME: It would be easy and harmless to use the target shuffle mask
+ // extraction tool to support more.
+ if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
+ return SDValue();
+
+ // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue FMAdd = Op0, FMSub = Op1;
+ if (FMSub.getOpcode() != X86ISD::FMSUB)
+ std::swap(FMAdd, FMSub);
+
+ if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
+ FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
+ FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
+ FMAdd.getOperand(2) != FMSub.getOperand(2))
+ return SDValue();
+
+ // Check for correct shuffle mask.
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+ bool Op0Even;
+ if (!isAddSubOrSubAddMask(Mask, Op0Even))
+ return SDValue();
+
+ // FMAddSub takes zeroth operand from FMSub node.
+ SDLoc DL(N);
+ bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
+ unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
+ return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
+ FMAdd.getOperand(2));
+}
+
+/// Try to combine a shuffle into a target-specific add-sub or
+/// mul-add-sub node.
+static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
+ return V;
+
+ SDValue Opnd0, Opnd1;
+ bool IsSubAdd;
+ if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ SDLoc DL(N);
+
+ // Try to generate X86ISD::FMADDSUB node here.
+ SDValue Opnd2;
+ if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
+ unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
+ return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
+ }
+
+ if (IsSubAdd)
+ return SDValue();
+
+ // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+ // the ADDSUB idiom has been successfully recognized. There are no known
+ // X86 targets with 512-bit ADDSUB instructions!
+ if (VT.is512BitVector())
+ return SDValue();
+
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
+}
+
+// We are looking for a shuffle where both sources are concatenated with undef
+// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
+// if we can express this as a single-source shuffle, that's preferable.
+static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+
+ if (VT.getVectorElementType() != MVT::i32 &&
+ VT.getVectorElementType() != MVT::i64 &&
+ VT.getVectorElementType() != MVT::f32 &&
+ VT.getVectorElementType() != MVT::f64)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Check that both sources are concats with undef.
+ if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
+ N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
+ N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
+ !N1.getOperand(1).isUndef())
+ return SDValue();
+
+ // Construct the new shuffle mask. Elements from the first source retain their
+ // index, but elements from the second source no longer need to skip an undef.
+ SmallVector<int, 8> Mask;
+ int NumElts = VT.getVectorNumElements();
+
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ for (int Elt : SVOp->getMask())
+ Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
+
+ SDLoc DL(N);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
+ N1.getOperand(0));
+ return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
+}
+
+/// Eliminate a redundant shuffle of a horizontal math op.
+static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
+ // TODO: Can we use getTargetShuffleInputs instead?
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
+ if (Opcode != X86ISD::UNPCKL && Opcode != X86ISD::UNPCKH)
+ if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+ return SDValue();
+
+ // For a broadcast, peek through an extract element of index 0 to find the
+ // horizontal op: broadcast (ext_vec_elt HOp, 0)
+ EVT VT = N->getValueType(0);
+ if (Opcode == X86ISD::VBROADCAST) {
+ SDValue SrcOp = N->getOperand(0);
+ if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ SrcOp.getValueType() == MVT::f64 &&
+ SrcOp.getOperand(0).getValueType() == VT &&
+ isNullConstant(SrcOp.getOperand(1)))
+ N = SrcOp.getNode();
+ }
+
+ SDValue HOp = N->getOperand(0);
+ if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
+ HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
+ return SDValue();
+
+ // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)).
+ // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)).
+ // Don't fold if hop(x,y) == hop(z,w).
+ if (Opcode == X86ISD::UNPCKL || Opcode == X86ISD::UNPCKH) {
+ SDValue HOp2 = N->getOperand(1);
+ if (HOp.getOpcode() != HOp2.getOpcode() || VT.getScalarSizeInBits() != 32)
+ return SDValue();
+ if (HOp == HOp2)
+ return SDValue();
+ SDLoc DL(HOp);
+ unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1;
+ SDValue Res = DAG.getNode(HOp.getOpcode(), DL, VT, HOp.getOperand(LoHi),
+ HOp2.getOperand(LoHi));
+ // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle
+ // combining and domain handling will simplify this later on.
+ EVT ShuffleVT = VT.changeVectorElementType(MVT::f32);
+ Res = DAG.getBitcast(ShuffleVT, Res);
+ Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
+ getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG));
+ return DAG.getBitcast(VT, Res);
+ }
+
+ // 128-bit horizontal math instructions are defined to operate on adjacent
+ // lanes of each operand as:
+ // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
+ // ...similarly for v2f64 and v8i16.
+ if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
+ HOp.getOperand(0) != HOp.getOperand(1))
+ return SDValue();
+
+ // The shuffle that we are eliminating may have allowed the horizontal op to
+ // have an undemanded (undefined) operand. Duplicate the other (defined)
+ // operand to ensure that the results are defined across all lanes without the
+ // shuffle.
+ auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
+ SDValue X;
+ if (HorizOp.getOperand(0).isUndef()) {
+ assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op");
+ X = HorizOp.getOperand(1);
+ } else if (HorizOp.getOperand(1).isUndef()) {
+ assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op");
+ X = HorizOp.getOperand(0);
+ } else {
+ return HorizOp;
+ }
+ return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
+ HorizOp.getValueType(), X, X);
+ };
+
+ // When the operands of a horizontal math op are identical, the low half of
+ // the result is the same as the high half. If a target shuffle is also
+ // replicating low and high halves (and without changing the type/length of
+ // the vector), we don't need the shuffle.
+ if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
+ if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
+ // movddup (hadd X, X) --> hadd X, X
+ // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
+ assert((HOp.getValueType() == MVT::v2f64 ||
+ HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op");
+ return updateHOp(HOp, DAG);
+ }
+ return SDValue();
+ }
+
+ // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+
+ // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
+ // but this should be tied to whatever horizontal op matching and shuffle
+ // canonicalization are producing.
+ if (HOp.getValueSizeInBits() == 128 &&
+ (isShuffleEquivalent(Mask, {0, 0}) ||
+ isShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
+ isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
+ return updateHOp(HOp, DAG);
+
+ if (HOp.getValueSizeInBits() == 256 &&
+ (isShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
+ isShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
+ isShuffleEquivalent(
+ Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
+ return updateHOp(HOp, DAG);
+
+ return SDValue();
+}
+
+/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
+/// low half of each source vector and does not set any high half elements in
+/// the destination vector, narrow the shuffle to half its original size.
+static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
+ if (!Shuf->getValueType(0).isSimple())
+ return SDValue();
+ MVT VT = Shuf->getSimpleValueType(0);
+ if (!VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ // See if we can ignore all of the high elements of the shuffle.
+ ArrayRef<int> Mask = Shuf->getMask();
+ if (!isUndefUpperHalf(Mask))
+ return SDValue();
+
+ // Check if the shuffle mask accesses only the low half of each input vector
+ // (half-index output is 0 or 2).
+ int HalfIdx1, HalfIdx2;
+ SmallVector<int, 8> HalfMask(Mask.size() / 2);
+ if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
+ (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
+ return SDValue();
+
+ // Create a half-width shuffle to replace the unnecessarily wide shuffle.
+ // The trick is knowing that all of the insert/extract are actually free
+ // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
+ // of narrow inputs into a narrow output, and that is always cheaper than
+ // the wide shuffle that we started with.
+ return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
+ Shuf->getOperand(1), HalfMask, HalfIdx1,
+ HalfIdx2, false, DAG, /*UseConcat*/true);
+}
+
+static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
+ if (SDValue V = narrowShuffle(Shuf, DAG))
+ return V;
+
+ // If we have legalized the vector types, look for blends of FADD and FSUB
+ // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.isTypeLegal(VT)) {
+ if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
+ return AddSub;
+
+ if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
+ return HAddSub;
+
+ // Merge shuffles through binops if its likely we'll be able to merge it
+ // with other shuffles (as long as they aren't splats).
+ // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
+ // TODO: We might be able to move this to DAGCombiner::visitVECTOR_SHUFFLE.
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N)) {
+ unsigned SrcOpcode = N->getOperand(0).getOpcode();
+ if (SrcOpcode == N->getOperand(1).getOpcode() && TLI.isBinOp(SrcOpcode) &&
+ N->isOnlyUserOf(N->getOperand(0).getNode()) &&
+ N->isOnlyUserOf(N->getOperand(1).getNode())) {
+ SDValue Op00 = N->getOperand(0).getOperand(0);
+ SDValue Op10 = N->getOperand(1).getOperand(0);
+ SDValue Op01 = N->getOperand(0).getOperand(1);
+ SDValue Op11 = N->getOperand(1).getOperand(1);
+ auto *SVN00 = dyn_cast<ShuffleVectorSDNode>(Op00);
+ auto *SVN10 = dyn_cast<ShuffleVectorSDNode>(Op10);
+ auto *SVN01 = dyn_cast<ShuffleVectorSDNode>(Op01);
+ auto *SVN11 = dyn_cast<ShuffleVectorSDNode>(Op11);
+ if (((SVN00 && !SVN00->isSplat()) || (SVN10 && !SVN10->isSplat())) &&
+ ((SVN01 && !SVN01->isSplat()) || (SVN11 && !SVN11->isSplat()))) {
+ SDLoc DL(N);
+ ArrayRef<int> Mask = SVN->getMask();
+ SDValue LHS = DAG.getVectorShuffle(VT, DL, Op00, Op10, Mask);
+ SDValue RHS = DAG.getVectorShuffle(VT, DL, Op01, Op11, Mask);
+ return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS);
+ }
+ }
+ }
+ }
+
+ // Attempt to combine into a vector load/broadcast.
+ if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
+ Subtarget, true))
+ return LD;
+
+ // For AVX2, we sometimes want to combine
+ // (vector_shuffle <mask> (concat_vectors t1, undef)
+ // (concat_vectors t2, undef))
+ // Into:
+ // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
+ // Since the latter can be efficiently lowered with VPERMD/VPERMQ
+ if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
+ return ShufConcat;
+
+ if (isTargetShuffle(N->getOpcode())) {
+ SDValue Op(N, 0);
+ if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
+ return Shuffle;
+
+ // Try recursively combining arbitrary sequences of x86 shuffle
+ // instructions into higher-order shuffles. We do this after combining
+ // specific PSHUF instruction sequences into their minimal form so that we
+ // can evaluate how many specialized shuffle instructions are involved in
+ // a particular chain.
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+
+ // Simplify source operands based on shuffle mask.
+ // TODO - merge this into combineX86ShufflesRecursively.
+ APInt KnownUndef, KnownZero;
+ APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
+ DCI))
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
+// Simplify variable target shuffle masks based on the demanded elements.
+// TODO: Handle DemandedBits in mask indices as well?
+bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
+ SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
+ TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
+ // If we're demanding all elements don't bother trying to simplify the mask.
+ unsigned NumElts = DemandedElts.getBitWidth();
+ if (DemandedElts.isAllOnesValue())
+ return false;
+
+ SDValue Mask = Op.getOperand(MaskIndex);
+ if (!Mask.hasOneUse())
+ return false;
+
+ // Attempt to generically simplify the variable shuffle mask.
+ APInt MaskUndef, MaskZero;
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+ Depth + 1))
+ return true;
+
+ // Attempt to extract+simplify a (constant pool load) shuffle mask.
+ // TODO: Support other types from getTargetShuffleMaskIndices?
+ SDValue BC = peekThroughOneUseBitcasts(Mask);
+ EVT BCVT = BC.getValueType();
+ auto *Load = dyn_cast<LoadSDNode>(BC);
+ if (!Load)
+ return false;
+
+ const Constant *C = getTargetConstantFromNode(Load);
+ if (!C)
+ return false;
+
+ Type *CTy = C->getType();
+ if (!CTy->isVectorTy() ||
+ CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
+ return false;
+
+ // Handle scaling for i64 elements on 32-bit targets.
+ unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
+ if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
+ return false;
+ unsigned Scale = NumCstElts / NumElts;
+
+ // Simplify mask if we have an undemanded element that is not undef.
+ bool Simplified = false;
+ SmallVector<Constant *, 32> ConstVecOps;
+ for (unsigned i = 0; i != NumCstElts; ++i) {
+ Constant *Elt = C->getAggregateElement(i);
+ if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
+ ConstVecOps.push_back(UndefValue::get(Elt->getType()));
+ Simplified = true;
+ continue;
+ }
+ ConstVecOps.push_back(Elt);
+ }
+ if (!Simplified)
+ return false;
+
+ // Generate new constant pool entry + legalize immediately for the load.
+ SDLoc DL(Op);
+ SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
+ SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
+ SDValue NewMask = TLO.DAG.getLoad(
+ BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
+ MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
+ Load->getAlign());
+ return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
+}
+
+bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
+ TargetLoweringOpt &TLO, unsigned Depth) const {
+ int NumElts = DemandedElts.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+
+ // Handle special case opcodes.
+ switch (Opc) {
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ: {
+ APInt LHSUndef, LHSZero;
+ APInt RHSUndef, RHSZero;
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
+ Depth + 1))
+ return true;
+ if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
+ Depth + 1))
+ return true;
+ // Multiply by zero.
+ KnownZero = LHSZero | RHSZero;
+ break;
+ }
+ case X86ISD::VSHL:
+ case X86ISD::VSRL:
+ case X86ISD::VSRA: {
+ // We only need the bottom 64-bits of the (128-bit) shift amount.
+ SDValue Amt = Op.getOperand(1);
+ MVT AmtVT = Amt.getSimpleValueType();
+ assert(AmtVT.is128BitVector() && "Unexpected value type");
+
+ // If we reuse the shift amount just for sse shift amounts then we know that
+ // only the bottom 64-bits are only ever used.
+ bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
+ unsigned UseOpc = Use->getOpcode();
+ return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
+ UseOpc == X86ISD::VSRA) &&
+ Use->getOperand(0) != Amt;
+ });
+
+ APInt AmtUndef, AmtZero;
+ unsigned NumAmtElts = AmtVT.getVectorNumElements();
+ APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
+ if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
+ Depth + 1, AssumeSingleUse))
+ return true;
+ LLVM_FALLTHROUGH;
+ }
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI: {
+ SDValue Src = Op.getOperand(0);
+ APInt SrcUndef;
+ if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
+ Depth + 1))
+ return true;
+
+ // Aggressively peek through ops to get at the demanded elts.
+ if (!DemandedElts.isAllOnesValue())
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+ Src, DemandedElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
+ break;
+ }
+ case X86ISD::KSHIFTL: {
+ SDValue Src = Op.getOperand(0);
+ auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
+ assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
+ unsigned ShiftAmt = Amt->getZExtValue();
+
+ if (ShiftAmt == 0)
+ return TLO.CombineTo(Op, Src);
+
+ // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+ // single shift. We can do this if the bottom bits (which are shifted
+ // out) are never demanded.
+ if (Src.getOpcode() == X86ISD::KSHIFTR) {
+ if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
+ unsigned C1 = Src.getConstantOperandVal(1);
+ unsigned NewOpc = X86ISD::KSHIFTL;
+ int Diff = ShiftAmt - C1;
+ if (Diff < 0) {
+ Diff = -Diff;
+ NewOpc = X86ISD::KSHIFTR;
+ }
+
+ SDLoc dl(Op);
+ SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
+ }
+ }
+
+ APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
+ Depth + 1))
+ return true;
+
+ KnownUndef <<= ShiftAmt;
+ KnownZero <<= ShiftAmt;
+ KnownZero.setLowBits(ShiftAmt);
+ break;
+ }
+ case X86ISD::KSHIFTR: {
+ SDValue Src = Op.getOperand(0);
+ auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
+ assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
+ unsigned ShiftAmt = Amt->getZExtValue();
+
+ if (ShiftAmt == 0)
+ return TLO.CombineTo(Op, Src);
+
+ // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
+ // single shift. We can do this if the top bits (which are shifted
+ // out) are never demanded.
+ if (Src.getOpcode() == X86ISD::KSHIFTL) {
+ if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
+ unsigned C1 = Src.getConstantOperandVal(1);
+ unsigned NewOpc = X86ISD::KSHIFTR;
+ int Diff = ShiftAmt - C1;
+ if (Diff < 0) {
+ Diff = -Diff;
+ NewOpc = X86ISD::KSHIFTL;
+ }
+
+ SDLoc dl(Op);
+ SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
+ }
+ }
+
+ APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
+ Depth + 1))
+ return true;
+
+ KnownUndef.lshrInPlace(ShiftAmt);
+ KnownZero.lshrInPlace(ShiftAmt);
+ KnownZero.setHighBits(ShiftAmt);
+ break;
+ }
+ case X86ISD::CVTSI2P:
+ case X86ISD::CVTUI2P: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ APInt SrcUndef, SrcZero;
+ APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
+ Depth + 1))
+ return true;
+ break;
+ }
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+
+ APInt SrcUndef, SrcZero;
+ if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
+ Depth + 1))
+ return true;
+ if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
+ Depth + 1))
+ return true;
+
+ // Aggressively peek through ops to get at the demanded elts.
+ // TODO - we should do this for all target/faux shuffles ops.
+ if (!DemandedElts.isAllOnesValue()) {
+ SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
+ TLO.DAG, Depth + 1);
+ SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
+ TLO.DAG, Depth + 1);
+ if (NewN0 || NewN1) {
+ NewN0 = NewN0 ? NewN0 : N0;
+ NewN1 = NewN1 ? NewN1 : N1;
+ return TLO.CombineTo(Op,
+ TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
+ }
+ }
+ break;
+ }
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: {
+ APInt DemandedLHS, DemandedRHS;
+ getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+
+ APInt LHSUndef, LHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
+ LHSZero, TLO, Depth + 1))
+ return true;
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
+ RHSZero, TLO, Depth + 1))
+ return true;
+ break;
+ }
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ APInt SrcUndef, SrcZero;
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
+ Depth + 1))
+ return true;
+ KnownZero = SrcZero.zextOrTrunc(NumElts);
+ KnownUndef = SrcUndef.zextOrTrunc(NumElts);
+ break;
+ }
+ case X86ISD::BLENDV: {
+ APInt SelUndef, SelZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
+ SelZero, TLO, Depth + 1))
+ return true;
+
+ // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
+ APInt LHSUndef, LHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
+ LHSZero, TLO, Depth + 1))
+ return true;
+
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
+ RHSZero, TLO, Depth + 1))
+ return true;
+
+ KnownZero = LHSZero & RHSZero;
+ KnownUndef = LHSUndef & RHSUndef;
+ break;
+ }
+ case X86ISD::VZEXT_MOVL: {
+ // If upper demanded elements are already zero then we have nothing to do.
+ SDValue Src = Op.getOperand(0);
+ APInt DemandedUpperElts = DemandedElts;
+ DemandedUpperElts.clearLowBits(1);
+ if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
+ return TLO.CombineTo(Op, Src);
+ break;
+ }
+ case X86ISD::VBROADCAST: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ if (!SrcVT.isVector())
+ break;
+ // Don't bother broadcasting if we just need the 0'th element.
+ if (DemandedElts == 1) {
+ if (Src.getValueType() != VT)
+ Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
+ SDLoc(Op));
+ return TLO.CombineTo(Op, Src);
+ }
+ APInt SrcUndef, SrcZero;
+ APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
+ if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
+ Depth + 1))
+ return true;
+ // Aggressively peek through src to get at the demanded elt.
+ // TODO - we should do this for all target/faux shuffles ops.
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+ Src, SrcElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
+ break;
+ }
+ case X86ISD::VPERMV:
+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
+ Depth))
+ return true;
+ break;
+ case X86ISD::PSHUFB:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMILPV:
+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
+ Depth))
+ return true;
+ break;
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMIL2:
+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
+ Depth))
+ return true;
+ break;
+ }
+
+ // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
+ // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
+ // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
+ if ((VT.is256BitVector() || VT.is512BitVector()) &&
+ DemandedElts.lshr(NumElts / 2) == 0) {
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned ExtSizeInBits = SizeInBits / 2;
+
+ // See if 512-bit ops only use the bottom 128-bits.
+ if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
+ ExtSizeInBits = SizeInBits / 4;
+
+ switch (Opc) {
+ // Scalar broadcast.
+ case X86ISD::VBROADCAST: {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueSizeInBits() > ExtSizeInBits)
+ Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
+ case X86ISD::VBROADCAST_LOAD: {
+ SDLoc DL(Op);
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
+ SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Bcst.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
+ // Subvector broadcast.
+ case X86ISD::SUBV_BROADCAST_LOAD: {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ EVT MemVT = MemIntr->getMemoryVT();
+ if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
+ SDLoc DL(Op);
+ SDValue Ld =
+ TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
+ MemIntr->getBasePtr(), MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Ld.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
+ SDLoc DL(Op);
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
+ SDValue Bcst =
+ TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
+ Ops, MemVT, MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Bcst.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
+ break;
+ }
+ // Byte shifts by immediate.
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
+ // Shift by uniform.
+ case X86ISD::VSHL:
+ case X86ISD::VSRL:
+ case X86ISD::VSRA:
+ // Shift by immediate.
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI: {
+ SDLoc DL(Op);
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp =
+ TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ case X86ISD::VPERMI: {
+ // Simplify PERMPD/PERMQ to extract_subvector.
+ // TODO: This should be done in shuffle combining.
+ if (VT == MVT::v4f64 || VT == MVT::v4i64) {
+ SmallVector<int, 4> Mask;
+ DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
+ if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
+ SDLoc DL(Op);
+ SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
+ break;
+ }
+ // Zero upper elements.
+ case X86ISD::VZEXT_MOVL:
+ // Target unary shuffles by immediate:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ case X86ISD::VPERMILPI:
+ // (Non-Lane Crossing) Target Shuffles.
+ case X86ISD::VPERMILPV:
+ case X86ISD::VPERMIL2:
+ case X86ISD::PSHUFB:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ case X86ISD::BLENDI:
+ // Integer ops.
+ case X86ISD::AVG:
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ // Horizontal Ops.
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: {
+ SDLoc DL(Op);
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+ SDValue SrcOp = Op.getOperand(i);
+ EVT SrcVT = SrcOp.getValueType();
+ assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
+ "Unsupported vector size");
+ Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
+ ExtSizeInBits)
+ : SrcOp);
+ }
+ MVT ExtVT = VT.getSimpleVT();
+ ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
+ ExtSizeInBits / ExtVT.getScalarSizeInBits());
+ SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
+ }
+
+ // Get target/faux shuffle mask.
+ APInt OpUndef, OpZero;
+ SmallVector<int, 64> OpMask;
+ SmallVector<SDValue, 2> OpInputs;
+ if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
+ OpZero, TLO.DAG, Depth, false))
+ return false;
+
+ // Shuffle inputs must be the same size as the result.
+ if (OpMask.size() != (unsigned)NumElts ||
+ llvm::any_of(OpInputs, [VT](SDValue V) {
+ return VT.getSizeInBits() != V.getValueSizeInBits() ||
+ !V.getValueType().isVector();
+ }))
+ return false;
+
+ KnownZero = OpZero;
+ KnownUndef = OpUndef;
+
+ // Check if shuffle mask can be simplified to undef/zero/identity.
+ int NumSrcs = OpInputs.size();
+ for (int i = 0; i != NumElts; ++i)
+ if (!DemandedElts[i])
+ OpMask[i] = SM_SentinelUndef;
+
+ if (isUndefInRange(OpMask, 0, NumElts)) {
+ KnownUndef.setAllBits();
+ return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+ }
+ if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
+ KnownZero.setAllBits();
+ return TLO.CombineTo(
+ Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
+ }
+ for (int Src = 0; Src != NumSrcs; ++Src)
+ if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
+ return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
+
+ // Attempt to simplify inputs.
+ for (int Src = 0; Src != NumSrcs; ++Src) {
+ // TODO: Support inputs of different types.
+ if (OpInputs[Src].getValueType() != VT)
+ continue;
+
+ int Lo = Src * NumElts;
+ APInt SrcElts = APInt::getNullValue(NumElts);
+ for (int i = 0; i != NumElts; ++i)
+ if (DemandedElts[i]) {
+ int M = OpMask[i] - Lo;
+ if (0 <= M && M < NumElts)
+ SrcElts.setBit(M);
+ }
+
+ // TODO - Propagate input undef/zero elts.
+ APInt SrcUndef, SrcZero;
+ if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
+ TLO, Depth + 1))
+ return true;
+ }
+
+ // If we don't demand all elements, then attempt to combine to a simpler
+ // shuffle.
+ // We need to convert the depth to something combineX86ShufflesRecursively
+ // can handle - so pretend its Depth == 0 again, and reduce the max depth
+ // to match. This prevents combineX86ShuffleChain from returning a
+ // combined shuffle that's the same as the original root, causing an
+ // infinite loop.
+ if (!DemandedElts.isAllOnesValue()) {
+ assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
+
+ SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
+ for (int i = 0; i != NumElts; ++i)
+ if (DemandedElts[i])
+ DemandedMask[i] = i;
+
+ SDValue NewShuffle = combineX86ShufflesRecursively(
+ {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
+ /*HasVarMask*/ false,
+ /*AllowVarMask*/ true, TLO.DAG, Subtarget);
+ if (NewShuffle)
+ return TLO.CombineTo(Op, NewShuffle);
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
+ SDValue Op, const APInt &OriginalDemandedBits,
+ const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+ unsigned Depth) const {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = OriginalDemandedBits.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ switch(Opc) {
+ case X86ISD::VTRUNC: {
+ KnownBits KnownOp;
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // Simplify the input, using demanded bit information.
+ APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
+ APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
+ if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
+ return true;
+ break;
+ }
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ: {
+ // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
+ KnownBits KnownOp;
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ // FIXME: Can we bound this better?
+ APInt DemandedMask = APInt::getLowBitsSet(64, 32);
+ if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
+ return true;
+
+ // Aggressively peek through ops to get at the demanded low bits.
+ SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
+ LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
+ RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ if (DemandedLHS || DemandedRHS) {
+ DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
+ DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
+ }
+ break;
+ }
+ case X86ISD::VSHLI: {
+ SDValue Op0 = Op.getOperand(0);
+
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ if (ShAmt >= BitWidth)
+ break;
+
+ APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
+
+ // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+ // single shift. We can do this if the bottom bits (which are shifted
+ // out) are never demanded.
+ if (Op0.getOpcode() == X86ISD::VSRLI &&
+ OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
+ unsigned Shift2Amt = Op0.getConstantOperandVal(1);
+ if (Shift2Amt < BitWidth) {
+ int Diff = ShAmt - Shift2Amt;
+ if (Diff == 0)
+ return TLO.CombineTo(Op, Op0.getOperand(0));
+
+ unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
+ SDValue NewShift = TLO.DAG.getNode(
+ NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
+ TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
+ return TLO.CombineTo(Op, NewShift);
+ }
+ }
+
+ // If we are only demanding sign bits then we can use the shift source directly.
+ unsigned NumSignBits =
+ TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
+ unsigned UpperDemandedBits =
+ BitWidth - OriginalDemandedBits.countTrailingZeros();
+ if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
+
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero <<= ShAmt;
+ Known.One <<= ShAmt;
+
+ // Low bits known zero.
+ Known.Zero.setLowBits(ShAmt);
+ return false;
+ }
+ case X86ISD::VSRLI: {
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ if (ShAmt >= BitWidth)
+ break;
+
+ APInt DemandedMask = OriginalDemandedBits << ShAmt;
+
+ if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
+ OriginalDemandedElts, Known, TLO, Depth + 1))
+ return true;
+
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
+
+ // High bits known zero.
+ Known.Zero.setHighBits(ShAmt);
+ return false;
+ }
+ case X86ISD::VSRAI: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
+ if (ShAmt >= BitWidth)
+ break;
+
+ APInt DemandedMask = OriginalDemandedBits << ShAmt;
+
+ // If we just want the sign bit then we don't need to shift it.
+ if (OriginalDemandedBits.isSignMask())
+ return TLO.CombineTo(Op, Op0);
+
+ // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
+ if (Op0.getOpcode() == X86ISD::VSHLI &&
+ Op.getOperand(1) == Op0.getOperand(1)) {
+ SDValue Op00 = Op0.getOperand(0);
+ unsigned NumSignBits =
+ TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
+ if (ShAmt < NumSignBits)
+ return TLO.CombineTo(Op, Op00);
+ }
+
+ // If any of the demanded bits are produced by the sign extension, we also
+ // demand the input sign bit.
+ if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
+ DemandedMask.setSignBit();
+
+ if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
+
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
+
+ // If the input sign bit is known to be zero, or if none of the top bits
+ // are demanded, turn this into an unsigned shift right.
+ if (Known.Zero[BitWidth - ShAmt - 1] ||
+ OriginalDemandedBits.countLeadingZeros() >= ShAmt)
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
+
+ // High bits are known one.
+ if (Known.One[BitWidth - ShAmt - 1])
+ Known.One.setHighBits(ShAmt);
+ return false;
+ }
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW: {
+ SDValue Vec = Op.getOperand(0);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ MVT VecVT = Vec.getSimpleValueType();
+ unsigned NumVecElts = VecVT.getVectorNumElements();
+
+ if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
+ unsigned Idx = CIdx->getZExtValue();
+ unsigned VecBitWidth = VecVT.getScalarSizeInBits();
+
+ // If we demand no bits from the vector then we must have demanded
+ // bits from the implict zext - simplify to zero.
+ APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
+ if (DemandedVecBits == 0)
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+ APInt KnownUndef, KnownZero;
+ APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
+ if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
+ KnownZero, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownVec;
+ if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
+ KnownVec, TLO, Depth + 1))
+ return true;
+
+ if (SDValue V = SimplifyMultipleUseDemandedBits(
+ Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
+
+ Known = KnownVec.zext(BitWidth);
+ return false;
+ }
+ break;
+ }
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Scl = Op.getOperand(1);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
+ unsigned Idx = CIdx->getZExtValue();
+ if (!OriginalDemandedElts[Idx])
+ return TLO.CombineTo(Op, Vec);
+
+ KnownBits KnownVec;
+ APInt DemandedVecElts(OriginalDemandedElts);
+ DemandedVecElts.clearBit(Idx);
+ if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
+ KnownVec, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownScl;
+ unsigned NumSclBits = Scl.getScalarValueSizeInBits();
+ APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
+ if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
+ return true;
+
+ KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
+ Known = KnownBits::commonBits(KnownVec, KnownScl);
+ return false;
+ }
+ break;
+ }
+ case X86ISD::PACKSS:
+ // PACKSS saturates to MIN/MAX integer values. So if we just want the
+ // sign bit then we can just ask for the source operands sign bit.
+ // TODO - add known bits handling.
+ if (OriginalDemandedBits.isSignMask()) {
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
+
+ KnownBits KnownLHS, KnownRHS;
+ APInt SignMask = APInt::getSignMask(BitWidth * 2);
+ if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
+ KnownLHS, TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
+ KnownRHS, TLO, Depth + 1))
+ return true;
+
+ // Attempt to avoid multi-use ops if we don't need anything from them.
+ SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+ Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
+ SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+ Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
+ if (DemandedOp0 || DemandedOp1) {
+ SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
+ SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
+ }
+ }
+ // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
+ break;
+ case X86ISD::PCMPGT:
+ // icmp sgt(0, R) == ashr(R, BitWidth-1).
+ // iff we only need the sign bit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return TLO.CombineTo(Op, Op.getOperand(1));
+ break;
+ case X86ISD::MOVMSK: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned SrcBits = SrcVT.getScalarSizeInBits();
+ unsigned NumElts = SrcVT.getVectorNumElements();
+
+ // If we don't need the sign bits at all just return zero.
+ if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+ // Only demand the vector elements of the sign bits we need.
+ APInt KnownUndef, KnownZero;
+ APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
+ if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
+ TLO, Depth + 1))
+ return true;
+
+ Known.Zero = KnownZero.zextOrSelf(BitWidth);
+ Known.Zero.setHighBits(BitWidth - NumElts);
+
+ // MOVMSK only uses the MSB from each vector element.
+ KnownBits KnownSrc;
+ APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
+ if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
+ Depth + 1))
+ return true;
+
+ if (KnownSrc.One[SrcBits - 1])
+ Known.One.setLowBits(NumElts);
+ else if (KnownSrc.Zero[SrcBits - 1])
+ Known.Zero.setLowBits(NumElts);
+
+ // Attempt to avoid multi-use os if we don't need anything from it.
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+ Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
+ return false;
+ }
+ case X86ISD::BEXTR:
+ case X86ISD::BEXTRI: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Only bottom 16-bits of the control bits are required.
+ if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+ // NOTE: SimplifyDemandedBits won't do this for constants.
+ uint64_t Val1 = Cst1->getZExtValue();
+ uint64_t MaskedVal1 = Val1 & 0xFFFF;
+ if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
+ SDLoc DL(Op);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
+ TLO.DAG.getConstant(MaskedVal1, DL, VT)));
+ }
+
+ unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
+ unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
+
+ // If the length is 0, the result is 0.
+ if (Length == 0) {
+ Known.setAllZero();
+ return false;
+ }
+
+ if ((Shift + Length) <= BitWidth) {
+ APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
+ if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
+ return true;
+
+ Known = Known.extractBits(Length, Shift);
+ Known = Known.zextOrTrunc(BitWidth);
+ return false;
+ }
+ } else {
+ assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
+ KnownBits Known1;
+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
+ if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
+ return true;
+
+ // If the length is 0, replace with 0.
+ KnownBits LengthBits = Known1.extractBits(8, 8);
+ if (LengthBits.isZero())
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+ }
+
+ break;
+ }
+ case X86ISD::PDEP: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
+ APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
+
+ // If the demanded bits has leading zeroes, we don't demand those from the
+ // mask.
+ if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
+ return true;
+
+ // The number of possible 1s in the mask determines the number of LSBs of
+ // operand 0 used. Undemanded bits from the mask don't matter so filter
+ // them before counting.
+ KnownBits Known2;
+ uint64_t Count = (~Known.Zero & LoMask).countPopulation();
+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
+ if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
+ return true;
+
+ // Zeroes are retained from the mask, but not ones.
+ Known.One.clearAllBits();
+ // The result will have at least as many trailing zeros as the non-mask
+ // operand since bits can only map to the same or higher bit position.
+ Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+ return false;
+ }
+ }
+
+ return TargetLowering::SimplifyDemandedBitsForTargetNode(
+ Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
+}
+
+SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ SelectionDAG &DAG, unsigned Depth) const {
+ int NumElts = DemandedElts.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+
+ switch (Opc) {
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: {
+ // If we don't demand the inserted element, return the base vector.
+ SDValue Vec = Op.getOperand(0);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ MVT VecVT = Vec.getSimpleValueType();
+ if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
+ !DemandedElts[CIdx->getZExtValue()])
+ return Vec;
+ break;
+ }
+ case X86ISD::VSHLI: {
+ // If we are only demanding sign bits then we can use the shift source
+ // directly.
+ SDValue Op0 = Op.getOperand(0);
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ unsigned BitWidth = DemandedBits.getBitWidth();
+ unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+ unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
+ if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
+ return Op0;
+ break;
+ }
+ case X86ISD::VSRAI:
+ // iff we only need the sign bit then we can use the source directly.
+ // TODO: generalize where we only demand extended signbits.
+ if (DemandedBits.isSignMask())
+ return Op.getOperand(0);
+ break;
+ case X86ISD::PCMPGT:
+ // icmp sgt(0, R) == ashr(R, BitWidth-1).
+ // iff we only need the sign bit then we can use R directly.
+ if (DemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return Op.getOperand(1);
+ break;
+ }
+
+ APInt ShuffleUndef, ShuffleZero;
+ SmallVector<int, 16> ShuffleMask;
+ SmallVector<SDValue, 2> ShuffleOps;
+ if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
+ ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
+ // If all the demanded elts are from one operand and are inline,
+ // then we can use the operand directly.
+ int NumOps = ShuffleOps.size();
+ if (ShuffleMask.size() == (unsigned)NumElts &&
+ llvm::all_of(ShuffleOps, [VT](SDValue V) {
+ return VT.getSizeInBits() == V.getValueSizeInBits();
+ })) {
+
+ if (DemandedElts.isSubsetOf(ShuffleUndef))
+ return DAG.getUNDEF(VT);
+ if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
+
+ // Bitmask that indicates which ops have only been accessed 'inline'.
+ APInt IdentityOp = APInt::getAllOnesValue(NumOps);
+ for (int i = 0; i != NumElts; ++i) {
+ int M = ShuffleMask[i];
+ if (!DemandedElts[i] || ShuffleUndef[i])
+ continue;
+ int OpIdx = M / NumElts;
+ int EltIdx = M % NumElts;
+ if (M < 0 || EltIdx != i) {
+ IdentityOp.clearAllBits();
+ break;
+ }
+ IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
+ if (IdentityOp == 0)
+ break;
+ }
+ assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
+ "Multiple identity shuffles detected");
+
+ if (IdentityOp != 0)
+ return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
+ }
+ }
+
+ return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+ Op, DemandedBits, DemandedElts, DAG, Depth);
+}
+
+// Helper to peek through bitops/setcc to determine size of source vector.
+// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
+static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ return Src.getOperand(0).getValueSizeInBits() == Size;
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
+ checkBitcastSrcVectorSize(Src.getOperand(1), Size);
+ }
+ return false;
+}
+
+// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
+static unsigned getAltBitOpcode(unsigned Opcode) {
+ switch(Opcode) {
+ case ISD::AND: return X86ISD::FAND;
+ case ISD::OR: return X86ISD::FOR;
+ case ISD::XOR: return X86ISD::FXOR;
+ case X86ISD::ANDNP: return X86ISD::FANDN;
+ }
+ llvm_unreachable("Unknown bitwise opcode");
+}
+
+// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
+static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
+ const SDLoc &DL) {
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT != MVT::v4i1)
+ return SDValue();
+
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
+ ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
+ cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
+ SDValue Op0 = Src.getOperand(0);
+ if (ISD::isNormalLoad(Op0.getNode()))
+ return DAG.getBitcast(MVT::v4f32, Op0);
+ if (Op0.getOpcode() == ISD::BITCAST &&
+ Op0.getOperand(0).getValueType() == MVT::v4f32)
+ return Op0.getOperand(0);
+ }
+ break;
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR: {
+ SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
+ SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
+ if (Op0 && Op1)
+ return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
+ Op1);
+ break;
+ }
+ }
+ return SDValue();
+}
+
+// Helper to push sign extension of vXi1 SETCC result through bitops.
+static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
+ SDValue Src, const SDLoc &DL) {
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ return DAG.getNode(
+ Src.getOpcode(), DL, SExtVT,
+ signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
+ signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
+ }
+ llvm_unreachable("Unexpected node type for vXi1 sign extension");
+}
+
+// Try to match patterns such as
+// (i16 bitcast (v16i1 x))
+// ->
+// (i16 movmsk (16i8 sext (v16i1 x)))
+// before the illegal vector is scalarized on subtargets that don't have legal
+// vxi1 types.
+static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
+ const SDLoc &DL,
+ const X86Subtarget &Subtarget) {
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
+ return SDValue();
+
+ // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
+ // legalization destroys the v4i32 type.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
+ if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
+ DAG.getBitcast(MVT::v4f32, V));
+ return DAG.getZExtOrTrunc(V, DL, VT);
+ }
+ }
+
+ // If the input is a truncate from v16i8 or v32i8 go ahead and use a
+ // movmskb even with avx512. This will be better than truncating to vXi1 and
+ // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
+ // vpcmpeqb/vpcmpgtb.
+ bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
+ (Src.getOperand(0).getValueType() == MVT::v16i8 ||
+ Src.getOperand(0).getValueType() == MVT::v32i8 ||
+ Src.getOperand(0).getValueType() == MVT::v64i8);
+
+ // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
+ // directly with vpmovmskb/vmovmskps/vmovmskpd.
+ if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
+ cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
+ ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
+ EVT CmpVT = Src.getOperand(0).getValueType();
+ EVT EltVT = CmpVT.getVectorElementType();
+ if (CmpVT.getSizeInBits() <= 256 &&
+ (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
+ PreferMovMsk = true;
+ }
+
+ // With AVX512 vxi1 types are legal and we prefer using k-regs.
+ // MOVMSK is supported in SSE2 or later.
+ if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
+ return SDValue();
+
+ // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
+ // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
+ // v8i16 and v16i16.
+ // For these two cases, we can shuffle the upper element bytes to a
+ // consecutive sequence at the start of the vector and treat the results as
+ // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
+ // for v16i16 this is not the case, because the shuffle is expensive, so we
+ // avoid sign-extending to this type entirely.
+ // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
+ // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
+ MVT SExtVT;
+ bool PropagateSExt = false;
+ switch (SrcVT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v2i1:
+ SExtVT = MVT::v2i64;
+ break;
+ case MVT::v4i1:
+ SExtVT = MVT::v4i32;
+ // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
+ // sign-extend to a 256-bit operation to avoid truncation.
+ if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
+ SExtVT = MVT::v4i64;
+ PropagateSExt = true;
+ }
+ break;
+ case MVT::v8i1:
+ SExtVT = MVT::v8i16;
+ // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
+ // sign-extend to a 256-bit operation to match the compare.
+ // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
+ // 256-bit because the shuffle is cheaper than sign extending the result of
+ // the compare.
+ if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
+ checkBitcastSrcVectorSize(Src, 512))) {
+ SExtVT = MVT::v8i32;
+ PropagateSExt = true;
+ }
+ break;
+ case MVT::v16i1:
+ SExtVT = MVT::v16i8;
+ // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
+ // it is not profitable to sign-extend to 256-bit because this will
+ // require an extra cross-lane shuffle which is more expensive than
+ // truncating the result of the compare to 128-bits.
+ break;
+ case MVT::v32i1:
+ SExtVT = MVT::v32i8;
+ break;
+ case MVT::v64i1:
+ // If we have AVX512F, but not AVX512BW and the input is truncated from
+ // v64i8 checked earlier. Then split the input and make two pmovmskbs.
+ if (Subtarget.hasAVX512()) {
+ if (Subtarget.hasBWI())
+ return SDValue();
+ SExtVT = MVT::v64i8;
+ break;
+ }
+ // Split if this is a <64 x i8> comparison result.
+ if (checkBitcastSrcVectorSize(Src, 512)) {
+ SExtVT = MVT::v64i8;
+ break;
+ }
+ return SDValue();
+ };
+
+ SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
+ : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+
+ if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
+ V = getPMOVMSKB(DL, V, DAG, Subtarget);
+ } else {
+ if (SExtVT == MVT::v8i16)
+ V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
+ DAG.getUNDEF(MVT::v8i16));
+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+ }
+
+ EVT IntVT =
+ EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
+ V = DAG.getZExtOrTrunc(V, DL, IntVT);
+ return DAG.getBitcast(VT, V);
+}
+
+// Convert a vXi1 constant build vector to the same width scalar integer.
+static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
+ EVT SrcVT = Op.getValueType();
+ assert(SrcVT.getVectorElementType() == MVT::i1 &&
+ "Expected a vXi1 vector");
+ assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+ "Expected a constant build vector");
+
+ APInt Imm(SrcVT.getVectorNumElements(), 0);
+ for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
+ SDValue In = Op.getOperand(Idx);
+ if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
+ Imm.setBit(Idx);
+ }
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
+ return DAG.getConstant(Imm, SDLoc(Op), IntVT);
+}
+
+static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
+
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // Only do this if we have k-registers.
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+
+ EVT DstVT = N->getValueType(0);
+ SDValue Op = N->getOperand(0);
+ EVT SrcVT = Op.getValueType();
+
+ if (!Op.hasOneUse())
+ return SDValue();
+
+ // Look for logic ops.
+ if (Op.getOpcode() != ISD::AND &&
+ Op.getOpcode() != ISD::OR &&
+ Op.getOpcode() != ISD::XOR)
+ return SDValue();
+
+ // Make sure we have a bitcast between mask registers and a scalar type.
+ if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+ DstVT.isScalarInteger()) &&
+ !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
+ SrcVT.isScalarInteger()))
+ return SDValue();
+
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
+ LHS.getOperand(0).getValueType() == DstVT)
+ return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
+ DAG.getBitcast(DstVT, RHS));
+
+ if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
+ RHS.getOperand(0).getValueType() == DstVT)
+ return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
+ DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
+
+ // If the RHS is a vXi1 build vector, this is a good reason to flip too.
+ // Most of these have to move a constant from the scalar domain anyway.
+ if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
+ RHS = combinevXi1ConstantToInteger(RHS, DAG);
+ return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
+ DAG.getBitcast(DstVT, LHS), RHS);
+ }
+
+ return SDValue();
+}
+
+static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(BV);
+ unsigned NumElts = BV->getNumOperands();
+ SDValue Splat = BV->getSplatValue();
+
+ // Build MMX element from integer GPR or SSE float values.
+ auto CreateMMXElement = [&](SDValue V) {
+ if (V.isUndef())
+ return DAG.getUNDEF(MVT::x86mmx);
+ if (V.getValueType().isFloatingPoint()) {
+ if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
+ V = DAG.getBitcast(MVT::v2i64, V);
+ return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
+ }
+ V = DAG.getBitcast(MVT::i32, V);
+ } else {
+ V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
+ }
+ return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
+ };
+
+ // Convert build vector ops to MMX data in the bottom elements.
+ SmallVector<SDValue, 8> Ops;
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
+ if (Splat) {
+ if (Splat.isUndef())
+ return DAG.getUNDEF(MVT::x86mmx);
+
+ Splat = CreateMMXElement(Splat);
+
+ if (Subtarget.hasSSE1()) {
+ // Unpack v8i8 to splat i8 elements to lowest 16-bits.
+ if (NumElts == 8)
+ Splat = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
+ DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
+ TLI.getPointerTy(DAG.getDataLayout())),
+ Splat, Splat);
+
+ // Use PSHUFW to repeat 16-bit elements.
+ unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
+ DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
+ TLI.getPointerTy(DAG.getDataLayout())),
+ Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
+ }
+ Ops.append(NumElts, Splat);
+ } else {
+ for (unsigned i = 0; i != NumElts; ++i)
+ Ops.push_back(CreateMMXElement(BV->getOperand(i)));
+ }
+
+ // Use tree of PUNPCKLs to build up general MMX vector.
+ while (Ops.size() > 1) {
+ unsigned NumOps = Ops.size();
+ unsigned IntrinOp =
+ (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
+ : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
+ : Intrinsic::x86_mmx_punpcklbw));
+ SDValue Intrin = DAG.getTargetConstant(
+ IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
+ for (unsigned i = 0; i != NumOps; i += 2)
+ Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
+ Ops[i], Ops[i + 1]);
+ Ops.resize(NumOps / 2);
+ }
+
+ return Ops[0];
+}
+
+// Recursive function that attempts to find if a bool vector node was originally
+// a vector/float/double that got truncated/extended/bitcast to/from a scalar
+// integer. If so, replace the scalar ops with bool vector equivalents back down
+// the chain.
+static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned Opc = V.getOpcode();
+ switch (Opc) {
+ case ISD::BITCAST: {
+ // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
+ SDValue Src = V.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.isVector() || SrcVT.isFloatingPoint())
+ return DAG.getBitcast(VT, Src);
+ break;
+ }
+ case ISD::TRUNCATE: {
+ // If we find a suitable source, a truncated scalar becomes a subvector.
+ SDValue Src = V.getOperand(0);
+ EVT NewSrcVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
+ if (TLI.isTypeLegal(NewSrcVT))
+ if (SDValue N0 =
+ combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
+ DAG.getIntPtrConstant(0, DL));
+ break;
+ }
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND: {
+ // If we find a suitable source, an extended scalar becomes a subvector.
+ SDValue Src = V.getOperand(0);
+ EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Src.getScalarValueSizeInBits());
+ if (TLI.isTypeLegal(NewSrcVT))
+ if (SDValue N0 =
+ combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
+ : DAG.getConstant(0, DL, VT),
+ N0, DAG.getIntPtrConstant(0, DL));
+ break;
+ }
+ case ISD::OR: {
+ // If we find suitable sources, we can just move an OR to the vector domain.
+ SDValue Src0 = V.getOperand(0);
+ SDValue Src1 = V.getOperand(1);
+ if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+ if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
+ return DAG.getNode(Opc, DL, VT, N0, N1);
+ break;
+ }
+ case ISD::SHL: {
+ // If we find a suitable source, a SHL becomes a KSHIFTL.
+ SDValue Src0 = V.getOperand(0);
+ if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
+ ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
+ break;
+
+ if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
+ if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+ return DAG.getNode(
+ X86ISD::KSHIFTL, DL, VT, N0,
+ DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
+ break;
+ }
+ }
+ return SDValue();
+}
+
+static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = N0.getValueType();
+
+ // Try to match patterns such as
+ // (i16 bitcast (v16i1 x))
+ // ->
+ // (i16 movmsk (16i8 sext (v16i1 x)))
+ // before the setcc result is scalarized on subtargets that don't have legal
+ // vxi1 types.
+ if (DCI.isBeforeLegalize()) {
+ SDLoc dl(N);
+ if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
+ return V;
+
+ // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
+ // type, widen both sides to avoid a trip through memory.
+ if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
+ Subtarget.hasAVX512()) {
+ N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
+ N0 = DAG.getBitcast(MVT::v8i1, N0);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
+ // type, widen both sides to avoid a trip through memory.
+ if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
+ Subtarget.hasAVX512()) {
+ // Use zeros for the widening if we already have some zeroes. This can
+ // allow SimplifyDemandedBits to remove scalar ANDs that may be down
+ // stream of this.
+ // FIXME: It might make sense to detect a concat_vectors with a mix of
+ // zeroes and undef and turn it into insert_subvector for i1 vectors as
+ // a separate combine. What we can't do is canonicalize the operands of
+ // such a concat or we'll get into a loop with SimplifyDemandedBits.
+ if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
+ SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
+ if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
+ SrcVT = LastOp.getValueType();
+ unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
+ SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
+ Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
+ N0 = DAG.getBitcast(MVT::i8, N0);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
+ }
+ }
+
+ unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
+ Ops[0] = N0;
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
+ N0 = DAG.getBitcast(MVT::i8, N0);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
+ }
+ } else {
+ // If we're bitcasting from iX to vXi1, see if the integer originally
+ // began as a vXi1 and whether we can remove the bitcast entirely.
+ if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
+ SrcVT.isScalarInteger() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ if (SDValue V =
+ combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
+ return V;
+ }
+ }
+
+ // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
+ // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
+ // due to insert_subvector legalization on KNL. By promoting the copy to i16
+ // we can help with known bits propagation from the vXi1 domain to the
+ // scalar domain.
+ if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
+ !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getOperand(0).getValueType() == MVT::v16i1 &&
+ isNullConstant(N0.getOperand(1)))
+ return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
+ DAG.getBitcast(MVT::i16, N0.getOperand(0)));
+
+ // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
+ // and the vbroadcast_load are both integer or both fp. In some cases this
+ // will remove the bitcast entirely.
+ if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
+ VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
+ auto *BCast = cast<MemIntrinsicSDNode>(N0);
+ unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
+ unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
+ // Don't swap i8/i16 since don't have fp types that size.
+ if (MemSize >= 32) {
+ MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
+ : MVT::getIntegerVT(MemSize);
+ MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
+ : MVT::getIntegerVT(SrcVTSize);
+ LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
+
+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+ SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+ MemVT, BCast->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
+ return DAG.getBitcast(VT, ResNode);
+ }
+ }
+
+ // Since MMX types are special and don't usually play with other vector types,
+ // it's better to handle them early to be sure we emit efficient code by
+ // avoiding store-load conversions.
+ if (VT == MVT::x86mmx) {
+ // Detect MMX constant vectors.
+ APInt UndefElts;
+ SmallVector<APInt, 1> EltBits;
+ if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
+ SDLoc DL(N0);
+ // Handle zero-extension of i32 with MOVD.
+ if (EltBits[0].countLeadingZeros() >= 32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
+ DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
+ // Else, bitcast to a double.
+ // TODO - investigate supporting sext 32-bit immediates on x86_64.
+ APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
+ return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
+ }
+
+ // Detect bitcasts to x86mmx low word.
+ if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+ (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
+ N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
+ bool LowUndef = true, AllUndefOrZero = true;
+ for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
+ SDValue Op = N0.getOperand(i);
+ LowUndef &= Op.isUndef() || (i >= e/2);
+ AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
+ }
+ if (AllUndefOrZero) {
+ SDValue N00 = N0.getOperand(0);
+ SDLoc dl(N00);
+ N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
+ : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
+ return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
+ }
+ }
+
+ // Detect bitcasts of 64-bit build vectors and convert to a
+ // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
+ // lowest element.
+ if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+ (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
+ SrcVT == MVT::v8i8))
+ return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
+
+ // Detect bitcasts between element or subvector extraction to x86mmx.
+ if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
+ isNullConstant(N0.getOperand(1))) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getValueType().is128BitVector())
+ return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
+ DAG.getBitcast(MVT::v2i64, N00));
+ }
+
+ // Detect bitcasts from FP_TO_SINT to x86mmx.
+ if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
+ SDLoc DL(N0);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+ return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
+ DAG.getBitcast(MVT::v2i64, Res));
+ }
+ }
+
+ // Try to remove a bitcast of constant vXi1 vector. We have to legalize
+ // most of these to scalar anyway.
+ if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
+ SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+ ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+ return combinevXi1ConstantToInteger(N0, DAG);
+ }
+
+ if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
+ VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ isa<ConstantSDNode>(N0)) {
+ auto *C = cast<ConstantSDNode>(N0);
+ if (C->isAllOnesValue())
+ return DAG.getConstant(1, SDLoc(N0), VT);
+ if (C->isNullValue())
+ return DAG.getConstant(0, SDLoc(N0), VT);
+ }
+
+ // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
+ // Turn it into a sign bit compare that produces a k-register. This avoids
+ // a trip through a GPR.
+ if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
+ VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ isPowerOf2_32(VT.getVectorNumElements())) {
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue Src = N0;
+
+ // Peek through truncate.
+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
+ Src = N0.getOperand(0);
+
+ if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
+ SDValue MovmskIn = Src.getOperand(0);
+ MVT MovmskVT = MovmskIn.getSimpleValueType();
+ unsigned MovMskElts = MovmskVT.getVectorNumElements();
+
+ // We allow extra bits of the movmsk to be used since they are known zero.
+ // We can't convert a VPMOVMSKB without avx512bw.
+ if (MovMskElts <= NumElts &&
+ (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
+ EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
+ MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
+ SDLoc dl(N);
+ MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
+ SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
+ DAG.getConstant(0, dl, IntVT), ISD::SETLT);
+ if (EVT(CmpVT) == VT)
+ return Cmp;
+
+ // Pad with zeroes up to original VT to replace the zeroes that were
+ // being used from the MOVMSK.
+ unsigned NumConcats = NumElts / MovMskElts;
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
+ Ops[0] = Cmp;
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
+ }
+ }
+ }
+
+ // Try to remove bitcasts from input and output of mask arithmetic to
+ // remove GPR<->K-register crossings.
+ if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
+ return V;
+
+ // Convert a bitcasted integer logic operation that has one bitcasted
+ // floating-point operand into a floating-point logic operation. This may
+ // create a load of a constant, but that is cheaper than materializing the
+ // constant in an integer register and transferring it to an SSE register or
+ // transferring the SSE operand to integer register and back.
+ unsigned FPOpcode;
+ switch (N0.getOpcode()) {
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+ default: return SDValue();
+ }
+
+ if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
+ (Subtarget.hasSSE2() && VT == MVT::f64)))
+ return SDValue();
+
+ SDValue LogicOp0 = N0.getOperand(0);
+ SDValue LogicOp1 = N0.getOperand(1);
+ SDLoc DL0(N0);
+
+ // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
+ if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
+ LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
+ !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
+ SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
+ return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
+ }
+ // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
+ if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
+ LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
+ !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
+ SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
+ return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
+ }
+
+ return SDValue();
+}
+
+// Given a ABS node, detect the following pattern:
+// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
+// This is useful as it is the input into a SAD pattern.
+static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
+ SDValue AbsOp1 = Abs->getOperand(0);
+ if (AbsOp1.getOpcode() != ISD::SUB)
+ return false;
+
+ Op0 = AbsOp1.getOperand(0);
+ Op1 = AbsOp1.getOperand(1);
+
+ // Check if the operands of the sub are zero-extended from vectors of i8.
+ if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
+ Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
+ Op1.getOpcode() != ISD::ZERO_EXTEND ||
+ Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
+ return false;
+
+ return true;
+}
+
+// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
+// to these zexts.
+static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
+ const SDValue &Zext1, const SDLoc &DL,
+ const X86Subtarget &Subtarget) {
+ // Find the appropriate width for the PSADBW.
+ EVT InVT = Zext0.getOperand(0).getValueType();
+ unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
+
+ // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
+ // fill in the missing vector elements with 0.
+ unsigned NumConcat = RegSize / InVT.getSizeInBits();
+ SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
+ Ops[0] = Zext0.getOperand(0);
+ MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
+ SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+ Ops[0] = Zext1.getOperand(0);
+ SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+
+ // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
+ auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
+ return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
+ };
+ MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
+ return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
+ PSADBWBuilder);
+}
+
+// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
+// PHMINPOSUW.
+static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Bail without SSE41.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ EVT ExtractVT = Extract->getValueType(0);
+ if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
+ return SDValue();
+
+ // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
+ ISD::NodeType BinOp;
+ SDValue Src = DAG.matchBinOpReduction(
+ Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
+ if (!Src)
+ return SDValue();
+
+ EVT SrcVT = Src.getValueType();
+ EVT SrcSVT = SrcVT.getScalarType();
+ if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
+ return SDValue();
+
+ SDLoc DL(Extract);
+ SDValue MinPos = Src;
+
+ // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
+ while (SrcVT.getSizeInBits() > 128) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
+ SrcVT = Lo.getValueType();
+ MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
+ }
+ assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
+ (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
+ "Unexpected value type");
+
+ // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
+ // to flip the value accordingly.
+ SDValue Mask;
+ unsigned MaskEltsBits = ExtractVT.getSizeInBits();
+ if (BinOp == ISD::SMAX)
+ Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
+ else if (BinOp == ISD::SMIN)
+ Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
+ else if (BinOp == ISD::UMAX)
+ Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
+
+ if (Mask)
+ MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
+
+ // For v16i8 cases we need to perform UMIN on pairs of byte elements,
+ // shuffling each upper element down and insert zeros. This means that the
+ // v16i8 UMIN will leave the upper element as zero, performing zero-extension
+ // ready for the PHMINPOS.
+ if (ExtractVT == MVT::i8) {
+ SDValue Upper = DAG.getVectorShuffle(
+ SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
+ {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
+ MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
+ }
+
+ // Perform the PHMINPOS on a v8i16 vector,
+ MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
+ MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
+ MinPos = DAG.getBitcast(SrcVT, MinPos);
+
+ if (Mask)
+ MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
+ DAG.getIntPtrConstant(0, DL));
+}
+
+// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
+static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Bail without SSE2.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ EVT ExtractVT = Extract->getValueType(0);
+ unsigned BitWidth = ExtractVT.getSizeInBits();
+ if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
+ ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
+ return SDValue();
+
+ // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
+ ISD::NodeType BinOp;
+ SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
+ if (!Match && ExtractVT == MVT::i1)
+ Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
+ if (!Match)
+ return SDValue();
+
+ // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
+ // which we can't support here for now.
+ if (Match.getScalarValueSizeInBits() != BitWidth)
+ return SDValue();
+
+ SDValue Movmsk;
+ SDLoc DL(Extract);
+ EVT MatchVT = Match.getValueType();
+ unsigned NumElts = MatchVT.getVectorNumElements();
+ unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (ExtractVT == MVT::i1) {
+ // Special case for (pre-legalization) vXi1 reductions.
+ if (NumElts > 64 || !isPowerOf2_32(NumElts))
+ return SDValue();
+ if (TLI.isTypeLegal(MatchVT)) {
+ // If this is a legal AVX512 predicate type then we can just bitcast.
+ EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ Movmsk = DAG.getBitcast(MovmskVT, Match);
+ } else {
+ // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
+ // PCMPEQQ (SSE41+), use PCMPEQD instead.
+ if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
+ Match.getOpcode() == ISD::SETCC &&
+ ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
+ cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
+ ISD::CondCode::SETEQ) {
+ SDValue Vec = Match.getOperand(0);
+ if (Vec.getValueType().getScalarType() == MVT::i64 &&
+ (2 * NumElts) <= MaxElts) {
+ NumElts *= 2;
+ EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+ Match = DAG.getSetCC(
+ DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
+ DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
+ }
+ }
+
+ // Use combineBitcastvxi1 to create the MOVMSK.
+ while (NumElts > MaxElts) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
+ Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
+ NumElts /= 2;
+ }
+ EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
+ }
+ if (!Movmsk)
+ return SDValue();
+ Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
+ } else {
+ // FIXME: Better handling of k-registers or 512-bit vectors?
+ unsigned MatchSizeInBits = Match.getValueSizeInBits();
+ if (!(MatchSizeInBits == 128 ||
+ (MatchSizeInBits == 256 && Subtarget.hasAVX())))
+ return SDValue();
+
+ // Make sure this isn't a vector of 1 element. The perf win from using
+ // MOVMSK diminishes with less elements in the reduction, but it is
+ // generally better to get the comparison over to the GPRs as soon as
+ // possible to reduce the number of vector ops.
+ if (Match.getValueType().getVectorNumElements() < 2)
+ return SDValue();
+
+ // Check that we are extracting a reduction of all sign bits.
+ if (DAG.ComputeNumSignBits(Match) != BitWidth)
+ return SDValue();
+
+ if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
+ Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
+ MatchSizeInBits = Match.getValueSizeInBits();
+ }
+
+ // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+ MVT MaskSrcVT;
+ if (64 == BitWidth || 32 == BitWidth)
+ MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+ MatchSizeInBits / BitWidth);
+ else
+ MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+ SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
+ Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
+ NumElts = MaskSrcVT.getVectorNumElements();
+ }
+ assert((NumElts <= 32 || NumElts == 64) &&
+ "Not expecting more than 64 elements");
+
+ MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
+ if (BinOp == ISD::XOR) {
+ // parity -> (PARITY(MOVMSK X))
+ SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
+ return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
+ }
+
+ SDValue CmpC;
+ ISD::CondCode CondCode;
+ if (BinOp == ISD::OR) {
+ // any_of -> MOVMSK != 0
+ CmpC = DAG.getConstant(0, DL, CmpVT);
+ CondCode = ISD::CondCode::SETNE;
+ } else {
+ // all_of -> MOVMSK == ((1 << NumElts) - 1)
+ CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
+ DL, CmpVT);
+ CondCode = ISD::CondCode::SETEQ;
+ }
+
+ // The setcc produces an i8 of 0/1, so extend that to the result width and
+ // negate to get the final 0/-1 mask value.
+ EVT SetccVT =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
+ SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
+ SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
+ SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
+ return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
+}
+
+static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // PSADBW is only supported on SSE2 and up.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ EVT ExtractVT = Extract->getValueType(0);
+ // Verify the type we're extracting is either i32 or i64.
+ // FIXME: Could support other types, but this is what we have coverage for.
+ if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
+ return SDValue();
+
+ EVT VT = Extract->getOperand(0).getValueType();
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
+ return SDValue();
+
+ // Match shuffle + add pyramid.
+ ISD::NodeType BinOp;
+ SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
+
+ // The operand is expected to be zero extended from i8
+ // (verified in detectZextAbsDiff).
+ // In order to convert to i64 and above, additional any/zero/sign
+ // extend is expected.
+ // The zero extend from 32 bit has no mathematical effect on the result.
+ // Also the sign extend is basically zero extend
+ // (extends the sign bit which is zero).
+ // So it is correct to skip the sign/zero extend instruction.
+ if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
+ Root.getOpcode() == ISD::ZERO_EXTEND ||
+ Root.getOpcode() == ISD::ANY_EXTEND))
+ Root = Root.getOperand(0);
+
+ // If there was a match, we want Root to be a select that is the root of an
+ // abs-diff pattern.
+ if (!Root || Root.getOpcode() != ISD::ABS)
+ return SDValue();
+
+ // Check whether we have an abs-diff pattern feeding into the select.
+ SDValue Zext0, Zext1;
+ if (!detectZextAbsDiff(Root, Zext0, Zext1))
+ return SDValue();
+
+ // Create the SAD instruction.
+ SDLoc DL(Extract);
+ SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
+
+ // If the original vector was wider than 8 elements, sum over the results
+ // in the SAD vector.
+ unsigned Stages = Log2_32(VT.getVectorNumElements());
+ EVT SadVT = SAD.getValueType();
+ if (Stages > 3) {
+ unsigned SadElems = SadVT.getVectorNumElements();
+
+ for(unsigned i = Stages - 3; i > 0; --i) {
+ SmallVector<int, 16> Mask(SadElems, -1);
+ for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
+ Mask[j] = MaskEnd + j;
+
+ SDValue Shuffle =
+ DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
+ SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
+ }
+ }
+
+ unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
+ // Return the lowest ExtractSizeInBits bits.
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
+ SadVT.getSizeInBits() / ExtractSizeInBits);
+ SAD = DAG.getBitcast(ResVT, SAD);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
+ Extract->getOperand(1));
+}
+
+// Attempt to peek through a target shuffle and extract the scalar from the
+// source.
+static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue Src = N->getOperand(0);
+ SDValue Idx = N->getOperand(1);
+
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+ EVT SrcSVT = SrcVT.getVectorElementType();
+ unsigned SrcEltBits = SrcSVT.getSizeInBits();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+
+ // Don't attempt this for boolean mask vectors or unknown extraction indices.
+ if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ const APInt &IdxC = N->getConstantOperandAPInt(1);
+ if (IdxC.uge(NumSrcElts))
+ return SDValue();
+
+ SDValue SrcBC = peekThroughBitcasts(Src);
+
+ // Handle extract(bitcast(broadcast(scalar_value))).
+ if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
+ SDValue SrcOp = SrcBC.getOperand(0);
+ EVT SrcOpVT = SrcOp.getValueType();
+ if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
+ (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
+ unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
+ unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
+ // TODO support non-zero offsets.
+ if (Offset == 0) {
+ SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
+ SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
+ return SrcOp;
+ }
+ }
+ }
+
+ // If we're extracting a single element from a broadcast load and there are
+ // no other users, just create a single load.
+ if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
+ unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
+ if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
+ VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
+ SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
+ MemIntr->getBasePtr(),
+ MemIntr->getPointerInfo(),
+ MemIntr->getOriginalAlign(),
+ MemIntr->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
+ return Load;
+ }
+ }
+
+ // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
+ // TODO: Move to DAGCombine?
+ if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
+ SrcBC.getValueType().isInteger() &&
+ (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
+ SrcBC.getScalarValueSizeInBits() ==
+ SrcBC.getOperand(0).getValueSizeInBits()) {
+ unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
+ if (IdxC.ult(Scale)) {
+ unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
+ SDValue Scl = SrcBC.getOperand(0);
+ EVT SclVT = Scl.getValueType();
+ if (Offset) {
+ Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
+ DAG.getShiftAmountConstant(Offset, SclVT, dl));
+ }
+ Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
+ Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
+ return Scl;
+ }
+ }
+
+ // Handle extract(truncate(x)) for 0'th index.
+ // TODO: Treat this as a faux shuffle?
+ // TODO: When can we use this for general indices?
+ if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
+ (SrcVT.getSizeInBits() % 128) == 0) {
+ Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
+ MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
+ return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
+ Idx);
+ }
+
+ // Resolve the target shuffle inputs and mask.
+ SmallVector<int, 16> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
+ return SDValue();
+
+ // Shuffle inputs must be the same size as the result.
+ if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
+ return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
+ }))
+ return SDValue();
+
+ // Attempt to narrow/widen the shuffle mask to the correct size.
+ if (Mask.size() != NumSrcElts) {
+ if ((NumSrcElts % Mask.size()) == 0) {
+ SmallVector<int, 16> ScaledMask;
+ int Scale = NumSrcElts / Mask.size();
+ narrowShuffleMaskElts(Scale, Mask, ScaledMask);
+ Mask = std::move(ScaledMask);
+ } else if ((Mask.size() % NumSrcElts) == 0) {
+ // Simplify Mask based on demanded element.
+ int ExtractIdx = (int)N->getConstantOperandVal(1);
+ int Scale = Mask.size() / NumSrcElts;
+ int Lo = Scale * ExtractIdx;
+ int Hi = Scale * (ExtractIdx + 1);
+ for (int i = 0, e = (int)Mask.size(); i != e; ++i)
+ if (i < Lo || Hi <= i)
+ Mask[i] = SM_SentinelUndef;
+
+ SmallVector<int, 16> WidenedMask;
+ while (Mask.size() > NumSrcElts &&
+ canWidenShuffleElements(Mask, WidenedMask))
+ Mask = std::move(WidenedMask);
+ // TODO - investigate support for wider shuffle masks with known upper
+ // undef/zero elements for implicit zero-extension.
+ }
+ }
+
+ // Check if narrowing/widening failed.
+ if (Mask.size() != NumSrcElts)
+ return SDValue();
+
+ int SrcIdx = Mask[IdxC.getZExtValue()];
+
+ // If the shuffle source element is undef/zero then we can just accept it.
+ if (SrcIdx == SM_SentinelUndef)
+ return DAG.getUNDEF(VT);
+
+ if (SrcIdx == SM_SentinelZero)
+ return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
+ : DAG.getConstant(0, dl, VT);
+
+ SDValue SrcOp = Ops[SrcIdx / Mask.size()];
+ SrcIdx = SrcIdx % Mask.size();
+
+ // We can only extract other elements from 128-bit vectors and in certain
+ // circumstances, depending on SSE-level.
+ // TODO: Investigate using extract_subvector for larger vectors.
+ // TODO: Investigate float/double extraction if it will be just stored.
+ if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
+ ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
+ assert(SrcSVT == VT && "Unexpected extraction type");
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
+ DAG.getIntPtrConstant(SrcIdx, dl));
+ }
+
+ if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
+ (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
+ assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type");
+ unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
+ SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
+ DAG.getTargetConstant(SrcIdx, dl, MVT::i8));
+ return DAG.getZExtOrTrunc(ExtOp, dl, VT);
+ }
+
+ return SDValue();
+}
+
+/// Extracting a scalar FP value from vector element 0 is free, so extract each
+/// operand first, then perform the math as a scalar op.
+static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
+ assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
+ SDValue Vec = ExtElt->getOperand(0);
+ SDValue Index = ExtElt->getOperand(1);
+ EVT VT = ExtElt->getValueType(0);
+ EVT VecVT = Vec.getValueType();
+
+ // TODO: If this is a unary/expensive/expand op, allow extraction from a
+ // non-zero element because the shuffle+scalar op will be cheaper?
+ if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
+ return SDValue();
+
+ // Vector FP compares don't fit the pattern of FP math ops (propagate, not
+ // extract, the condition code), so deal with those as a special-case.
+ if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
+ EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
+ if (OpVT != MVT::f32 && OpVT != MVT::f64)
+ return SDValue();
+
+ // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
+ SDLoc DL(ExtElt);
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
+ Vec.getOperand(0), Index);
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
+ Vec.getOperand(1), Index);
+ return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
+ }
+
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return SDValue();
+
+ // Vector FP selects don't fit the pattern of FP math ops (because the
+ // condition has a different type and we have to change the opcode), so deal
+ // with those here.
+ // FIXME: This is restricted to pre type legalization by ensuring the setcc
+ // has i1 elements. If we loosen this we need to convert vector bool to a
+ // scalar bool.
+ if (Vec.getOpcode() == ISD::VSELECT &&
+ Vec.getOperand(0).getOpcode() == ISD::SETCC &&
+ Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
+ Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
+ // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
+ SDLoc DL(ExtElt);
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ Vec.getOperand(0).getValueType().getScalarType(),
+ Vec.getOperand(0), Index);
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ Vec.getOperand(1), Index);
+ SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ Vec.getOperand(2), Index);
+ return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
+ }
+
+ // TODO: This switch could include FNEG and the x86-specific FP logic ops
+ // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
+ // missed load folding and fma+fneg combining.
+ switch (Vec.getOpcode()) {
+ case ISD::FMA: // Begin 3 operands
+ case ISD::FMAD:
+ case ISD::FADD: // Begin 2 operands
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FCOPYSIGN:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM:
+ case X86ISD::FMAX:
+ case X86ISD::FMIN:
+ case ISD::FABS: // Begin 1 operand
+ case ISD::FSQRT:
+ case ISD::FRINT:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case X86ISD::FRCP:
+ case X86ISD::FRSQRT: {
+ // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
+ SDLoc DL(ExtElt);
+ SmallVector<SDValue, 4> ExtOps;
+ for (SDValue Op : Vec->ops())
+ ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
+ return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
+ }
+ default:
+ return SDValue();
+ }
+ llvm_unreachable("All opcodes should return within switch");
+}
+
+/// Try to convert a vector reduction sequence composed of binops and shuffles
+/// into horizontal ops.
+static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
+
+ // We need at least SSE2 to anything here.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ ISD::NodeType Opc;
+ SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
+ {ISD::ADD, ISD::MUL, ISD::FADD}, true);
+ if (!Rdx)
+ return SDValue();
+
+ SDValue Index = ExtElt->getOperand(1);
+ assert(isNullConstant(Index) &&
+ "Reduction doesn't end in an extract from index 0");
+
+ EVT VT = ExtElt->getValueType(0);
+ EVT VecVT = Rdx.getValueType();
+ if (VecVT.getScalarType() != VT)
+ return SDValue();
+
+ SDLoc DL(ExtElt);
+
+ // vXi8 mul reduction - promote to vXi16 mul reduction.
+ if (Opc == ISD::MUL) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
+ return SDValue();
+ if (VecVT.getSizeInBits() >= 128) {
+ EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
+ SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
+ SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
+ Lo = DAG.getBitcast(WideVT, Lo);
+ Hi = DAG.getBitcast(WideVT, Hi);
+ Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
+ while (Rdx.getValueSizeInBits() > 128) {
+ std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+ Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
+ }
+ } else {
+ if (VecVT == MVT::v4i8)
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
+ DAG.getUNDEF(MVT::v4i8));
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
+ DAG.getUNDEF(MVT::v8i8));
+ Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
+ Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
+ }
+ if (NumElts >= 8)
+ Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+ DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+ {4, 5, 6, 7, -1, -1, -1, -1}));
+ Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+ DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+ {2, 3, -1, -1, -1, -1, -1, -1}));
+ Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+ DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+ {1, -1, -1, -1, -1, -1, -1, -1}));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
+ // vXi8 add reduction - sub 128-bit vector.
+ if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
+ if (VecVT == MVT::v4i8) {
+ // Pad with zero.
+ if (Subtarget.hasSSE41()) {
+ Rdx = DAG.getBitcast(MVT::i32, Rdx);
+ Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+ DAG.getConstant(0, DL, MVT::v4i32), Rdx,
+ DAG.getIntPtrConstant(0, DL));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ } else {
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
+ DAG.getConstant(0, DL, VecVT));
+ }
+ }
+ if (Rdx.getValueType() == MVT::v8i8) {
+ // Pad with undef.
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
+ DAG.getUNDEF(MVT::v8i8));
+ }
+ Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
+ DAG.getConstant(0, DL, MVT::v16i8));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
+ // Must be a >=128-bit vector with pow2 elements.
+ if ((VecVT.getSizeInBits() % 128) != 0 ||
+ !isPowerOf2_32(VecVT.getVectorNumElements()))
+ return SDValue();
+
+ // vXi8 add reduction - sum lo/hi halves then use PSADBW.
+ if (VT == MVT::i8) {
+ while (Rdx.getValueSizeInBits() > 128) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+ VecVT = Lo.getValueType();
+ Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
+ }
+ assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
+
+ SDValue Hi = DAG.getVectorShuffle(
+ MVT::v16i8, DL, Rdx, Rdx,
+ {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
+ Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
+ Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
+ getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
+ // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
+ if (!shouldUseHorizontalOp(true, DAG, Subtarget))
+ return SDValue();
+
+ unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
+
+ // 256-bit horizontal instructions operate on 128-bit chunks rather than
+ // across the whole vector, so we need an extract + hop preliminary stage.
+ // This is the only step where the operands of the hop are not the same value.
+ // TODO: We could extend this to handle 512-bit or even longer vectors.
+ if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
+ ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
+ SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
+ Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
+ VecVT = Rdx.getValueType();
+ }
+ if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
+ !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
+ return SDValue();
+
+ // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
+ unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
+ for (unsigned i = 0; i != ReductionSteps; ++i)
+ Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+}
+
+/// Detect vector gather/scatter index generation and convert it from being a
+/// bunch of shuffles and extracts into a somewhat faster sequence.
+/// For i686, the best sequence is apparently storing the value and loading
+/// scalars back, while for x64 we should use 64-bit extracts and shifts.
+static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+ return NewOp;
+
+ SDValue InputVector = N->getOperand(0);
+ SDValue EltIdx = N->getOperand(1);
+ auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
+
+ EVT SrcVT = InputVector.getValueType();
+ EVT VT = N->getValueType(0);
+ SDLoc dl(InputVector);
+ bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+
+ if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
+ return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
+
+ // Integer Constant Folding.
+ if (CIdx && VT.isInteger()) {
+ APInt UndefVecElts;
+ SmallVector<APInt, 16> EltBits;
+ unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
+ if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
+ EltBits, true, false)) {
+ uint64_t Idx = CIdx->getZExtValue();
+ if (UndefVecElts[Idx])
+ return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
+ return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
+ dl, VT);
+ }
+ }
+
+ if (IsPextr) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(
+ SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
+ return SDValue(N, 0);
+
+ // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
+ if ((InputVector.getOpcode() == X86ISD::PINSRB ||
+ InputVector.getOpcode() == X86ISD::PINSRW) &&
+ InputVector.getOperand(2) == EltIdx) {
+ assert(SrcVT == InputVector.getOperand(0).getValueType() &&
+ "Vector type mismatch");
+ SDValue Scl = InputVector.getOperand(1);
+ Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
+ return DAG.getZExtOrTrunc(Scl, dl, VT);
+ }
+
+ // TODO - Remove this once we can handle the implicit zero-extension of
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
+ // combineBasicSADPattern.
+ return SDValue();
+ }
+
+ // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
+ SDValue MMXSrc = InputVector.getOperand(0);
+
+ // The bitcast source is a direct mmx result.
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getBitcast(VT, InputVector);
+ }
+
+ // Detect mmx to i32 conversion through a v2i32 elt extract.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
+ SDValue MMXSrc = InputVector.getOperand(0);
+
+ // The bitcast source is a direct mmx result.
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
+ }
+
+ // Check whether this extract is the root of a sum of absolute differences
+ // pattern. This has to be done here because we really want it to happen
+ // pre-legalization,
+ if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
+ return SAD;
+
+ // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
+ if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
+ return Cmp;
+
+ // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
+ if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
+ return MinMax;
+
+ // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
+ if (SDValue V = combineArithReduction(N, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = scalarizeExtEltFP(N, DAG))
+ return V;
+
+ // Attempt to extract a i1 element by using MOVMSK to extract the signbits
+ // and then testing the relevant element.
+ //
+ // Note that we only combine extracts on the *same* result number, i.e.
+ // t0 = merge_values a0, a1, a2, a3
+ // i1 = extract_vector_elt t0, Constant:i64<2>
+ // i1 = extract_vector_elt t0, Constant:i64<3>
+ // but not
+ // i1 = extract_vector_elt t0:1, Constant:i64<2>
+ // since the latter would need its own MOVMSK.
+ if (CIdx && SrcVT.getScalarType() == MVT::i1) {
+ SmallVector<SDNode *, 16> BoolExtracts;
+ unsigned ResNo = InputVector.getResNo();
+ auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
+ if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(Use->getOperand(1)) &&
+ Use->getOperand(0).getResNo() == ResNo &&
+ Use->getValueType(0) == MVT::i1) {
+ BoolExtracts.push_back(Use);
+ return true;
+ }
+ return false;
+ };
+ if (all_of(InputVector->uses(), IsBoolExtract) &&
+ BoolExtracts.size() > 1) {
+ EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
+ if (SDValue BC =
+ combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
+ for (SDNode *Use : BoolExtracts) {
+ // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
+ unsigned MaskIdx = Use->getConstantOperandVal(1);
+ APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
+ SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
+ SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
+ Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
+ DCI.CombineTo(Use, Res);
+ }
+ return SDValue(N, 0);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+/// If a vector select has an operand that is -1 or 0, try to simplify the
+/// select to a bitwise logic operation.
+/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
+static SDValue
+combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT VT = LHS.getValueType();
+ EVT CondVT = Cond.getValueType();
+ SDLoc DL(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (N->getOpcode() != ISD::VSELECT)
+ return SDValue();
+
+ assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
+ // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
+ // TODO: Can we assert that both operands are not zeros (because that should
+ // get simplified at node creation time)?
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ // If both inputs are 0/undef, create a complete zero vector.
+ // FIXME: As noted above this should be handled by DAGCombiner/getNode.
+ if (TValIsAllZeros && FValIsAllZeros) {
+ if (VT.isFloatingPoint())
+ return DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getConstant(0, DL, VT);
+ }
+
+ // To use the condition operand as a bitwise mask, it must have elements that
+ // are the same size as the select elements. Ie, the condition operand must
+ // have already been promoted from the IR select condition type <N x i1>.
+ // Don't check if the types themselves are equal because that excludes
+ // vector floating-point selects.
+ if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ // Try to invert the condition if true value is not all 1s and false value is
+ // not all 0s. Only do this if the condition has one use.
+ bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+ if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
+ // Check if the selector will be produced by CMPP*/PCMP*.
+ Cond.getOpcode() == ISD::SETCC &&
+ // Check if SETCC has already been promoted.
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+ CondVT) {
+ bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+ if (TValIsAllZeros || FValIsAllOnes) {
+ SDValue CC = Cond.getOperand(2);
+ ISD::CondCode NewCC = ISD::getSetCCInverse(
+ cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
+ Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
+ NewCC);
+ std::swap(LHS, RHS);
+ TValIsAllOnes = FValIsAllOnes;
+ FValIsAllZeros = TValIsAllZeros;
+ }
+ }
+
+ // Cond value must be 'sign splat' to be converted to a logical op.
+ if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
+ return SDValue();
+
+ // vselect Cond, 111..., 000... -> Cond
+ if (TValIsAllOnes && FValIsAllZeros)
+ return DAG.getBitcast(VT, Cond);
+
+ if (!TLI.isTypeLegal(CondVT))
+ return SDValue();
+
+ // vselect Cond, 111..., X -> or Cond, X
+ if (TValIsAllOnes) {
+ SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
+ SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
+ return DAG.getBitcast(VT, Or);
+ }
+
+ // vselect Cond, X, 000... -> and Cond, X
+ if (FValIsAllZeros) {
+ SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
+ SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
+ return DAG.getBitcast(VT, And);
+ }
+
+ // vselect Cond, 000..., X -> andn Cond, X
+ if (TValIsAllZeros) {
+ SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
+ SDValue AndN;
+ // The canonical form differs for i1 vectors - x86andnp is not used
+ if (CondVT.getScalarType() == MVT::i1)
+ AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
+ CastRHS);
+ else
+ AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
+ return DAG.getBitcast(VT, AndN);
+ }
+
+ return SDValue();
+}
+
+/// If both arms of a vector select are concatenated vectors, split the select,
+/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
+/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
+/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
+static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
+ return SDValue();
+
+ // TODO: Split 512-bit vectors too?
+ EVT VT = N->getValueType(0);
+ if (!VT.is256BitVector())
+ return SDValue();
+
+ // TODO: Split as long as any 2 of the 3 operands are concatenated?
+ SDValue Cond = N->getOperand(0);
+ SDValue TVal = N->getOperand(1);
+ SDValue FVal = N->getOperand(2);
+ SmallVector<SDValue, 4> CatOpsT, CatOpsF;
+ if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
+ !collectConcatOps(TVal.getNode(), CatOpsT) ||
+ !collectConcatOps(FVal.getNode(), CatOpsF))
+ return SDValue();
+
+ auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
+ makeBlend, /*CheckBWI*/ false);
+}
+
+static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ SDLoc DL(N);
+
+ auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
+ auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
+ if (!TrueC || !FalseC)
+ return SDValue();
+
+ // Don't do this for crazy integer types.
+ EVT VT = N->getValueType(0);
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ // We're going to use the condition bit in math or logic ops. We could allow
+ // this with a wider condition value (post-legalization it becomes an i8),
+ // but if nothing is creating selects that late, it doesn't matter.
+ if (Cond.getValueType() != MVT::i1)
+ return SDValue();
+
+ // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
+ // 3, 5, or 9 with i32/i64, so those get transformed too.
+ // TODO: For constants that overflow or do not differ by power-of-2 or small
+ // multiplier, convert to 'and' + 'add'.
+ const APInt &TrueVal = TrueC->getAPIntValue();
+ const APInt &FalseVal = FalseC->getAPIntValue();
+ bool OV;
+ APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
+ if (OV)
+ return SDValue();
+
+ APInt AbsDiff = Diff.abs();
+ if (AbsDiff.isPowerOf2() ||
+ ((VT == MVT::i32 || VT == MVT::i64) &&
+ (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
+
+ // We need a positive multiplier constant for shift/LEA codegen. The 'not'
+ // of the condition can usually be folded into a compare predicate, but even
+ // without that, the sequence should be cheaper than a CMOV alternative.
+ if (TrueVal.slt(FalseVal)) {
+ Cond = DAG.getNOT(DL, Cond, MVT::i1);
+ std::swap(TrueC, FalseC);
+ }
+
+ // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
+ SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
+
+ // Multiply condition by the difference if non-one.
+ if (!AbsDiff.isOneValue())
+ R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
+
+ // Add the base if non-zero.
+ if (!FalseC->isNullValue())
+ R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
+
+ return R;
+ }
+
+ return SDValue();
+}
+
+/// If this is a *dynamic* select (non-constant condition) and we can match
+/// this node with one of the variable blend instructions, restructure the
+/// condition so that blends can use the high (sign) bit of each element.
+/// This function will also call SimplifyDemandedBits on already created
+/// BLENDV to perform additional simplifications.
+static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue Cond = N->getOperand(0);
+ if ((N->getOpcode() != ISD::VSELECT &&
+ N->getOpcode() != X86ISD::BLENDV) ||
+ ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+
+ // Don't optimize before the condition has been transformed to a legal type
+ // and don't ever optimize vector selects that map to AVX512 mask-registers.
+ unsigned BitWidth = Cond.getScalarValueSizeInBits();
+ if (BitWidth < 8 || BitWidth > 64)
+ return SDValue();
+
+ // We can only handle the cases where VSELECT is directly legal on the
+ // subtarget. We custom lower VSELECT nodes with constant conditions and
+ // this makes it hard to see whether a dynamic VSELECT will correctly
+ // lower, so we both check the operation's status and explicitly handle the
+ // cases where a *dynamic* blend will fail even though a constant-condition
+ // blend could be custom lowered.
+ // FIXME: We should find a better way to handle this class of problems.
+ // Potentially, we should combine constant-condition vselect nodes
+ // pre-legalization into shuffles and not mark as many types as custom
+ // lowered.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT VT = N->getValueType(0);
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+ // FIXME: We don't support i16-element blends currently. We could and
+ // should support them by making *all* the bits in the condition be set
+ // rather than just the high bit and using an i8-element blend.
+ if (VT.getVectorElementType() == MVT::i16)
+ return SDValue();
+ // Dynamic blending was only available from SSE4.1 onward.
+ if (VT.is128BitVector() && !Subtarget.hasSSE41())
+ return SDValue();
+ // Byte blends are only available in AVX2
+ if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
+ return SDValue();
+ // There are no 512-bit blend instructions that use sign bits.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ auto OnlyUsedAsSelectCond = [](SDValue Cond) {
+ for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
+ UI != UE; ++UI)
+ if ((UI->getOpcode() != ISD::VSELECT &&
+ UI->getOpcode() != X86ISD::BLENDV) ||
+ UI.getOperandNo() != 0)
+ return false;
+
+ return true;
+ };
+
+ APInt DemandedBits(APInt::getSignMask(BitWidth));
+
+ if (OnlyUsedAsSelectCond(Cond)) {
+ KnownBits Known;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
+ return SDValue();
+
+ // If we changed the computation somewhere in the DAG, this change will
+ // affect all users of Cond. Update all the nodes so that we do not use
+ // the generic VSELECT anymore. Otherwise, we may perform wrong
+ // optimizations as we messed with the actual expectation for the vector
+ // boolean values.
+ for (SDNode *U : Cond->uses()) {
+ if (U->getOpcode() == X86ISD::BLENDV)
+ continue;
+
+ SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
+ Cond, U->getOperand(1), U->getOperand(2));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
+ DCI.AddToWorklist(U);
+ }
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue(N, 0);
+ }
+
+ // Otherwise we can still at least try to simplify multiple use bits.
+ if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
+ return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
+ N->getOperand(1), N->getOperand(2));
+
+ return SDValue();
+}
+
+// Try to match:
+// (or (and (M, (sub 0, X)), (pandn M, X)))
+// which is a special case of:
+// (select M, (sub 0, X), X)
+// Per:
+// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+// We know that, if fNegate is 0 or 1:
+// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+//
+// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+// ( M ? -X : X) == ((X ^ M ) + (M & 1))
+// This lets us transform our vselect to:
+// (add (xor X, M), (and M, 1))
+// And further to:
+// (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoConditionalNegate(
+ EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ EVT MaskVT = Mask.getValueType();
+ assert(MaskVT.isInteger() &&
+ DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
+ "Mask must be zero/all-bits");
+
+ if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
+ return SDValue();
+ if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
+ return SDValue();
+
+ auto IsNegV = [](SDNode *N, SDValue V) {
+ return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+ ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+ };
+
+ SDValue V;
+ if (IsNegV(Y.getNode(), X))
+ V = X;
+ else if (IsNegV(X.getNode(), Y))
+ V = Y;
+ else
+ return SDValue();
+
+ SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+ SDValue SubOp2 = Mask;
+
+ // If the negate was on the false side of the select, then
+ // the operands of the SUB need to be swapped. PR 27251.
+ // This is because the pattern being matched above is
+ // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
+ // but if the pattern matched was
+ // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+ // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+ // pattern also needs to be a negation of the replacement pattern above.
+ // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+ // sub accomplishes the negation of the replacement pattern.
+ if (V == Y)
+ std::swap(SubOp1, SubOp2);
+
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+ return DAG.getBitcast(VT, Res);
+}
+
+/// Do target-specific dag combines on SELECT and VSELECT nodes.
+static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+
+ // Try simplification again because we use this function to optimize
+ // BLENDV nodes that are not handled by the generic combiner.
+ if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
+ return V;
+
+ EVT VT = LHS.getValueType();
+ EVT CondVT = Cond.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
+
+ // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
+ // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
+ // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
+ if (CondVT.isVector() && CondVT.isInteger() &&
+ CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
+ (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
+ DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
+ if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
+ DL, DAG, Subtarget))
+ return V;
+
+ // Convert vselects with constant condition into shuffles.
+ if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
+ SmallVector<int, 64> Mask;
+ if (createShuffleMaskFromVSELECT(Mask, Cond))
+ return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
+ }
+
+ // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
+ // by forcing the unselected elements to zero.
+ // TODO: Can we handle more shuffles with this?
+ if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
+ LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
+ LHS.hasOneUse() && RHS.hasOneUse()) {
+ MVT SimpleVT = VT.getSimpleVT();
+ bool LHSUnary, RHSUnary;
+ SmallVector<SDValue, 1> LHSOps, RHSOps;
+ SmallVector<int, 64> LHSMask, RHSMask, CondMask;
+ if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
+ getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask,
+ LHSUnary) &&
+ getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask,
+ RHSUnary)) {
+ int NumElts = VT.getVectorNumElements();
+ for (int i = 0; i != NumElts; ++i) {
+ if (CondMask[i] < NumElts)
+ RHSMask[i] = 0x80;
+ else
+ LHSMask[i] = 0x80;
+ }
+ LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
+ getConstVector(LHSMask, SimpleVT, DAG, DL, true));
+ RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
+ getConstVector(RHSMask, SimpleVT, DAG, DL, true));
+ return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
+ }
+ }
+
+ // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+ // instructions match the semantics of the common C idiom x<y?x:y but not
+ // x<=y?x:y, because of how they handle negative zero (which can be
+ // ignored in unsafe-math mode).
+ // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
+ if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+ VT != MVT::f80 && VT != MVT::f128 &&
+ (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+ (Subtarget.hasSSE2() ||
+ (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ unsigned Opcode = 0;
+ // Check for x CC y ? x : y.
+ if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ // Converting this to a min would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
+ !(DAG.isKnownNeverZeroFloat(LHS) ||
+ DAG.isKnownNeverZeroFloat(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
+ !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETULE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETOGE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
+ !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a max would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
+ !(DAG.isKnownNeverZeroFloat(LHS) ||
+ DAG.isKnownNeverZeroFloat(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ // Check for x CC y ? y : x -- a min/max with reversed arms.
+ } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOGE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
+ !(DAG.isKnownNeverZeroFloat(LHS) ||
+ DAG.isKnownNeverZeroFloat(RHS))) {
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a min would handle NaNs incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETULT:
+ // Converting this to a max would handle NaNs incorrectly.
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
+ !DAG.isKnownNeverZeroFloat(LHS) &&
+ !DAG.isKnownNeverZeroFloat(RHS)) {
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETULE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ }
+
+ if (Opcode)
+ return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+ }
+
+ // Some mask scalar intrinsics rely on checking if only one bit is set
+ // and implement it in C code like this:
+ // A[0] = (U & 1) ? A[0] : W[0];
+ // This creates some redundant instructions that break pattern matching.
+ // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
+ if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
+ Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ SDValue AndNode = Cond.getOperand(0);
+ if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
+ isNullConstant(Cond.getOperand(1)) &&
+ isOneConstant(AndNode.getOperand(1))) {
+ // LHS and RHS swapped due to
+ // setcc outputting 1 when AND resulted in 0 and vice versa.
+ AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
+ return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
+ }
+ }
+
+ // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+ // lowering on KNL. In this case we convert it to
+ // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+ // The same situation all vectors of i8 and i16 without BWI.
+ // Make sure we extend these even before type legalization gets a chance to
+ // split wide vectors.
+ // Since SKX these selects have a proper lowering.
+ if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
+ CondVT.getVectorElementType() == MVT::i1 &&
+ (VT.getVectorElementType() == MVT::i8 ||
+ VT.getVectorElementType() == MVT::i16)) {
+ Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
+ }
+
+ // AVX512 - Extend select with zero to merge with target shuffle.
+ // select(mask, extract_subvector(shuffle(x)), zero) -->
+ // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
+ // TODO - support non target shuffles as well.
+ if (Subtarget.hasAVX512() && CondVT.isVector() &&
+ CondVT.getVectorElementType() == MVT::i1) {
+ auto SelectableOp = [&TLI](SDValue Op) {
+ return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isTargetShuffle(Op.getOperand(0).getOpcode()) &&
+ isNullConstant(Op.getOperand(1)) &&
+ TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
+ Op.hasOneUse() && Op.getOperand(0).hasOneUse();
+ };
+
+ bool SelectableLHS = SelectableOp(LHS);
+ bool SelectableRHS = SelectableOp(RHS);
+ bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
+ EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
+ : RHS.getOperand(0).getValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
+ LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
+ VT.getSizeInBits());
+ RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
+ VT.getSizeInBits());
+ Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
+ DAG.getUNDEF(SrcCondVT), Cond,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
+ return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
+ }
+ }
+
+ if (SDValue V = combineSelectOfTwoConstants(N, DAG))
+ return V;
+
+ // Canonicalize min/max:
+ // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
+ // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
+ // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+ // the need for an extra compare against zero. e.g.
+ // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
+ // subl %esi, %edi
+ // testl %edi, %edi
+ // movl $0, %eax
+ // cmovgl %edi, %eax
+ // =>
+ // xorl %eax, %eax
+ // subl %esi, $edi
+ // cmovsl %eax, %edi
+ //
+ // We can also canonicalize
+ // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
+ // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
+ // This allows the use of a test instruction for the compare.
+ if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+ Cond.hasOneUse() &&
+ LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
+ (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
+ ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
+ Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS);
+ }
+ if (CC == ISD::SETUGT && isOneConstant(RHS)) {
+ ISD::CondCode NewCC = ISD::SETUGE;
+ Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS);
+ }
+ }
+
+ // Check if the first operand is all zeros and Cond type is vXi1.
+ // If this an avx512 target we can improve the use of zero masking by
+ // swapping the operands and inverting the condition.
+ if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
+ Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
+ ISD::isBuildVectorAllZeros(LHS.getNode()) &&
+ !ISD::isBuildVectorAllZeros(RHS.getNode())) {
+ // Invert the cond to not(cond) : xor(op,allones)=not(op)
+ SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
+ // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+ return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
+ }
+
+ // Early exit check
+ if (!TLI.isTypeLegal(VT))
+ return SDValue();
+
+ if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
+ return V;
+
+ if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
+ return V;
+
+ if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
+ return V;
+
+ // select(~Cond, X, Y) -> select(Cond, Y, X)
+ if (CondVT.getScalarType() != MVT::i1) {
+ if (SDValue CondNot = IsNOT(Cond, DAG))
+ return DAG.getNode(N->getOpcode(), DL, VT,
+ DAG.getBitcast(CondVT, CondNot), RHS, LHS);
+ // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
+ if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
+ ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
+ Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
+ DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
+ }
+ }
+
+ // Try to optimize vXi1 selects if both operands are either all constants or
+ // bitcasts from scalar integer type. In that case we can convert the operands
+ // to integer and use an integer select which will be converted to a CMOV.
+ // We need to take a little bit of care to avoid creating an i64 type after
+ // type legalization.
+ if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
+ VT.getVectorElementType() == MVT::i1 &&
+ (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
+ bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
+ bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
+
+ if ((LHSIsConst ||
+ (LHS.getOpcode() == ISD::BITCAST &&
+ LHS.getOperand(0).getValueType() == IntVT)) &&
+ (RHSIsConst ||
+ (RHS.getOpcode() == ISD::BITCAST &&
+ RHS.getOperand(0).getValueType() == IntVT))) {
+ if (LHSIsConst)
+ LHS = combinevXi1ConstantToInteger(LHS, DAG);
+ else
+ LHS = LHS.getOperand(0);
+
+ if (RHSIsConst)
+ RHS = combinevXi1ConstantToInteger(RHS, DAG);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
+ return DAG.getBitcast(VT, Select);
+ }
+ }
+
+ // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
+ // single bits, then invert the predicate and swap the select operands.
+ // This can lower using a vector shift bit-hack rather than mask and compare.
+ if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
+ Cond.getOperand(0).getOpcode() == ISD::AND &&
+ isNullOrNullSplat(Cond.getOperand(1)) &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+ Cond.getOperand(0).getValueType() == VT) {
+ // The 'and' mask must be composed of power-of-2 constants.
+ SDValue And = Cond.getOperand(0);
+ auto *C = isConstOrConstSplat(And.getOperand(1));
+ if (C && C->getAPIntValue().isPowerOf2()) {
+ // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
+ SDValue NotCond =
+ DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
+ return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
+ }
+
+ // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
+ // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
+ // 16-bit lacks a proper blendv.
+ unsigned EltBitWidth = VT.getScalarSizeInBits();
+ bool CanShiftBlend =
+ TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
+ (Subtarget.hasAVX2() && EltBitWidth == 64) ||
+ (Subtarget.hasXOP()));
+ if (CanShiftBlend &&
+ ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
+ return C->getAPIntValue().isPowerOf2();
+ })) {
+ // Create a left-shift constant to get the mask bits over to the sign-bit.
+ SDValue Mask = And.getOperand(1);
+ SmallVector<int, 32> ShlVals;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
+ ShlVals.push_back(EltBitWidth - 1 -
+ MaskVal->getAPIntValue().exactLogBase2());
+ }
+ // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
+ SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
+ SDValue NewCond =
+ DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
+ return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
+ }
+ }
+
+ return SDValue();
+}
+
+/// Combine:
+/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
+/// to:
+/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
+/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+/// Note that this is only legal for some op/cc combinations.
+static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // This combine only operates on CMP-like nodes.
+ if (!(Cmp.getOpcode() == X86ISD::CMP ||
+ (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+ return SDValue();
+
+ // Can't replace the cmp if it has more uses than the one we're looking at.
+ // FIXME: We would like to be able to handle this, but would need to make sure
+ // all uses were updated.
+ if (!Cmp.hasOneUse())
+ return SDValue();
+
+ // This only applies to variations of the common case:
+ // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
+ // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
+ // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
+ // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
+ // Using the proper condcodes (see below), overflow is checked for.
+
+ // FIXME: We can generalize both constraints:
+ // - XOR/OR/AND (if they were made to survive AtomicExpand)
+ // - LHS != 1
+ // if the result is compared.
+
+ SDValue CmpLHS = Cmp.getOperand(0);
+ SDValue CmpRHS = Cmp.getOperand(1);
+
+ if (!CmpLHS.hasOneUse())
+ return SDValue();
+
+ unsigned Opc = CmpLHS.getOpcode();
+ if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
+ return SDValue();
+
+ SDValue OpRHS = CmpLHS.getOperand(2);
+ auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
+ if (!OpRHSC)
+ return SDValue();
+
+ APInt Addend = OpRHSC->getAPIntValue();
+ if (Opc == ISD::ATOMIC_LOAD_SUB)
+ Addend = -Addend;
+
+ auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
+ if (!CmpRHSC)
+ return SDValue();
+
+ APInt Comparison = CmpRHSC->getAPIntValue();
+
+ // If the addend is the negation of the comparison value, then we can do
+ // a full comparison by emitting the atomic arithmetic as a locked sub.
+ if (Comparison == -Addend) {
+ // The CC is fine, but we need to rewrite the LHS of the comparison as an
+ // atomic sub.
+ auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
+ auto AtomicSub = DAG.getAtomic(
+ ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
+ /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
+ /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
+ AN->getMemOperand());
+ auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
+ DAG.getUNDEF(CmpLHS.getValueType()));
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
+ return LockOp;
+ }
+
+ // We can handle comparisons with zero in a number of cases by manipulating
+ // the CC used.
+ if (!Comparison.isNullValue())
+ return SDValue();
+
+ if (CC == X86::COND_S && Addend == 1)
+ CC = X86::COND_LE;
+ else if (CC == X86::COND_NS && Addend == 1)
+ CC = X86::COND_G;
+ else if (CC == X86::COND_G && Addend == -1)
+ CC = X86::COND_GE;
+ else if (CC == X86::COND_LE && Addend == -1)
+ CC = X86::COND_L;
+ else
+ return SDValue();
+
+ SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
+ DAG.getUNDEF(CmpLHS.getValueType()));
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
+ return LockOp;
+}
+
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+ // This combine only operates on CMP-like nodes.
+ if (!(Cmp.getOpcode() == X86ISD::CMP ||
+ (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+ return SDValue();
+
+ // Quit if not used as a boolean value.
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ // Check CMP operands. One of them should be 0 or 1 and the other should be
+ // an SetCC or extended from it.
+ SDValue Op1 = Cmp.getOperand(0);
+ SDValue Op2 = Cmp.getOperand(1);
+
+ SDValue SetCC;
+ const ConstantSDNode* C = nullptr;
+ bool needOppositeCond = (CC == X86::COND_E);
+ bool checkAgainstTrue = false; // Is it a comparison against 1?
+
+ if ((C = dyn_cast<ConstantSDNode>(Op1)))
+ SetCC = Op2;
+ else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+ SetCC = Op1;
+ else // Quit if all operands are not constants.
+ return SDValue();
+
+ if (C->getZExtValue() == 1) {
+ needOppositeCond = !needOppositeCond;
+ checkAgainstTrue = true;
+ } else if (C->getZExtValue() != 0)
+ // Quit if the constant is neither 0 or 1.
+ return SDValue();
+
+ bool truncatedToBoolWithAnd = false;
+ // Skip (zext $x), (trunc $x), or (and $x, 1) node.
+ while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
+ SetCC.getOpcode() == ISD::TRUNCATE ||
+ SetCC.getOpcode() == ISD::AND) {
+ if (SetCC.getOpcode() == ISD::AND) {
+ int OpIdx = -1;
+ if (isOneConstant(SetCC.getOperand(0)))
+ OpIdx = 1;
+ if (isOneConstant(SetCC.getOperand(1)))
+ OpIdx = 0;
+ if (OpIdx < 0)
+ break;
+ SetCC = SetCC.getOperand(OpIdx);
+ truncatedToBoolWithAnd = true;
+ } else
+ SetCC = SetCC.getOperand(0);
+ }
+
+ switch (SetCC.getOpcode()) {
+ case X86ISD::SETCC_CARRY:
+ // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
+ // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
+ // i.e. it's a comparison against true but the result of SETCC_CARRY is not
+ // truncated to i1 using 'and'.
+ if (checkAgainstTrue && !truncatedToBoolWithAnd)
+ break;
+ assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
+ "Invalid use of SETCC_CARRY!");
+ LLVM_FALLTHROUGH;
+ case X86ISD::SETCC:
+ // Set the condition code or opposite one if necessary.
+ CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(1);
+ case X86ISD::CMOV: {
+ // Check whether false/true value has canonical one, i.e. 0 or 1.
+ ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+ ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+ // Quit if true value is not a constant.
+ if (!TVal)
+ return SDValue();
+ // Quit if false value is not a constant.
+ if (!FVal) {
+ SDValue Op = SetCC.getOperand(0);
+ // Skip 'zext' or 'trunc' node.
+ if (Op.getOpcode() == ISD::ZERO_EXTEND ||
+ Op.getOpcode() == ISD::TRUNCATE)
+ Op = Op.getOperand(0);
+ // A special case for rdrand/rdseed, where 0 is set if false cond is
+ // found.
+ if ((Op.getOpcode() != X86ISD::RDRAND &&
+ Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+ return SDValue();
+ }
+ // Quit if false value is not the constant 0 or 1.
+ bool FValIsFalse = true;
+ if (FVal && FVal->getZExtValue() != 0) {
+ if (FVal->getZExtValue() != 1)
+ return SDValue();
+ // If FVal is 1, opposite cond is needed.
+ needOppositeCond = !needOppositeCond;
+ FValIsFalse = false;
+ }
+ // Quit if TVal is not the constant opposite of FVal.
+ if (FValIsFalse && TVal->getZExtValue() != 1)
+ return SDValue();
+ if (!FValIsFalse && TVal->getZExtValue() != 0)
+ return SDValue();
+ CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(3);
+ }
+ }
+
+ return SDValue();
+}
+
+/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
+/// Match:
+/// (X86or (X86setcc) (X86setcc))
+/// (X86cmp (and (X86setcc) (X86setcc)), 0)
+static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
+ X86::CondCode &CC1, SDValue &Flags,
+ bool &isAnd) {
+ if (Cond->getOpcode() == X86ISD::CMP) {
+ if (!isNullConstant(Cond->getOperand(1)))
+ return false;
+
+ Cond = Cond->getOperand(0);
+ }
+
+ isAnd = false;
+
+ SDValue SetCC0, SetCC1;
+ switch (Cond->getOpcode()) {
+ default: return false;
+ case ISD::AND:
+ case X86ISD::AND:
+ isAnd = true;
+ LLVM_FALLTHROUGH;
+ case ISD::OR:
+ case X86ISD::OR:
+ SetCC0 = Cond->getOperand(0);
+ SetCC1 = Cond->getOperand(1);
+ break;
+ };
+
+ // Make sure we have SETCC nodes, using the same flags value.
+ if (SetCC0.getOpcode() != X86ISD::SETCC ||
+ SetCC1.getOpcode() != X86ISD::SETCC ||
+ SetCC0->getOperand(1) != SetCC1->getOperand(1))
+ return false;
+
+ CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
+ CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
+ Flags = SetCC0->getOperand(1);
+ return true;
+}
+
+// When legalizing carry, we create carries via add X, -1
+// If that comes from an actual carry, via setcc, we use the
+// carry directly.
+static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
+ if (EFLAGS.getOpcode() == X86ISD::ADD) {
+ if (isAllOnesConstant(EFLAGS.getOperand(1))) {
+ SDValue Carry = EFLAGS.getOperand(0);
+ while (Carry.getOpcode() == ISD::TRUNCATE ||
+ Carry.getOpcode() == ISD::ZERO_EXTEND ||
+ Carry.getOpcode() == ISD::SIGN_EXTEND ||
+ Carry.getOpcode() == ISD::ANY_EXTEND ||
+ (Carry.getOpcode() == ISD::AND &&
+ isOneConstant(Carry.getOperand(1))))
+ Carry = Carry.getOperand(0);
+ if (Carry.getOpcode() == X86ISD::SETCC ||
+ Carry.getOpcode() == X86ISD::SETCC_CARRY) {
+ // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
+ uint64_t CarryCC = Carry.getConstantOperandVal(0);
+ SDValue CarryOp1 = Carry.getOperand(1);
+ if (CarryCC == X86::COND_B)
+ return CarryOp1;
+ if (CarryCC == X86::COND_A) {
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp
+ // instruction cannot take an immediate as its first operand.
+ //
+ if (CarryOp1.getOpcode() == X86ISD::SUB &&
+ CarryOp1.getNode()->hasOneUse() &&
+ CarryOp1.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
+ SDValue SubCommute =
+ DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
+ CarryOp1.getOperand(1), CarryOp1.getOperand(0));
+ return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
+ }
+ }
+ // If this is a check of the z flag of an add with 1, switch to the
+ // C flag.
+ if (CarryCC == X86::COND_E &&
+ CarryOp1.getOpcode() == X86ISD::ADD &&
+ isOneConstant(CarryOp1.getOperand(1)))
+ return CarryOp1;
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
+/// to avoid the inversion.
+static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
+ if (EFLAGS.getOpcode() != X86ISD::PTEST &&
+ EFLAGS.getOpcode() != X86ISD::TESTP)
+ return SDValue();
+
+ // PTEST/TESTP sets EFLAGS as:
+ // TESTZ: ZF = (Op0 & Op1) == 0
+ // TESTC: CF = (~Op0 & Op1) == 0
+ // TESTNZC: ZF == 0 && CF == 0
+ EVT VT = EFLAGS.getValueType();
+ SDValue Op0 = EFLAGS.getOperand(0);
+ SDValue Op1 = EFLAGS.getOperand(1);
+ EVT OpVT = Op0.getValueType();
+
+ // TEST*(~X,Y) == TEST*(X,Y)
+ if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
+ X86::CondCode InvCC;
+ switch (CC) {
+ case X86::COND_B:
+ // testc -> testz.
+ InvCC = X86::COND_E;
+ break;
+ case X86::COND_AE:
+ // !testc -> !testz.
+ InvCC = X86::COND_NE;
+ break;
+ case X86::COND_E:
+ // testz -> testc.
+ InvCC = X86::COND_B;
+ break;
+ case X86::COND_NE:
+ // !testz -> !testc.
+ InvCC = X86::COND_AE;
+ break;
+ case X86::COND_A:
+ case X86::COND_BE:
+ // testnzc -> testnzc (no change).
+ InvCC = CC;
+ break;
+ default:
+ InvCC = X86::COND_INVALID;
+ break;
+ }
+
+ if (InvCC != X86::COND_INVALID) {
+ CC = InvCC;
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, NotOp0), Op1);
+ }
+ }
+
+ if (CC == X86::COND_E || CC == X86::COND_NE) {
+ // TESTZ(X,~Y) == TESTC(Y,X)
+ if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
+ CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, NotOp1), Op0);
+ }
+
+ if (Op0 == Op1) {
+ SDValue BC = peekThroughBitcasts(Op0);
+ EVT BCVT = BC.getValueType();
+ assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
+ "Unexpected vector type");
+
+ // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
+ if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, BC.getOperand(0)),
+ DAG.getBitcast(OpVT, BC.getOperand(1)));
+ }
+
+ // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
+ if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
+ CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, BC.getOperand(0)),
+ DAG.getBitcast(OpVT, BC.getOperand(1)));
+ }
+
+ // If every element is an all-sign value, see if we can use MOVMSK to
+ // more efficiently extract the sign bits and compare that.
+ // TODO: Handle TESTC with comparison inversion.
+ // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
+ // MOVMSK combines to make sure its never worse than PTEST?
+ unsigned EltBits = BCVT.getScalarSizeInBits();
+ if (DAG.ComputeNumSignBits(BC) == EltBits) {
+ assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
+ APInt SignMask = APInt::getSignMask(EltBits);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (SDValue Res =
+ TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
+ // For vXi16 cases we need to use pmovmksb and extract every other
+ // sign bit.
+ SDLoc DL(EFLAGS);
+ if (EltBits == 16) {
+ MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
+ Res = DAG.getBitcast(MovmskVT, Res);
+ Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+ Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
+ DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+ } else {
+ Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+ }
+ }
+
+ // TESTZ(-1,X) == TESTZ(X,X)
+ if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
+
+ // TESTZ(X,-1) == TESTZ(X,X)
+ if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
+ }
+
+ return SDValue();
+}
+
+// Attempt to simplify the MOVMSK input based on the comparison type.
+static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Handle eq/ne against zero (any_of).
+ // Handle eq/ne against -1 (all_of).
+ if (!(CC == X86::COND_E || CC == X86::COND_NE))
+ return SDValue();
+ if (EFLAGS.getValueType() != MVT::i32)
+ return SDValue();
+ unsigned CmpOpcode = EFLAGS.getOpcode();
+ if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
+ return SDValue();
+ auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
+ if (!CmpConstant)
+ return SDValue();
+ const APInt &CmpVal = CmpConstant->getAPIntValue();
+
+ SDValue CmpOp = EFLAGS.getOperand(0);
+ unsigned CmpBits = CmpOp.getValueSizeInBits();
+ assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
+
+ // Peek through any truncate.
+ if (CmpOp.getOpcode() == ISD::TRUNCATE)
+ CmpOp = CmpOp.getOperand(0);
+
+ // Bail if we don't find a MOVMSK.
+ if (CmpOp.getOpcode() != X86ISD::MOVMSK)
+ return SDValue();
+
+ SDValue Vec = CmpOp.getOperand(0);
+ MVT VecVT = Vec.getSimpleValueType();
+ assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
+ "Unexpected MOVMSK operand");
+ unsigned NumElts = VecVT.getVectorNumElements();
+ unsigned NumEltBits = VecVT.getScalarSizeInBits();
+
+ bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
+ bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
+ CmpVal.isMask(NumElts);
+ if (!IsAnyOf && !IsAllOf)
+ return SDValue();
+
+ // See if we can peek through to a vector with a wider element type, if the
+ // signbits extend down to all the sub-elements as well.
+ // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
+ // potential SimplifyDemandedBits/Elts cases.
+ if (Vec.getOpcode() == ISD::BITCAST) {
+ SDValue BC = peekThroughBitcasts(Vec);
+ MVT BCVT = BC.getSimpleValueType();
+ unsigned BCNumElts = BCVT.getVectorNumElements();
+ unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
+ if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
+ BCNumEltBits > NumEltBits &&
+ DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
+ SDLoc DL(EFLAGS);
+ unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
+ DAG.getConstant(CmpMask, DL, MVT::i32));
+ }
+ }
+
+ // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
+ // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
+ if (IsAllOf && Subtarget.hasSSE41()) {
+ SDValue BC = peekThroughBitcasts(Vec);
+ if (BC.getOpcode() == X86ISD::PCMPEQ &&
+ ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
+ MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
+ return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
+ }
+ }
+
+ // See if we can avoid a PACKSS by calling MOVMSK on the sources.
+ // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
+ // sign bits prior to the comparison with zero unless we know that
+ // the vXi16 splats the sign bit down to the lower i8 half.
+ // TODO: Handle all_of patterns.
+ if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
+ SDValue VecOp0 = Vec.getOperand(0);
+ SDValue VecOp1 = Vec.getOperand(1);
+ bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
+ bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
+ // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
+ if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
+ if (!SignExt0) {
+ Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
+ DAG.getConstant(0xAAAA, DL, MVT::i16));
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ DAG.getConstant(0, DL, MVT::i16));
+ }
+ // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
+ // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
+ if (CmpBits == 16 && Subtarget.hasInt256() &&
+ VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
+ VecOp0.getConstantOperandAPInt(1) == 0 &&
+ VecOp1.getConstantOperandAPInt(1) == 8 &&
+ (IsAnyOf || (SignExt0 && SignExt1))) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
+ if (!SignExt0 || !SignExt1) {
+ assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
+ Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+ DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ DAG.getConstant(CmpMask, DL, MVT::i32));
+ }
+ }
+
+ // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
+ SmallVector<int, 32> ShuffleMask;
+ SmallVector<SDValue, 2> ShuffleInputs;
+ if (NumElts == CmpBits &&
+ getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
+ ShuffleMask, DAG) &&
+ ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
+ ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
+ unsigned NumShuffleElts = ShuffleMask.size();
+ APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
+ for (int M : ShuffleMask) {
+ assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
+ DemandedElts.setBit(M);
+ }
+ if (DemandedElts.isAllOnesValue()) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ Result =
+ DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ EFLAGS.getOperand(1));
+ }
+ }
+
+ return SDValue();
+}
+
+/// Optimize an EFLAGS definition used according to the condition code \p CC
+/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
+/// uses of chain values.
+static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (CC == X86::COND_B)
+ if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
+ return Flags;
+
+ if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
+ return R;
+
+ if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
+ return R;
+
+ if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
+ return R;
+
+ return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+
+ SDValue FalseOp = N->getOperand(0);
+ SDValue TrueOp = N->getOperand(1);
+ X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+ SDValue Cond = N->getOperand(3);
+
+ // cmov X, X, ?, ? --> X
+ if (TrueOp == FalseOp)
+ return TrueOp;
+
+ // Try to simplify the EFLAGS and condition code operands.
+ // We can't always do this as FCMOV only supports a subset of X86 cond.
+ if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
+ if (!(FalseOp.getValueType() == MVT::f80 ||
+ (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
+ (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
+ !Subtarget.hasCMov() || hasFPCMov(CC)) {
+ SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
+ Flags};
+ return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
+ }
+ }
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations. Note that the operands are ordered the opposite of SELECT
+ // operands.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
+ // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+ // larger than FalseC (the false value).
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueC, FalseC);
+ std::swap(TrueOp, FalseOp);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
+ // This is efficient for any integer data type (including i8/i16) and
+ // shift amount.
+ if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+ Cond = getSETCC(CC, Cond, DL, DAG);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(ShAmt, DL, MVT::i8));
+ return Cond;
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
+ // for any integer data type, including i8/i16.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ Cond = getSETCC(CC, Cond, DL, DAG);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ return Cond;
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
+ assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
+ "Implicit constant truncation");
+
+ bool isFastMultiplier = false;
+ if (Diff.ult(10)) {
+ switch (Diff.getZExtValue()) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ Cond = getSETCC(CC, Cond, DL ,DAG);
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, DL, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ return Cond;
+ }
+ }
+ }
+ }
+
+ // Handle these cases:
+ // (select (x != c), e, c) -> select (x != c), e, x),
+ // (select (x == c), c, e) -> select (x == c), x, e)
+ // where the c is an integer constant, and the "select" is the combination
+ // of CMOV and CMP.
+ //
+ // The rationale for this change is that the conditional-move from a constant
+ // needs two instructions, however, conditional-move from a register needs
+ // only one instruction.
+ //
+ // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+ // some instruction-combining opportunities. This opt needs to be
+ // postponed as late as possible.
+ //
+ if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+ // the DCI.xxxx conditions are provided to postpone the optimization as
+ // late as possible.
+
+ ConstantSDNode *CmpAgainst = nullptr;
+ if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+ (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+ !isa<ConstantSDNode>(Cond.getOperand(0))) {
+
+ if (CC == X86::COND_NE &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueOp, FalseOp);
+ }
+
+ if (CC == X86::COND_E &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+ SDValue Ops[] = {FalseOp, Cond.getOperand(0),
+ DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
+ return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
+ }
+ }
+ }
+
+ // Fold and/or of setcc's to double CMOV:
+ // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
+ // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
+ //
+ // This combine lets us generate:
+ // cmovcc1 (jcc1 if we don't have CMOV)
+ // cmovcc2 (same)
+ // instead of:
+ // setcc1
+ // setcc2
+ // and/or
+ // cmovne (jne if we don't have CMOV)
+ // When we can't use the CMOV instruction, it might increase branch
+ // mispredicts.
+ // When we can use CMOV, or when there is no mispredict, this improves
+ // throughput and reduces register pressure.
+ //
+ if (CC == X86::COND_NE) {
+ SDValue Flags;
+ X86::CondCode CC0, CC1;
+ bool isAndSetCC;
+ if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
+ if (isAndSetCC) {
+ std::swap(FalseOp, TrueOp);
+ CC0 = X86::GetOppositeBranchCondition(CC0);
+ CC1 = X86::GetOppositeBranchCondition(CC1);
+ }
+
+ SDValue LOps[] = {FalseOp, TrueOp,
+ DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
+ SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
+ SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
+ Flags};
+ SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
+ return CMOV;
+ }
+ }
+
+ // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
+ // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
+ // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
+ // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
+ if ((CC == X86::COND_NE || CC == X86::COND_E) &&
+ Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
+ SDValue Add = TrueOp;
+ SDValue Const = FalseOp;
+ // Canonicalize the condition code for easier matching and output.
+ if (CC == X86::COND_E)
+ std::swap(Add, Const);
+
+ // We might have replaced the constant in the cmov with the LHS of the
+ // compare. If so change it to the RHS of the compare.
+ if (Const == Cond.getOperand(0))
+ Const = Cond.getOperand(1);
+
+ // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
+ if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
+ Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
+ (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
+ Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
+ Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
+ EVT VT = N->getValueType(0);
+ // This should constant fold.
+ SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
+ SDValue CMov =
+ DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
+ DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
+ return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
+ }
+ }
+
+ return SDValue();
+}
+
+/// Different mul shrinking modes.
+enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
+
+static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
+ EVT VT = N->getOperand(0).getValueType();
+ if (VT.getScalarSizeInBits() != 32)
+ return false;
+
+ assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
+ unsigned SignBits[2] = {1, 1};
+ bool IsPositive[2] = {false, false};
+ for (unsigned i = 0; i < 2; i++) {
+ SDValue Opd = N->getOperand(i);
+
+ SignBits[i] = DAG.ComputeNumSignBits(Opd);
+ IsPositive[i] = DAG.SignBitIsZero(Opd);
+ }
+
+ bool AllPositive = IsPositive[0] && IsPositive[1];
+ unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
+ // When ranges are from -128 ~ 127, use MULS8 mode.
+ if (MinSignBits >= 25)
+ Mode = ShrinkMode::MULS8;
+ // When ranges are from 0 ~ 255, use MULU8 mode.
+ else if (AllPositive && MinSignBits >= 24)
+ Mode = ShrinkMode::MULU8;
+ // When ranges are from -32768 ~ 32767, use MULS16 mode.
+ else if (MinSignBits >= 17)
+ Mode = ShrinkMode::MULS16;
+ // When ranges are from 0 ~ 65535, use MULU16 mode.
+ else if (AllPositive && MinSignBits >= 16)
+ Mode = ShrinkMode::MULU16;
+ else
+ return false;
+ return true;
+}
+
+/// When the operands of vector mul are extended from smaller size values,
+/// like i8 and i16, the type of mul may be shrinked to generate more
+/// efficient code. Two typical patterns are handled:
+/// Pattern1:
+/// %2 = sext/zext <N x i8> %1 to <N x i32>
+/// %4 = sext/zext <N x i8> %3 to <N x i32>
+// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+/// %5 = mul <N x i32> %2, %4
+///
+/// Pattern2:
+/// %2 = zext/sext <N x i16> %1 to <N x i32>
+/// %4 = zext/sext <N x i16> %3 to <N x i32>
+/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+/// %5 = mul <N x i32> %2, %4
+///
+/// There are four mul shrinking modes:
+/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
+/// generate pmullw+sext32 for it (MULS8 mode).
+/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
+/// generate pmullw+zext32 for it (MULU8 mode).
+/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
+/// generate pmullw+pmulhw for it (MULS16 mode).
+/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
+/// generate pmullw+pmulhuw for it (MULU16 mode).
+static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Check for legality
+ // pmullw/pmulhw are not supported by SSE.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ // Check for profitability
+ // pmulld is supported since SSE41. It is better to use pmulld
+ // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
+ // the expansion.
+ bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
+ if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
+ return SDValue();
+
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(N, DAG, Mode))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getOperand(0).getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ if ((NumElts % 2) != 0)
+ return SDValue();
+
+ EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
+
+ // Shrink the operands of mul.
+ SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
+ SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
+
+ // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
+ // lower part is needed.
+ SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
+ if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
+ return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
+ : ISD::SIGN_EXTEND,
+ DL, VT, MulLo);
+
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
+ // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
+ // the higher part is also needed.
+ SDValue MulHi =
+ DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+ ReducedVT, NewN0, NewN1);
+
+ // Repack the lower part and higher part result of mul into a wider
+ // result.
+ // Generate shuffle functioning as punpcklwd.
+ SmallVector<int, 16> ShuffleMask(NumElts);
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+ ShuffleMask[2 * i] = i;
+ ShuffleMask[2 * i + 1] = i + NumElts;
+ }
+ SDValue ResLo =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResLo = DAG.getBitcast(ResVT, ResLo);
+ // Generate shuffle functioning as punpckhwd.
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+ ShuffleMask[2 * i] = i + NumElts / 2;
+ ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
+ }
+ SDValue ResHi =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResHi = DAG.getBitcast(ResVT, ResHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
+}
+
+static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
+ EVT VT, const SDLoc &DL) {
+
+ auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
+ SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(Mult, DL, VT));
+ Result = DAG.getNode(ISD::SHL, DL, VT, Result,
+ DAG.getConstant(Shift, DL, MVT::i8));
+ Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
+ N->getOperand(0));
+ return Result;
+ };
+
+ auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
+ SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(Mul1, DL, VT));
+ Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
+ DAG.getConstant(Mul2, DL, VT));
+ Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
+ N->getOperand(0));
+ return Result;
+ };
+
+ switch (MulAmt) {
+ default:
+ break;
+ case 11:
+ // mul x, 11 => add ((shl (mul x, 5), 1), x)
+ return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
+ case 21:
+ // mul x, 21 => add ((shl (mul x, 5), 2), x)
+ return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
+ case 41:
+ // mul x, 41 => add ((shl (mul x, 5), 3), x)
+ return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
+ case 22:
+ // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
+ return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+ combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
+ case 19:
+ // mul x, 19 => add ((shl (mul x, 9), 1), x)
+ return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
+ case 37:
+ // mul x, 37 => add ((shl (mul x, 9), 2), x)
+ return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
+ case 73:
+ // mul x, 73 => add ((shl (mul x, 9), 3), x)
+ return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
+ case 13:
+ // mul x, 13 => add ((shl (mul x, 3), 2), x)
+ return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
+ case 23:
+ // mul x, 23 => sub ((shl (mul x, 3), 3), x)
+ return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
+ case 26:
+ // mul x, 26 => add ((mul (mul x, 5), 5), x)
+ return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
+ case 28:
+ // mul x, 28 => add ((mul (mul x, 9), 3), x)
+ return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
+ case 29:
+ // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
+ return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+ combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
+ }
+
+ // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
+ // by a single LEA.
+ // First check if this a sum of two power of 2s because that's easy. Then
+ // count how many zeros are up to the first bit.
+ // TODO: We can do this even without LEA at a cost of two shifts and an add.
+ if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+ unsigned ScaleShift = countTrailingZeros(MulAmt);
+ if (ScaleShift >= 1 && ScaleShift < 4) {
+ unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
+ SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(ScaleShift, DL, MVT::i8));
+ return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
+ }
+ }
+
+ return SDValue();
+}
+
+// If the upper 17 bits of each element are zero then we can use PMADDWD,
+// which is always at least as quick as PMULLD, except on KNL.
+static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ if (Subtarget.isPMADDWDSlow())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // Only support vXi32 vectors.
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
+ return SDValue();
+
+ // Make sure the type is legal or will be widened to a legal type.
+ if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
+
+ // Without BWI, we would need to split v32i16.
+ if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // If we are zero extending two steps without SSE4.1, its better to reduce
+ // the vmul width instead.
+ if (!Subtarget.hasSSE41() &&
+ (N0.getOpcode() == ISD::ZERO_EXTEND &&
+ N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
+ (N1.getOpcode() == ISD::ZERO_EXTEND &&
+ N1.getOperand(0).getScalarValueSizeInBits() <= 8))
+ return SDValue();
+
+ APInt Mask17 = APInt::getHighBitsSet(32, 17);
+ if (!DAG.MaskedValueIsZero(N1, Mask17) ||
+ !DAG.MaskedValueIsZero(N0, Mask17))
+ return SDValue();
+
+ // Use SplitOpsAndApply to handle AVX splitting.
+ auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+ { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
+ PMADDWDBuilder);
+}
+
+static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // Only support vXi64 vectors.
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
+ VT.getVectorNumElements() < 2 ||
+ !isPowerOf2_32(VT.getVectorNumElements()))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // MULDQ returns the 64-bit result of the signed multiplication of the lower
+ // 32-bits. We can lower with this if the sign bits stretch that far.
+ if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
+ DAG.ComputeNumSignBits(N1) > 32) {
+ auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
+ PMULDQBuilder, /*CheckBWI*/false);
+ }
+
+ // If the upper bits are zero we can use a single pmuludq.
+ APInt Mask = APInt::getHighBitsSet(64, 32);
+ if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
+ auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
+ PMULUDQBuilder, /*CheckBWI*/false);
+ }
+
+ return SDValue();
+}
+
+/// Optimize a single multiply with constant into two operations in order to
+/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
+static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+
+ if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
+ return V;
+
+ if (DCI.isBeforeLegalize() && VT.isVector())
+ return reduceVMULWidth(N, DAG, Subtarget);
+
+ if (!MulConstantOptimization)
+ return SDValue();
+ // An imul is usually smaller than the alternative sequence.
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return SDValue();
+
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ if (VT != MVT::i64 && VT != MVT::i32)
+ return SDValue();
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+ if (isPowerOf2_64(C->getZExtValue()))
+ return SDValue();
+
+ int64_t SignMulAmt = C->getSExtValue();
+ assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
+ uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
+
+ SDLoc DL(N);
+ if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
+ SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(AbsMulAmt, DL, VT));
+ if (SignMulAmt < 0)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ NewMul);
+
+ return NewMul;
+ }
+
+ uint64_t MulAmt1 = 0;
+ uint64_t MulAmt2 = 0;
+ if ((AbsMulAmt % 9) == 0) {
+ MulAmt1 = 9;
+ MulAmt2 = AbsMulAmt / 9;
+ } else if ((AbsMulAmt % 5) == 0) {
+ MulAmt1 = 5;
+ MulAmt2 = AbsMulAmt / 5;
+ } else if ((AbsMulAmt % 3) == 0) {
+ MulAmt1 = 3;
+ MulAmt2 = AbsMulAmt / 3;
+ }
+
+ SDValue NewMul;
+ // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
+ if (MulAmt2 &&
+ (isPowerOf2_64(MulAmt2) ||
+ (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
+
+ if (isPowerOf2_64(MulAmt2) &&
+ !(SignMulAmt >= 0 && N->hasOneUse() &&
+ N->use_begin()->getOpcode() == ISD::ADD))
+ // If second multiplifer is pow2, issue it first. We want the multiply by
+ // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+ // is an add. Only do this for positive multiply amounts since the
+ // negate would prevent it from being used as an address mode anyway.
+ std::swap(MulAmt1, MulAmt2);
+
+ if (isPowerOf2_64(MulAmt1))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(MulAmt1, DL, VT));
+
+ if (isPowerOf2_64(MulAmt2))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+ DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+ DAG.getConstant(MulAmt2, DL, VT));
+
+ // Negate the result.
+ if (SignMulAmt < 0)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ NewMul);
+ } else if (!Subtarget.slowLEA())
+ NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
+
+ if (!NewMul) {
+ assert(C->getZExtValue() != 0 &&
+ C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
+ "Both cases that could cause potential overflows should have "
+ "already been handled.");
+ if (isPowerOf2_64(AbsMulAmt - 1)) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ NewMul = DAG.getNode(
+ ISD::ADD, DL, VT, N->getOperand(0),
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
+ MVT::i8)));
+ // To negate, subtract the number from zero
+ if (SignMulAmt < 0)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT,
+ DAG.getConstant(0, DL, VT), NewMul);
+ } else if (isPowerOf2_64(AbsMulAmt + 1)) {
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt + 1),
+ DL, MVT::i8));
+ // To negate, reverse the operands of the subtract.
+ if (SignMulAmt < 0)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
+ else
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
+ } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
+ // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt - 2),
+ DL, MVT::i8));
+ NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
+ NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
+ } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
+ // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt + 2),
+ DL, MVT::i8));
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
+ }
+ }
+
+ return NewMul;
+}
+
+// Try to form a MULHU or MULHS node by looking for
+// (srl (mul ext, ext), 16)
+// TODO: This is X86 specific because we want to be able to handle wide types
+// before type legalization. But we can only do it if the vector will be
+// legalized via widening/splitting. Type legalization can't handle promotion
+// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
+// combiner.
+static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+ "SRL or SRA node is required here!");
+ SDLoc DL(N);
+
+ // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
+ // the multiply.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ // The operation feeding into the shift must be a multiply.
+ SDValue ShiftOperand = N->getOperand(0);
+ if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
+ return SDValue();
+
+ // Input type should be at least vXi32.
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
+ return SDValue();
+
+ // Need a shift by 16.
+ APInt ShiftAmt;
+ if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
+ ShiftAmt != 16)
+ return SDValue();
+
+ SDValue LHS = ShiftOperand.getOperand(0);
+ SDValue RHS = ShiftOperand.getOperand(1);
+
+ unsigned ExtOpc = LHS.getOpcode();
+ if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
+ RHS.getOpcode() != ExtOpc)
+ return SDValue();
+
+ // Peek through the extends.
+ LHS = LHS.getOperand(0);
+ RHS = RHS.getOperand(0);
+
+ // Ensure the input types match.
+ EVT MulVT = LHS.getValueType();
+ if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
+ return SDValue();
+
+ unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
+ SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
+
+ ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ return DAG.getNode(ExtOpc, DL, VT, Mulh);
+}
+
+static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ EVT VT = N0.getValueType();
+
+ // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+ // since the result of setcc_c is all zero's or all ones.
+ if (VT.isInteger() && !VT.isVector() &&
+ N1C && N0.getOpcode() == ISD::AND &&
+ N0.getOperand(1).getOpcode() == ISD::Constant) {
+ SDValue N00 = N0.getOperand(0);
+ APInt Mask = N0.getConstantOperandAPInt(1);
+ Mask <<= N1C->getAPIntValue();
+ bool MaskOK = false;
+ // We can handle cases concerning bit-widening nodes containing setcc_c if
+ // we carefully interrogate the mask to make sure we are semantics
+ // preserving.
+ // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
+ // of the underlying setcc_c operation if the setcc_c was zero extended.
+ // Consider the following example:
+ // zext(setcc_c) -> i32 0x0000FFFF
+ // c1 -> i32 0x0000FFFF
+ // c2 -> i32 0x00000001
+ // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
+ // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = true;
+ } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = true;
+ } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
+ N00.getOpcode() == ISD::ANY_EXTEND) &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
+ }
+ if (MaskOK && Mask != 0) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
+ }
+ }
+
+ // Hardware support for vector shifts is sparse which makes us scalarize the
+ // vector operations in many cases. Also, on sandybridge ADD is faster than
+ // shl.
+ // (shl V, 1) -> add V,V
+ if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+ if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+ assert(N0.getValueType().isVector() && "Invalid vector shift type");
+ // We shift all of the values by one. In many cases we do not have
+ // hardware support for this operation. This is better expressed as an ADD
+ // of two values.
+ if (N1SplatC->isOne())
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+ unsigned Size = VT.getSizeInBits();
+
+ if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
+ return V;
+
+ // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
+ // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
+ // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
+ // depending on sign of (SarConst - [56,48,32,24,16])
+
+ // sexts in X86 are MOVs. The MOVs have the same code size
+ // as above SHIFTs (only SHIFT on 1 has lower code size).
+ // However the MOVs have 2 advantages to a SHIFT:
+ // 1. MOVs can write to a register that differs from source
+ // 2. MOVs accept memory operands
+
+ if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
+ N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
+ N0.getOperand(1).getOpcode() != ISD::Constant)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
+ APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
+ EVT CVT = N1.getValueType();
+
+ if (SarConst.isNegative())
+ return SDValue();
+
+ for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
+ unsigned ShiftSize = SVT.getSizeInBits();
+ // skipping types without corresponding sext/zext and
+ // ShlConst that is not one of [56,48,32,24,16]
+ if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
+ continue;
+ SDLoc DL(N);
+ SDValue NN =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
+ SarConst = SarConst - (Size - ShiftSize);
+ if (SarConst == 0)
+ return NN;
+ else if (SarConst.isNegative())
+ return DAG.getNode(ISD::SHL, DL, VT, NN,
+ DAG.getConstant(-SarConst, DL, CVT));
+ else
+ return DAG.getNode(ISD::SRA, DL, VT, NN,
+ DAG.getConstant(SarConst, DL, CVT));
+ }
+ return SDValue();
+}
+
+static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+
+ if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
+ return V;
+
+ // Only do this on the last DAG combine as it can interfere with other
+ // combines.
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue();
+
+ // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
+ // TODO: This is a generic DAG combine that became an x86-only combine to
+ // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
+ // and-not ('andn').
+ if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+ return SDValue();
+
+ auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
+ auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!ShiftC || !AndC)
+ return SDValue();
+
+ // If we can shrink the constant mask below 8-bits or 32-bits, then this
+ // transform should reduce code size. It may also enable secondary transforms
+ // from improved known-bits analysis or instruction selection.
+ APInt MaskVal = AndC->getAPIntValue();
+
+ // If this can be matched by a zero extend, don't optimize.
+ if (MaskVal.isMask()) {
+ unsigned TO = MaskVal.countTrailingOnes();
+ if (TO >= 8 && isPowerOf2_32(TO))
+ return SDValue();
+ }
+
+ APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
+ unsigned OldMaskSize = MaskVal.getMinSignedBits();
+ unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
+ if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
+ (OldMaskSize > 32 && NewMaskSize <= 32)) {
+ // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
+ SDLoc DL(N);
+ SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
+ SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
+ return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
+ }
+ return SDValue();
+}
+
+static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode ||
+ X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode ||
+ X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
+ "Unexpected hadd/hsub/pack opcode");
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT SrcVT = N0.getValueType();
+
+ // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
+ // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
+ // truncation trees that help us avoid lane crossing shuffles.
+ // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
+ // TODO: We don't handle vXf64 shuffles yet.
+ if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getConstantOperandAPInt(1) == 0 &&
+ N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&
+ N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
+ N0.getOperand(0).getValueType().is256BitVector() &&
+ SrcVT.getScalarSizeInBits() <= 32) {
+ // TODO - support target/faux shuffles.
+ SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
+ // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
+ // shuffle to a vXi64 width - we can probably relax this in the future.
+ SmallVector<int, 4> ShuffleMask;
+ if (SVN->getOperand(1).isUndef() &&
+ scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
+ SDLoc DL(N);
+ SDValue Lo, Hi;
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
+ std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
+ Lo = DAG.getBitcast(N0.getValueType(), Lo);
+ Hi = DAG.getBitcast(N1.getValueType(), Hi);
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+ }
+
+ // Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)).
+ // TODO: Merge with binary shuffle folds below.
+ if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
+ int PostShuffle[4] = {0, 1, 2, 3};
+
+ // If the op is an unary shuffle that can scale to v2x64,
+ // then we can perform this as a v4x32 post shuffle.
+ auto AdjustOp = [&](SDValue V, int Offset) {
+ auto *SVN = dyn_cast<ShuffleVectorSDNode>(V);
+ SmallVector<int, 2> ScaledMask;
+ if (!SVN || !SVN->getOperand(1).isUndef() ||
+ !scaleShuffleElements(SVN->getMask(), 2, ScaledMask) ||
+ !N->isOnlyUserOf(V.getNode()))
+ return SDValue();
+ PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0];
+ PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1];
+ return SVN->getOperand(0);
+ };
+
+ SDValue Src0 = AdjustOp(N0, 0);
+ SDValue Src1 = AdjustOp(N1, 2);
+ if (Src0 || Src1) {
+ Src0 = Src0 ? Src0 : N0;
+ Src1 = Src1 ? Src1 : N1;
+ SDLoc DL(N);
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+
+ // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
+ // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
+ if (VT.is256BitVector() && Subtarget.hasInt256()) {
+ SmallVector<int> Mask0, Mask1;
+ SmallVector<SDValue> Ops0, Ops1;
+ if (getTargetShuffleInputs(N0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
+ getTargetShuffleInputs(N1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
+ !Ops0.empty() && !Ops1.empty()) {
+ SDValue Op00 = Ops0.front(), Op01 = Ops0.back();
+ SDValue Op10 = Ops1.front(), Op11 = Ops1.back();
+ SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
+ if (Op00.getValueType() == SrcVT && Op01.getValueType() == SrcVT &&
+ Op11.getValueType() == SrcVT && Op11.getValueType() == SrcVT &&
+ scaleShuffleElements(Mask0, 2, ShuffleMask0) &&
+ scaleShuffleElements(Mask1, 2, ShuffleMask1)) {
+ if ((Op00 == Op11) && (Op01 == Op10)) {
+ std::swap(Op10, Op11);
+ ShuffleVectorSDNode::commuteMask(ShuffleMask1);
+ }
+ if ((Op00 == Op10) && (Op01 == Op11)) {
+ SmallVector<int, 4> ShuffleMask;
+ ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
+ ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
+ SDLoc DL(N);
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
+ "Unexpected pack opcode");
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ unsigned NumDstElts = VT.getVectorNumElements();
+ unsigned DstBitsPerElt = VT.getScalarSizeInBits();
+ unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
+ assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
+ N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
+ "Unexpected PACKSS/PACKUS input type");
+
+ bool IsSigned = (X86ISD::PACKSS == Opcode);
+
+ // Constant Folding.
+ APInt UndefElts0, UndefElts1;
+ SmallVector<APInt, 32> EltBits0, EltBits1;
+ if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
+ (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
+ getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
+ getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumSrcElts = NumDstElts / 2;
+ unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
+ unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+
+ APInt Undefs(NumDstElts, 0);
+ SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
+ unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
+ auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
+ auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
+
+ if (UndefElts[SrcIdx]) {
+ Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
+ continue;
+ }
+
+ APInt &Val = EltBits[SrcIdx];
+ if (IsSigned) {
+ // PACKSS: Truncate signed value with signed saturation.
+ // Source values less than dst minint are saturated to minint.
+ // Source values greater than dst maxint are saturated to maxint.
+ if (Val.isSignedIntN(DstBitsPerElt))
+ Val = Val.trunc(DstBitsPerElt);
+ else if (Val.isNegative())
+ Val = APInt::getSignedMinValue(DstBitsPerElt);
+ else
+ Val = APInt::getSignedMaxValue(DstBitsPerElt);
+ } else {
+ // PACKUS: Truncate signed value with unsigned saturation.
+ // Source values less than zero are saturated to zero.
+ // Source values greater than dst maxuint are saturated to maxuint.
+ if (Val.isIntN(DstBitsPerElt))
+ Val = Val.trunc(DstBitsPerElt);
+ else if (Val.isNegative())
+ Val = APInt::getNullValue(DstBitsPerElt);
+ else
+ Val = APInt::getAllOnesValue(DstBitsPerElt);
+ }
+ Bits[Lane * NumDstEltsPerLane + Elt] = Val;
+ }
+ }
+
+ return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
+ }
+
+ // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
+ if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
+ return V;
+
+ // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
+ // truncate to create a larger truncate.
+ if (Subtarget.hasAVX512() &&
+ N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
+ N0.getOperand(0).getValueType() == MVT::v8i32) {
+ if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
+ (!IsSigned &&
+ DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
+ if (Subtarget.hasVLX())
+ return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
+
+ // Widen input to v16i32 so we can truncate that.
+ SDLoc dl(N);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
+ N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
+ return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
+ }
+ }
+
+ // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
+ if (VT.is128BitVector()) {
+ unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ SDValue Src0, Src1;
+ if (N0.getOpcode() == ExtOpc &&
+ N0.getOperand(0).getValueType().is64BitVector() &&
+ N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
+ Src0 = N0.getOperand(0);
+ }
+ if (N1.getOpcode() == ExtOpc &&
+ N1.getOperand(0).getValueType().is64BitVector() &&
+ N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
+ Src1 = N1.getOperand(0);
+ }
+ if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
+ assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
+ Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
+ Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
+ }
+ }
+
+ // Attempt to combine as shuffle.
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+
+ return SDValue();
+}
+
+static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
+ X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
+ "Unexpected horizontal add/sub opcode");
+
+ // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
+ if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
+ return V;
+
+ return SDValue();
+}
+
+static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
+ X86ISD::VSRL == N->getOpcode()) &&
+ "Unexpected shift opcode");
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Shift zero -> zero.
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
+ return DAG.getConstant(0, SDLoc(N), VT);
+
+ // Detect constant shift amounts.
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
+ unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
+ return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
+ EltBits[0].getZExtValue(), DAG);
+ }
+
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+ KnownZero, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
+ X86ISD::VSRLI == Opcode) &&
+ "Unexpected shift opcode");
+ bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
+ "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == MVT::i8 &&
+ "Unexpected shift amount type");
+
+ // Out of range logical bit shifts are guaranteed to be zero.
+ // Out of range arithmetic bit shifts splat the sign bit.
+ unsigned ShiftVal = N->getConstantOperandVal(1);
+ if (ShiftVal >= NumBitsPerElt) {
+ if (LogicalShift)
+ return DAG.getConstant(0, SDLoc(N), VT);
+ ShiftVal = NumBitsPerElt - 1;
+ }
+
+ // (shift X, 0) -> X
+ if (!ShiftVal)
+ return N0;
+
+ // (shift 0, C) -> 0
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
+ // N0 is all zeros or undef. We guarantee that the bits shifted into the
+ // result are all zeros, not undef.
+ return DAG.getConstant(0, SDLoc(N), VT);
+
+ // (VSRAI -1, C) -> -1
+ if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
+ // N0 is all ones or undef. We guarantee that the bits shifted into the
+ // result are all ones, not undef.
+ return DAG.getConstant(-1, SDLoc(N), VT);
+
+ // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
+ if (Opcode == N0.getOpcode()) {
+ unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+ unsigned NewShiftVal = ShiftVal + ShiftVal2;
+ if (NewShiftVal >= NumBitsPerElt) {
+ // Out of range logical bit shifts are guaranteed to be zero.
+ // Out of range arithmetic bit shifts splat the sign bit.
+ if (LogicalShift)
+ return DAG.getConstant(0, SDLoc(N), VT);
+ NewShiftVal = NumBitsPerElt - 1;
+ }
+ return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
+ DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
+ }
+
+ // We can decode 'whole byte' logical bit shifts as shuffles.
+ if (LogicalShift && (ShiftVal % 8) == 0) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ // Constant Folding.
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (N->isOnlyUserOf(N0.getNode()) &&
+ getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
+ assert(EltBits.size() == VT.getVectorNumElements() &&
+ "Unexpected shift value type");
+ // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
+ // created an undef input due to no input bits being demanded, but user
+ // still expects 0 in other bits.
+ for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
+ APInt &Elt = EltBits[i];
+ if (UndefElts[i])
+ Elt = 0;
+ else if (X86ISD::VSHLI == Opcode)
+ Elt <<= ShiftVal;
+ else if (X86ISD::VSRAI == Opcode)
+ Elt.ashrInPlace(ShiftVal);
+ else
+ Elt.lshrInPlace(ShiftVal);
+ }
+ // Reset undef elements since they were zeroed above.
+ UndefElts = 0;
+ return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
+ }
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBitsPerElt), DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
+ (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
+ N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
+ "Unexpected vector insertion");
+
+ if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBitsPerElt), DCI))
+ return SDValue(N, 0);
+ }
+
+ // Attempt to combine insertion patterns to a shuffle.
+ if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ return SDValue();
+}
+
+/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
+/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
+/// OR -> CMPNEQSS.
+static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned opcode;
+
+ // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+ // we're requiring SSE2 for both.
+ if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue CMP0 = N0.getOperand(1);
+ SDValue CMP1 = N1.getOperand(1);
+ SDLoc DL(N);
+
+ // The SETCCs should both refer to the same CMP.
+ if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
+ return SDValue();
+
+ SDValue CMP00 = CMP0->getOperand(0);
+ SDValue CMP01 = CMP0->getOperand(1);
+ EVT VT = CMP00.getValueType();
+
+ if (VT == MVT::f32 || VT == MVT::f64) {
+ bool ExpectingFlags = false;
+ // Check for any users that want flags:
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+ !ExpectingFlags && UI != UE; ++UI)
+ switch (UI->getOpcode()) {
+ default:
+ case ISD::BR_CC:
+ case ISD::BRCOND:
+ case ISD::SELECT:
+ ExpectingFlags = true;
+ break;
+ case ISD::CopyToReg:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ break;
+ }
+
+ if (!ExpectingFlags) {
+ enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+ enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+ if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+ X86::CondCode tmp = cc0;
+ cc0 = cc1;
+ cc1 = tmp;
+ }
+
+ if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
+ (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+ // FIXME: need symbolic constants for these magic numbers.
+ // See X86ATTInstPrinter.cpp:printSSECC().
+ unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+ if (Subtarget.hasAVX512()) {
+ SDValue FSetCC =
+ DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
+ DAG.getTargetConstant(x86cc, DL, MVT::i8));
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
+ DAG.getConstant(0, DL, MVT::v16i1),
+ FSetCC, DAG.getIntPtrConstant(0, DL));
+ return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
+ N->getSimpleValueType(0));
+ }
+ SDValue OnesOrZeroesF =
+ DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
+ CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
+
+ bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+ MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
+
+ if (is64BitFP && !Subtarget.is64Bit()) {
+ // On a 32-bit target, we cannot bitcast the 64-bit float to a
+ // 64-bit integer, since that's not a legal type. Since
+ // OnesOrZeroesF is all ones of all zeroes, we don't need all the
+ // bits, but can do this little dance to extract the lowest 32 bits
+ // and work with those going forward.
+ SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+ OnesOrZeroesF);
+ SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
+ OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+ Vector32, DAG.getIntPtrConstant(0, DL));
+ IntVT = MVT::i32;
+ }
+
+ SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
+ SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
+ DAG.getConstant(1, DL, IntVT));
+ SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+ ANDed);
+ return OneBitOfTruth;
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
+/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
+static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::AND);
+
+ MVT VT = N->getSimpleValueType(0);
+ if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ SDValue X, Y;
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ auto GetNot = [&VT, &DAG](SDValue V) {
+ // Basic X = NOT(Y) detection.
+ if (SDValue Not = IsNOT(V, DAG))
+ return Not;
+ // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
+ if (V.getOpcode() == X86ISD::VBROADCAST) {
+ SDValue Src = V.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isVector())
+ return SDValue();
+ if (SDValue Not = IsNOT(Src, DAG))
+ return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
+ DAG.getBitcast(SrcVT, Not));
+ }
+ return SDValue();
+ };
+
+ if (SDValue Not = GetNot(N0)) {
+ X = Not;
+ Y = N1;
+ } else if (SDValue Not = GetNot(N1)) {
+ X = Not;
+ Y = N0;
+ } else
+ return SDValue();
+
+ X = DAG.getBitcast(VT, X);
+ Y = DAG.getBitcast(VT, Y);
+ return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
+}
+
+// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
+// logical operations, like in the example below.
+// or (and (truncate x, truncate y)),
+// (xor (truncate z, build_vector (constants)))
+// Given a target type \p VT, we generate
+// or (and x, y), (xor z, zext(build_vector (constants)))
+// given x, y and z are of type \p VT. We can do so, if operands are either
+// truncates from VT types, the second operand is a vector of constants or can
+// be recursively promoted.
+static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
+ unsigned Depth) {
+ // Limit recursion to avoid excessive compile times.
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue();
+
+ if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
+ N->getOpcode() != ISD::OR)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
+ return SDValue();
+
+ if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
+ N0 = NN0;
+ else {
+ // The Left side has to be a trunc.
+ if (N0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ // The type of the truncated inputs.
+ if (N0.getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ N0 = N0.getOperand(0);
+ }
+
+ if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
+ N1 = NN1;
+ else {
+ // The right side has to be a 'trunc' or a constant vector.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
+ N1.getOperand(0).getValueType() == VT;
+ if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
+ return SDValue();
+
+ if (RHSTrunc)
+ N1 = N1.getOperand(0);
+ else
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
+ }
+
+ return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
+}
+
+// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
+// register. In most cases we actually compare or select YMM-sized registers
+// and mixing the two types creates horrible code. This method optimizes
+// some of the transition sequences.
+// Even with AVX-512 this is still useful for removing casts around logical
+// operations on vXi1 mask types.
+static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ assert(VT.isVector() && "Expected vector type");
+
+ SDLoc DL(N);
+ assert((N->getOpcode() == ISD::ANY_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+
+ SDValue Narrow = N->getOperand(0);
+ EVT NarrowVT = Narrow.getValueType();
+
+ // Generate the wide operation.
+ SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
+ if (!Op)
+ return SDValue();
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::ANY_EXTEND:
+ return Op;
+ case ISD::ZERO_EXTEND:
+ return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
+ case ISD::SIGN_EXTEND:
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
+ Op, DAG.getValueType(NarrowVT));
+ }
+}
+
+static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
+ unsigned FPOpcode;
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected input node for FP logic conversion");
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+ }
+ return FPOpcode;
+}
+
+/// If both input operands of a logic op are being cast from floating point
+/// types, try to convert this into a floating point logic node to avoid
+/// unnecessary moves from SSE to integer registers.
+static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N10 = N1.getOperand(0);
+ EVT N00Type = N00.getValueType();
+ EVT N10Type = N10.getValueType();
+
+ // Ensure that both types are the same and are legal scalar fp types.
+ if (N00Type != N10Type ||
+ !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
+ (Subtarget.hasSSE2() && N00Type == MVT::f64)))
+ return SDValue();
+
+ unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
+ SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+ return DAG.getBitcast(VT, FPLogic);
+}
+
+// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
+// to reduce XMM->GPR traffic.
+static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opc = N->getOpcode();
+ assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
+ "Unexpected bit opcode");
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Both operands must be single use MOVMSK.
+ if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
+ N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
+ return SDValue();
+
+ SDValue Vec0 = N0.getOperand(0);
+ SDValue Vec1 = N1.getOperand(0);
+ EVT VecVT0 = Vec0.getValueType();
+ EVT VecVT1 = Vec1.getValueType();
+
+ // Both MOVMSK operands must be from vectors of the same size and same element
+ // size, but its OK for a fp/int diff.
+ if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
+ VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
+ return SDValue();
+
+ SDLoc DL(N);
+ unsigned VecOpc =
+ VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
+ SDValue Result =
+ DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
+ return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+}
+
+/// If this is a zero/all-bits result that is bitwise-anded with a low bits
+/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
+/// with a shift-right to eliminate loading the vector constant mask value.
+static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+ EVT VT0 = Op0.getValueType();
+ EVT VT1 = Op1.getValueType();
+
+ if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
+ return SDValue();
+
+ APInt SplatVal;
+ if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
+ !SplatVal.isMask())
+ return SDValue();
+
+ // Don't prevent creation of ANDN.
+ if (isBitwiseNot(Op0))
+ return SDValue();
+
+ if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
+ return SDValue();
+
+ unsigned EltBitWidth = VT0.getScalarSizeInBits();
+ if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
+ return SDValue();
+
+ SDLoc DL(N);
+ unsigned ShiftVal = SplatVal.countTrailingOnes();
+ SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
+ SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
+ return DAG.getBitcast(N->getValueType(0), Shift);
+}
+
+// Get the index node from the lowered DAG of a GEP IR instruction with one
+// indexing dimension.
+static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
+ if (Ld->isIndexed())
+ return SDValue();
+
+ SDValue Base = Ld->getBasePtr();
+
+ if (Base.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ SDValue ShiftedIndex = Base.getOperand(0);
+
+ if (ShiftedIndex.getOpcode() != ISD::SHL)
+ return SDValue();
+
+ return ShiftedIndex.getOperand(0);
+
+}
+
+static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
+ if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
+ switch (VT.getSizeInBits()) {
+ default: return false;
+ case 64: return Subtarget.is64Bit() ? true : false;
+ case 32: return true;
+ }
+ }
+ return false;
+}
+
+// This function recognizes cases where X86 bzhi instruction can replace and
+// 'and-load' sequence.
+// In case of loading integer value from an array of constants which is defined
+// as follows:
+//
+// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
+//
+// then applying a bitwise and on the result with another input.
+// It's equivalent to performing bzhi (zero high bits) on the input, with the
+// same index of the load.
+static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Node->getSimpleValueType(0);
+ SDLoc dl(Node);
+
+ // Check if subtarget has BZHI instruction for the node's type
+ if (!hasBZHI(Subtarget, VT))
+ return SDValue();
+
+ // Try matching the pattern for both operands.
+ for (unsigned i = 0; i < 2; i++) {
+ SDValue N = Node->getOperand(i);
+ LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
+
+ // continue if the operand is not a load instruction
+ if (!Ld)
+ return SDValue();
+
+ const Value *MemOp = Ld->getMemOperand()->getValue();
+
+ if (!MemOp)
+ return SDValue();
+
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
+ if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
+
+ Constant *Init = GV->getInitializer();
+ Type *Ty = Init->getType();
+ if (!isa<ConstantDataArray>(Init) ||
+ !Ty->getArrayElementType()->isIntegerTy() ||
+ Ty->getArrayElementType()->getScalarSizeInBits() !=
+ VT.getSizeInBits() ||
+ Ty->getArrayNumElements() >
+ Ty->getArrayElementType()->getScalarSizeInBits())
+ continue;
+
+ // Check if the array's constant elements are suitable to our case.
+ uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+ bool ConstantsMatch = true;
+ for (uint64_t j = 0; j < ArrayElementCount; j++) {
+ ConstantInt *Elem =
+ dyn_cast<ConstantInt>(Init->getAggregateElement(j));
+ if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
+ ConstantsMatch = false;
+ break;
+ }
+ }
+ if (!ConstantsMatch)
+ continue;
+
+ // Do the transformation (For 32-bit type):
+ // -> (and (load arr[idx]), inp)
+ // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
+ // that will be replaced with one bzhi instruction.
+ SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
+ SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
+
+ // Get the Node which indexes into the array.
+ SDValue Index = getIndexFromUnindexedLoad(Ld);
+ if (!Index)
+ return SDValue();
+ Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
+
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
+ Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
+
+ SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
+ SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
+
+ return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
+// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
+// Where C is a mask containing the same number of bits as the setcc and
+// where the setcc will freely 0 upper bits of k-register. We can replace the
+// undef in the concat with 0s and remove the AND. This mainly helps with
+// v2i1/v4i1 setcc being casted to scalar.
+static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
+
+ EVT VT = N->getValueType(0);
+
+ // Make sure this is an AND with constant. We will check the value of the
+ // constant later.
+ if (!isa<ConstantSDNode>(N->getOperand(1)))
+ return SDValue();
+
+ // This is implied by the ConstantSDNode.
+ assert(!VT.isVector() && "Expected scalar VT!");
+
+ if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
+ !N->getOperand(0).hasOneUse() ||
+ !N->getOperand(0).getOperand(0).hasOneUse())
+ return SDValue();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Src = N->getOperand(0).getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
+ !TLI.isTypeLegal(SrcVT))
+ return SDValue();
+
+ if (Src.getOpcode() != ISD::CONCAT_VECTORS)
+ return SDValue();
+
+ // We only care about the first subvector of the concat, we expect the
+ // other subvectors to be ignored due to the AND if we make the change.
+ SDValue SubVec = Src.getOperand(0);
+ EVT SubVecVT = SubVec.getValueType();
+
+ // First subvector should be a setcc with a legal result type. The RHS of the
+ // AND should be a mask with this many bits.
+ if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
+ !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
+ return SDValue();
+
+ EVT SetccVT = SubVec.getOperand(0).getValueType();
+ if (!TLI.isTypeLegal(SetccVT) ||
+ !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
+ return SDValue();
+
+ if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
+ return SDValue();
+
+ // We passed all the checks. Rebuild the concat_vectors with zeroes
+ // and cast it back to VT.
+ SDLoc dl(N);
+ SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
+ DAG.getConstant(0, dl, SubVecVT));
+ Ops[0] = SubVec;
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
+ Ops);
+ return DAG.getBitcast(VT, Concat);
+}
+
+static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+
+ // If this is SSE1 only convert to FAND to avoid scalarization.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
+ return DAG.getBitcast(
+ MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
+ DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
+ }
+
+ // Use a 32-bit and+zext if upper bits known zero.
+ if (VT == MVT::i64 && Subtarget.is64Bit() &&
+ !isa<ConstantSDNode>(N->getOperand(1))) {
+ APInt HiMask = APInt::getHighBitsSet(64, 32);
+ if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
+ DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
+ SDLoc dl(N);
+ SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
+ SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
+ DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
+ }
+ }
+
+ // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
+ // TODO: Support multiple SrcOps.
+ if (VT == MVT::i1) {
+ SmallVector<SDValue, 2> SrcOps;
+ SmallVector<APInt, 2> SrcPartials;
+ if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
+ SrcOps.size() == 1) {
+ SDLoc dl(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
+ Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
+ if (Mask) {
+ assert(SrcPartials[0].getBitWidth() == NumElts &&
+ "Unexpected partial reduction mask");
+ SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
+ Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
+ return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
+ }
+ }
+ }
+
+ if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
+ return V;
+
+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+ return R;
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
+ return R;
+
+ if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
+ return ShiftRight;
+
+ if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
+ return R;
+
+ // Attempt to recursively combine a bitmask AND with shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ // Attempt to combine a scalar bitmask AND with an extracted shuffle.
+ if ((VT.getScalarSizeInBits() % 8) == 0 &&
+ N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
+ SDValue BitMask = N->getOperand(1);
+ SDValue SrcVec = N->getOperand(0).getOperand(0);
+ EVT SrcVecVT = SrcVec.getValueType();
+
+ // Check that the constant bitmask masks whole bytes.
+ APInt UndefElts;
+ SmallVector<APInt, 64> EltBits;
+ if (VT == SrcVecVT.getScalarType() &&
+ N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
+ getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
+ llvm::all_of(EltBits, [](const APInt &M) {
+ return M.isNullValue() || M.isAllOnesValue();
+ })) {
+ unsigned NumElts = SrcVecVT.getVectorNumElements();
+ unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
+ unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
+
+ // Create a root shuffle mask from the byte mask and the extracted index.
+ SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
+ for (unsigned i = 0; i != Scale; ++i) {
+ if (UndefElts[i])
+ continue;
+ int VecIdx = Scale * Idx + i;
+ ShuffleMask[VecIdx] =
+ EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
+ }
+
+ if (SDValue Shuffle = combineX86ShufflesRecursively(
+ {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
+ X86::MaxShuffleCombineDepth,
+ /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
+ N->getOperand(0).getOperand(1));
+ }
+ }
+
+ return SDValue();
+}
+
+// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
+static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
+
+ MVT VT = N->getSimpleValueType(0);
+ if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
+ return SDValue();
+
+ SDValue N0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue N1 = peekThroughBitcasts(N->getOperand(1));
+ if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
+ // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
+ bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
+ Subtarget.hasVLX();
+ if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
+ !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
+ return SDValue();
+
+ // Attempt to extract constant byte masks.
+ APInt UndefElts0, UndefElts1;
+ SmallVector<APInt, 32> EltBits0, EltBits1;
+ if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
+ false, false))
+ return SDValue();
+ if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
+ false, false))
+ return SDValue();
+
+ for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
+ // TODO - add UNDEF elts support.
+ if (UndefElts0[i] || UndefElts1[i])
+ return SDValue();
+ if (EltBits0[i] != ~EltBits1[i])
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+
+ if (UseVPTERNLOG) {
+ // Emit a VPTERNLOG node directly.
+ SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
+ SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
+ SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
+ SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
+ return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
+ }
+
+ SDValue X = N->getOperand(0);
+ SDValue Y =
+ DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
+ DAG.getBitcast(VT, N1.getOperand(0)));
+ return DAG.getNode(ISD::OR, DL, VT, X, Y);
+}
+
+// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
+static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
+ if (N->getOpcode() != ISD::OR)
+ return false;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Canonicalize AND to LHS.
+ if (N1.getOpcode() == ISD::AND)
+ std::swap(N0, N1);
+
+ // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
+ if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
+ return false;
+
+ Mask = N1.getOperand(0);
+ X = N1.getOperand(1);
+
+ // Check to see if the mask appeared in both the AND and ANDNP.
+ if (N0.getOperand(0) == Mask)
+ Y = N0.getOperand(1);
+ else if (N0.getOperand(1) == Mask)
+ Y = N0.getOperand(0);
+ else
+ return false;
+
+ // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
+ // ANDNP combine allows other combines to happen that prevent matching.
+ return true;
+}
+
+// Try to fold:
+// (or (and (m, y), (pandn m, x)))
+// into:
+// (vselect m, x, y)
+// As a special case, try to fold:
+// (or (and (m, (sub 0, x)), (pandn m, x)))
+// into:
+// (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
+
+ EVT VT = N->getValueType(0);
+ if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (VT.is256BitVector() && Subtarget.hasInt256())))
+ return SDValue();
+
+ SDValue X, Y, Mask;
+ if (!matchLogicBlend(N, X, Y, Mask))
+ return SDValue();
+
+ // Validate that X, Y, and Mask are bitcasts, and see through them.
+ Mask = peekThroughBitcasts(Mask);
+ X = peekThroughBitcasts(X);
+ Y = peekThroughBitcasts(Y);
+
+ EVT MaskVT = Mask.getValueType();
+ unsigned EltBits = MaskVT.getScalarSizeInBits();
+
+ // TODO: Attempt to handle floating point cases as well?
+ if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Attempt to combine to conditional negate: (sub (xor X, M), M)
+ if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
+ DAG, Subtarget))
+ return Res;
+
+ // PBLENDVB is only available on SSE 4.1.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
+ if (Subtarget.hasVLX())
+ return SDValue();
+
+ MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
+
+ X = DAG.getBitcast(BlendVT, X);
+ Y = DAG.getBitcast(BlendVT, Y);
+ Mask = DAG.getBitcast(BlendVT, Mask);
+ Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
+ return DAG.getBitcast(VT, Mask);
+}
+
+// Helper function for combineOrCmpEqZeroToCtlzSrl
+// Transforms:
+// seteq(cmp x, 0)
+// into:
+// srl(ctlz x), log2(bitsize(x))
+// Input pattern is checked by caller.
+static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
+ SelectionDAG &DAG) {
+ SDValue Cmp = Op.getOperand(1);
+ EVT VT = Cmp.getOperand(0).getValueType();
+ unsigned Log2b = Log2_32(VT.getSizeInBits());
+ SDLoc dl(Op);
+ SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
+ // The result of the shift is true or false, and on X86, the 32-bit
+ // encoding of shr and lzcnt is more desirable.
+ SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
+ SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
+ DAG.getConstant(Log2b, dl, MVT::i8));
+ return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
+}
+
+// Try to transform:
+// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
+// into:
+// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
+// Will also attempt to match more generic cases, eg:
+// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
+// Only applies if the target supports the FastLZCNT feature.
+static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
+ return SDValue();
+
+ auto isORCandidate = [](SDValue N) {
+ return (N->getOpcode() == ISD::OR && N->hasOneUse());
+ };
+
+ // Check the zero extend is extending to 32-bit or more. The code generated by
+ // srl(ctlz) for 16-bit or less variants of the pattern would require extra
+ // instructions to clear the upper bits.
+ if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
+ !isORCandidate(N->getOperand(0)))
+ return SDValue();
+
+ // Check the node matches: setcc(eq, cmp 0)
+ auto isSetCCCandidate = [](SDValue N) {
+ return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
+ X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
+ N->getOperand(1).getOpcode() == X86ISD::CMP &&
+ isNullConstant(N->getOperand(1).getOperand(1)) &&
+ N->getOperand(1).getValueType().bitsGE(MVT::i32);
+ };
+
+ SDNode *OR = N->getOperand(0).getNode();
+ SDValue LHS = OR->getOperand(0);
+ SDValue RHS = OR->getOperand(1);
+
+ // Save nodes matching or(or, setcc(eq, cmp 0)).
+ SmallVector<SDNode *, 2> ORNodes;
+ while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
+ (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
+ ORNodes.push_back(OR);
+ OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
+ LHS = OR->getOperand(0);
+ RHS = OR->getOperand(1);
+ }
+
+ // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
+ if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
+ !isORCandidate(SDValue(OR, 0)))
+ return SDValue();
+
+ // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
+ // to
+ // or(srl(ctlz),srl(ctlz)).
+ // The dag combiner can then fold it into:
+ // srl(or(ctlz, ctlz)).
+ EVT VT = OR->getValueType(0);
+ SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
+ SDValue Ret, NewRHS;
+ if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
+ Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
+
+ if (!Ret)
+ return SDValue();
+
+ // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
+ while (ORNodes.size() > 0) {
+ OR = ORNodes.pop_back_val();
+ LHS = OR->getOperand(0);
+ RHS = OR->getOperand(1);
+ // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
+ if (RHS->getOpcode() == ISD::OR)
+ std::swap(LHS, RHS);
+ NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
+ if (!NewRHS)
+ return SDValue();
+ Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
+ }
+
+ if (Ret)
+ Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
+
+ return Ret;
+}
+
+static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ // If this is SSE1 only convert to FOR to avoid scalarization.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
+ return DAG.getBitcast(MVT::v4i32,
+ DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, N0),
+ DAG.getBitcast(MVT::v4f32, N1)));
+ }
+
+ // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
+ // TODO: Support multiple SrcOps.
+ if (VT == MVT::i1) {
+ SmallVector<SDValue, 2> SrcOps;
+ SmallVector<APInt, 2> SrcPartials;
+ if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
+ SrcOps.size() == 1) {
+ SDLoc dl(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
+ Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
+ if (Mask) {
+ assert(SrcPartials[0].getBitWidth() == NumElts &&
+ "Unexpected partial reduction mask");
+ SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
+ SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
+ Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
+ return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
+ }
+ }
+ }
+
+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+ return R;
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
+ return R;
+
+ if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
+ return R;
+
+ // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
+ // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
+ // iff the upper elements of the non-shifted arg are zero.
+ // KUNPCK require 16+ bool vector elements.
+ if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfElts = NumElts / 2;
+ APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
+ if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
+ N1.getConstantOperandAPInt(1) == HalfElts &&
+ DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
+ SDLoc dl(N);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, dl, VT,
+ extractSubVector(N0, 0, DAG, dl, HalfElts),
+ extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
+ }
+ if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
+ N0.getConstantOperandAPInt(1) == HalfElts &&
+ DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
+ SDLoc dl(N);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, dl, VT,
+ extractSubVector(N1, 0, DAG, dl, HalfElts),
+ extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
+ }
+ }
+
+ // Attempt to recursively combine an OR of shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ return SDValue();
+}
+
+/// Try to turn tests against the signbit in the form of:
+/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+/// into:
+/// SETGT(X, -1)
+static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
+ // This is only worth doing if the output type is i8 or i1.
+ EVT ResultType = N->getValueType(0);
+ if (ResultType != MVT::i8 && ResultType != MVT::i1)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // We should be performing an xor against a truncated shift.
+ if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
+ return SDValue();
+
+ // Make sure we are performing an xor against one.
+ if (!isOneConstant(N1))
+ return SDValue();
+
+ // SetCC on x86 zero extends so only act on this if it's a logical shift.
+ SDValue Shift = N0.getOperand(0);
+ if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
+ return SDValue();
+
+ // Make sure we are truncating from one of i16, i32 or i64.
+ EVT ShiftTy = Shift.getValueType();
+ if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
+ return SDValue();
+
+ // Make sure the shift amount extracts the sign bit.
+ if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
+ return SDValue();
+
+ // Create a greater-than comparison against -1.
+ // N.B. Using SETGE against 0 works but we want a canonical looking
+ // comparison, using SETGT matches up with what TranslateX86CC.
+ SDLoc DL(N);
+ SDValue ShiftOp = Shift.getOperand(0);
+ EVT ShiftOpTy = ShiftOp.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), ResultType);
+ SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
+ DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+ if (SetCCResultType != ResultType)
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
+ return Cond;
+}
+
+/// Turn vector tests of the signbit in the form of:
+/// xor (sra X, elt_size(X)-1), -1
+/// into:
+/// pcmpgt X, -1
+///
+/// This should be called before type legalization because the pattern may not
+/// persist after that.
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isSimple())
+ return SDValue();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
+ }
+
+ // There must be a shift right algebraic before the xor, and the xor must be a
+ // 'not' operation.
+ SDValue Shift = N->getOperand(0);
+ SDValue Ones = N->getOperand(1);
+ if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
+ !ISD::isBuildVectorAllOnes(Ones.getNode()))
+ return SDValue();
+
+ // The shift should be smearing the sign bit across each vector element.
+ auto *ShiftAmt =
+ isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
+ if (!ShiftAmt ||
+ ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
+ return SDValue();
+
+ // Create a greater-than comparison against -1. We don't use the more obvious
+ // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
+ return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
+}
+
+/// Detect patterns of truncation with unsigned saturation:
+///
+/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Return the source value x to be truncated or SDValue() if the pattern was
+/// not matched.
+///
+/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
+/// where C1 >= 0 and C2 is unsigned max of destination type.
+///
+/// (truncate (smax (smin (x, C2), C1)) to dest_type)
+/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
+///
+/// These two patterns are equivalent to:
+/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
+/// So return the smax(x, C1) value to be truncated or SDValue() if the
+/// pattern was not matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+ const SDLoc &DL) {
+ EVT InVT = In.getValueType();
+
+ // Saturation with truncation. We truncate from InVT to VT.
+ assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
+ "Unexpected types for truncate operation");
+
+ // Match min/max and return limit value as a parameter.
+ auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
+ if (V.getOpcode() == Opcode &&
+ ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
+ return V.getOperand(0);
+ return SDValue();
+ };
+
+ APInt C1, C2;
+ if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
+ // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+ // the element size of the destination type.
+ if (C2.isMask(VT.getScalarSizeInBits()))
+ return UMin;
+
+ if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
+ if (MatchMinMax(SMin, ISD::SMAX, C1))
+ if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
+ return SMin;
+
+ if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
+ if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
+ if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
+ C2.uge(C1)) {
+ return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
+ }
+
+ return SDValue();
+}
+
+/// Detect patterns of truncation with signed saturation:
+/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
+/// signed_max_of_dest_type)) to dest_type)
+/// or:
+/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
+/// signed_min_of_dest_type)) to dest_type).
+/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
+ unsigned NumDstBits = VT.getScalarSizeInBits();
+ unsigned NumSrcBits = In.getScalarValueSizeInBits();
+ assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
+
+ auto MatchMinMax = [](SDValue V, unsigned Opcode,
+ const APInt &Limit) -> SDValue {
+ APInt C;
+ if (V.getOpcode() == Opcode &&
+ ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
+ return V.getOperand(0);
+ return SDValue();
+ };
+
+ APInt SignedMax, SignedMin;
+ if (MatchPackUS) {
+ SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
+ SignedMin = APInt(NumSrcBits, 0);
+ } else {
+ SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
+ SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
+ }
+
+ if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
+ if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
+ return SMax;
+
+ if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
+ if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
+ return SMin;
+
+ return SDValue();
+}
+
+static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2() || !VT.isVector())
+ return SDValue();
+
+ EVT SVT = VT.getVectorElementType();
+ EVT InVT = In.getValueType();
+ EVT InSVT = InVT.getVectorElementType();
+
+ // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
+ // split across two registers. We can use a packusdw+perm to clamp to 0-65535
+ // and concatenate at the same time. Then we can use a final vpmovuswb to
+ // clip to 0-255.
+ if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ InVT == MVT::v16i32 && VT == MVT::v16i8) {
+ if (auto USatVal = detectSSatPattern(In, VT, true)) {
+ // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
+ SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
+ DL, DAG, Subtarget);
+ assert(Mid && "Failed to pack!");
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
+ }
+ }
+
+ // vXi32 truncate instructions are available with AVX512F.
+ // vXi16 truncate instructions are only available with AVX512BW.
+ // For 256-bit or smaller vectors, we require VLX.
+ // FIXME: We could widen truncates to 512 to remove the VLX restriction.
+ // If the result type is 256-bits or larger and we have disable 512-bit
+ // registers, we should go ahead and use the pack instructions if possible.
+ bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
+ (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
+ (InVT.getSizeInBits() > 128) &&
+ (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
+ !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
+
+ if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
+ VT.getSizeInBits() >= 64 &&
+ (SVT == MVT::i8 || SVT == MVT::i16) &&
+ (InSVT == MVT::i16 || InSVT == MVT::i32)) {
+ if (auto USatVal = detectSSatPattern(In, VT, true)) {
+ // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
+ // Only do this when the result is at least 64 bits or we'll leaving
+ // dangling PACKSSDW nodes.
+ if (SVT == MVT::i8 && InSVT == MVT::i32) {
+ EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements());
+ SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
+ DAG, Subtarget);
+ assert(Mid && "Failed to pack!");
+ SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
+ Subtarget);
+ assert(V && "Failed to pack!");
+ return V;
+ } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
+ return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
+ Subtarget);
+ }
+ if (auto SSatVal = detectSSatPattern(In, VT))
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
+ Subtarget);
+ }
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
+ Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
+ unsigned TruncOpc = 0;
+ SDValue SatVal;
+ if (auto SSatVal = detectSSatPattern(In, VT)) {
+ SatVal = SSatVal;
+ TruncOpc = X86ISD::VTRUNCS;
+ } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
+ SatVal = USatVal;
+ TruncOpc = X86ISD::VTRUNCUS;
+ }
+ if (SatVal) {
+ unsigned ResElts = VT.getVectorNumElements();
+ // If the input type is less than 512 bits and we don't have VLX, we need
+ // to widen to 512 bits.
+ if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
+ unsigned NumConcats = 512 / InVT.getSizeInBits();
+ ResElts *= NumConcats;
+ SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
+ ConcatOps[0] = SatVal;
+ InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
+ NumConcats * InVT.getVectorNumElements());
+ SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
+ }
+ // Widen the result if its narrower than 128 bits.
+ if (ResElts * SVT.getSizeInBits() < 128)
+ ResElts = 128 / SVT.getSizeInBits();
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
+ SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+
+ return SDValue();
+}
+
+/// This function detects the AVG pattern between vectors of unsigned i8/i16,
+/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
+/// X86ISD::AVG instruction.
+static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ const SDLoc &DL) {
+ if (!VT.isVector())
+ return SDValue();
+ EVT InVT = In.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ EVT ScalarVT = VT.getVectorElementType();
+ if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
+ return SDValue();
+
+ // InScalarVT is the intermediate type in AVG pattern and it should be greater
+ // than the original input type (i8/i16).
+ EVT InScalarVT = InVT.getVectorElementType();
+ if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
+ return SDValue();
+
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ // Detect the following pattern:
+ //
+ // %1 = zext <N x i8> %a to <N x i32>
+ // %2 = zext <N x i8> %b to <N x i32>
+ // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
+ // %4 = add nuw nsw <N x i32> %3, %2
+ // %5 = lshr <N x i32> %N, <i32 1 x N>
+ // %6 = trunc <N x i32> %5 to <N x i8>
+ //
+ // In AVX512, the last instruction can also be a trunc store.
+ if (In.getOpcode() != ISD::SRL)
+ return SDValue();
+
+ // A lambda checking the given SDValue is a constant vector and each element
+ // is in the range [Min, Max].
+ auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
+ return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
+ return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
+ });
+ };
+
+ // Check if each element of the vector is right-shifted by one.
+ SDValue LHS = In.getOperand(0);
+ SDValue RHS = In.getOperand(1);
+ if (!IsConstVectorInRange(RHS, 1, 1))
+ return SDValue();
+ if (LHS.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // Detect a pattern of a + b + 1 where the order doesn't matter.
+ SDValue Operands[3];
+ Operands[0] = LHS.getOperand(0);
+ Operands[1] = LHS.getOperand(1);
+
+ auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
+ };
+
+ auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
+ // Pad to a power-of-2 vector, split+apply and extract the original vector.
+ unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
+ EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
+ if (NumElemsPow2 != NumElems) {
+ SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
+ SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDValue Idx = DAG.getIntPtrConstant(i, DL);
+ Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
+ Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
+ }
+ Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
+ Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
+ }
+ SDValue Res =
+ SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
+ if (NumElemsPow2 == NumElems)
+ return Res;
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ };
+
+ // Take care of the case when one of the operands is a constant vector whose
+ // element is in the range [1, 256].
+ if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
+ Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
+ Operands[0].getOperand(0).getValueType() == VT) {
+ // The pattern is detected. Subtract one from the constant vector, then
+ // demote it and emit X86ISD::AVG instruction.
+ SDValue VecOnes = DAG.getConstant(1, DL, InVT);
+ Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
+ Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
+ return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
+ }
+
+ // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
+ // Match the or case only if its 'add-like' - can be replaced by an add.
+ auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
+ if (ISD::ADD == V.getOpcode()) {
+ Op0 = V.getOperand(0);
+ Op1 = V.getOperand(1);
+ return true;
+ }
+ if (ISD::ZERO_EXTEND != V.getOpcode())
+ return false;
+ V = V.getOperand(0);
+ if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
+ !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
+ return false;
+ Op0 = V.getOperand(0);
+ Op1 = V.getOperand(1);
+ return true;
+ };
+
+ SDValue Op0, Op1;
+ if (FindAddLike(Operands[0], Op0, Op1))
+ std::swap(Operands[0], Operands[1]);
+ else if (!FindAddLike(Operands[1], Op0, Op1))
+ return SDValue();
+ Operands[2] = Op0;
+ Operands[1] = Op1;
+
+ // Now we have three operands of two additions. Check that one of them is a
+ // constant vector with ones, and the other two can be promoted from i8/i16.
+ for (int i = 0; i < 3; ++i) {
+ if (!IsConstVectorInRange(Operands[i], 1, 1))
+ continue;
+ std::swap(Operands[i], Operands[2]);
+
+ // Check if Operands[0] and Operands[1] are results of type promotion.
+ for (int j = 0; j < 2; ++j)
+ if (Operands[j].getValueType() != VT) {
+ if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+ Operands[j].getOperand(0).getValueType() != VT)
+ return SDValue();
+ Operands[j] = Operands[j].getOperand(0);
+ }
+
+ // The pattern is detected, emit X86ISD::AVG instruction(s).
+ return AVGSplitter(Operands[0], Operands[1]);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ EVT RegVT = Ld->getValueType(0);
+ EVT MemVT = Ld->getMemoryVT();
+ SDLoc dl(Ld);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+ // into two 16-byte operations. Also split non-temporal aligned loads on
+ // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
+ ISD::LoadExtType Ext = Ld->getExtensionType();
+ bool Fast;
+ if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
+ Ext == ISD::NON_EXTLOAD &&
+ ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
+ Ld->getAlignment() >= 16) ||
+ (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+ *Ld->getMemOperand(), &Fast) &&
+ !Fast))) {
+ unsigned NumElems = RegVT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ unsigned HalfOffset = 16;
+ SDValue Ptr1 = Ld->getBasePtr();
+ SDValue Ptr2 =
+ DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ NumElems / 2);
+ SDValue Load1 =
+ DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
+ Ld->getPointerInfo().getWithOffset(HalfOffset),
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Load1.getValue(1), Load2.getValue(1));
+
+ SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
+ return DCI.CombineTo(N, NewVec, TF, true);
+ }
+
+ // Bool vector load - attempt to cast to an integer, as we have good
+ // (vXiY *ext(vXi1 bitcast(iX))) handling.
+ if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
+ RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
+ unsigned NumElts = RegVT.getVectorNumElements();
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ if (TLI.isTypeLegal(IntVT)) {
+ SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(),
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
+ return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
+ }
+ }
+
+ // If we also broadcast this as a subvector to a wider type, then just extract
+ // the lowest subvector.
+ if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
+ (RegVT.is128BitVector() || RegVT.is256BitVector())) {
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Chain = Ld->getChain();
+ for (SDNode *User : Ptr->uses()) {
+ if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
+ cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
+ cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
+ MemVT.getSizeInBits() &&
+ !User->hasAnyUseOfValue(1) &&
+ User->getValueSizeInBits(0).getFixedSize() >
+ RegVT.getFixedSizeInBits()) {
+ SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
+ RegVT.getSizeInBits());
+ Extract = DAG.getBitcast(RegVT, Extract);
+ return DCI.CombineTo(N, Extract, SDValue(User, 1));
+ }
+ }
+ }
+
+ // Cast ptr32 and ptr64 pointers to the default address space before a load.
+ unsigned AddrSpace = Ld->getAddressSpace();
+ if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
+ AddrSpace == X86AS::PTR32_UPTR) {
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
+ SDValue Cast =
+ DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
+ return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ }
+ }
+
+ return SDValue();
+}
+
+/// If V is a build vector of boolean constants and exactly one of those
+/// constants is true, return the operand index of that true element.
+/// Otherwise, return -1.
+static int getOneTrueElt(SDValue V) {
+ // This needs to be a build vector of booleans.
+ // TODO: Checking for the i1 type matches the IR definition for the mask,
+ // but the mask check could be loosened to i8 or other types. That might
+ // also require checking more than 'allOnesValue'; eg, the x86 HW
+ // instructions only require that the MSB is set for each mask element.
+ // The ISD::MSTORE comments/definition do not specify how the mask operand
+ // is formatted.
+ auto *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
+ return -1;
+
+ int TrueIndex = -1;
+ unsigned NumElts = BV->getValueType(0).getVectorNumElements();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ const SDValue &Op = BV->getOperand(i);
+ if (Op.isUndef())
+ continue;
+ auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
+ if (!ConstNode)
+ return -1;
+ if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
+ // If we already found a one, this is too many.
+ if (TrueIndex >= 0)
+ return -1;
+ TrueIndex = i;
+ }
+ }
+ return TrueIndex;
+}
+
+/// Given a masked memory load/store operation, return true if it has one mask
+/// bit set. If it has one mask bit set, then also return the memory address of
+/// the scalar element to load/store, the vector index to insert/extract that
+/// scalar element, and the alignment for the scalar memory access.
+static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
+ SelectionDAG &DAG, SDValue &Addr,
+ SDValue &Index, Align &Alignment,
+ unsigned &Offset) {
+ int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
+ if (TrueMaskElt < 0)
+ return false;
+
+ // Get the address of the one scalar element that is specified by the mask
+ // using the appropriate offset from the base pointer.
+ EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
+ Offset = 0;
+ Addr = MaskedOp->getBasePtr();
+ if (TrueMaskElt != 0) {
+ Offset = TrueMaskElt * EltVT.getStoreSize();
+ Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
+ SDLoc(MaskedOp));
+ }
+
+ Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
+ Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
+ EltVT.getStoreSize());
+ return true;
+}
+
+/// If exactly one element of the mask is set for a non-extending masked load,
+/// it is a scalar load and vector insert.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue
+reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");
+ // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+ // However, some target hooks may need to be added to know when the transform
+ // is profitable. Endianness would also have to be considered.
+
+ SDValue Addr, VecIndex;
+ Align Alignment;
+ unsigned Offset;
+ if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
+ return SDValue();
+
+ // Load the one scalar element that is specified by the mask using the
+ // appropriate offset from the base pointer.
+ SDLoc DL(ML);
+ EVT VT = ML->getValueType(0);
+ EVT EltVT = VT.getVectorElementType();
+
+ EVT CastVT = VT;
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ CastVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ }
+
+ SDValue Load =
+ DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
+ ML->getPointerInfo().getWithOffset(Offset),
+ Alignment, ML->getMemOperand()->getFlags());
+
+ SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
+
+ // Insert the loaded element into the appropriate place in the vector.
+ SDValue Insert =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
+ Insert = DAG.getBitcast(VT, Insert);
+ return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
+}
+
+static SDValue
+combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");
+ if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
+ return SDValue();
+
+ SDLoc DL(ML);
+ EVT VT = ML->getValueType(0);
+
+ // If we are loading the first and last elements of a vector, it is safe and
+ // always faster to load the whole vector. Replace the masked load with a
+ // vector load and select.
+ unsigned NumElts = VT.getVectorNumElements();
+ BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
+ bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
+ bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
+ if (LoadFirstElt && LoadLastElt) {
+ SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
+ ML->getMemOperand());
+ SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
+ ML->getPassThru());
+ return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
+ }
+
+ // Convert a masked load with a constant mask into a masked load and a select.
+ // This allows the select operation to use a faster kind of select instruction
+ // (for example, vblendvps -> vblendps).
+
+ // Don't try this if the pass-through operand is already undefined. That would
+ // cause an infinite loop because that's what we're about to create.
+ if (ML->getPassThru().isUndef())
+ return SDValue();
+
+ if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
+ return SDValue();
+
+ // The new masked load has an undef pass-through operand. The select uses the
+ // original pass-through operand.
+ SDValue NewML = DAG.getMaskedLoad(
+ VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
+ DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
+ ML->getAddressingMode(), ML->getExtensionType());
+ SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
+ ML->getPassThru());
+
+ return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
+}
+
+static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ auto *Mld = cast<MaskedLoadSDNode>(N);
+
+ // TODO: Expanding load with constant mask may be optimized as well.
+ if (Mld->isExpandingLoad())
+ return SDValue();
+
+ if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
+ if (SDValue ScalarLoad =
+ reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
+ return ScalarLoad;
+
+ // TODO: Do some AVX512 subsets benefit from this transform?
+ if (!Subtarget.hasAVX512())
+ if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
+ return Blend;
+ }
+
+ // If the mask value has been legalized to a non-boolean vector, try to
+ // simplify ops leading up to it. We only demand the MSB of each lane.
+ SDValue Mask = Mld->getMask();
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ EVT VT = Mld->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ if (SDValue NewMask =
+ TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
+ return DAG.getMaskedLoad(
+ VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
+ NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
+ Mld->getAddressingMode(), Mld->getExtensionType());
+ }
+
+ return SDValue();
+}
+
+/// If exactly one element of the mask is set for a non-truncating masked store,
+/// it is a vector extract and scalar store.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+ // However, some target hooks may need to be added to know when the transform
+ // is profitable. Endianness would also have to be considered.
+
+ SDValue Addr, VecIndex;
+ Align Alignment;
+ unsigned Offset;
+ if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
+ return SDValue();
+
+ // Extract the one scalar element that is actually being stored.
+ SDLoc DL(MS);
+ SDValue Value = MS->getValue();
+ EVT VT = Value.getValueType();
+ EVT EltVT = VT.getVectorElementType();
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ EVT CastVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ Value = DAG.getBitcast(CastVT, Value);
+ }
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
+
+ // Store that element at the appropriate offset from the base pointer.
+ return DAG.getStore(MS->getChain(), DL, Extract, Addr,
+ MS->getPointerInfo().getWithOffset(Offset),
+ Alignment, MS->getMemOperand()->getFlags());
+}
+
+static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+ if (Mst->isCompressingStore())
+ return SDValue();
+
+ EVT VT = Mst->getValue().getValueType();
+ SDLoc dl(Mst);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (Mst->isTruncatingStore())
+ return SDValue();
+
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
+ return ScalarStore;
+
+ // If the mask value has been legalized to a non-boolean vector, try to
+ // simplify ops leading up to it. We only demand the MSB of each lane.
+ SDValue Mask = Mst->getMask();
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ if (SDValue NewMask =
+ TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
+ Mst->getBasePtr(), Mst->getOffset(), NewMask,
+ Mst->getMemoryVT(), Mst->getMemOperand(),
+ Mst->getAddressingMode());
+ }
+
+ SDValue Value = Mst->getValue();
+ if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
+ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+ Mst->getMemoryVT())) {
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
+ Mst->getBasePtr(), Mst->getOffset(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(),
+ Mst->getAddressingMode(), true);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ EVT StVT = St->getMemoryVT();
+ SDLoc dl(St);
+ SDValue StoredVal = St->getValue();
+ EVT VT = StoredVal.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Convert a store of vXi1 into a store of iX and a bitcast.
+ if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
+ VT.getVectorElementType() == MVT::i1) {
+
+ EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
+ StoredVal = DAG.getBitcast(NewVT, StoredVal);
+
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ }
+
+ // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
+ // This will avoid a copy to k-register.
+ if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
+ StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ StoredVal.getOperand(0).getValueType() == MVT::i8) {
+ SDValue Val = StoredVal.getOperand(0);
+ // We must store zeros to the unused bits.
+ Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
+ return DAG.getStore(St->getChain(), dl, Val,
+ St->getBasePtr(), St->getPointerInfo(),
+ St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ }
+
+ // Widen v2i1/v4i1 stores to v8i1.
+ if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
+ Subtarget.hasAVX512()) {
+ unsigned NumConcats = 8 / VT.getVectorNumElements();
+ // We must store zeros to the unused bits.
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
+ Ops[0] = StoredVal;
+ StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ }
+
+ // Turn vXi1 stores of constants into a scalar store.
+ if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
+ VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
+ ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
+ // If its a v64i1 store without 64-bit support, we need two stores.
+ if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
+ StoredVal->ops().slice(0, 32));
+ Lo = combinevXi1ConstantToInteger(Lo, DAG);
+ SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
+ StoredVal->ops().slice(32, 32));
+ Hi = combinevXi1ConstantToInteger(Hi, DAG);
+
+ SDValue Ptr0 = St->getBasePtr();
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
+
+ SDValue Ch0 =
+ DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
+ St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ SDValue Ch1 =
+ DAG.getStore(St->getChain(), dl, Hi, Ptr1,
+ St->getPointerInfo().getWithOffset(4),
+ St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ }
+
+ StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ }
+
+ // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
+ // Sandy Bridge, perform two 16-byte stores.
+ bool Fast;
+ if (VT.is256BitVector() && StVT == VT &&
+ TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ *St->getMemOperand(), &Fast) &&
+ !Fast) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ return splitVectorStore(St, DAG);
+ }
+
+ // Split under-aligned vector non-temporal stores.
+ if (St->isNonTemporal() && StVT == VT &&
+ St->getAlignment() < VT.getStoreSize()) {
+ // ZMM/YMM nt-stores - either it can be stored as a series of shorter
+ // vectors or the legalizer can scalarize it to use MOVNTI.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+ return splitVectorStore(St, DAG);
+ }
+
+ // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
+ // to use MOVNTI.
+ if (VT.is128BitVector() && Subtarget.hasSSE2()) {
+ MVT NTVT = Subtarget.hasSSE4A()
+ ? MVT::v2f64
+ : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
+ return scalarizeVectorStore(St, NTVT, DAG);
+ }
+ }
+
+ // Try to optimize v16i16->v16i8 truncating stores when BWI is not
+ // supported, but avx512f is by extending to v16i32 and truncating.
+ if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
+ St->getValue().getOpcode() == ISD::TRUNCATE &&
+ St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
+ TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
+ St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
+ SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
+ return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
+ MVT::v16i8, St->getMemOperand());
+ }
+
+ // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
+ if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
+ (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
+ StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
+ TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
+ bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
+ return EmitTruncSStore(IsSigned, St->getChain(),
+ dl, StoredVal.getOperand(0), St->getBasePtr(),
+ VT, St->getMemOperand(), DAG);
+ }
+
+ // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
+ if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
+ auto IsExtractedElement = [](SDValue V) {
+ if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
+ V = V.getOperand(0);
+ unsigned Opc = V.getOpcode();
+ if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
+ if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
+ return V.getOperand(0);
+ }
+ return SDValue();
+ };
+ if (SDValue Extract = IsExtractedElement(StoredVal)) {
+ SDValue Trunc = peekThroughOneUseBitcasts(Extract);
+ if (Trunc.getOpcode() == X86ISD::VTRUNC) {
+ SDValue Src = Trunc.getOperand(0);
+ MVT DstVT = Trunc.getSimpleValueType();
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
+ MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
+ if (NumTruncBits == VT.getSizeInBits() &&
+ TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
+ return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
+ TruncVT, St->getMemOperand());
+ }
+ }
+ }
+ }
+
+ // Optimize trunc store (of multiple scalars) to shuffle and store.
+ // First, pack all of the elements in one place. Next, store to memory
+ // in fewer chunks.
+ if (St->isTruncatingStore() && VT.isVector()) {
+ // Check if we can detect an AVG pattern from the truncation. If yes,
+ // replace the trunc store by a normal store with the result of X86ISD::AVG
+ // instruction.
+ if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
+ if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
+ Subtarget, dl))
+ return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+
+ if (TLI.isTruncStoreLegal(VT, StVT)) {
+ if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
+ return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
+ dl, Val, St->getBasePtr(),
+ St->getMemoryVT(), St->getMemOperand(), DAG);
+ if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
+ DAG, dl))
+ return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+ dl, Val, St->getBasePtr(),
+ St->getMemoryVT(), St->getMemOperand(), DAG);
+ }
+
+ return SDValue();
+ }
+
+ // Cast ptr32 and ptr64 pointers to the default address space before a store.
+ unsigned AddrSpace = St->getAddressSpace();
+ if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
+ AddrSpace == X86AS::PTR32_UPTR) {
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (PtrVT != St->getBasePtr().getSimpleValueType()) {
+ SDValue Cast =
+ DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
+ return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags(), St->getAAInfo());
+ }
+ }
+
+ // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
+ // the FP state in cases where an emms may be missing.
+ // A preferable solution to the general problem is to figure out the right
+ // places to insert EMMS. This qualifies as a quick hack.
+
+ // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+ if (VT.getSizeInBits() != 64)
+ return SDValue();
+
+ const Function &F = DAG.getMachineFunction().getFunction();
+ bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
+ bool F64IsLegal =
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
+ if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
+ isa<LoadSDNode>(St->getValue()) &&
+ cast<LoadSDNode>(St->getValue())->isSimple() &&
+ St->getChain().hasOneUse() && St->isSimple()) {
+ LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
+
+ if (!ISD::isNormalLoad(Ld))
+ return SDValue();
+
+ // Avoid the transformation if there are multiple uses of the loaded value.
+ if (!Ld->hasNUsesOfValue(1, 0))
+ return SDValue();
+
+ SDLoc LdDL(Ld);
+ SDLoc StDL(N);
+ // Lower to a single movq load/store pair.
+ SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getMemOperand());
+
+ // Make sure new load is placed in same chain order.
+ DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
+ return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
+ St->getMemOperand());
+ }
+
+ // This is similar to the above case, but here we handle a scalar 64-bit
+ // integer store that is extracted from a vector on a 32-bit target.
+ // If we have SSE2, then we can treat it like a floating-point double
+ // to get past legalization. The execution dependencies fixup pass will
+ // choose the optimal machine instruction for the store if this really is
+ // an integer or v2f32 rather than an f64.
+ if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
+ St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue OldExtract = St->getOperand(1);
+ SDValue ExtOp0 = OldExtract.getOperand(0);
+ unsigned VecSize = ExtOp0.getValueSizeInBits();
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
+ SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ BitCast, OldExtract.getOperand(1));
+ return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ }
+
+ return SDValue();
+}
+
+static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ auto *St = cast<MemIntrinsicSDNode>(N);
+
+ SDValue StoredVal = N->getOperand(1);
+ MVT VT = StoredVal.getSimpleValueType();
+ EVT MemVT = St->getMemoryVT();
+
+ // Figure out which elements we demand.
+ unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
+ APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
+
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
+ KnownZero, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
+/// Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS. A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector. For example, if
+/// A = < float a0, float a1, float a2, float a3 >
+/// and
+/// B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ bool IsCommutative,
+ SmallVectorImpl<int> &PostShuffleMask) {
+ // If either operand is undef, bail out. The binop should be simplified.
+ if (LHS.isUndef() || RHS.isUndef())
+ return false;
+
+ // Look for the following pattern:
+ // A = < float a0, float a1, float a2, float a3 >
+ // B = < float b0, float b1, float b2, float b3 >
+ // and
+ // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+ // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+ // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+ // which is A horizontal-op B.
+
+ MVT VT = LHS.getSimpleValueType();
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for horizontal add/sub");
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // TODO - can we make a general helper method that does all of this for us?
+ auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
+ SmallVectorImpl<int> &ShuffleMask) {
+ if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (!Op.getOperand(0).isUndef())
+ N0 = Op.getOperand(0);
+ if (!Op.getOperand(1).isUndef())
+ N1 = Op.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ return;
+ }
+ bool UseSubVector = false;
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op.getOperand(0).getValueType().is256BitVector() &&
+ llvm::isNullConstant(Op.getOperand(1))) {
+ Op = Op.getOperand(0);
+ UseSubVector = true;
+ }
+ bool IsUnary;
+ SmallVector<SDValue, 2> SrcOps;
+ SmallVector<int, 16> SrcShuffleMask;
+ SDValue BC = peekThroughBitcasts(Op);
+ if (isTargetShuffle(BC.getOpcode()) &&
+ getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
+ SrcOps, SrcShuffleMask, IsUnary)) {
+ if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
+ SrcOps.size() <= 2) {
+ N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
+ N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
+ ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
+ }
+ if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
+ SrcOps.size() == 1) {
+ N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
+ N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
+ ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ }
+ }
+ };
+
+ // View LHS in the form
+ // LHS = VECTOR_SHUFFLE A, B, LMask
+ // If LHS is not a shuffle, then pretend it is the identity shuffle:
+ // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+ // NOTE: A default initialized SDValue represents an UNDEF of type VT.
+ SDValue A, B;
+ SmallVector<int, 16> LMask;
+ GetShuffle(LHS, A, B, LMask);
+
+ // Likewise, view RHS in the form
+ // RHS = VECTOR_SHUFFLE C, D, RMask
+ SDValue C, D;
+ SmallVector<int, 16> RMask;
+ GetShuffle(RHS, C, D, RMask);
+
+ // At least one of the operands should be a vector shuffle.
+ unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
+ if (NumShuffles == 0)
+ return false;
+
+ if (LMask.empty()) {
+ A = LHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ LMask.push_back(i);
+ }
+
+ if (RMask.empty()) {
+ C = RHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ RMask.push_back(i);
+ }
+
+ // If A and B occur in reverse order in RHS, then canonicalize by commuting
+ // RHS operands and shuffle mask.
+ if (A != C) {
+ std::swap(C, D);
+ ShuffleVectorSDNode::commuteMask(RMask);
+ }
+ // Check that the shuffles are both shuffling the same vectors.
+ if (!(A == C && B == D))
+ return false;
+
+ PostShuffleMask.clear();
+ PostShuffleMask.append(NumElts, SM_SentinelUndef);
+
+ // LHS and RHS are now:
+ // LHS = shuffle A, B, LMask
+ // RHS = shuffle A, B, RMask
+ // Check that the masks correspond to performing a horizontal operation.
+ // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
+ // so we just repeat the inner loop if this is a 256-bit op.
+ unsigned Num128BitChunks = VT.getSizeInBits() / 128;
+ unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
+ unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
+ assert((NumEltsPer128BitChunk % 2 == 0) &&
+ "Vector type should have an even number of elements in each lane");
+ for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
+ for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
+ // Ignore undefined components.
+ int LIdx = LMask[i + j], RIdx = RMask[i + j];
+ if (LIdx < 0 || RIdx < 0 ||
+ (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+ (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+ continue;
+
+ // Check that successive odd/even elements are being operated on. If not,
+ // this is not a horizontal operation.
+ if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
+ !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
+ return false;
+
+ // Compute the post-shuffle mask index based on where the element
+ // is stored in the HOP result, and where it needs to be moved to.
+ int Base = LIdx & ~1u;
+ int Index = ((Base % NumEltsPer128BitChunk) / 2) +
+ ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
+
+ // The low half of the 128-bit result must choose from A.
+ // The high half of the 128-bit result must choose from B,
+ // unless B is undef. In that case, we are always choosing from A.
+ if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
+ Index += NumEltsPer64BitChunk;
+ PostShuffleMask[i + j] = Index;
+ }
+ }
+
+ SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+ SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+
+ bool IsIdentityPostShuffle =
+ isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
+ if (IsIdentityPostShuffle)
+ PostShuffleMask.clear();
+
+ // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
+ if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
+ isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
+ return false;
+
+ // If the source nodes are already used in HorizOps then always accept this.
+ // Shuffle folding should merge these back together.
+ bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
+ return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+ });
+ bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
+ return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+ });
+ bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
+
+ // Assume a SingleSource HOP if we only shuffle one input and don't need to
+ // shuffle the result.
+ if (!ForceHorizOp &&
+ !shouldUseHorizontalOp(NewLHS == NewRHS &&
+ (NumShuffles < 2 || !IsIdentityPostShuffle),
+ DAG, Subtarget))
+ return false;
+
+ LHS = DAG.getBitcast(VT, NewLHS);
+ RHS = DAG.getBitcast(VT, NewRHS);
+ return true;
+}
+
+/// Do target-specific dag combines on floating-point adds/subs.
+static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ bool IsFadd = N->getOpcode() == ISD::FADD;
+ auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
+ assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
+
+ // Try to synthesize horizontal add/sub from adds/subs of shuffles.
+ SmallVector<int, 8> PostShuffleMask;
+ if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+ isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd,
+ PostShuffleMask)) {
+ SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
+ if (!PostShuffleMask.empty())
+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+ DAG.getUNDEF(VT), PostShuffleMask);
+ return HorizBinOp;
+ }
+
+ return SDValue();
+}
+
+/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
+/// the codegen.
+/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
+/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
+/// anything that is guaranteed to be transformed by DAGCombiner.
+static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ const SDLoc &DL) {
+ assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
+ SDValue Src = N->getOperand(0);
+ unsigned SrcOpcode = Src.getOpcode();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+
+ auto IsFreeTruncation = [VT](SDValue Op) {
+ unsigned TruncSizeInBits = VT.getScalarSizeInBits();
+
+ // See if this has been extended from a smaller/equal size to
+ // the truncation size, allowing a truncation to combine with the extend.
+ unsigned Opcode = Op.getOpcode();
+ if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
+ Opcode == ISD::ZERO_EXTEND) &&
+ Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+ return true;
+
+ // See if this is a single use constant which can be constant folded.
+ // NOTE: We don't peek throught bitcasts here because there is currently
+ // no support for constant folding truncate+bitcast+vector_of_constants. So
+ // we'll just send up with a truncate on both operands which will
+ // get turned back into (truncate (binop)) causing an infinite loop.
+ return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
+ };
+
+ auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
+ SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
+ SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+ return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
+ };
+
+ // Don't combine if the operation has other uses.
+ if (!Src.hasOneUse())
+ return SDValue();
+
+ // Only support vector truncation for now.
+ // TODO: i64 scalar math would benefit as well.
+ if (!VT.isVector())
+ return SDValue();
+
+ // In most cases its only worth pre-truncating if we're only facing the cost
+ // of one truncation.
+ // i.e. if one of the inputs will constant fold or the input is repeated.
+ switch (SrcOpcode) {
+ case ISD::MUL:
+ // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
+ // better to truncate if we have the chance.
+ if (SrcVT.getScalarType() == MVT::i64 &&
+ TLI.isOperationLegal(SrcOpcode, VT) &&
+ !TLI.isOperationLegal(SrcOpcode, SrcVT))
+ return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
+ LLVM_FALLTHROUGH;
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ case ISD::ADD:
+ case ISD::SUB: {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegal(SrcOpcode, VT) &&
+ (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
+/// Truncate using ISD::AND mask and X86ISD::PACKUS.
+/// e.g. trunc <8 x i32> X to <8 x i16> -->
+/// MaskX = X & 0xffff (clear high bits to prevent saturation)
+/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
+static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+ EVT OutVT = N->getValueType(0);
+
+ APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
+ OutVT.getScalarSizeInBits());
+ In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
+ return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
+}
+
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+ EVT OutVT = N->getValueType(0);
+ In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
+ DAG.getValueType(OutVT));
+ return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
+}
+
+/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
+/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
+/// legalization the truncation will be translated into a BUILD_VECTOR with each
+/// element that is extracted from a vector and then truncated, and it is
+/// difficult to do this optimization based on them.
+static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT OutVT = N->getValueType(0);
+ if (!OutVT.isVector())
+ return SDValue();
+
+ SDValue In = N->getOperand(0);
+ if (!In.getValueType().isSimple())
+ return SDValue();
+
+ EVT InVT = In.getValueType();
+ unsigned NumElems = OutVT.getVectorNumElements();
+
+ // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
+ // SSE2, and we need to take care of it specially.
+ // AVX512 provides vpmovdb.
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
+ return SDValue();
+
+ EVT OutSVT = OutVT.getVectorElementType();
+ EVT InSVT = InVT.getVectorElementType();
+ if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
+ (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
+ NumElems >= 8))
+ return SDValue();
+
+ // SSSE3's pshufb results in less instructions in the cases below.
+ if (Subtarget.hasSSSE3() && NumElems == 8 &&
+ ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
+ (InSVT == MVT::i32 && OutSVT == MVT::i16)))
+ return SDValue();
+
+ SDLoc DL(N);
+ // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
+ // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
+ // truncate 2 x v4i32 to v8i16.
+ if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
+ return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
+ if (InSVT == MVT::i32)
+ return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
+
+ return SDValue();
+}
+
+/// This function transforms vector truncation of 'extended sign-bits' or
+/// 'extended zero-bits' values.
+/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
+static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Requires SSE2.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
+ return SDValue();
+
+ SDValue In = N->getOperand(0);
+ if (!In.getValueType().isSimple())
+ return SDValue();
+
+ MVT VT = N->getValueType(0).getSimpleVT();
+ MVT SVT = VT.getScalarType();
+
+ MVT InVT = In.getValueType().getSimpleVT();
+ MVT InSVT = InVT.getScalarType();
+
+ // Check we have a truncation suited for PACKSS/PACKUS.
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
+ return SDValue();
+ if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
+ return SDValue();
+ if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
+ return SDValue();
+
+ // Truncation to sub-128bit vXi32 can be better handled with shuffles.
+ if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
+ return SDValue();
+
+ // AVX512 has fast truncate, but if the input is already going to be split,
+ // there's no harm in trying pack.
+ if (Subtarget.hasAVX512() &&
+ !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
+ InVT.is512BitVector())) {
+ // PACK should still be worth it for 128-bit vectors if the sources were
+ // originally concatenated from subvectors.
+ SmallVector<SDValue> ConcatOps;
+ if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
+ return SDValue();
+ }
+
+ unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
+ unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
+
+ // Use PACKUS if the input has zero-bits that extend all the way to the
+ // packed/truncated value. e.g. masks, zext_in_reg, etc.
+ KnownBits Known = DAG.computeKnownBits(In);
+ unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
+ if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
+ return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
+
+ // Use PACKSS if the input has sign-bits that extend all the way to the
+ // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
+ unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+
+ // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
+ // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
+ // on and combines/simplifications can't then use it.
+ if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
+ return SDValue();
+
+ unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
+ if (NumSignBits > MinSignBits)
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+
+ // If we have a srl that only generates signbits that we will discard in
+ // the truncation then we can use PACKSS by converting the srl to a sra.
+ // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
+ if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
+ if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
+ In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
+ if (*ShAmt == MinSignBits) {
+ SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
+ Subtarget);
+ }
+ }
+
+ return SDValue();
+}
+
+// Try to form a MULHU or MULHS node by looking for
+// (trunc (srl (mul ext, ext), 16))
+// TODO: This is X86 specific because we want to be able to handle wide types
+// before type legalization. But we can only do it if the vector will be
+// legalized via widening/splitting. Type legalization can't handle promotion
+// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
+// combiner.
+static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ // First instruction should be a right shift of a multiply.
+ if (Src.getOpcode() != ISD::SRL ||
+ Src.getOperand(0).getOpcode() != ISD::MUL)
+ return SDValue();
+
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ // Only handle vXi16 types that are at least 128-bits unless they will be
+ // widened.
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
+ return SDValue();
+
+ // Input type should be at least vXi32.
+ EVT InVT = Src.getValueType();
+ if (InVT.getVectorElementType().getSizeInBits() < 32)
+ return SDValue();
+
+ // Need a shift by 16.
+ APInt ShiftAmt;
+ if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
+ ShiftAmt != 16)
+ return SDValue();
+
+ SDValue LHS = Src.getOperand(0).getOperand(0);
+ SDValue RHS = Src.getOperand(0).getOperand(1);
+
+ unsigned ExtOpc = LHS.getOpcode();
+ if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
+ RHS.getOpcode() != ExtOpc)
+ return SDValue();
+
+ // Peek through the extends.
+ LHS = LHS.getOperand(0);
+ RHS = RHS.getOperand(0);
+
+ // Ensure the input types match.
+ if (LHS.getValueType() != VT || RHS.getValueType() != VT)
+ return SDValue();
+
+ unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
+ return DAG.getNode(Opc, DL, VT, LHS, RHS);
+}
+
+// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
+// from one vector with signed bytes from another vector, adds together
+// adjacent pairs of 16-bit products, and saturates the result before
+// truncating to 16-bits.
+//
+// Which looks something like this:
+// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
+// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
+static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ const SDLoc &DL) {
+ if (!VT.isVector() || !Subtarget.hasSSSE3())
+ return SDValue();
+
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT ScalarVT = VT.getVectorElementType();
+ if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
+ return SDValue();
+
+ SDValue SSatVal = detectSSatPattern(In, VT);
+ if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
+ // of multiplies from even/odd elements.
+ SDValue N0 = SSatVal.getOperand(0);
+ SDValue N1 = SSatVal.getOperand(1);
+
+ if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ SDValue N10 = N1.getOperand(0);
+ SDValue N11 = N1.getOperand(1);
+
+ // TODO: Handle constant vectors and use knownbits/computenumsignbits?
+ // Canonicalize zero_extend to LHS.
+ if (N01.getOpcode() == ISD::ZERO_EXTEND)
+ std::swap(N00, N01);
+ if (N11.getOpcode() == ISD::ZERO_EXTEND)
+ std::swap(N10, N11);
+
+ // Ensure we have a zero_extend and a sign_extend.
+ if (N00.getOpcode() != ISD::ZERO_EXTEND ||
+ N01.getOpcode() != ISD::SIGN_EXTEND ||
+ N10.getOpcode() != ISD::ZERO_EXTEND ||
+ N11.getOpcode() != ISD::SIGN_EXTEND)
+ return SDValue();
+
+ // Peek through the extends.
+ N00 = N00.getOperand(0);
+ N01 = N01.getOperand(0);
+ N10 = N10.getOperand(0);
+ N11 = N11.getOperand(0);
+
+ // Ensure the extend is from vXi8.
+ if (N00.getValueType().getVectorElementType() != MVT::i8 ||
+ N01.getValueType().getVectorElementType() != MVT::i8 ||
+ N10.getValueType().getVectorElementType() != MVT::i8 ||
+ N11.getValueType().getVectorElementType() != MVT::i8)
+ return SDValue();
+
+ // All inputs should be build_vectors.
+ if (N00.getOpcode() != ISD::BUILD_VECTOR ||
+ N01.getOpcode() != ISD::BUILD_VECTOR ||
+ N10.getOpcode() != ISD::BUILD_VECTOR ||
+ N11.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ // N00/N10 are zero extended. N01/N11 are sign extended.
+
+ // For each element, we need to ensure we have an odd element from one vector
+ // multiplied by the odd element of another vector and the even element from
+ // one of the same vectors being multiplied by the even element from the
+ // other vector. So we need to make sure for each element i, this operator
+ // is being performed:
+ // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
+ SDValue ZExtIn, SExtIn;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDValue N00Elt = N00.getOperand(i);
+ SDValue N01Elt = N01.getOperand(i);
+ SDValue N10Elt = N10.getOperand(i);
+ SDValue N11Elt = N11.getOperand(i);
+ // TODO: Be more tolerant to undefs.
+ if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+ auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
+ auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
+ auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
+ auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
+ if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
+ return SDValue();
+ unsigned IdxN00 = ConstN00Elt->getZExtValue();
+ unsigned IdxN01 = ConstN01Elt->getZExtValue();
+ unsigned IdxN10 = ConstN10Elt->getZExtValue();
+ unsigned IdxN11 = ConstN11Elt->getZExtValue();
+ // Add is commutative so indices can be reordered.
+ if (IdxN00 > IdxN10) {
+ std::swap(IdxN00, IdxN10);
+ std::swap(IdxN01, IdxN11);
+ }
+ // N0 indices be the even element. N1 indices must be the next odd element.
+ if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
+ IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
+ return SDValue();
+ SDValue N00In = N00Elt.getOperand(0);
+ SDValue N01In = N01Elt.getOperand(0);
+ SDValue N10In = N10Elt.getOperand(0);
+ SDValue N11In = N11Elt.getOperand(0);
+ // First time we find an input capture it.
+ if (!ZExtIn) {
+ ZExtIn = N00In;
+ SExtIn = N01In;
+ }
+ if (ZExtIn != N00In || SExtIn != N01In ||
+ ZExtIn != N10In || SExtIn != N11In)
+ return SDValue();
+ }
+
+ auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ // Shrink by adding truncate nodes and let DAGCombine fold with the
+ // sources.
+ EVT InVT = Ops[0].getValueType();
+ assert(InVT.getScalarType() == MVT::i8 &&
+ "Unexpected scalar element type");
+ assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ InVT.getVectorNumElements() / 2);
+ return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
+ PMADDBuilder);
+}
+
+static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ SDLoc DL(N);
+
+ // Attempt to pre-truncate inputs to arithmetic ops instead.
+ if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
+ return V;
+
+ // Try to detect AVG pattern first.
+ if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
+ return Avg;
+
+ // Try to detect PMADD
+ if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
+ return PMAdd;
+
+ // Try to combine truncation with signed/unsigned saturation.
+ if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
+ return Val;
+
+ // Try to combine PMULHUW/PMULHW for vXi16.
+ if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
+ return V;
+
+ // The bitcast source is a direct mmx result.
+ // Detect bitcasts between i32 to x86mmx
+ if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
+ SDValue BCSrc = Src.getOperand(0);
+ if (BCSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
+ }
+
+ // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
+ if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
+ return V;
+
+ return combineVectorTruncation(N, DAG, Subtarget);
+}
+
+static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+ SDLoc DL(N);
+
+ if (auto SSatVal = detectSSatPattern(In, VT))
+ return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
+ if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+/// Returns the negated value if the node \p N flips sign of FP value.
+///
+/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
+/// or FSUB(0, x)
+/// AVX512F does not have FXOR, so FNEG is lowered as
+/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
+/// In this case we go though all bitcasts.
+/// This also recognizes splat of a negated value and returns the splat of that
+/// value.
+static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
+ if (N->getOpcode() == ISD::FNEG)
+ return N->getOperand(0);
+
+ // Don't recurse exponentially.
+ if (Depth > SelectionDAG::MaxRecursionDepth)
+ return SDValue();
+
+ unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
+
+ SDValue Op = peekThroughBitcasts(SDValue(N, 0));
+ EVT VT = Op->getValueType(0);
+
+ // Make sure the element size doesn't change.
+ if (VT.getScalarSizeInBits() != ScalarSize)
+ return SDValue();
+
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case ISD::VECTOR_SHUFFLE: {
+ // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
+ // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
+ if (!Op.getOperand(1).isUndef())
+ return SDValue();
+ if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
+ if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
+ return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
+ cast<ShuffleVectorSDNode>(Op)->getMask());
+ break;
+ }
+ case ISD::INSERT_VECTOR_ELT: {
+ // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
+ // -V, INDEX).
+ SDValue InsVector = Op.getOperand(0);
+ SDValue InsVal = Op.getOperand(1);
+ if (!InsVector.isUndef())
+ return SDValue();
+ if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
+ if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
+ NegInsVal, Op.getOperand(2));
+ break;
+ }
+ case ISD::FSUB:
+ case ISD::XOR:
+ case X86ISD::FXOR: {
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op0 = Op.getOperand(0);
+
+ // For XOR and FXOR, we want to check if constant
+ // bits of Op1 are sign bit masks. For FSUB, we
+ // have to check if constant bits of Op0 are sign
+ // bit masks and hence we swap the operands.
+ if (Opc == ISD::FSUB)
+ std::swap(Op0, Op1);
+
+ APInt UndefElts;
+ SmallVector<APInt, 16> EltBits;
+ // Extract constant bits and see if they are all
+ // sign bit masks. Ignore the undef elements.
+ if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
+ /* AllowWholeUndefs */ true,
+ /* AllowPartialUndefs */ false)) {
+ for (unsigned I = 0, E = EltBits.size(); I < E; I++)
+ if (!UndefElts[I] && !EltBits[I].isSignMask())
+ return SDValue();
+
+ return peekThroughBitcasts(Op0);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
+ bool NegRes) {
+ if (NegMul) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FNMADD; break;
+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMADD: Opcode = ISD::FMA; break;
+ case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
+ }
+ }
+
+ if (NegAcc) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FMSUB; break;
+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
+ case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
+ case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
+ case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
+ }
+ }
+
+ if (NegRes) {
+ switch (Opcode) {
+ // For accuracy reason, we never combine fneg and fma under strict FP.
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ }
+ }
+
+ return Opcode;
+}
+
+/// Do target-specific dag combines on floating point negations.
+static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT OrigVT = N->getValueType(0);
+ SDValue Arg = isFNEG(DAG, N);
+ if (!Arg)
+ return SDValue();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT VT = Arg.getValueType();
+ EVT SVT = VT.getScalarType();
+ SDLoc DL(N);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!TLI.isTypeLegal(VT))
+ return SDValue();
+
+ // If we're negating a FMUL node on a target with FMA, then we can avoid the
+ // use of a constant by performing (-0 - A*B) instead.
+ // FIXME: Check rounding control flags as well once it becomes available.
+ if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+ Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
+ SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+ SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Zero);
+ return DAG.getBitcast(OrigVT, NewNode);
+ }
+
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOperations = !DCI.isBeforeLegalizeOps();
+ if (SDValue NegArg =
+ TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
+ return DAG.getBitcast(OrigVT, NegArg);
+
+ return SDValue();
+}
+
+SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations,
+ bool ForCodeSize,
+ NegatibleCost &Cost,
+ unsigned Depth) const {
+ // fneg patterns are removable even if they have multiple uses.
+ if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
+ Cost = NegatibleCost::Cheaper;
+ return DAG.getBitcast(Op.getValueType(), Arg);
+ }
+
+ EVT VT = Op.getValueType();
+ EVT SVT = VT.getScalarType();
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case ISD::FMA:
+ case X86ISD::FMSUB:
+ case X86ISD::FNMADD:
+ case X86ISD::FNMSUB:
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMSUB_RND:
+ case X86ISD::FNMADD_RND:
+ case X86ISD::FNMSUB_RND: {
+ if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
+ !(SVT == MVT::f32 || SVT == MVT::f64) ||
+ !isOperationLegal(ISD::FMA, VT))
+ break;
+
+ // This is always negatible for free but we might be able to remove some
+ // extra operand negations as well.
+ SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
+ for (int i = 0; i != 3; ++i)
+ NewOps[i] = getCheaperNegatedExpression(
+ Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
+
+ bool NegA = !!NewOps[0];
+ bool NegB = !!NewOps[1];
+ bool NegC = !!NewOps[2];
+ unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
+
+ Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
+ : NegatibleCost::Neutral;
+
+ // Fill in the non-negated ops with the original values.
+ for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
+ if (!NewOps[i])
+ NewOps[i] = Op.getOperand(i);
+ return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
+ }
+ case X86ISD::FRCP:
+ if (SDValue NegOp0 =
+ getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
+ ForCodeSize, Cost, Depth + 1))
+ return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
+ break;
+ }
+
+ return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
+ ForCodeSize, Cost, Depth);
+}
+
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = N->getSimpleValueType(0);
+ // If we have integer vector types available, use the integer opcodes.
+ if (!VT.isVector() || !Subtarget.hasSSE2())
+ return SDValue();
+
+ SDLoc dl(N);
+
+ unsigned IntBits = VT.getScalarSizeInBits();
+ MVT IntSVT = MVT::getIntegerVT(IntBits);
+ MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
+
+ SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
+ SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
+ unsigned IntOpcode;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected FP logic op");
+ case X86ISD::FOR: IntOpcode = ISD::OR; break;
+ case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+ case X86ISD::FAND: IntOpcode = ISD::AND; break;
+ case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+ }
+ SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+ return DAG.getBitcast(VT, IntOp);
+}
+
+
+/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
+static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() != ISD::XOR)
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
+ return SDValue();
+
+ X86::CondCode NewCC = X86::GetOppositeBranchCondition(
+ X86::CondCode(LHS->getConstantOperandVal(0)));
+ SDLoc DL(N);
+ return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
+}
+
+static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ // If this is SSE1 only convert to FXOR to avoid scalarization.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
+ N->getValueType(0) == MVT::v4i32) {
+ return DAG.getBitcast(
+ MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
+ DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
+ }
+
+ if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+ return Cmp;
+
+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+ return R;
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue SetCC = foldXor1SetCC(N, DAG))
+ return SetCC;
+
+ if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
+ return RV;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ return combineFneg(N, DAG, DCI, Subtarget);
+}
+
+static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ unsigned NumBits = VT.getSizeInBits();
+
+ // TODO - Constant Folding.
+
+ // Simplify the inputs.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getAllOnesValue(NumBits));
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+static bool isNullFPScalarOrVectorConst(SDValue V) {
+ return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
+}
+
+/// If a value is a scalar FP zero or a vector FP zero (potentially including
+/// undefined elements), return a zero constant that may be used to fold away
+/// that value. In the case of a vector, the returned constant will not contain
+/// undefined elements even if the input parameter does. This makes it suitable
+/// to be used as a replacement operand with operations (eg, bitwise-and) where
+/// an undef should not propagate.
+static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!isNullFPScalarOrVectorConst(V))
+ return SDValue();
+
+ if (V.getValueType().isVector())
+ return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
+
+ return V;
+}
+
+static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
+ if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::f64 && Subtarget.hasSSE2()) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
+ return SDValue();
+
+ auto isAllOnesConstantFP = [](SDValue V) {
+ if (V.getSimpleValueType().isVector())
+ return ISD::isBuildVectorAllOnes(V.getNode());
+ auto *C = dyn_cast<ConstantFPSDNode>(V);
+ return C && C->getConstantFPValue()->isAllOnesValue();
+ };
+
+ // fand (fxor X, -1), Y --> fandn X, Y
+ if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
+ return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
+
+ // fand X, (fxor Y, -1) --> fandn Y, X
+ if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
+ return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
+
+ return SDValue();
+}
+
+/// Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // FAND(0.0, x) -> 0.0
+ if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
+ return V;
+
+ // FAND(x, 0.0) -> 0.0
+ if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
+ return V;
+
+ if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
+ return V;
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FANDN nodes.
+static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // FANDN(0.0, x) -> x
+ if (isNullFPScalarOrVectorConst(N->getOperand(0)))
+ return N->getOperand(1);
+
+ // FANDN(x, 0.0) -> 0.0
+ if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
+ return V;
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
+static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
+ // F[X]OR(0.0, x) -> x
+ if (isNullFPScalarOrVectorConst(N->getOperand(0)))
+ return N->getOperand(1);
+
+ // F[X]OR(x, 0.0) -> x
+ if (isNullFPScalarOrVectorConst(N->getOperand(1)))
+ return N->getOperand(0);
+
+ if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
+ return NewVal;
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
+static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+ // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
+ if (!DAG.getTarget().Options.NoNaNsFPMath ||
+ !DAG.getTarget().Options.NoSignedZerosFPMath)
+ return SDValue();
+
+ // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+ // into FMINC and FMAXC, which are Commutative operations.
+ unsigned NewOp = 0;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("unknown opcode");
+ case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
+ case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
+ }
+
+ return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+}
+
+static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Subtarget.useSoftFloat())
+ return SDValue();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ EVT VT = N->getValueType(0);
+ if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
+ (Subtarget.hasSSE2() && VT == MVT::f64) ||
+ (VT.isVector() && TLI.isTypeLegal(VT))))
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDLoc DL(N);
+ auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+
+ // If we don't have to respect NaN inputs, this is a direct translation to x86
+ // min/max instructions.
+ if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
+ return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+
+ // If one of the operands is known non-NaN use the native min/max instructions
+ // with the non-NaN input as second operand.
+ if (DAG.isKnownNeverNaN(Op1))
+ return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+ if (DAG.isKnownNeverNaN(Op0))
+ return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
+
+ // If we have to respect NaN inputs, this takes at least 3 instructions.
+ // Favor a library call when operating on a scalar and minimizing code size.
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
+ return SDValue();
+
+ EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+ VT);
+
+ // There are 4 possibilities involving NaN inputs, and these are the required
+ // outputs:
+ // Op1
+ // Num NaN
+ // ----------------
+ // Num | Max | Op0 |
+ // Op0 ----------------
+ // NaN | Op1 | NaN |
+ // ----------------
+ //
+ // The SSE FP max/min instructions were not designed for this case, but rather
+ // to implement:
+ // Min = Op1 < Op0 ? Op1 : Op0
+ // Max = Op1 > Op0 ? Op1 : Op0
+ //
+ // So they always return Op0 if either input is a NaN. However, we can still
+ // use those instructions for fmaxnum by selecting away a NaN input.
+
+ // If either operand is NaN, the 2nd source operand (Op0) is passed through.
+ SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
+ SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
+
+ // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
+ // are NaN, the NaN value of Op1 is the result.
+ return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
+}
+
+static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ APInt KnownUndef, KnownZero;
+ APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+ KnownZero, DCI))
+ return SDValue(N, 0);
+
+ // Convert a full vector load into vzload when not all bits are needed.
+ SDValue In = N->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
+ ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
+ assert(InVT.is128BitVector() && "Expected 128-bit input vector");
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getIntegerVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
+ SDLoc dl(N);
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
+ DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return SDValue(N, 0);
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ bool IsStrict = N->isTargetStrictFPOpcode();
+ EVT VT = N->getValueType(0);
+
+ // Convert a full vector load into vzload when not all bits are needed.
+ SDValue In = N->getOperand(IsStrict ? 1 : 0);
+ MVT InVT = In.getSimpleValueType();
+ if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
+ ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
+ assert(InVT.is128BitVector() && "Expected 128-bit input vector");
+ LoadSDNode *LN = cast<LoadSDNode>(In);
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getFloatingPointVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
+ SDLoc dl(N);
+ if (IsStrict) {
+ SDValue Convert =
+ DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
+ {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
+ DCI.CombineTo(N, Convert, Convert.getValue(1));
+ } else {
+ SDValue Convert =
+ DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ }
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return SDValue(N, 0);
+ }
+ }
+
+ return SDValue();
+}
+
+/// Do target-specific dag combines on X86ISD::ANDNP nodes.
+static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ MVT VT = N->getSimpleValueType(0);
+
+ // ANDNP(0, x) -> x
+ if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ return N->getOperand(1);
+
+ // ANDNP(x, 0) -> 0
+ if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
+ return DAG.getConstant(0, SDLoc(N), VT);
+
+ // Turn ANDNP back to AND if input is inverted.
+ if (SDValue Not = IsNOT(N->getOperand(0), DAG))
+ return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
+ N->getOperand(1));
+
+ // Attempt to recursively combine a bitmask ANDNP with shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ return SDValue();
+}
+
+static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue N1 = N->getOperand(1);
+
+ // BT ignores high bits in the bit index operand.
+ unsigned BitWidth = N1.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+ if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+
+ if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedElts = APInt::getLowBitsSet(8, 4);
+ if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
+ DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ // Convert a full vector load into vzload when not all bits are needed.
+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
+ SDLoc dl(N);
+ if (IsStrict) {
+ SDValue Convert = DAG.getNode(
+ N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
+ DCI.CombineTo(N, Convert, Convert.getValue(1));
+ } else {
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
+ DAG.getBitcast(MVT::v8i16, VZLoad));
+ DCI.CombineTo(N, Convert);
+ }
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return SDValue(N, 0);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+// Try to combine sext_in_reg of a cmov of constants by extending the constants.
+static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+ EVT DstVT = N->getValueType(0);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+
+ if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
+ return SDValue();
+
+ // Look through single use any_extends / truncs.
+ SDValue IntermediateBitwidthOp;
+ if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
+ N0.hasOneUse()) {
+ IntermediateBitwidthOp = N0;
+ N0 = N0.getOperand(0);
+ }
+
+ // See if we have a single use cmov.
+ if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
+ return SDValue();
+
+ SDValue CMovOp0 = N0.getOperand(0);
+ SDValue CMovOp1 = N0.getOperand(1);
+
+ // Make sure both operands are constants.
+ if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
+ !isa<ConstantSDNode>(CMovOp1.getNode()))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // If we looked through an any_extend/trunc above, add one to the constants.
+ if (IntermediateBitwidthOp) {
+ unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
+ CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
+ CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
+ }
+
+ CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
+ CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
+
+ EVT CMovVT = DstVT;
+ // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
+ if (DstVT == MVT::i16) {
+ CMovVT = MVT::i32;
+ CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
+ CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
+ }
+
+ SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
+ N0.getOperand(2), N0.getOperand(3));
+
+ if (CMovVT != DstVT)
+ CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
+
+ return CMov;
+}
+
+static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+ if (SDValue V = combineSextInRegCmov(N, DAG))
+ return V;
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+ SDLoc dl(N);
+
+ // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+ // both SSE and AVX2 since there is no sign-extended shift right
+ // operation on a vector with 64-bit elements.
+ //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+ // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+ if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+ N0.getOpcode() == ISD::SIGN_EXTEND)) {
+ SDValue N00 = N0.getOperand(0);
+
+ // EXTLOAD has a better solution on AVX2,
+ // it may be replaced with X86ISD::VSEXT node.
+ if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
+ if (!ISD::isNormalLoad(N00.getNode()))
+ return SDValue();
+
+ // Attempt to promote any comparison mask ops before moving the
+ // SIGN_EXTEND_INREG in the way.
+ if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
+
+ if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+ SDValue Tmp =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+ }
+ }
+ return SDValue();
+}
+
+/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
+/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
+/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
+/// opportunities to combine math ops, use an LEA, or use a complex addressing
+/// mode. This can eliminate extend, add, and shift instructions.
+static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
+ Ext->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+
+ // TODO: This should be valid for other integer types.
+ EVT VT = Ext->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ SDValue Add = Ext->getOperand(0);
+ if (Add.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
+ bool NSW = Add->getFlags().hasNoSignedWrap();
+ bool NUW = Add->getFlags().hasNoUnsignedWrap();
+
+ // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
+ // into the 'zext'
+ if ((Sext && !NSW) || (!Sext && !NUW))
+ return SDValue();
+
+ // Having a constant operand to the 'add' ensures that we are not increasing
+ // the instruction count because the constant is extended for free below.
+ // A constant operand can also become the displacement field of an LEA.
+ auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+ if (!AddOp1)
+ return SDValue();
+
+ // Don't make the 'add' bigger if there's no hope of combining it with some
+ // other 'add' or 'shl' instruction.
+ // TODO: It may be profitable to generate simpler LEA instructions in place
+ // of single 'add' instructions, but the cost model for selecting an LEA
+ // currently has a high threshold.
+ bool HasLEAPotential = false;
+ for (auto *User : Ext->uses()) {
+ if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
+ HasLEAPotential = true;
+ break;
+ }
+ }
+ if (!HasLEAPotential)
+ return SDValue();
+
+ // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
+ int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
+ SDValue AddOp0 = Add.getOperand(0);
+ SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
+ SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+
+ // The wider add is guaranteed to not wrap because both operands are
+ // sign-extended.
+ SDNodeFlags Flags;
+ Flags.setNoSignedWrap(NSW);
+ Flags.setNoUnsignedWrap(NUW);
+ return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
+}
+
+// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
+// operands and the result of CMOV is not used anywhere else - promote CMOV
+// itself instead of promoting its result. This could be beneficial, because:
+// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
+// (or more) pseudo-CMOVs only when they go one-after-another and
+// getting rid of result extension code after CMOV will help that.
+// 2) Promotion of constant CMOV arguments is free, hence the
+// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
+// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
+// promotion is also good in terms of code-size.
+// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
+// promotion).
+static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
+ SDValue CMovN = Extend->getOperand(0);
+ if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
+ return SDValue();
+
+ EVT TargetVT = Extend->getValueType(0);
+ unsigned ExtendOpcode = Extend->getOpcode();
+ SDLoc DL(Extend);
+
+ EVT VT = CMovN.getValueType();
+ SDValue CMovOp0 = CMovN.getOperand(0);
+ SDValue CMovOp1 = CMovN.getOperand(1);
+
+ if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
+ !isa<ConstantSDNode>(CMovOp1.getNode()))
+ return SDValue();
+
+ // Only extend to i32 or i64.
+ if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
+ return SDValue();
+
+ // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
+ // are free.
+ if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
+ return SDValue();
+
+ // If this a zero extend to i64, we should only extend to i32 and use a free
+ // zero extend to finish.
+ EVT ExtendVT = TargetVT;
+ if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
+ ExtendVT = MVT::i32;
+
+ CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
+ CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
+
+ SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
+ CMovN.getOperand(2), CMovN.getOperand(3));
+
+ // Finish extending if needed.
+ if (ExtendVT != TargetVT)
+ Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
+
+ return Res;
+}
+
+// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
+// This is more or less the reverse of combineBitcastvxi1.
+static SDValue
+combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
+ Opcode != ISD::ANY_EXTEND)
+ return SDValue();
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT InSVT = N0.getValueType().getScalarType();
+ unsigned EltSizeInBits = SVT.getSizeInBits();
+
+ // Input type must be extending a bool vector (bit-casted from a scalar
+ // integer) to legal integer types.
+ if (!VT.isVector())
+ return SDValue();
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
+ return SDValue();
+ if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ EVT SclVT = N0.getOperand(0).getValueType();
+ if (!SclVT.isScalarInteger())
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue Vec;
+ SmallVector<int, 32> ShuffleMask;
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
+
+ // Broadcast the scalar integer to the vector elements.
+ if (NumElts > EltSizeInBits) {
+ // If the scalar integer is greater than the vector element size, then we
+ // must split it down into sub-sections for broadcasting. For example:
+ // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
+ // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
+ assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
+ unsigned Scale = NumElts / EltSizeInBits;
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+ Vec = DAG.getBitcast(VT, Vec);
+
+ for (unsigned i = 0; i != Scale; ++i)
+ ShuffleMask.append(EltSizeInBits, i);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+ } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
+ (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
+ // If we have register broadcast instructions, use the scalar size as the
+ // element type for the shuffle. Then cast to the wider element type. The
+ // widened bits won't be used, and this might allow the use of a broadcast
+ // load.
+ assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
+ unsigned Scale = EltSizeInBits / NumElts;
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+ ShuffleMask.append(NumElts * Scale, 0);
+ Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
+ Vec = DAG.getBitcast(VT, Vec);
+ } else {
+ // For smaller scalar integers, we can simply any-extend it to the vector
+ // element size (we don't care about the upper bits) and broadcast it to all
+ // elements.
+ SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
+ ShuffleMask.append(NumElts, 0);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+ }
+
+ // Now, mask the relevant bit in each element.
+ SmallVector<SDValue, 32> Bits;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int BitIdx = (i % EltSizeInBits);
+ APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
+ Bits.push_back(DAG.getConstant(Bit, DL, SVT));
+ }
+ SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
+ Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
+
+ // Compare against the bitmask and extend the result.
+ EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+ Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
+ Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
+
+ // For SEXT, this is now done, otherwise shift the result down for
+ // zero-extension.
+ if (Opcode == ISD::SIGN_EXTEND)
+ return Vec;
+ return DAG.getNode(ISD::SRL, DL, VT, Vec,
+ DAG.getConstant(EltSizeInBits - 1, DL, VT));
+}
+
+// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
+// result type.
+static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+
+ // Only do this combine with AVX512 for vector extends.
+ if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ // Only combine legal element types.
+ EVT SVT = VT.getVectorElementType();
+ if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
+ SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
+ return SDValue();
+
+ // We can only do this if the vector size in 256 bits or less.
+ unsigned Size = VT.getSizeInBits();
+ if (Size > 256 && Subtarget.useAVX512Regs())
+ return SDValue();
+
+ // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
+ // that's the only integer compares with we have.
+ ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+ if (ISD::isUnsignedIntSetCC(CC))
+ return SDValue();
+
+ // Only do this combine if the extension will be fully consumed by the setcc.
+ EVT N00VT = N0.getOperand(0).getValueType();
+ EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
+ if (Size != MatchingVecType.getSizeInBits())
+ return SDValue();
+
+ SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+
+ if (N->getOpcode() == ISD::ZERO_EXTEND)
+ Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
+
+ return Res;
+}
+
+static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
+ if (!DCI.isBeforeLegalizeOps() &&
+ N0.getOpcode() == X86ISD::SETCC_CARRY) {
+ SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
+ N0->getOperand(1));
+ bool ReplaceOtherUses = !N0.hasOneUse();
+ DCI.CombineTo(N, Setcc);
+ // Replace other uses with a truncate of the widened setcc_carry.
+ if (ReplaceOtherUses) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+ N0.getValueType(), Setcc);
+ DCI.CombineTo(N0.getNode(), Trunc);
+ }
+
+ return SDValue(N, 0);
+ }
+
+ if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
+ return NewCMov;
+
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
+
+ if (VT.isVector()) {
+ if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
+ return R;
+
+ if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
+ return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
+ }
+
+ if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
+ return NewAdd;
+
+ return SDValue();
+}
+
+static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
+
+ // Let legalize expand this if it isn't a legal type yet.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(VT))
+ return SDValue();
+
+ SDValue A = N->getOperand(IsStrict ? 1 : 0);
+ SDValue B = N->getOperand(IsStrict ? 2 : 1);
+ SDValue C = N->getOperand(IsStrict ? 3 : 2);
+
+ // If the operation allows fast-math and the target does not support FMA,
+ // split this into mul+add to avoid libcall(s).
+ SDNodeFlags Flags = N->getFlags();
+ if (!IsStrict && Flags.hasAllowReassociation() &&
+ TLI.isOperationExpand(ISD::FMA, VT)) {
+ SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
+ return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
+ }
+
+ EVT ScalarVT = VT.getScalarType();
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
+ return SDValue();
+
+ auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOperations = !DCI.isBeforeLegalizeOps();
+ if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
+ CodeSize)) {
+ V = NegV;
+ return true;
+ }
+ // Look through extract_vector_elts. If it comes from an FNEG, create a
+ // new extract from the FNEG input.
+ if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isNullConstant(V.getOperand(1))) {
+ SDValue Vec = V.getOperand(0);
+ if (SDValue NegV = TLI.getCheaperNegatedExpression(
+ Vec, DAG, LegalOperations, CodeSize)) {
+ V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
+ NegV, V.getOperand(1));
+ return true;
+ }
+ }
+
+ return false;
+ };
+
+ // Do not convert the passthru input of scalar intrinsics.
+ // FIXME: We could allow negations of the lower element only.
+ bool NegA = invertIfNegative(A);
+ bool NegB = invertIfNegative(B);
+ bool NegC = invertIfNegative(C);
+
+ if (!NegA && !NegB && !NegC)
+ return SDValue();
+
+ unsigned NewOpcode =
+ negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
+
+ if (IsStrict) {
+ assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
+ return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
+ {N->getOperand(0), A, B, C});
+ } else {
+ if (N->getNumOperands() == 4)
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ }
+}
+
+// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
+// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
+static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOperations = !DCI.isBeforeLegalizeOps();
+
+ SDValue N2 = N->getOperand(2);
+
+ SDValue NegN2 =
+ TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+ if (!NegN2)
+ return SDValue();
+ unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
+
+ if (N->getNumOperands() == 4)
+ return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
+ NegN2, N->getOperand(3));
+ return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
+ NegN2);
+}
+
+static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(N);
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
+ // FIXME: Is this needed? We don't seem to have any tests for it.
+ if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
+ N0.getOpcode() == X86ISD::SETCC_CARRY) {
+ SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
+ N0->getOperand(1));
+ bool ReplaceOtherUses = !N0.hasOneUse();
+ DCI.CombineTo(N, Setcc);
+ // Replace other uses with a truncate of the widened setcc_carry.
+ if (ReplaceOtherUses) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+ N0.getValueType(), Setcc);
+ DCI.CombineTo(N0.getNode(), Trunc);
+ }
+
+ return SDValue(N, 0);
+ }
+
+ if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
+ return NewCMov;
+
+ if (DCI.isBeforeLegalizeOps())
+ if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
+
+ if (VT.isVector())
+ if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
+ return R;
+
+ if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
+ return NewAdd;
+
+ if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
+ return R;
+
+ // TODO: Combine with any target/faux shuffle.
+ if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
+ VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
+ APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
+ if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
+ (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
+ return concatSubVectors(N00, N01, DAG, dl);
+ }
+ }
+
+ return SDValue();
+}
+
+/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
+/// recognizable memcmp expansion.
+static bool isOrXorXorTree(SDValue X, bool Root = true) {
+ if (X.getOpcode() == ISD::OR)
+ return isOrXorXorTree(X.getOperand(0), false) &&
+ isOrXorXorTree(X.getOperand(1), false);
+ if (Root)
+ return false;
+ return X.getOpcode() == ISD::XOR;
+}
+
+/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
+/// expansion.
+template<typename F>
+static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
+ EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
+ SDValue Op0 = X.getOperand(0);
+ SDValue Op1 = X.getOperand(1);
+ if (X.getOpcode() == ISD::OR) {
+ SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
+ SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
+ if (VecVT != CmpVT)
+ return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
+ if (HasPT)
+ return DAG.getNode(ISD::OR, DL, VecVT, A, B);
+ return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
+ } else if (X.getOpcode() == ISD::XOR) {
+ SDValue A = SToV(Op0);
+ SDValue B = SToV(Op1);
+ if (VecVT != CmpVT)
+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
+ if (HasPT)
+ return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+ }
+ llvm_unreachable("Impossible");
+}
+
+/// Try to map a 128-bit or larger integer comparison to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+ assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
+
+ // We're looking for an oversized integer equality comparison.
+ SDValue X = SetCC->getOperand(0);
+ SDValue Y = SetCC->getOperand(1);
+ EVT OpVT = X.getValueType();
+ unsigned OpSize = OpVT.getSizeInBits();
+ if (!OpVT.isScalarInteger() || OpSize < 128)
+ return SDValue();
+
+ // Ignore a comparison with zero because that gets special treatment in
+ // EmitTest(). But make an exception for the special case of a pair of
+ // logically-combined vector-sized operands compared to zero. This pattern may
+ // be generated by the memcmp expansion pass with oversized integer compares
+ // (see PR33325).
+ bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
+ if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
+ return SDValue();
+
+ // Don't perform this combine if constructing the vector will be expensive.
+ auto IsVectorBitCastCheap = [](SDValue X) {
+ X = peekThroughBitcasts(X);
+ return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
+ X.getOpcode() == ISD::LOAD;
+ };
+ if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
+ !IsOrXorXorTreeCCZero)
+ return SDValue();
+
+ EVT VT = SetCC->getValueType(0);
+ SDLoc DL(SetCC);
+
+ // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
+ // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
+ // Otherwise use PCMPEQ (plus AND) and mask testing.
+ if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+ (OpSize == 256 && Subtarget.hasAVX()) ||
+ (OpSize == 512 && Subtarget.useAVX512Regs())) {
+ bool HasPT = Subtarget.hasSSE41();
+
+ // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
+ // vector registers are essentially free. (Technically, widening registers
+ // prevents load folding, but the tradeoff is worth it.)
+ bool PreferKOT = Subtarget.preferMaskRegisters();
+ bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
+
+ EVT VecVT = MVT::v16i8;
+ EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
+ if (OpSize == 256) {
+ VecVT = MVT::v32i8;
+ CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
+ }
+ EVT CastVT = VecVT;
+ bool NeedsAVX512FCast = false;
+ if (OpSize == 512 || NeedZExt) {
+ if (Subtarget.hasBWI()) {
+ VecVT = MVT::v64i8;
+ CmpVT = MVT::v64i1;
+ if (OpSize == 512)
+ CastVT = VecVT;
+ } else {
+ VecVT = MVT::v16i32;
+ CmpVT = MVT::v16i1;
+ CastVT = OpSize == 512 ? VecVT :
+ OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
+ NeedsAVX512FCast = true;
+ }
+ }
+
+ auto ScalarToVector = [&](SDValue X) -> SDValue {
+ bool TmpZext = false;
+ EVT TmpCastVT = CastVT;
+ if (X.getOpcode() == ISD::ZERO_EXTEND) {
+ SDValue OrigX = X.getOperand(0);
+ unsigned OrigSize = OrigX.getScalarValueSizeInBits();
+ if (OrigSize < OpSize) {
+ if (OrigSize == 128) {
+ TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
+ X = OrigX;
+ TmpZext = true;
+ } else if (OrigSize == 256) {
+ TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
+ X = OrigX;
+ TmpZext = true;
+ }
+ }
+ }
+ X = DAG.getBitcast(TmpCastVT, X);
+ if (!NeedZExt && !TmpZext)
+ return X;
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
+ DAG.getConstant(0, DL, VecVT), X,
+ DAG.getVectorIdxConstant(0, DL));
+ };
+
+ SDValue Cmp;
+ if (IsOrXorXorTreeCCZero) {
+ // This is a bitwise-combined equality comparison of 2 pairs of vectors:
+ // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
+ // Use 2 vector equality compares and 'and' the results before doing a
+ // MOVMSK.
+ Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
+ } else {
+ SDValue VecX = ScalarToVector(X);
+ SDValue VecY = ScalarToVector(Y);
+ if (VecVT != CmpVT) {
+ Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
+ } else if (HasPT) {
+ Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
+ } else {
+ Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+ }
+ }
+ // AVX512 should emit a setcc that will lower to kortest.
+ if (VecVT != CmpVT) {
+ EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
+ CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
+ return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
+ DAG.getConstant(0, DL, KRegVT), CC);
+ }
+ if (HasPT) {
+ SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
+ Cmp);
+ SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
+ X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+ SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
+ }
+ // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
+ // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
+ // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
+ assert(Cmp.getValueType() == MVT::v16i8 &&
+ "Non 128-bit vector on pre-SSE41 target");
+ SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
+ SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
+ return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ const SDValue LHS = N->getOperand(0);
+ const SDValue RHS = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ EVT OpVT = LHS.getValueType();
+ SDLoc DL(N);
+
+ if (CC == ISD::SETNE || CC == ISD::SETEQ) {
+ if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
+ return V;
+
+ if (VT == MVT::i1 && isNullConstant(RHS)) {
+ SDValue X86CC;
+ if (SDValue V =
+ MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
+ return DAG.getNode(ISD::TRUNCATE, DL, VT,
+ DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
+ }
+ }
+
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
+ // Using temporaries to avoid messing up operand ordering for later
+ // transformations if this doesn't work.
+ SDValue Op0 = LHS;
+ SDValue Op1 = RHS;
+ ISD::CondCode TmpCC = CC;
+ // Put build_vector on the right.
+ if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
+ std::swap(Op0, Op1);
+ TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
+ }
+
+ bool IsSEXT0 =
+ (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
+ (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
+ bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
+
+ if (IsSEXT0 && IsVZero1) {
+ assert(VT == Op0.getOperand(0).getValueType() &&
+ "Unexpected operand type");
+ if (TmpCC == ISD::SETGT)
+ return DAG.getConstant(0, DL, VT);
+ if (TmpCC == ISD::SETLE)
+ return DAG.getConstant(1, DL, VT);
+ if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
+ return DAG.getNOT(DL, Op0.getOperand(0), VT);
+
+ assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
+ "Unexpected condition code!");
+ return Op0.getOperand(0);
+ }
+ }
+
+ // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
+ // pre-promote its result type since vXi1 vectors don't get promoted
+ // during type legalization.
+ // NOTE: The element count check is to ignore operand types that need to
+ // go through type promotion to a 128-bit vector.
+ if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
+ VT.getVectorElementType() == MVT::i1 &&
+ (OpVT.getVectorElementType() == MVT::i8 ||
+ OpVT.getVectorElementType() == MVT::i16)) {
+ SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
+ }
+
+ // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
+ // to avoid scalarization via legalization because v4i32 is not a legal type.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
+ LHS.getValueType() == MVT::v4f32)
+ return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
+
+ return SDValue();
+}
+
+static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue Src = N->getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumBits = VT.getScalarSizeInBits();
+ unsigned NumElts = SrcVT.getVectorNumElements();
+
+ // Perform constant folding.
+ if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
+ assert(VT == MVT::i32 && "Unexpected result type");
+ APInt Imm(32, 0);
+ for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
+ if (!Src.getOperand(Idx).isUndef() &&
+ Src.getConstantOperandAPInt(Idx).isNegative())
+ Imm.setBit(Idx);
+ }
+ return DAG.getConstant(Imm, SDLoc(N), VT);
+ }
+
+ // Look through int->fp bitcasts that don't change the element width.
+ unsigned EltWidth = SrcVT.getScalarSizeInBits();
+ if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
+ Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
+ return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
+
+ // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
+ // with scalar comparisons.
+ if (SDValue NotSrc = IsNOT(Src, DAG)) {
+ SDLoc DL(N);
+ APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
+ NotSrc = DAG.getBitcast(SrcVT, NotSrc);
+ return DAG.getNode(ISD::XOR, DL, VT,
+ DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
+ DAG.getConstant(NotMask, DL, VT));
+ }
+
+ // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
+ // results with scalar comparisons.
+ if (Src.getOpcode() == X86ISD::PCMPGT &&
+ ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
+ SDLoc DL(N);
+ APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
+ return DAG.getNode(ISD::XOR, DL, VT,
+ DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
+ DAG.getConstant(NotMask, DL, VT));
+ }
+
+ // Simplify the inputs.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getAllOnesValue(NumBits));
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // With vector masks we only demand the upper bit of the mask.
+ SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
+ SDValue Index, SDValue Base, SDValue Scale,
+ SelectionDAG &DAG) {
+ SDLoc DL(GorS);
+
+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+ SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
+ Gather->getMask(), Base, Index, Scale } ;
+ return DAG.getMaskedGather(Gather->getVTList(),
+ Gather->getMemoryVT(), DL, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType(),
+ Gather->getExtensionType());
+ }
+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+ SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
+ Scatter->getMask(), Base, Index, Scale };
+ return DAG.getMaskedScatter(Scatter->getVTList(),
+ Scatter->getMemoryVT(), DL,
+ Ops, Scatter->getMemOperand(),
+ Scatter->getIndexType(),
+ Scatter->isTruncatingStore());
+}
+
+static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDLoc DL(N);
+ auto *GorS = cast<MaskedGatherScatterSDNode>(N);
+ SDValue Index = GorS->getIndex();
+ SDValue Base = GorS->getBasePtr();
+ SDValue Scale = GorS->getScale();
+
+ if (DCI.isBeforeLegalize()) {
+ unsigned IndexWidth = Index.getScalarValueSizeInBits();
+
+ // Shrink constant indices if they are larger than 32-bits.
+ // Only do this before legalize types since v2i64 could become v2i32.
+ // FIXME: We could check that the type is legal if we're after legalize
+ // types, but then we would need to construct test cases where that happens.
+ // FIXME: We could support more than just constant vectors, but we need to
+ // careful with costing. A truncate that can be optimized out would be fine.
+ // Otherwise we might only want to create a truncate if it avoids a split.
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
+ if (BV->isConstant() && IndexWidth > 32 &&
+ DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
+ unsigned NumElts = Index.getValueType().getVectorNumElements();
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+ }
+ }
+
+ // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
+ // there are sufficient sign bits. Only do this before legalize types to
+ // avoid creating illegal types in truncate.
+ if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
+ Index.getOpcode() == ISD::ZERO_EXTEND) &&
+ IndexWidth > 32 &&
+ Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
+ DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
+ unsigned NumElts = Index.getValueType().getVectorNumElements();
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+ }
+ }
+
+ if (DCI.isBeforeLegalizeOps()) {
+ unsigned IndexWidth = Index.getScalarValueSizeInBits();
+
+ // Make sure the index is either i32 or i64
+ if (IndexWidth != 32 && IndexWidth != 64) {
+ MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
+ EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+ Index.getValueType().getVectorNumElements());
+ Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+ }
+ }
+
+ // With vector masks we only demand the upper bit of the mask.
+ SDValue Mask = GorS->getMask();
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ }
+
+ return SDValue();
+}
+
+// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
+static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+ SDValue EFLAGS = N->getOperand(1);
+
+ // Try to simplify the EFLAGS and condition code operands.
+ if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
+ return getSETCC(CC, Flags, DL, DAG);
+
+ return SDValue();
+}
+
+/// Optimize branch condition evaluation.
+static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ SDValue EFLAGS = N->getOperand(3);
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+ // Try to simplify the EFLAGS and condition code operands.
+ // Make sure to not keep references to operands, as combineSetCCEFLAGS can
+ // RAUW them under us.
+ if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
+ SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
+ N->getOperand(1), Cond, Flags);
+ }
+
+ return SDValue();
+}
+
+// TODO: Could we move this to DAGCombine?
+static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
+ SelectionDAG &DAG) {
+ // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
+ // to optimize away operation when it's from a constant.
+ //
+ // The general transformation is:
+ // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+ // AND(VECTOR_CMP(x,y), constant2)
+ // constant2 = UNARYOP(constant)
+
+ // Early exit if this isn't a vector operation, the operand of the
+ // unary operation isn't a bitwise AND, or if the sizes of the operations
+ // aren't the same.
+ EVT VT = N->getValueType(0);
+ bool IsStrict = N->isStrictFPOpcode();
+ unsigned NumEltBits = VT.getScalarSizeInBits();
+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
+ if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
+ DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
+ VT.getSizeInBits() != Op0.getValueSizeInBits())
+ return SDValue();
+
+ // Now check that the other operand of the AND is a constant. We could
+ // make the transformation for non-constant splats as well, but it's unclear
+ // that would be a benefit as it would not eliminate any operations, just
+ // perform one more step in scalar code before moving to the vector unit.
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
+ // Bail out if the vector isn't a constant.
+ if (!BV->isConstant())
+ return SDValue();
+
+ // Everything checks out. Build up the new and improved node.
+ SDLoc DL(N);
+ EVT IntVT = BV->getValueType(0);
+ // Create a new constant of the appropriate type for the transformed
+ // DAG.
+ SDValue SourceConst;
+ if (IsStrict)
+ SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
+ {N->getOperand(0), SDValue(BV, 0)});
+ else
+ SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+ // The AND node needs bitcasts to/from an integer vector type around it.
+ SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
+ MaskConst);
+ SDValue Res = DAG.getBitcast(VT, NewAnd);
+ if (IsStrict)
+ return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
+ return Res;
+ }
+
+ return SDValue();
+}
+
+/// If we are converting a value to floating-point, try to replace scalar
+/// truncate of an extracted vector element with a bitcast. This tries to keep
+/// the sequence on XMM registers rather than moving between vector and GPRs.
+static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
+ // TODO: This is currently only used by combineSIntToFP, but it is generalized
+ // to allow being called by any similar cast opcode.
+ // TODO: Consider merging this into lowering: vectorizeExtractedCast().
+ SDValue Trunc = N->getOperand(0);
+ if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ SDValue ExtElt = Trunc.getOperand(0);
+ if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isNullConstant(ExtElt.getOperand(1)))
+ return SDValue();
+
+ EVT TruncVT = Trunc.getValueType();
+ EVT SrcVT = ExtElt.getValueType();
+ unsigned DestWidth = TruncVT.getSizeInBits();
+ unsigned SrcWidth = SrcVT.getSizeInBits();
+ if (SrcWidth % DestWidth != 0)
+ return SDValue();
+
+ // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
+ EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
+ unsigned VecWidth = SrcVecVT.getSizeInBits();
+ unsigned NumElts = VecWidth / DestWidth;
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
+ SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
+ SDLoc DL(N);
+ SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
+ BitcastVec, ExtElt.getOperand(1));
+ return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
+}
+
+static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
+ EVT VT = N->getValueType(0);
+ EVT InVT = Op0.getValueType();
+
+ // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
+ // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
+ // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
+ if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
+ SDLoc dl(N);
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ InVT.getVectorNumElements());
+ SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+
+ // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {N->getOperand(0), P});
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+ }
+
+ // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+ // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+ // the optimization here.
+ if (DAG.SignBitIsZero(Op0)) {
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
+ {N->getOperand(0), Op0});
+ return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ // First try to optimize away the conversion entirely when it's
+ // conditionally from a constant. Vectors only.
+ bool IsStrict = N->isStrictFPOpcode();
+ if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
+ return Res;
+
+ // Now move on to more general possibilities.
+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
+ EVT VT = N->getValueType(0);
+ EVT InVT = Op0.getValueType();
+
+ // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
+ // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
+ // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
+ if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
+ SDLoc dl(N);
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ InVT.getVectorNumElements());
+ SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {N->getOperand(0), P});
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+ }
+
+ // Without AVX512DQ we only support i64 to float scalar conversion. For both
+ // vectors and scalars, see if we know that the upper bits are all the sign
+ // bit, in which case we can truncate the input to i32 and convert from that.
+ if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
+ unsigned BitWidth = InVT.getScalarSizeInBits();
+ unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
+ if (NumSignBits >= (BitWidth - 31)) {
+ EVT TruncVT = MVT::i32;
+ if (InVT.isVector())
+ TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
+ InVT.getVectorNumElements());
+ SDLoc dl(N);
+ if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {N->getOperand(0), Trunc});
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+ }
+ // If we're after legalize and the type is v2i32 we need to shuffle and
+ // use CVTSI2P.
+ assert(InVT == MVT::v2i64 && "Unexpected VT!");
+ SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
+ { 0, 2, -1, -1 });
+ if (IsStrict)
+ return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
+ {N->getOperand(0), Shuf});
+ return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
+ }
+ }
+
+ // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+ // a 32-bit target where SSE doesn't support i64->FP operations.
+ if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
+ Op0.getOpcode() == ISD::LOAD) {
+ LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+
+ // This transformation is not supported if the result type is f16 or f128.
+ if (VT == MVT::f16 || VT == MVT::f128)
+ return SDValue();
+
+ // If we have AVX512DQ we can use packed conversion instructions unless
+ // the VT is f80.
+ if (Subtarget.hasDQI() && VT != MVT::f80)
+ return SDValue();
+
+ if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
+ Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
+ std::pair<SDValue, SDValue> Tmp =
+ Subtarget.getTargetLowering()->BuildFILD(
+ VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
+ DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
+ return Tmp.first;
+ }
+ }
+
+ if (IsStrict)
+ return SDValue();
+
+ if (SDValue V = combineToFPTruncExtElt(N, DAG))
+ return V;
+
+ return SDValue();
+}
+
+static bool needCarryOrOverflowFlag(SDValue Flags) {
+ assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
+
+ for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+
+ X86::CondCode CC;
+ switch (User->getOpcode()) {
+ default:
+ // Be conservative.
+ return true;
+ case X86ISD::SETCC:
+ case X86ISD::SETCC_CARRY:
+ CC = (X86::CondCode)User->getConstantOperandVal(0);
+ break;
+ case X86ISD::BRCOND:
+ CC = (X86::CondCode)User->getConstantOperandVal(2);
+ break;
+ case X86ISD::CMOV:
+ CC = (X86::CondCode)User->getConstantOperandVal(2);
+ break;
+ }
+
+ switch (CC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ case X86::COND_O: case X86::COND_NO:
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool onlyZeroFlagUsed(SDValue Flags) {
+ assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
+
+ for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+
+ unsigned CCOpNo;
+ switch (User->getOpcode()) {
+ default:
+ // Be conservative.
+ return false;
+ case X86ISD::SETCC: CCOpNo = 0; break;
+ case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
+ case X86ISD::BRCOND: CCOpNo = 2; break;
+ case X86ISD::CMOV: CCOpNo = 2; break;
+ }
+
+ X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return false;
+ }
+
+ return true;
+}
+
+static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
+ // Only handle test patterns.
+ if (!isNullConstant(N->getOperand(1)))
+ return SDValue();
+
+ // If we have a CMP of a truncated binop, see if we can make a smaller binop
+ // and use its flags directly.
+ // TODO: Maybe we should try promoting compares that only use the zero flag
+ // first if we can prove the upper bits with computeKnownBits?
+ SDLoc dl(N);
+ SDValue Op = N->getOperand(0);
+ EVT VT = Op.getValueType();
+
+ // If we have a constant logical shift that's only used in a comparison
+ // against zero turn it into an equivalent AND. This allows turning it into
+ // a TEST instruction later.
+ if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
+ Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
+ onlyZeroFlagUsed(SDValue(N, 0))) {
+ unsigned BitWidth = VT.getSizeInBits();
+ const APInt &ShAmt = Op.getConstantOperandAPInt(1);
+ if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
+ unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
+ APInt Mask = Op.getOpcode() == ISD::SRL
+ ? APInt::getHighBitsSet(BitWidth, MaskBits)
+ : APInt::getLowBitsSet(BitWidth, MaskBits);
+ if (Mask.isSignedIntN(32)) {
+ Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
+ DAG.getConstant(Mask, dl, VT));
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, VT));
+ }
+ }
+ }
+
+ // Look for a truncate with a single use.
+ if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
+ return SDValue();
+
+ Op = Op.getOperand(0);
+
+ // Arithmetic op can only have one use.
+ if (!Op.hasOneUse())
+ return SDValue();
+
+ unsigned NewOpc;
+ switch (Op.getOpcode()) {
+ default: return SDValue();
+ case ISD::AND:
+ // Skip and with constant. We have special handling for and with immediate
+ // during isel to generate test instructions.
+ if (isa<ConstantSDNode>(Op.getOperand(1)))
+ return SDValue();
+ NewOpc = X86ISD::AND;
+ break;
+ case ISD::OR: NewOpc = X86ISD::OR; break;
+ case ISD::XOR: NewOpc = X86ISD::XOR; break;
+ case ISD::ADD:
+ // If the carry or overflow flag is used, we can't truncate.
+ if (needCarryOrOverflowFlag(SDValue(N, 0)))
+ return SDValue();
+ NewOpc = X86ISD::ADD;
+ break;
+ case ISD::SUB:
+ // If the carry or overflow flag is used, we can't truncate.
+ if (needCarryOrOverflowFlag(SDValue(N, 0)))
+ return SDValue();
+ NewOpc = X86ISD::SUB;
+ break;
+ }
+
+ // We found an op we can narrow. Truncate its inputs.
+ SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
+ SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
+
+ // Use a X86 specific opcode to avoid DAG combine messing with it.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
+
+ // For AND, keep a CMP so that we can match the test pattern.
+ if (NewOpc == X86ISD::AND)
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, dl, VT));
+
+ // Return the flags.
+ return Op.getValue(1);
+}
+
+static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
+ "Expected X86ISD::ADD or X86ISD::SUB");
+
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ MVT VT = LHS.getSimpleValueType();
+ unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
+
+ // If we don't use the flag result, simplify back to a generic ADD/SUB.
+ if (!N->hasAnyUseOfValue(1)) {
+ SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
+ return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
+ }
+
+ // Fold any similar generic ADD/SUB opcodes to reuse this node.
+ auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
+ SDValue Ops[] = {N0, N1};
+ SDVTList VTs = DAG.getVTList(N->getValueType(0));
+ if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
+ SDValue Op(N, 0);
+ if (Negate)
+ Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
+ DCI.CombineTo(GenericAddSub, Op);
+ }
+ };
+ MatchGeneric(LHS, RHS, false);
+ MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
+
+ return SDValue();
+}
+
+static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
+ MVT VT = N->getSimpleValueType(0);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
+ N->getOperand(0), N->getOperand(1),
+ Flags);
+ }
+
+ // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
+ // iff the flag result is dead.
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
+ !N->hasAnyUseOfValue(1))
+ return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
+ Op0.getOperand(1), N->getOperand(2));
+
+ return SDValue();
+}
+
+// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
+static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // If the LHS and RHS of the ADC node are zero, then it can't overflow and
+ // the result is either zero or one (depending on the input carry bit).
+ // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
+ if (X86::isZeroNode(N->getOperand(0)) &&
+ X86::isZeroNode(N->getOperand(1)) &&
+ // We don't have a good way to replace an EFLAGS use, so only do this when
+ // dead right now.
+ SDValue(N, 1).use_empty()) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
+ SDValue Res1 =
+ DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ N->getOperand(2)),
+ DAG.getConstant(1, DL, VT));
+ return DCI.CombineTo(N, Res1, CarryOut);
+ }
+
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
+ MVT VT = N->getSimpleValueType(0);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
+ N->getOperand(0), N->getOperand(1),
+ Flags);
+ }
+
+ return SDValue();
+}
+
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+ bool IsSub = N->getOpcode() == ISD::SUB;
+ SDValue X = N->getOperand(0);
+ SDValue Y = N->getOperand(1);
+
+ // If this is an add, canonicalize a zext operand to the RHS.
+ // TODO: Incomplete? What if both sides are zexts?
+ if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
+ Y.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(X, Y);
+
+ // Look through a one-use zext.
+ bool PeekedThroughZext = false;
+ if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
+ Y = Y.getOperand(0);
+ PeekedThroughZext = true;
+ }
+
+ // If this is an add, canonicalize a setcc operand to the RHS.
+ // TODO: Incomplete? What if both sides are setcc?
+ // TODO: Should we allow peeking through a zext of the other operand?
+ if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
+ Y.getOpcode() != X86ISD::SETCC)
+ std::swap(X, Y);
+
+ if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
+ return SDValue();
+
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
+
+ // If X is -1 or 0, then we have an opportunity to avoid constants required in
+ // the general case below.
+ auto *ConstantX = dyn_cast<ConstantSDNode>(X);
+ if (ConstantX) {
+ if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
+ (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
+ // This is a complicated way to get -1 or 0 from the carry flag:
+ // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+ // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ Y.getOperand(1));
+ }
+
+ if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
+ (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
+ SDValue EFLAGS = Y->getOperand(1);
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ // Swap the operands of a SUB, and we have the same pattern as above.
+ // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
+ // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ NewEFLAGS);
+ }
+ }
+ }
+
+ if (CC == X86::COND_B) {
+ // X + SETB Z --> adc X, 0
+ // X - SETB Z --> sbb X, 0
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(0, DL, VT), Y.getOperand(1));
+ }
+
+ if (CC == X86::COND_A) {
+ SDValue EFLAGS = Y.getOperand(1);
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+ EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(0, DL, VT), NewEFLAGS);
+ }
+ }
+
+ if (CC == X86::COND_AE) {
+ // X + SETAE --> sbb X, -1
+ // X - SETAE --> adc X, -1
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(-1, DL, VT), Y.getOperand(1));
+ }
+
+ if (CC == X86::COND_BE) {
+ // X + SETBE --> sbb X, -1
+ // X - SETBE --> adc X, -1
+ SDValue EFLAGS = Y.getOperand(1);
+ // Try to convert COND_BE into COND_AE in an attempt to facilitate
+ // materializing "setae reg".
+ //
+ // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(-1, DL, VT), NewEFLAGS);
+ }
+ }
+
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ SDValue Cmp = Y.getOperand(1);
+ if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
+ !X86::isZeroNode(Cmp.getOperand(1)) ||
+ !Cmp.getOperand(0).getValueType().isInteger())
+ return SDValue();
+
+ SDValue Z = Cmp.getOperand(0);
+ EVT ZVT = Z.getValueType();
+
+ // If X is -1 or 0, then we have an opportunity to avoid constants required in
+ // the general case below.
+ if (ConstantX) {
+ // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
+ // fake operands:
+ // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
+ // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
+ if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
+ (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
+ SDValue Zero = DAG.getConstant(0, DL, ZVT);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ }
+
+ // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
+ // with fake operands:
+ // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
+ // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
+ if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
+ (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ Cmp1.getValue(1));
+ }
+ }
+
+ // (cmp Z, 1) sets the carry flag if Z is 0.
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
+
+ // Add the flags type for ADC/SBB nodes.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+ // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
+ // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
+ if (CC == X86::COND_NE)
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
+ DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
+
+ // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
+ // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
+ DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
+}
+
+static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
+ const SDLoc &DL, EVT VT,
+ const X86Subtarget &Subtarget) {
+ // Example of pattern we try to detect:
+ // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
+ //(add (build_vector (extract_elt t, 0),
+ // (extract_elt t, 2),
+ // (extract_elt t, 4),
+ // (extract_elt t, 6)),
+ // (build_vector (extract_elt t, 1),
+ // (extract_elt t, 3),
+ // (extract_elt t, 5),
+ // (extract_elt t, 7)))
+
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
+ Op1.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
+ VT.getVectorNumElements() < 4 ||
+ !isPowerOf2_32(VT.getVectorNumElements()))
+ return SDValue();
+
+ // Check if one of Op0,Op1 is of the form:
+ // (build_vector (extract_elt Mul, 0),
+ // (extract_elt Mul, 2),
+ // (extract_elt Mul, 4),
+ // ...
+ // the other is of the form:
+ // (build_vector (extract_elt Mul, 1),
+ // (extract_elt Mul, 3),
+ // (extract_elt Mul, 5),
+ // ...
+ // and identify Mul.
+ SDValue Mul;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
+ SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
+ Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
+ // TODO: Be more tolerant to undefs.
+ if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+ auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
+ auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
+ auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
+ auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
+ if (!Const0L || !Const1L || !Const0H || !Const1H)
+ return SDValue();
+ unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
+ Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
+ // Commutativity of mul allows factors of a product to reorder.
+ if (Idx0L > Idx1L)
+ std::swap(Idx0L, Idx1L);
+ if (Idx0H > Idx1H)
+ std::swap(Idx0H, Idx1H);
+ // Commutativity of add allows pairs of factors to reorder.
+ if (Idx0L > Idx0H) {
+ std::swap(Idx0L, Idx0H);
+ std::swap(Idx1L, Idx1H);
+ }
+ if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
+ Idx1H != 2 * i + 3)
+ return SDValue();
+ if (!Mul) {
+ // First time an extract_elt's source vector is visited. Must be a MUL
+ // with 2X number of vector elements than the BUILD_VECTOR.
+ // Both extracts must be from same MUL.
+ Mul = Op0L->getOperand(0);
+ if (Mul->getOpcode() != ISD::MUL ||
+ Mul.getValueType().getVectorNumElements() != 2 * e)
+ return SDValue();
+ }
+ // Check that the extract is from the same MUL previously seen.
+ if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
+ Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
+ return SDValue();
+ }
+
+ // Check if the Mul source can be safely shrunk.
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
+ Mode == ShrinkMode::MULU16)
+ return SDValue();
+
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements() * 2);
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+
+ auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ EVT InVT = Ops[0].getValueType();
+ assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ InVT.getVectorNumElements() / 2);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
+}
+
+// Attempt to turn this pattern into PMADDWD.
+// (add (mul (sext (build_vector)), (sext (build_vector))),
+// (mul (sext (build_vector)), (sext (build_vector)))
+static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
+ const SDLoc &DL, EVT VT,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
+ VT.getVectorNumElements() < 4 ||
+ !isPowerOf2_32(VT.getVectorNumElements()))
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ SDValue N10 = N1.getOperand(0);
+ SDValue N11 = N1.getOperand(1);
+
+ // All inputs need to be sign extends.
+ // TODO: Support ZERO_EXTEND from known positive?
+ if (N00.getOpcode() != ISD::SIGN_EXTEND ||
+ N01.getOpcode() != ISD::SIGN_EXTEND ||
+ N10.getOpcode() != ISD::SIGN_EXTEND ||
+ N11.getOpcode() != ISD::SIGN_EXTEND)
+ return SDValue();
+
+ // Peek through the extends.
+ N00 = N00.getOperand(0);
+ N01 = N01.getOperand(0);
+ N10 = N10.getOperand(0);
+ N11 = N11.getOperand(0);
+
+ // Must be extending from vXi16.
+ EVT InVT = N00.getValueType();
+ if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
+ N10.getValueType() != InVT || N11.getValueType() != InVT)
+ return SDValue();
+
+ // All inputs should be build_vectors.
+ if (N00.getOpcode() != ISD::BUILD_VECTOR ||
+ N01.getOpcode() != ISD::BUILD_VECTOR ||
+ N10.getOpcode() != ISD::BUILD_VECTOR ||
+ N11.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ // For each element, we need to ensure we have an odd element from one vector
+ // multiplied by the odd element of another vector and the even element from
+ // one of the same vectors being multiplied by the even element from the
+ // other vector. So we need to make sure for each element i, this operator
+ // is being performed:
+ // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
+ SDValue In0, In1;
+ for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
+ SDValue N00Elt = N00.getOperand(i);
+ SDValue N01Elt = N01.getOperand(i);
+ SDValue N10Elt = N10.getOperand(i);
+ SDValue N11Elt = N11.getOperand(i);
+ // TODO: Be more tolerant to undefs.
+ if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+ auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
+ auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
+ auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
+ auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
+ if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
+ return SDValue();
+ unsigned IdxN00 = ConstN00Elt->getZExtValue();
+ unsigned IdxN01 = ConstN01Elt->getZExtValue();
+ unsigned IdxN10 = ConstN10Elt->getZExtValue();
+ unsigned IdxN11 = ConstN11Elt->getZExtValue();
+ // Add is commutative so indices can be reordered.
+ if (IdxN00 > IdxN10) {
+ std::swap(IdxN00, IdxN10);
+ std::swap(IdxN01, IdxN11);
+ }
+ // N0 indices be the even element. N1 indices must be the next odd element.
+ if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
+ IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
+ return SDValue();
+ SDValue N00In = N00Elt.getOperand(0);
+ SDValue N01In = N01Elt.getOperand(0);
+ SDValue N10In = N10Elt.getOperand(0);
+ SDValue N11In = N11Elt.getOperand(0);
+ // First time we find an input capture it.
+ if (!In0) {
+ In0 = N00In;
+ In1 = N01In;
+ }
+ // Mul is commutative so the input vectors can be in any order.
+ // Canonicalize to make the compares easier.
+ if (In0 != N00In)
+ std::swap(N00In, N01In);
+ if (In0 != N10In)
+ std::swap(N10In, N11In);
+ if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
+ return SDValue();
+ }
+
+ auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ // Shrink by adding truncate nodes and let DAGCombine fold with the
+ // sources.
+ EVT OpVT = Ops[0].getValueType();
+ assert(OpVT.getScalarType() == MVT::i16 &&
+ "Unexpected scalar element type");
+ assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ OpVT.getVectorNumElements() / 2);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
+ PMADDBuilder);
+}
+
+static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ bool IsAdd = N->getOpcode() == ISD::ADD;
+ auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
+ assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
+
+ SmallVector<int, 8> PostShuffleMask;
+ if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
+ VT == MVT::v8i32) &&
+ Subtarget.hasSSSE3() &&
+ isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd,
+ PostShuffleMask)) {
+ auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
+ };
+ SDValue HorizBinOp =
+ SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
+ if (!PostShuffleMask.empty())
+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+ DAG.getUNDEF(VT), PostShuffleMask);
+ return HorizBinOp;
+ }
+
+ return SDValue();
+}
+
+static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
+ return MAdd;
+ if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
+ return MAdd;
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
+ return V;
+
+ // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
+ // (sub Y, (sext (vXi1 X))).
+ // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
+ // generic DAG combine without a legal type check, but adding this there
+ // caused regressions.
+ if (VT.isVector()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
+ Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
+ TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
+ SDLoc DL(N);
+ SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
+ return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
+ }
+
+ if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
+ Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
+ TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
+ SDLoc DL(N);
+ SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
+ return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
+ }
+ }
+
+ return combineAddOrSubToADCOrSBB(N, DAG);
+}
+
+static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ if (!VT.isVector())
+ return SDValue();
+
+ // PSUBUS is supported, starting from SSE2.
+ EVT EltVT = VT.getVectorElementType();
+ if (!(Subtarget.hasSSE2() &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16 || VT == MVT::v8i32 ||
+ VT == MVT::v8i64 || VT == MVT::v16i32)))
+ return SDValue();
+
+ SDValue SubusLHS, SubusRHS;
+ // Try to find umax(a,b) - b or a - umin(a,b) patterns
+ // they may be converted to subus(a,b).
+ // TODO: Need to add IR canonicalization for this code.
+ if (Op0.getOpcode() == ISD::UMAX) {
+ SubusRHS = Op1;
+ SDValue MaxLHS = Op0.getOperand(0);
+ SDValue MaxRHS = Op0.getOperand(1);
+ if (MaxLHS == Op1)
+ SubusLHS = MaxRHS;
+ else if (MaxRHS == Op1)
+ SubusLHS = MaxLHS;
+ else
+ return SDValue();
+ } else if (Op1.getOpcode() == ISD::UMIN) {
+ SubusLHS = Op0;
+ SDValue MinLHS = Op1.getOperand(0);
+ SDValue MinRHS = Op1.getOperand(1);
+ if (MinLHS == Op0)
+ SubusRHS = MinRHS;
+ else if (MinRHS == Op0)
+ SubusRHS = MinLHS;
+ else
+ return SDValue();
+ } else if (Op1.getOpcode() == ISD::TRUNCATE &&
+ Op1.getOperand(0).getOpcode() == ISD::UMIN &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16)) {
+ // Special case where the UMIN has been truncated. Try to push the truncate
+ // further up. This is similar to the i32/i64 special processing.
+ SubusLHS = Op0;
+ SDValue MinLHS = Op1.getOperand(0).getOperand(0);
+ SDValue MinRHS = Op1.getOperand(0).getOperand(1);
+ EVT TruncVT = Op1.getOperand(0).getValueType();
+ if (!(Subtarget.hasSSE2() &&
+ (TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64 ||
+ TruncVT == MVT::v16i32)))
+ return SDValue();
+ SDValue OpToSaturate;
+ if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
+ MinLHS.getOperand(0) == Op0)
+ OpToSaturate = MinRHS;
+ else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
+ MinRHS.getOperand(0) == Op0)
+ OpToSaturate = MinLHS;
+ else
+ return SDValue();
+
+ // Saturate the non-extended input and then truncate it.
+ SDLoc DL(N);
+ SDValue SaturationConst =
+ DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
+ VT.getScalarSizeInBits()),
+ DL, TruncVT);
+ SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
+ SaturationConst);
+ SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
+ } else
+ return SDValue();
+
+ // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
+ // special preprocessing in some cases.
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
+
+ assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&
+ "Unexpected VT!");
+
+ // Special preprocessing case can be only applied
+ // if the value was zero extended from 16 bit,
+ // so we require first 16 bits to be zeros for 32 bit
+ // values, or first 48 bits for 64 bit values.
+ KnownBits Known = DAG.computeKnownBits(SubusLHS);
+ unsigned NumZeros = Known.countMinLeadingZeros();
+ if (NumZeros < (VT.getScalarSizeInBits() - 16))
+ return SDValue();
+
+ EVT ExtType = SubusLHS.getValueType();
+ EVT ShrinkedType;
+ if (VT == MVT::v8i32 || VT == MVT::v8i64)
+ ShrinkedType = MVT::v8i16;
+ else
+ ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
+
+ // If SubusLHS is zeroextended - truncate SubusRHS to it's
+ // size SubusRHS = umin(0xFFF.., SubusRHS).
+ SDValue SaturationConst =
+ DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
+ ShrinkedType.getScalarSizeInBits()),
+ SDLoc(SubusLHS), ExtType);
+ SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
+ SaturationConst);
+ SDValue NewSubusLHS =
+ DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
+ SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
+ SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
+ NewSubusLHS, NewSubusRHS);
+
+ // Zero extend the result, it may be used somewhere as 32 bit,
+ // if not zext and following trunc will shrink.
+ return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
+}
+
+static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // X86 can't encode an immediate LHS of a sub. See if we can push the
+ // negation into a preceding instruction.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+ // If the RHS of the sub is a XOR with one use and a constant, invert the
+ // immediate. Then add one to the LHS of the sub so we can turn
+ // X-Y -> X+~Y+1, saving one register.
+ if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+ isa<ConstantSDNode>(Op1.getOperand(1))) {
+ const APInt &XorC = Op1.getConstantOperandAPInt(1);
+ EVT VT = Op0.getValueType();
+ SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
+ Op1.getOperand(0),
+ DAG.getConstant(~XorC, SDLoc(Op1), VT));
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
+ DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
+ }
+ }
+
+ // Try to synthesize horizontal subs from subs of shuffles.
+ if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
+ return V;
+
+ // Try to create PSUBUS if SUB's argument is max/min
+ if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
+ return V;
+
+ return combineAddOrSubToADCOrSBB(N, DAG);
+}
+
+static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = N->getSimpleValueType(0);
+ SDLoc DL(N);
+
+ if (N->getOperand(0) == N->getOperand(1)) {
+ if (N->getOpcode() == X86ISD::PCMPEQ)
+ return DAG.getConstant(-1, DL, VT);
+ if (N->getOpcode() == X86ISD::PCMPGT)
+ return DAG.getConstant(0, DL, VT);
+ }
+
+ return SDValue();
+}
+
+/// Helper that combines an array of subvector ops as if they were the operands
+/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
+/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
+static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
+ ArrayRef<SDValue> Ops, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+ if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+ return DAG.getUNDEF(VT);
+
+ if (llvm::all_of(Ops, [](SDValue Op) {
+ return ISD::isBuildVectorAllZeros(Op.getNode());
+ }))
+ return getZeroVector(VT, Subtarget, DAG, DL);
+
+ SDValue Op0 = Ops[0];
+ bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
+
+ // Repeated subvectors.
+ if (IsSplat &&
+ (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
+ // If this broadcast is inserted into both halves, use a larger broadcast.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST)
+ return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
+
+ // If this scalar/subvector broadcast_load is inserted into both halves, use
+ // a larger broadcast_load. Update other uses to use an extracted subvector.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
+ SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
+ MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(
+ Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+
+ // If this is a simple subvector load repeated across multiple lanes, then
+ // broadcast the load. Update other uses to use an extracted subvector.
+ if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
+ if (Ld->isSimple() && !Ld->isNonTemporal() &&
+ Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
+ Ld->getMemoryVT(), Ld->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(
+ Op0,
+ extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+ }
+
+ // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
+ if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
+ (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
+ Op0.getOperand(0),
+ DAG.getIntPtrConstant(0, DL)));
+
+ // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
+ if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ (Subtarget.hasAVX2() ||
+ (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+ Op0.getOperand(0).getValueType() == VT.getScalarType())
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
+
+ // concat_vectors(extract_subvector(broadcast(x)),
+ // extract_subvector(broadcast(x))) -> broadcast(x)
+ if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op0.getOperand(0).getValueType() == VT) {
+ if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
+ Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
+ return Op0.getOperand(0);
+ }
+ }
+
+ // Repeated opcode.
+ // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
+ // but it currently struggles with different vector widths.
+ if (llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op.getOpcode() == Op0.getOpcode();
+ })) {
+ unsigned NumOps = Ops.size();
+ switch (Op0.getOpcode()) {
+ case X86ISD::SHUFP: {
+ // Add SHUFPD support if/when necessary.
+ if (!IsSplat && VT.getScalarType() == MVT::f32 &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op.getOperand(2) == Op0.getOperand(2);
+ })) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
+ Op0.getOperand(2));
+ }
+ break;
+ }
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFD:
+ if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
+ Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::VPERMILPI:
+ // TODO - add support for vXf64/vXi64 shuffles.
+ if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
+ Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
+ Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
+ Op0.getOperand(1));
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ case X86ISD::VPERMV3:
+ if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
+ MVT OpVT = Op0.getSimpleValueType();
+ int NumSrcElts = OpVT.getVectorNumElements();
+ SmallVector<int, 64> ConcatMask;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ bool IsUnary;
+ SmallVector<int, 64> SubMask;
+ SmallVector<SDValue, 2> SubOps;
+ if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
+ SubMask, IsUnary))
+ break;
+ for (int M : SubMask) {
+ if (0 <= M) {
+ M += M < NumSrcElts ? 0 : NumSrcElts;
+ M += i * NumSrcElts;
+ }
+ ConcatMask.push_back(M);
+ }
+ }
+ if (ConcatMask.size() == (NumOps * NumSrcElts)) {
+ SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
+ Ops[1].getOperand(0), DAG, DL);
+ SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
+ Ops[1].getOperand(2), DAG, DL);
+ MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
+ SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
+ }
+ }
+ break;
+ case X86ISD::VSHLI:
+ case X86ISD::VSRAI:
+ case X86ISD::VSRLI:
+ if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
+ (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ })) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ break;
+ case X86ISD::VPERMI:
+ case X86ISD::VROTLI:
+ case X86ISD::VROTRI:
+ if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ })) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ break;
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case X86ISD::ANDNP:
+ // TODO: Add 256-bit support.
+ if (!IsSplat && VT.is512BitVector()) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumOps * SrcVT.getVectorNumElements());
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+ }
+ break;
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB:
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ if (!IsSplat && VT.is256BitVector() &&
+ (VT.isFloatingPoint() || Subtarget.hasInt256())) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumOps * SrcVT.getVectorNumElements());
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+ }
+ break;
+ case X86ISD::PALIGNR:
+ if (!IsSplat &&
+ ((VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(2) == Op.getOperand(2);
+ })) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
+ Op0.getOperand(2));
+ }
+ break;
+ }
+ }
+
+ // Fold subvector loads into one.
+ // If needed, look through bitcasts to get to the load.
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
+ bool Fast;
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ *FirstLd->getMemOperand(), &Fast) &&
+ Fast) {
+ if (SDValue Ld =
+ EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
+ return Ld;
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = N->getOperand(0).getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Don't do anything for i1 vectors.
+ if (VT.getVectorElementType() == MVT::i1)
+ return SDValue();
+
+ if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
+ SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
+ if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
+ DCI, Subtarget))
+ return R;
+ }
+
+ return SDValue();
+}
+
+static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ MVT OpVT = N->getSimpleValueType(0);
+
+ bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
+
+ SDLoc dl(N);
+ SDValue Vec = N->getOperand(0);
+ SDValue SubVec = N->getOperand(1);
+
+ uint64_t IdxVal = N->getConstantOperandVal(2);
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ if (Vec.isUndef() && SubVec.isUndef())
+ return DAG.getUNDEF(OpVT);
+
+ // Inserting undefs/zeros into zeros/undefs is a zero vector.
+ if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
+ (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
+ return getZeroVector(OpVT, Subtarget, DAG, dl);
+
+ if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ // If we're inserting into a zero vector and then into a larger zero vector,
+ // just insert into the larger zero vector directly.
+ if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
+ uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+ getZeroVector(OpVT, Subtarget, DAG, dl),
+ SubVec.getOperand(1),
+ DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
+ }
+
+ // If we're inserting into a zero vector and our input was extracted from an
+ // insert into a zero vector of the same type and the extraction was at
+ // least as large as the original insertion. Just insert the original
+ // subvector into a zero vector.
+ if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
+ isNullConstant(SubVec.getOperand(1)) &&
+ SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue Ins = SubVec.getOperand(0);
+ if (isNullConstant(Ins.getOperand(2)) &&
+ ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
+ Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
+ SubVecVT.getFixedSizeInBits())
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+ getZeroVector(OpVT, Subtarget, DAG, dl),
+ Ins.getOperand(1), N->getOperand(2));
+ }
+ }
+
+ // Stop here if this is an i1 vector.
+ if (IsI1Vector)
+ return SDValue();
+
+ // If this is an insert of an extract, combine to a shuffle. Don't do this
+ // if the insert or extract can be represented with a subregister operation.
+ if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ SubVec.getOperand(0).getSimpleValueType() == OpVT &&
+ (IdxVal != 0 ||
+ !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
+ int ExtIdxVal = SubVec.getConstantOperandVal(1);
+ if (ExtIdxVal != 0) {
+ int VecNumElts = OpVT.getVectorNumElements();
+ int SubVecNumElts = SubVecVT.getVectorNumElements();
+ SmallVector<int, 64> Mask(VecNumElts);
+ // First create an identity shuffle mask.
+ for (int i = 0; i != VecNumElts; ++i)
+ Mask[i] = i;
+ // Now insert the extracted portion.
+ for (int i = 0; i != SubVecNumElts; ++i)
+ Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
+
+ return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
+ }
+ }
+
+ // Match concat_vector style patterns.
+ SmallVector<SDValue, 2> SubVectorOps;
+ if (collectConcatOps(N, SubVectorOps)) {
+ if (SDValue Fold =
+ combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
+ return Fold;
+
+ // If we're inserting all zeros into the upper half, change this to
+ // a concat with zero. We will match this to a move
+ // with implicit upper bit zeroing during isel.
+ // We do this here because we don't want combineConcatVectorOps to
+ // create INSERT_SUBVECTOR from CONCAT_VECTORS.
+ if (SubVectorOps.size() == 2 &&
+ ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+ getZeroVector(OpVT, Subtarget, DAG, dl),
+ SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
+ }
+
+ // If this is a broadcast insert into an upper undef, use a larger broadcast.
+ if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
+ return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
+
+ // If this is a broadcast load inserted into an upper undef, use a larger
+ // broadcast load.
+ if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
+ SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
+ SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
+ SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+ MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+
+ return SDValue();
+}
+
+/// If we are extracting a subvector of a vector select and the select condition
+/// is composed of concatenated vectors, try to narrow the select width. This
+/// is a common pattern for AVX1 integer code because 256-bit selects may be
+/// legal, but there is almost no integer math/logic available for 256-bit.
+/// This function should only be called with legal types (otherwise, the calls
+/// to get simple value types will assert).
+static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
+ SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
+ SmallVector<SDValue, 4> CatOps;
+ if (Sel.getOpcode() != ISD::VSELECT ||
+ !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
+ return SDValue();
+
+ // Note: We assume simple value types because this should only be called with
+ // legal operations/types.
+ // TODO: This can be extended to handle extraction to 256-bits.
+ MVT VT = Ext->getSimpleValueType(0);
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
+ if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
+ return SDValue();
+
+ MVT WideVT = Ext->getOperand(0).getSimpleValueType();
+ MVT SelVT = Sel.getSimpleValueType();
+ assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
+ "Unexpected vector type with legal operations");
+
+ unsigned SelElts = SelVT.getVectorNumElements();
+ unsigned CastedElts = WideVT.getVectorNumElements();
+ unsigned ExtIdx = Ext->getConstantOperandVal(1);
+ if (SelElts % CastedElts == 0) {
+ // The select has the same or more (narrower) elements than the extract
+ // operand. The extraction index gets scaled by that factor.
+ ExtIdx *= (SelElts / CastedElts);
+ } else if (CastedElts % SelElts == 0) {
+ // The select has less (wider) elements than the extract operand. Make sure
+ // that the extraction index can be divided evenly.
+ unsigned IndexDivisor = CastedElts / SelElts;
+ if (ExtIdx % IndexDivisor != 0)
+ return SDValue();
+ ExtIdx /= IndexDivisor;
+ } else {
+ llvm_unreachable("Element count of simple vector types are not divisible?");
+ }
+
+ unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
+ unsigned NarrowElts = SelElts / NarrowingFactor;
+ MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
+ SDLoc DL(Ext);
+ SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
+ SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
+ SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
+ SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
+ return DAG.getBitcast(VT, NarrowSel);
+}
+
+static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ // For AVX1 only, if we are extracting from a 256-bit and+not (which will
+ // eventually get combined/lowered into ANDNP) with a concatenated operand,
+ // split the 'and' into 128-bit ops to avoid the concatenate and extract.
+ // We let generic combining take over from there to simplify the
+ // insert/extract and 'not'.
+ // This pattern emerges during AVX1 legalization. We handle it before lowering
+ // to avoid complications like splitting constant vector loads.
+
+ // Capture the original wide type in the likely case that we need to bitcast
+ // back to this type.
+ if (!N->getValueType(0).isSimple())
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
+ SDValue InVec = N->getOperand(0);
+ unsigned IdxVal = N->getConstantOperandVal(1);
+ SDValue InVecBC = peekThroughBitcasts(InVec);
+ EVT InVecVT = InVec.getValueType();
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned InSizeInBits = InVecVT.getSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
+ TLI.isTypeLegal(InVecVT) &&
+ InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
+ auto isConcatenatedNot = [](SDValue V) {
+ V = peekThroughBitcasts(V);
+ if (!isBitwiseNot(V))
+ return false;
+ SDValue NotOp = V->getOperand(0);
+ return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
+ };
+ if (isConcatenatedNot(InVecBC.getOperand(0)) ||
+ isConcatenatedNot(InVecBC.getOperand(1))) {
+ // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
+ SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
+ DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
+ }
+ }
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue V = narrowExtractedVectorSelect(N, DAG))
+ return V;
+
+ if (ISD::isBuildVectorAllZeros(InVec.getNode()))
+ return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
+
+ if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
+ if (VT.getScalarType() == MVT::i1)
+ return DAG.getConstant(1, SDLoc(N), VT);
+ return getOnesVector(VT, DAG, SDLoc(N));
+ }
+
+ if (InVec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getBuildVector(
+ VT, SDLoc(N),
+ InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
+
+ // If we are extracting from an insert into a zero vector, replace with a
+ // smaller insert into zero if we don't access less than the original
+ // subvector. Don't do this for i1 vectors.
+ if (VT.getVectorElementType() != MVT::i1 &&
+ InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
+ InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
+ ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
+ InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL),
+ InVec.getOperand(1), InVec.getOperand(2));
+ }
+
+ // If we're extracting an upper subvector from a broadcast we should just
+ // extract the lowest subvector instead which should allow
+ // SimplifyDemandedVectorElts do more simplifications.
+ if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
+ InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
+ return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
+
+ // If we're extracting a broadcasted subvector, just use the lowest subvector.
+ if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
+ return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
+
+ // Attempt to extract from the source of a shuffle vector.
+ if ((InSizeInBits % SizeInBits) == 0 &&
+ (IdxVal % VT.getVectorNumElements()) == 0) {
+ SmallVector<int, 32> ShuffleMask;
+ SmallVector<int, 32> ScaledMask;
+ SmallVector<SDValue, 2> ShuffleInputs;
+ unsigned NumSubVecs = InSizeInBits / SizeInBits;
+ // Decode the shuffle mask and scale it so its shuffling subvectors.
+ if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
+ scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
+ unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
+ if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
+ return DAG.getUNDEF(VT);
+ if (ScaledMask[SubVecIdx] == SM_SentinelZero)
+ return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
+ SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
+ if (Src.getValueSizeInBits() == InSizeInBits) {
+ unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
+ unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
+ return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
+ SDLoc(N), SizeInBits);
+ }
+ }
+ }
+
+ // If we're extracting the lowest subvector and we're the only user,
+ // we may be able to perform this with a smaller vector width.
+ unsigned InOpcode = InVec.getOpcode();
+ if (IdxVal == 0 && InVec.hasOneUse()) {
+ if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
+ // v2f64 CVTDQ2PD(v4i32).
+ if (InOpcode == ISD::SINT_TO_FP &&
+ InVec.getOperand(0).getValueType() == MVT::v4i32) {
+ return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
+ }
+ // v2f64 CVTUDQ2PD(v4i32).
+ if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
+ InVec.getOperand(0).getValueType() == MVT::v4i32) {
+ return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
+ }
+ // v2f64 CVTPS2PD(v4f32).
+ if (InOpcode == ISD::FP_EXTEND &&
+ InVec.getOperand(0).getValueType() == MVT::v4f32) {
+ return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
+ }
+ }
+ if ((InOpcode == ISD::ANY_EXTEND ||
+ InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+ InOpcode == ISD::ZERO_EXTEND ||
+ InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ InOpcode == ISD::SIGN_EXTEND ||
+ InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+ (SizeInBits == 128 || SizeInBits == 256) &&
+ InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
+ SDLoc DL(N);
+ SDValue Ext = InVec.getOperand(0);
+ if (Ext.getValueSizeInBits() > SizeInBits)
+ Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
+ unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
+ return DAG.getNode(ExtOp, DL, VT, Ext);
+ }
+ if (InOpcode == ISD::VSELECT &&
+ InVec.getOperand(0).getValueType().is256BitVector() &&
+ InVec.getOperand(1).getValueType().is256BitVector() &&
+ InVec.getOperand(2).getValueType().is256BitVector()) {
+ SDLoc DL(N);
+ SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
+ SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
+ SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
+ return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
+ }
+ if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
+ (VT.is128BitVector() || VT.is256BitVector())) {
+ SDLoc DL(N);
+ SDValue InVecSrc = InVec.getOperand(0);
+ unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
+ SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
+ return DAG.getNode(InOpcode, DL, VT, Ext);
+ }
+ }
+
+ // Always split vXi64 logical shifts where we're extracting the upper 32-bits
+ // as this is very likely to fold into a shuffle/truncation.
+ if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
+ InVecVT.getScalarSizeInBits() == 64 &&
+ InVec.getConstantOperandAPInt(1) == 32) {
+ SDLoc DL(N);
+ SDValue Ext =
+ extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+ return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
+ }
+
+ return SDValue();
+}
+
+static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ SDLoc DL(N);
+
+ // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
+ // This occurs frequently in our masked scalar intrinsic code and our
+ // floating point select lowering with AVX512.
+ // TODO: SimplifyDemandedBits instead?
+ if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
+ if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
+ if (C->getAPIntValue().isOneValue())
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
+ Src.getOperand(0));
+
+ // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
+ if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
+ Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+ if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
+ if (C->isNullValue())
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
+ Src.getOperand(1));
+
+ // Reduce v2i64 to v4i32 if we don't need the upper bits.
+ // TODO: Move to DAGCombine/SimplifyDemandedBits?
+ if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ auto IsAnyExt64 = [](SDValue Op) {
+ if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
+ return SDValue();
+ if (Op.getOpcode() == ISD::ANY_EXTEND &&
+ Op.getOperand(0).getScalarValueSizeInBits() <= 32)
+ return Op.getOperand(0);
+ if (auto *Ld = dyn_cast<LoadSDNode>(Op))
+ if (Ld->getExtensionType() == ISD::EXTLOAD &&
+ Ld->getMemoryVT().getScalarSizeInBits() <= 32)
+ return Op;
+ return SDValue();
+ };
+ if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
+ DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
+ }
+
+ // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
+ if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
+ Src.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
+
+ return SDValue();
+}
+
+// Simplify PMULDQ and PMULUDQ operations.
+static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Canonicalize constant to RHS.
+ if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
+ !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
+
+ // Multiply by zero.
+ // Don't return RHS as it may contain UNDEFs.
+ if (ISD::isBuildVectorAllZeros(RHS.getNode()))
+ return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
+
+ // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
+ return SDValue(N, 0);
+
+ // If the input is an extend_invec and the SimplifyDemandedBits call didn't
+ // convert it to any_extend_invec, due to the LegalOperations check, do the
+ // conversion directly to a vector shuffle manually. This exposes combine
+ // opportunities missed by combineEXTEND_VECTOR_INREG not calling
+ // combineX86ShufflesRecursively on SSE4.1 targets.
+ // FIXME: This is basically a hack around several other issues related to
+ // ANY_EXTEND_VECTOR_INREG.
+ if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
+ (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+ LHS.getOperand(0).getValueType() == MVT::v4i32) {
+ SDLoc dl(N);
+ LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
+ LHS.getOperand(0), { 0, -1, 1, -1 });
+ LHS = DAG.getBitcast(MVT::v2i64, LHS);
+ return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+ }
+ if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
+ (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+ RHS.getOperand(0).getValueType() == MVT::v4i32) {
+ SDLoc dl(N);
+ RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
+ RHS.getOperand(0), { 0, -1, 1, -1 });
+ RHS = DAG.getBitcast(MVT::v2i64, RHS);
+ return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+ unsigned Opcode = N->getOpcode();
+ unsigned InOpcode = In.getOpcode();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Try to merge vector loads and extend_inreg to an extload.
+ if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
+ In.hasOneUse()) {
+ auto *Ld = cast<LoadSDNode>(In);
+ if (Ld->isSimple()) {
+ MVT SVT = In.getSimpleValueType().getVectorElementType();
+ ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
+ ? ISD::SEXTLOAD
+ : ISD::ZEXTLOAD;
+ EVT MemVT =
+ EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());
+ if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
+ SDValue Load =
+ DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+ return Load;
+ }
+ }
+ }
+
+ // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
+ if (Opcode == InOpcode)
+ return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
+
+ // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
+ // -> EXTEND_VECTOR_INREG(X).
+ // TODO: Handle non-zero subvector indices.
+ if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
+ In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
+ In.getOperand(0).getOperand(0).getValueSizeInBits() ==
+ In.getValueSizeInBits())
+ return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
+
+ // Attempt to combine as a shuffle.
+ // TODO: General ZERO_EXTEND_VECTOR_INREG support.
+ if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+ (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
+ SDValue Op(N, 0);
+ if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ return SDValue();
+}
+
+static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+
+ if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ return DAG.getConstant(0, SDLoc(N), VT);
+
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+ KnownZero, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
+// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
+// extra instructions between the conversion due to going to scalar and back.
+static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
+ return SDValue();
+
+ if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
+ return SDValue();
+
+ if (N->getValueType(0) != MVT::f32 ||
+ N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
+ N->getOperand(0).getOperand(0));
+ Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
+ DAG.getTargetConstant(4, dl, MVT::i32));
+ Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
+ return SDValue();
+
+ bool IsStrict = N->isStrictFPOpcode();
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ EVT SrcVT = Src.getValueType();
+
+ if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
+ return SDValue();
+
+ if (VT.getVectorElementType() != MVT::f32 &&
+ VT.getVectorElementType() != MVT::f64)
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 1 || !isPowerOf2_32(NumElts))
+ return SDValue();
+
+ SDLoc dl(N);
+
+ // Convert the input to vXi16.
+ EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
+ Src = DAG.getBitcast(IntVT, Src);
+
+ // Widen to at least 8 input elements.
+ if (NumElts < 8) {
+ unsigned NumConcats = 8 / NumElts;
+ SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
+ : DAG.getConstant(0, dl, IntVT);
+ SmallVector<SDValue, 4> Ops(NumConcats, Fill);
+ Ops[0] = Src;
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
+ }
+
+ // Destination is vXf32 with at least 4 elements.
+ EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
+ std::max(4U, NumElts));
+ SDValue Cvt, Chain;
+ if (IsStrict) {
+ Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
+ {N->getOperand(0), Src});
+ Chain = Cvt.getValue(1);
+ } else {
+ Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
+ }
+
+ if (NumElts < 4) {
+ assert(NumElts == 2 && "Unexpected size");
+ Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ if (IsStrict) {
+ // Extend to the original VT if necessary.
+ if (Cvt.getValueType() != VT) {
+ Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
+ {Chain, Cvt});
+ Chain = Cvt.getValue(1);
+ }
+ return DAG.getMergeValues({Cvt, Chain}, dl);
+ }
+
+ // Extend to the original VT if necessary.
+ return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
+}
+
+// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
+// from. Limit this to cases where the loads have the same input chain and the
+// output chains are unused. This avoids any memory ordering issues.
+static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
+ "Unknown broadcast load type");
+
+ // Only do this if the chain result is unused.
+ if (N->hasAnyUseOfValue(1))
+ return SDValue();
+
+ auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
+
+ SDValue Ptr = MemIntrin->getBasePtr();
+ SDValue Chain = MemIntrin->getChain();
+ EVT VT = N->getSimpleValueType(0);
+ EVT MemVT = MemIntrin->getMemoryVT();
+
+ // Look at other users of our base pointer and try to find a wider broadcast.
+ // The input chain and the size of the memory VT must match.
+ for (SDNode *User : Ptr->uses())
+ if (User != N && User->getOpcode() == N->getOpcode() &&
+ cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
+ cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
+ cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
+ MemVT.getSizeInBits() &&
+ !User->hasAnyUseOfValue(1) &&
+ User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
+ SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
+ VT.getSizeInBits());
+ Extract = DAG.getBitcast(VT, Extract);
+ return DCI.CombineTo(N, Extract, SDValue(User, 1));
+ }
+
+ return SDValue();
+}
+
+static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
+ SrcVT.getVectorElementType() != MVT::f32)
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 1 || !isPowerOf2_32(NumElts))
+ return SDValue();
+
+ SDLoc dl(N);
+
+ // Widen to at least 4 input elements.
+ if (NumElts < 4)
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getConstantFP(0.0, dl, SrcVT));
+
+ // Destination is v8i16 with at least 8 elements.
+ EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ std::max(8U, NumElts));
+ SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
+ DAG.getTargetConstant(4, dl, MVT::i32));
+
+ // Extract down to real number of elements.
+ if (NumElts < 8) {
+ EVT IntVT = VT.changeVectorElementTypeToInteger();
+ Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ return DAG.getBitcast(VT, Cvt);
+}
+
+static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
+ SDValue Src = N->getOperand(0);
+
+ // Turn MOVDQ2Q+simple_load into an mmx load.
+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
+
+ if (LN->isSimple()) {
+ SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
+ LN->getBasePtr(),
+ LN->getPointerInfo(),
+ LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
+ return NewLd;
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBits), DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::SCALAR_TO_VECTOR:
+ return combineScalarToVector(N, DAG);
+ case ISD::EXTRACT_VECTOR_ELT:
+ case X86ISD::PEXTRW:
+ case X86ISD::PEXTRB:
+ return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+ case ISD::CONCAT_VECTORS:
+ return combineConcatVectors(N, DAG, DCI, Subtarget);
+ case ISD::INSERT_SUBVECTOR:
+ return combineInsertSubvector(N, DAG, DCI, Subtarget);
+ case ISD::EXTRACT_SUBVECTOR:
+ return combineExtractSubvector(N, DAG, DCI, Subtarget);
+ case ISD::VSELECT:
+ case ISD::SELECT:
+ case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
+ case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
+ case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
+ case X86ISD::CMP: return combineCMP(N, DAG);
+ case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
+ case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
+ case X86ISD::ADD:
+ case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
+ case X86ISD::SBB: return combineSBB(N, DAG);
+ case X86ISD::ADC: return combineADC(N, DAG, DCI);
+ case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
+ case ISD::SHL: return combineShiftLeft(N, DAG);
+ case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
+ case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
+ case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
+ case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
+ case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
+ case X86ISD::BEXTR:
+ case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
+ case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
+ case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
+ case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
+ case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
+ case X86ISD::VEXTRACT_STORE:
+ return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
+ case ISD::SINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
+ return combineSIntToFP(N, DAG, DCI, Subtarget);
+ case ISD::UINT_TO_FP:
+ case ISD::STRICT_UINT_TO_FP:
+ return combineUIntToFP(N, DAG, Subtarget);
+ case ISD::FADD:
+ case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
+ case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
+ case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
+ case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
+ case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
+ case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
+ case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
+ case X86ISD::FXOR:
+ case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
+ case X86ISD::FMIN:
+ case X86ISD::FMAX: return combineFMinFMax(N, DAG);
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
+ case X86ISD::CVTSI2P:
+ case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
+ case X86ISD::CVTP2SI:
+ case X86ISD::CVTP2UI:
+ case X86ISD::STRICT_CVTTP2SI:
+ case X86ISD::CVTTP2SI:
+ case X86ISD::STRICT_CVTTP2UI:
+ case X86ISD::CVTTP2UI:
+ return combineCVTP2I_CVTTP2I(N, DAG, DCI);
+ case X86ISD::STRICT_CVTPH2PS:
+ case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
+ case X86ISD::BT: return combineBT(N, DAG, DCI);
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
+ case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
+ case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
+ case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
+ case X86ISD::VSHL:
+ case X86ISD::VSRA:
+ case X86ISD::VSRL:
+ return combineVectorShiftVar(N, DAG, DCI, Subtarget);
+ case X86ISD::VSHLI:
+ case X86ISD::VSRAI:
+ case X86ISD::VSRLI:
+ return combineVectorShiftImm(N, DAG, DCI, Subtarget);
+ case ISD::INSERT_VECTOR_ELT:
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
+ case X86ISD::SHUFP: // Handle all target specific shuffles
+ case X86ISD::INSERTPS:
+ case X86ISD::EXTRQI:
+ case X86ISD::INSERTQI:
+ case X86ISD::VALIGN:
+ case X86ISD::PALIGNR:
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
+ case X86ISD::BLENDI:
+ case X86ISD::UNPCKH:
+ case X86ISD::UNPCKL:
+ case X86ISD::MOVHLPS:
+ case X86ISD::MOVLHPS:
+ case X86ISD::PSHUFB:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::MOVDDUP:
+ case X86ISD::MOVSS:
+ case X86ISD::MOVSD:
+ case X86ISD::VBROADCAST:
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMI:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMIL2:
+ case X86ISD::VPERMILPI:
+ case X86ISD::VPERMILPV:
+ case X86ISD::VPERM2X128:
+ case X86ISD::SHUF128:
+ case X86ISD::VZEXT_MOVL:
+ case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMSUB:
+ case X86ISD::STRICT_FMSUB:
+ case X86ISD::FMSUB_RND:
+ case X86ISD::FNMADD:
+ case X86ISD::STRICT_FNMADD:
+ case X86ISD::FNMADD_RND:
+ case X86ISD::FNMSUB:
+ case X86ISD::STRICT_FNMSUB:
+ case X86ISD::FNMSUB_RND:
+ case ISD::FMA:
+ case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
+ case X86ISD::FMADDSUB_RND:
+ case X86ISD::FMSUBADD_RND:
+ case X86ISD::FMADDSUB:
+ case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
+ case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
+ case X86ISD::MGATHER:
+ case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
+ case ISD::MGATHER:
+ case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
+ case X86ISD::PCMPEQ:
+ case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
+ case X86ISD::KSHIFTL:
+ case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
+ case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
+ case ISD::STRICT_FP_EXTEND:
+ case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
+ case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
+ case X86ISD::VBROADCAST_LOAD:
+ case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
+ case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
+ case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
+ }
+
+ return SDValue();
+}
+
+bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
+ if (!isTypeLegal(VT))
+ return false;
+
+ // There are no vXi8 shifts.
+ if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
+ return false;
+
+ // TODO: Almost no 8-bit ops are desirable because they have no actual
+ // size/speed advantages vs. 32-bit ops, but they do have a major
+ // potential disadvantage by causing partial register stalls.
+ //
+ // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
+ // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
+ // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
+ // check for a constant operand to the multiply.
+ if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
+ return false;
+
+ // i16 instruction encodings are longer and some i16 instructions are slow,
+ // so those are not desirable.
+ if (VT == MVT::i16) {
+ switch (Opc) {
+ default:
+ break;
+ case ISD::LOAD:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::SUB:
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ return false;
+ }
+ }
+
+ // Any legal type not explicitly accounted for above here is desirable.
+ return true;
+}
+
+SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
+ SDValue Value, SDValue Addr,
+ SelectionDAG &DAG) const {
+ const Module *M = DAG.getMachineFunction().getMMI().getModule();
+ Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
+ if (IsCFProtectionSupported) {
+ // In case control-flow branch protection is enabled, we need to add
+ // notrack prefix to the indirect branch.
+ // In order to do that we create NT_BRIND SDNode.
+ // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
+ return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
+ }
+
+ return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
+}
+
+bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
+ EVT VT = Op.getValueType();
+ bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
+ isa<ConstantSDNode>(Op.getOperand(1));
+
+ // i16 is legal, but undesirable since i16 instruction encodings are longer
+ // and some i16 instructions are slow.
+ // 8-bit multiply-by-constant can usually be expanded to something cheaper
+ // using LEA and/or other ALU ops.
+ if (VT != MVT::i16 && !Is8BitMulByConstant)
+ return false;
+
+ auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
+ if (!Op.hasOneUse())
+ return false;
+ SDNode *User = *Op->use_begin();
+ if (!ISD::isNormalStore(User))
+ return false;
+ auto *Ld = cast<LoadSDNode>(Load);
+ auto *St = cast<StoreSDNode>(User);
+ return Ld->getBasePtr() == St->getBasePtr();
+ };
+
+ auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
+ if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
+ return false;
+ if (!Op.hasOneUse())
+ return false;
+ SDNode *User = *Op->use_begin();
+ if (User->getOpcode() != ISD::ATOMIC_STORE)
+ return false;
+ auto *Ld = cast<AtomicSDNode>(Load);
+ auto *St = cast<AtomicSDNode>(User);
+ return Ld->getBasePtr() == St->getBasePtr();
+ };
+
+ bool Commute = false;
+ switch (Op.getOpcode()) {
+ default: return false;
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ break;
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: {
+ SDValue N0 = Op.getOperand(0);
+ // Look out for (store (shl (load), x)).
+ if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
+ return false;
+ break;
+ }
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ Commute = true;
+ LLVM_FALLTHROUGH;
+ case ISD::SUB: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ // Avoid disabling potential load folding opportunities.
+ if (MayFoldLoad(N1) &&
+ (!Commute || !isa<ConstantSDNode>(N0) ||
+ (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
+ return false;
+ if (MayFoldLoad(N0) &&
+ ((Commute && !isa<ConstantSDNode>(N1)) ||
+ (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
+ return false;
+ if (IsFoldableAtomicRMW(N0, Op) ||
+ (Commute && IsFoldableAtomicRMW(N1, Op)))
+ return false;
+ }
+ }
+
+ PVT = MVT::i32;
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Helper to match a string separated by whitespace.
+static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
+ S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
+
+ for (StringRef Piece : Pieces) {
+ if (!S.startswith(Piece)) // Check if the piece matches.
+ return false;
+
+ S = S.substr(Piece.size());
+ StringRef::size_type Pos = S.find_first_not_of(" \t");
+ if (Pos == 0) // We matched a prefix.
+ return false;
+
+ S = S.substr(Pos);
+ }
+
+ return S.empty();
+}
+
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+ if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+ if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+ std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+ std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+ if (AsmPieces.size() == 3)
+ return true;
+ else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
+
+ const std::string &AsmStr = IA->getAsmString();
+
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (!Ty || Ty->getBitWidth() % 16 != 0)
+ return false;
+
+ // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+ SmallVector<StringRef, 4> AsmPieces;
+ SplitString(AsmStr, AsmPieces, ";\n");
+
+ switch (AsmPieces.size()) {
+ default: return false;
+ case 1:
+ // FIXME: this should verify that we are targeting a 486 or better. If not,
+ // we will turn this bswap into something that will be lowered to logical
+ // ops instead of emitting the bswap asm. For now, we don't support 486 or
+ // lower so don't worry about this.
+ // bswap $0
+ if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
+ // No need to check constraints, nothing other than the equivalent of
+ // "=r,0" would be valid here.
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+
+ // rorw $$8, ${0:w} --> llvm.bswap.i16
+ if (CI->getType()->isIntegerTy(16) &&
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
+ matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
+ AsmPieces.clear();
+ StringRef ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+ if (clobbersFlagRegisters(AsmPieces))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ break;
+ case 3:
+ if (CI->getType()->isIntegerTy(32) &&
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
+ matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
+ matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
+ AsmPieces.clear();
+ StringRef ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+ if (clobbersFlagRegisters(AsmPieces))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+
+ if (CI->getType()->isIntegerTy(64)) {
+ InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+ if (Constraints.size() >= 2 &&
+ Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+ Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+ // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
+ if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
+ matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
+ matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
+ return IntrinsicLowering::LowerToByteSwap(CI);
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
+ X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
+ .Case("{@cca}", X86::COND_A)
+ .Case("{@ccae}", X86::COND_AE)
+ .Case("{@ccb}", X86::COND_B)
+ .Case("{@ccbe}", X86::COND_BE)
+ .Case("{@ccc}", X86::COND_B)
+ .Case("{@cce}", X86::COND_E)
+ .Case("{@ccz}", X86::COND_E)
+ .Case("{@ccg}", X86::COND_G)
+ .Case("{@ccge}", X86::COND_GE)
+ .Case("{@ccl}", X86::COND_L)
+ .Case("{@ccle}", X86::COND_LE)
+ .Case("{@ccna}", X86::COND_BE)
+ .Case("{@ccnae}", X86::COND_B)
+ .Case("{@ccnb}", X86::COND_AE)
+ .Case("{@ccnbe}", X86::COND_A)
+ .Case("{@ccnc}", X86::COND_AE)
+ .Case("{@ccne}", X86::COND_NE)
+ .Case("{@ccnz}", X86::COND_NE)
+ .Case("{@ccng}", X86::COND_LE)
+ .Case("{@ccnge}", X86::COND_L)
+ .Case("{@ccnl}", X86::COND_GE)
+ .Case("{@ccnle}", X86::COND_G)
+ .Case("{@ccno}", X86::COND_NO)
+ .Case("{@ccnp}", X86::COND_NP)
+ .Case("{@ccns}", X86::COND_NS)
+ .Case("{@cco}", X86::COND_O)
+ .Case("{@ccp}", X86::COND_P)
+ .Case("{@ccs}", X86::COND_S)
+ .Default(X86::COND_INVALID);
+ return Cond;
+}
+
+/// Given a constraint letter, return the type of constraint for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'R':
+ case 'q':
+ case 'Q':
+ case 'f':
+ case 't':
+ case 'u':
+ case 'y':
+ case 'x':
+ case 'v':
+ case 'l':
+ case 'k': // AVX512 masking registers.
+ return C_RegisterClass;
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'S':
+ case 'D':
+ case 'A':
+ return C_Register;
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'N':
+ case 'G':
+ case 'L':
+ case 'M':
+ return C_Immediate;
+ case 'C':
+ case 'e':
+ case 'Z':
+ return C_Other;
+ default:
+ break;
+ }
+ }
+ else if (Constraint.size() == 2) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'Y':
+ switch (Constraint[1]) {
+ default:
+ break;
+ case 'z':
+ return C_Register;
+ case 'i':
+ case 'm':
+ case 'k':
+ case 't':
+ case '2':
+ return C_RegisterClass;
+ }
+ }
+ } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
+ return C_Other;
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ X86TargetLowering::getSingleConstraintMatchWeight(
+ AsmOperandInfo &info, const char *constraint) const {
+ ConstraintWeight weight = CW_Invalid;
+ Value *CallOperandVal = info.CallOperandVal;
+ // If we don't have a value, we can't do a match,
+ // but allow it at the lowest weight.
+ if (!CallOperandVal)
+ return CW_Default;
+ Type *type = CallOperandVal->getType();
+ // Look at the constraint type.
+ switch (*constraint) {
+ default:
+ weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ LLVM_FALLTHROUGH;
+ case 'R':
+ case 'q':
+ case 'Q':
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'S':
+ case 'D':
+ case 'A':
+ if (CallOperandVal->getType()->isIntegerTy())
+ weight = CW_SpecificReg;
+ break;
+ case 'f':
+ case 't':
+ case 'u':
+ if (type->isFloatingPointTy())
+ weight = CW_SpecificReg;
+ break;
+ case 'y':
+ if (type->isX86_MMXTy() && Subtarget.hasMMX())
+ weight = CW_SpecificReg;
+ break;
+ case 'Y':
+ if (StringRef(constraint).size() != 2)
+ break;
+ switch (constraint[1]) {
+ default:
+ return CW_Invalid;
+ // XMM0
+ case 'z':
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
+ ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
+ return CW_SpecificReg;
+ return CW_Invalid;
+ // Conditional OpMask regs (AVX512)
+ case 'k':
+ if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
+ return CW_Register;
+ return CW_Invalid;
+ // Any MMX reg
+ case 'm':
+ if (type->isX86_MMXTy() && Subtarget.hasMMX())
+ return weight;
+ return CW_Invalid;
+ // Any SSE reg when ISA >= SSE2, same as 'x'
+ case 'i':
+ case 't':
+ case '2':
+ if (!Subtarget.hasSSE2())
+ return CW_Invalid;
+ break;
+ }
+ break;
+ case 'v':
+ if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
+ weight = CW_Register;
+ LLVM_FALLTHROUGH;
+ case 'x':
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
+ weight = CW_Register;
+ break;
+ case 'k':
+ // Enable conditional vector operations using %k<#> registers.
+ if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
+ weight = CW_Register;
+ break;
+ case 'I':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+ if (C->getZExtValue() <= 31)
+ weight = CW_Constant;
+ }
+ break;
+ case 'J':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 63)
+ weight = CW_Constant;
+ }
+ break;
+ case 'K':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
+ weight = CW_Constant;
+ }
+ break;
+ case 'L':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
+ weight = CW_Constant;
+ }
+ break;
+ case 'M':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 3)
+ weight = CW_Constant;
+ }
+ break;
+ case 'N':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 0xff)
+ weight = CW_Constant;
+ }
+ break;
+ case 'G':
+ case 'C':
+ if (isa<ConstantFP>(CallOperandVal)) {
+ weight = CW_Constant;
+ }
+ break;
+ case 'e':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if ((C->getSExtValue() >= -0x80000000LL) &&
+ (C->getSExtValue() <= 0x7fffffffLL))
+ weight = CW_Constant;
+ }
+ break;
+ case 'Z':
+ if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (C->getZExtValue() <= 0xffffffff)
+ weight = CW_Constant;
+ }
+ break;
+ }
+ return weight;
+}
+
+/// Try to replace an X constraint, which matches anything, with another that
+/// has more specific requirements based on the type of the corresponding
+/// operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+ // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+ // 'f' like normal targets.
+ if (ConstraintVT.isFloatingPoint()) {
+ if (Subtarget.hasSSE1())
+ return "x";
+ }
+
+ return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+// Lower @cc targets via setcc.
+SDValue X86TargetLowering::LowerAsmOutputForConstraint(
+ SDValue &Chain, SDValue &Flag, const SDLoc &DL,
+ const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
+ X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
+ if (Cond == X86::COND_INVALID)
+ return SDValue();
+ // Check that return type is valid.
+ if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
+ OpInfo.ConstraintVT.getSizeInBits() < 8)
+ report_fatal_error("Flag output operand is of invalid type");
+
+ // Get EFLAGS register. Only update chain when copyfrom is glued.
+ if (Flag.getNode()) {
+ Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
+ Chain = Flag.getValue(1);
+ } else
+ Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
+ // Extract CC code.
+ SDValue CC = getSETCC(Cond, Flag, DL, DAG);
+ // Extend to 32-bits
+ SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
+
+ return Result;
+}
+
+/// Lower the specified operand into the Ops vector.
+/// If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue>&Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result;
+
+ // Only support length 1 constraints for now.
+ if (Constraint.length() > 1) return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
+ default: break;
+ case 'I':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 31) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'J':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 63) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'K':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (isInt<8>(C->getSExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'L':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+ (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'M':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 3) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'N':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 255) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'O':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 127) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'e': {
+ // 32-bit signed value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getSExtValue())) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
+ break;
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ }
+ return;
+ }
+ case 'Z': {
+ // 32-bit unsigned value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getZExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
+ break;
+ }
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ return;
+ }
+ case 'i': {
+ // Literal immediates are always ok.
+ if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+ bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
+ BooleanContent BCont = getBooleanContents(MVT::i64);
+ ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
+ : ISD::SIGN_EXTEND;
+ int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
+ : CST->getSExtValue();
+ Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
+ break;
+ }
+
+ // In any sort of PIC mode addresses need to be computed at runtime by
+ // adding in a register or some sort of table lookup. These can't
+ // be used as immediates.
+ if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
+ return;
+
+ // If we are in non-pic codegen mode, we allow the address of a global (with
+ // an optional displacement) to be used with 'i'.
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
+ // If we require an extra load to get this address, as in PIC mode, we
+ // can't accept it.
+ if (isGlobalStubReference(
+ Subtarget.classifyGlobalReference(GA->getGlobal())))
+ return;
+ break;
+ }
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+/// Check if \p RC is a general purpose register class.
+/// I.e., GR* or one of their variant.
+static bool isGRClass(const TargetRegisterClass &RC) {
+ return RC.hasSuperClassEq(&X86::GR8RegClass) ||
+ RC.hasSuperClassEq(&X86::GR16RegClass) ||
+ RC.hasSuperClassEq(&X86::GR32RegClass) ||
+ RC.hasSuperClassEq(&X86::GR64RegClass) ||
+ RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
+}
+
+/// Check if \p RC is a vector register class.
+/// I.e., FR* / VR* or one of their variant.
+static bool isFRClass(const TargetRegisterClass &RC) {
+ return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
+ RC.hasSuperClassEq(&X86::FR64XRegClass) ||
+ RC.hasSuperClassEq(&X86::VR128XRegClass) ||
+ RC.hasSuperClassEq(&X86::VR256XRegClass) ||
+ RC.hasSuperClassEq(&X86::VR512RegClass);
+}
+
+/// Check if \p RC is a mask register class.
+/// I.e., VK* or one of their variant.
+static bool isVKClass(const TargetRegisterClass &RC) {
+ return RC.hasSuperClassEq(&X86::VK1RegClass) ||
+ RC.hasSuperClassEq(&X86::VK2RegClass) ||
+ RC.hasSuperClassEq(&X86::VK4RegClass) ||
+ RC.hasSuperClassEq(&X86::VK8RegClass) ||
+ RC.hasSuperClassEq(&X86::VK16RegClass) ||
+ RC.hasSuperClassEq(&X86::VK32RegClass) ||
+ RC.hasSuperClassEq(&X86::VK64RegClass);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ // First, see if this is a constraint that directly corresponds to an LLVM
+ // register class.
+ if (Constraint.size() == 1) {
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ default: break;
+ // 'A' means [ER]AX + [ER]DX.
+ case 'A':
+ if (Subtarget.is64Bit())
+ return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
+ assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
+ "Expecting 64, 32 or 16 bit subtarget");
+ return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
+
+ // TODO: Slight differences here in allocation order and leaving
+ // RIP in the class. Do they matter any more here than they do
+ // in the normal allocation?
+ case 'k':
+ if (Subtarget.hasAVX512()) {
+ if (VT == MVT::i1)
+ return std::make_pair(0U, &X86::VK1RegClass);
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &X86::VK8RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::VK16RegClass);
+ }
+ if (Subtarget.hasBWI()) {
+ if (VT == MVT::i32)
+ return std::make_pair(0U, &X86::VK32RegClass);
+ if (VT == MVT::i64)
+ return std::make_pair(0U, &X86::VK64RegClass);
+ }
+ break;
+ case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+ if (Subtarget.is64Bit()) {
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32RegClass);
+ if (VT != MVT::f80)
+ return std::make_pair(0U, &X86::GR64RegClass);
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ // 32-bit fallthrough
+ case 'Q': // Q_REGS
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+ return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+ if (VT != MVT::f80)
+ return std::make_pair(0U, &X86::GR64_ABCDRegClass);
+ break;
+ case 'r': // GENERAL_REGS
+ case 'l': // INDEX_REGS
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+ return std::make_pair(0U, &X86::GR32RegClass);
+ if (VT != MVT::f80)
+ return std::make_pair(0U, &X86::GR64RegClass);
+ break;
+ case 'R': // LEGACY_REGS
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_NOREXRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_NOREXRegClass);
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+ return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+ if (VT != MVT::f80)
+ return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+ break;
+ case 'f': // FP Stack registers.
+ // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+ // value to the correct fpstack register class.
+ if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, &X86::RFP32RegClass);
+ if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, &X86::RFP64RegClass);
+ if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
+ return std::make_pair(0U, &X86::RFP80RegClass);
+ break;
+ case 'y': // MMX_REGS if MMX allowed.
+ if (!Subtarget.hasMMX()) break;
+ return std::make_pair(0U, &X86::VR64RegClass);
+ case 'v':
+ case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+ if (!Subtarget.hasSSE1()) break;
+ bool VConstraint = (Constraint[0] == 'v');
+
+ switch (VT.SimpleTy) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::FR32XRegClass);
+ return std::make_pair(0U, &X86::FR32RegClass);
+ case MVT::f64:
+ case MVT::i64:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::FR64XRegClass);
+ return std::make_pair(0U, &X86::FR64RegClass);
+ case MVT::i128:
+ if (Subtarget.is64Bit()) {
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::VR128XRegClass);
+ return std::make_pair(0U, &X86::VR128RegClass);
+ }
+ break;
+ // Vector types and fp128.
+ case MVT::f128:
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::VR128XRegClass);
+ return std::make_pair(0U, &X86::VR128RegClass);
+ // AVX types.
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::VR256XRegClass);
+ if (Subtarget.hasAVX())
+ return std::make_pair(0U, &X86::VR256RegClass);
+ break;
+ case MVT::v64i8:
+ case MVT::v32i16:
+ case MVT::v8f64:
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (!Subtarget.hasAVX512()) break;
+ if (VConstraint)
+ return std::make_pair(0U, &X86::VR512RegClass);
+ return std::make_pair(0U, &X86::VR512_0_15RegClass);
+ }
+ break;
+ }
+ } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
+ switch (Constraint[1]) {
+ default:
+ break;
+ case 'i':
+ case 't':
+ case '2':
+ return getRegForInlineAsmConstraint(TRI, "x", VT);
+ case 'm':
+ if (!Subtarget.hasMMX()) break;
+ return std::make_pair(0U, &X86::VR64RegClass);
+ case 'z':
+ if (!Subtarget.hasSSE1()) break;
+ switch (VT.SimpleTy) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ return std::make_pair(X86::XMM0, &X86::FR32RegClass);
+ case MVT::f64:
+ case MVT::i64:
+ return std::make_pair(X86::XMM0, &X86::FR64RegClass);
+ case MVT::f128:
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return std::make_pair(X86::XMM0, &X86::VR128RegClass);
+ // AVX types.
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ if (Subtarget.hasAVX())
+ return std::make_pair(X86::YMM0, &X86::VR256RegClass);
+ break;
+ case MVT::v64i8:
+ case MVT::v32i16:
+ case MVT::v8f64:
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (Subtarget.hasAVX512())
+ return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
+ break;
+ }
+ break;
+ case 'k':
+ // This register class doesn't allocate k0 for masked vector operation.
+ if (Subtarget.hasAVX512()) {
+ if (VT == MVT::i1)
+ return std::make_pair(0U, &X86::VK1WMRegClass);
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &X86::VK8WMRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::VK16WMRegClass);
+ }
+ if (Subtarget.hasBWI()) {
+ if (VT == MVT::i32)
+ return std::make_pair(0U, &X86::VK32WMRegClass);
+ if (VT == MVT::i64)
+ return std::make_pair(0U, &X86::VK64WMRegClass);
+ }
+ break;
+ }
+ }
+
+ if (parseConstraintCode(Constraint) != X86::COND_INVALID)
+ return std::make_pair(0U, &X86::GR32RegClass);
+
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ std::pair<Register, const TargetRegisterClass*> Res;
+ Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+ // Not found as a standard register?
+ if (!Res.second) {
+ // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
+ // to/from f80.
+ if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
+ // Map st(0) -> st(7) -> ST0
+ if (Constraint.size() == 7 && Constraint[0] == '{' &&
+ tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
+ Constraint[3] == '(' &&
+ (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+ Constraint[5] == ')' && Constraint[6] == '}') {
+ // st(7) is not allocatable and thus not a member of RFP80. Return
+ // singleton class in cases where we have a reference to it.
+ if (Constraint[4] == '7')
+ return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
+ return std::make_pair(X86::FP0 + Constraint[4] - '0',
+ &X86::RFP80RegClass);
+ }
+
+ // GCC allows "st(0)" to be called just plain "st".
+ if (StringRef("{st}").equals_lower(Constraint))
+ return std::make_pair(X86::FP0, &X86::RFP80RegClass);
+ }
+
+ // flags -> EFLAGS
+ if (StringRef("{flags}").equals_lower(Constraint))
+ return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
+
+ // dirflag -> DF
+ // Only allow for clobber.
+ if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)
+ return std::make_pair(X86::DF, &X86::DFCCRRegClass);
+
+ // fpsr -> FPSW
+ if (StringRef("{fpsr}").equals_lower(Constraint))
+ return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
+
+ return Res;
+ }
+
+ // Make sure it isn't a register that requires 64-bit mode.
+ if (!Subtarget.is64Bit() &&
+ (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
+ TRI->getEncodingValue(Res.first) >= 8) {
+ // Register requires REX prefix, but we're in 32-bit mode.
+ return std::make_pair(0, nullptr);
+ }
+
+ // Make sure it isn't a register that requires AVX512.
+ if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
+ TRI->getEncodingValue(Res.first) & 0x10) {
+ // Register requires EVEX prefix.
+ return std::make_pair(0, nullptr);
+ }
+
+ // Otherwise, check to see if this is a register class of the wrong value
+ // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+ // turn into {ax},{dx}.
+ // MVT::Other is used to specify clobber names.
+ if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
+ return Res; // Correct type already, nothing to do.
+
+ // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
+ // return "eax". This should even work for things like getting 64bit integer
+ // registers when given an f64 type.
+ const TargetRegisterClass *Class = Res.second;
+ // The generic code will match the first register class that contains the
+ // given register. Thus, based on the ordering of the tablegened file,
+ // the "plain" GR classes might not come first.
+ // Therefore, use a helper method.
+ if (isGRClass(*Class)) {
+ unsigned Size = VT.getSizeInBits();
+ if (Size == 1) Size = 8;
+ Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
+ if (DestReg > 0) {
+ bool is64Bit = Subtarget.is64Bit();
+ const TargetRegisterClass *RC =
+ Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
+ : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
+ : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
+ : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
+ : nullptr;
+ if (Size == 64 && !is64Bit) {
+ // Model GCC's behavior here and select a fixed pair of 32-bit
+ // registers.
+ switch (DestReg) {
+ case X86::RAX:
+ return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
+ case X86::RDX:
+ return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
+ case X86::RCX:
+ return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
+ case X86::RBX:
+ return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
+ case X86::RSI:
+ return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
+ case X86::RDI:
+ return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
+ case X86::RBP:
+ return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
+ default:
+ return std::make_pair(0, nullptr);
+ }
+ }
+ if (RC && RC->contains(DestReg))
+ return std::make_pair(DestReg, RC);
+ return Res;
+ }
+ // No register found/type mismatch.
+ return std::make_pair(0, nullptr);
+ } else if (isFRClass(*Class)) {
+ // Handle references to XMM physical registers that got mapped into the
+ // wrong class. This can happen with constraints like {xmm0} where the
+ // target independent register mapper will just pick the first match it can
+ // find, ignoring the required type.
+
+ // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
+ if (VT == MVT::f32 || VT == MVT::i32)
+ Res.second = &X86::FR32XRegClass;
+ else if (VT == MVT::f64 || VT == MVT::i64)
+ Res.second = &X86::FR64XRegClass;
+ else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
+ Res.second = &X86::VR128XRegClass;
+ else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
+ Res.second = &X86::VR256XRegClass;
+ else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
+ Res.second = &X86::VR512RegClass;
+ else {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
+ }
+ } else if (isVKClass(*Class)) {
+ if (VT == MVT::i1)
+ Res.second = &X86::VK1RegClass;
+ else if (VT == MVT::i8)
+ Res.second = &X86::VK8RegClass;
+ else if (VT == MVT::i16)
+ Res.second = &X86::VK16RegClass;
+ else if (VT == MVT::i32)
+ Res.second = &X86::VK32RegClass;
+ else if (VT == MVT::i64)
+ Res.second = &X86::VK64RegClass;
+ else {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
+ }
+ }
+
+ return Res;
+}
+
+int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
+ // Scaling factors are not free at all.
+ // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+ // will take 2 allocations in the out of order engine instead of 1
+ // for plain addressing mode, i.e. inst (reg1).
+ // E.g.,
+ // vaddps (%rsi,%rdx), %ymm0, %ymm1
+ // Requires two allocations (one for the load, one for the computation)
+ // whereas:
+ // vaddps (%rsi), %ymm0, %ymm1
+ // Requires just 1 allocation, i.e., freeing allocations for other operations
+ // and having less micro operations to execute.
+ //
+ // For some X86 architectures, this is even worse because for instance for
+ // stores, the complex addressing mode forces the instruction to use the
+ // "load" ports instead of the dedicated "store" port.
+ // E.g., on Haswell:
+ // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+ // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+ if (isLegalAddressingMode(DL, AM, Ty, AS))
+ // Scale represents reg2 * scale, thus account for 1
+ // as soon as we use a second register.
+ return AM.Scale != 0;
+ return -1;
+}
+
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
+ // Integer division on x86 is expensive. However, when aggressively optimizing
+ // for code size, we prefer to use a div instruction, as it is usually smaller
+ // than the alternative sequence.
+ // The exception to this is vector division. Since x86 doesn't have vector
+ // integer division, leaving the division as-is is a loss even in terms of
+ // size, because it will have to be scalarized, while the alternative code
+ // sequence can be performed in vector form.
+ bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
+ return OptSize && !VT.isVector();
+}
+
+void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+ if (!Subtarget.is64Bit())
+ return;
+
+ // Update IsSplitCSR in X86MachineFunctionInfo.
+ X86MachineFunctionInfo *AFI =
+ Entry->getParent()->getInfo<X86MachineFunctionInfo>();
+ AFI->setIsSplitCSR(true);
+}
+
+void X86TargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (X86::GR64RegClass.contains(*I))
+ RC = &X86::GR64RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ Register NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ // FIXME: this currently does not emit CFI pseudo-instructions, it works
+ // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+ // nounwind. If we want to generalize this later, we may need to emit
+ // CFI pseudo-instructions.
+ assert(
+ Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
+bool X86TargetLowering::supportSwiftError() const {
+ return Subtarget.is64Bit();
+}
+
+/// Returns true if stack probing through a function call is requested.
+bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
+ return !getStackProbeSymbolName(MF).empty();
+}
+
+/// Returns true if stack probing through inline assembly is requested.
+bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+
+ // No inline stack probe for Windows, they have their own mechanism.
+ if (Subtarget.isOSWindows() ||
+ MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
+ return false;
+
+ // If the function specifically requests inline stack probes, emit them.
+ if (MF.getFunction().hasFnAttribute("probe-stack"))
+ return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+ "inline-asm";
+
+ return false;
+}
+
+/// Returns the name of the symbol used to emit stack probes or the empty
+/// string if not applicable.
+StringRef
+X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+ // Inline Stack probes disable stack probe call
+ if (hasInlineStackProbe(MF))
+ return "";
+
+ // If the function specifically requests stack probes, emit them.
+ if (MF.getFunction().hasFnAttribute("probe-stack"))
+ return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
+
+ // Generally, if we aren't on Windows, the platform ABI does not include
+ // support for stack probes, so don't emit them.
+ if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
+ MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
+ return "";
+
+ // We need a stack probe to conform to the Windows ABI. Choose the right
+ // symbol.
+ if (Subtarget.is64Bit())
+ return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
+ return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
+}
+
+unsigned
+X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
+ // The default stack probe size is 4096 if the function has no stackprobesize
+ // attribute.
+ unsigned StackProbeSize = 4096;
+ const Function &Fn = MF.getFunction();
+ if (Fn.hasFnAttribute("stack-probe-size"))
+ Fn.getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+ return StackProbeSize;
+}
+
+Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ if (ML->isInnermost() &&
+ ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
+ return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
+ return TargetLowering::getPrefLoopAlignment();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 000000000000..76c83b7df9eb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,1713 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+ class X86Subtarget;
+ class X86TargetMachine;
+
+ namespace X86ISD {
+ // X86 Specific DAG Nodes
+ enum NodeType : unsigned {
+ // Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// Bit scan forward.
+ BSF,
+ /// Bit scan reverse.
+ BSR,
+
+ /// X86 funnel/double shift i16 instructions. These correspond to
+ /// X86::SHLDW and X86::SHRDW instructions which have different amt
+ /// modulo rules to generic funnel shifts.
+ /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
+ FSHL,
+ FSHR,
+
+ /// Bitwise logical AND of floating point values. This corresponds
+ /// to X86::ANDPS or X86::ANDPD.
+ FAND,
+
+ /// Bitwise logical OR of floating point values. This corresponds
+ /// to X86::ORPS or X86::ORPD.
+ FOR,
+
+ /// Bitwise logical XOR of floating point values. This corresponds
+ /// to X86::XORPS or X86::XORPD.
+ FXOR,
+
+ /// Bitwise logical ANDNOT of floating point values. This
+ /// corresponds to X86::ANDNPS or X86::ANDNPD.
+ FANDN,
+
+ /// These operations represent an abstract X86 call
+ /// instruction, which includes a bunch of information. In particular the
+ /// operands of these node are:
+ ///
+ /// #0 - The incoming token chain
+ /// #1 - The callee
+ /// #2 - The number of arg bytes the caller pushes on the stack.
+ /// #3 - The number of arg bytes the callee pops off the stack.
+ /// #4 - The value to pass in AL/AX/EAX (optional)
+ /// #5 - The value to pass in DL/DX/EDX (optional)
+ ///
+ /// The result values of these nodes are:
+ ///
+ /// #0 - The outgoing token chain
+ /// #1 - The first register result value (optional)
+ /// #2 - The second register result value (optional)
+ ///
+ CALL,
+
+ /// Same as call except it adds the NoTrack prefix.
+ NT_CALL,
+
+ /// X86 compare and logical compare instructions.
+ CMP,
+ FCMP,
+ COMI,
+ UCOMI,
+
+ /// X86 bit-test instructions.
+ BT,
+
+ /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+ /// operand, usually produced by a CMP instruction.
+ SETCC,
+
+ /// X86 Select
+ SELECTS,
+
+ // Same as SETCC except it's materialized with a sbb and the value is all
+ // one's or all zero's.
+ SETCC_CARRY, // R = carry_bit ? ~0 : 0
+
+ /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+ /// Operands are two FP values to compare; result is a mask of
+ /// 0s or 1s. Generally DTRT for C/C++ with NaNs.
+ FSETCC,
+
+ /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+ /// and a version with SAE.
+ FSETCCM,
+ FSETCCM_SAE,
+
+ /// X86 conditional moves. Operand 0 and operand 1 are the two values
+ /// to select from. Operand 2 is the condition code, and operand 3 is the
+ /// flag operand produced by a CMP or TEST instruction.
+ CMOV,
+
+ /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
+ /// operand 1 is the target address.
+ NT_BRIND,
+
+ /// Return with a flag operand. Operand 0 is the chain operand, operand
+ /// 1 is the number of bytes of stack to pop.
+ RET_FLAG,
+
+ /// Return from interrupt. Operand 0 is the number of bytes to pop.
+ IRET,
+
+ /// Repeat fill, corresponds to X86::REP_STOSx.
+ REP_STOS,
+
+ /// Repeat move, corresponds to X86::REP_MOVSx.
+ REP_MOVS,
+
+ /// On Darwin, this node represents the result of the popl
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// A wrapper node for TargetConstantPool, TargetJumpTable,
+ /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+ /// MCSymbol and TargetBlockAddress.
+ Wrapper,
+
+ /// Special wrapper used under X86-64 PIC mode for RIP
+ /// relative displacements.
+ WrapperRIP,
+
+ /// Copies a 64-bit value from an MMX vector to the low word
+ /// of an XMM vector, with the high word zero filled.
+ MOVQ2DQ,
+
+ /// Copies a 64-bit value from the low word of an XMM vector
+ /// to an MMX vector.
+ MOVDQ2Q,
+
+ /// Copies a 32-bit value from the low word of a MMX
+ /// vector to a GPR.
+ MMX_MOVD2W,
+
+ /// Copies a GPR into the low 32-bit word of a MMX vector
+ /// and zero out the high word.
+ MMX_MOVW2D,
+
+ /// Extract an 8-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRB.
+ PEXTRB,
+
+ /// Extract a 16-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRW.
+ PEXTRW,
+
+ /// Insert any element of a 4 x float vector into any element
+ /// of a destination 4 x floatvector.
+ INSERTPS,
+
+ /// Insert the lower 8-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRB.
+ PINSRB,
+
+ /// Insert the lower 16-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRW.
+ PINSRW,
+
+ /// Shuffle 16 8-bit values within a vector.
+ PSHUFB,
+
+ /// Compute Sum of Absolute Differences.
+ PSADBW,
+ /// Compute Double Block Packed Sum-Absolute-Differences
+ DBPSADBW,
+
+ /// Bitwise Logical AND NOT of Packed FP values.
+ ANDNP,
+
+ /// Blend where the selector is an immediate.
+ BLENDI,
+
+ /// Dynamic (non-constant condition) vector blend where only the sign bits
+ /// of the condition elements are used. This is used to enforce that the
+ /// condition mask is not valid for generic VSELECT optimizations. This
+ /// is also used to implement the intrinsics.
+ /// Operands are in VSELECT order: MASK, TRUE, FALSE
+ BLENDV,
+
+ /// Combined add and sub on an FP vector.
+ ADDSUB,
+
+ // FP vector ops with rounding mode.
+ FADD_RND,
+ FADDS,
+ FADDS_RND,
+ FSUB_RND,
+ FSUBS,
+ FSUBS_RND,
+ FMUL_RND,
+ FMULS,
+ FMULS_RND,
+ FDIV_RND,
+ FDIVS,
+ FDIVS_RND,
+ FMAX_SAE,
+ FMAXS_SAE,
+ FMIN_SAE,
+ FMINS_SAE,
+ FSQRT_RND,
+ FSQRTS,
+ FSQRTS_RND,
+
+ // FP vector get exponent.
+ FGETEXP,
+ FGETEXP_SAE,
+ FGETEXPS,
+ FGETEXPS_SAE,
+ // Extract Normalized Mantissas.
+ VGETMANT,
+ VGETMANT_SAE,
+ VGETMANTS,
+ VGETMANTS_SAE,
+ // FP Scale.
+ SCALEF,
+ SCALEF_RND,
+ SCALEFS,
+ SCALEFS_RND,
+
+ // Unsigned Integer average.
+ AVG,
+
+ /// Integer horizontal add/sub.
+ HADD,
+ HSUB,
+
+ /// Floating point horizontal add/sub.
+ FHADD,
+ FHSUB,
+
+ // Detect Conflicts Within a Vector
+ CONFLICT,
+
+ /// Floating point max and min.
+ FMAX,
+ FMIN,
+
+ /// Commutative FMIN and FMAX.
+ FMAXC,
+ FMINC,
+
+ /// Scalar intrinsic floating point max and min.
+ FMAXS,
+ FMINS,
+
+ /// Floating point reciprocal-sqrt and reciprocal approximation.
+ /// Note that these typically require refinement
+ /// in order to obtain suitable precision.
+ FRSQRT,
+ FRCP,
+
+ // AVX-512 reciprocal approximations with a little more precision.
+ RSQRT14,
+ RSQRT14S,
+ RCP14,
+ RCP14S,
+
+ // Thread Local Storage.
+ TLSADDR,
+
+ // Thread Local Storage. A call to get the start address
+ // of the TLS block for the current module.
+ TLSBASEADDR,
+
+ // Thread Local Storage. When calling to an OS provided
+ // thunk at the address from an earlier relocation.
+ TLSCALL,
+
+ // Exception Handling helpers.
+ EH_RETURN,
+
+ // SjLj exception handling setjmp.
+ EH_SJLJ_SETJMP,
+
+ // SjLj exception handling longjmp.
+ EH_SJLJ_LONGJMP,
+
+ // SjLj exception handling dispatch.
+ EH_SJLJ_SETUP_DISPATCH,
+
+ /// Tail call return. See X86TargetLowering::LowerCall for
+ /// the list of operands.
+ TC_RETURN,
+
+ // Vector move to low scalar and zero higher vector elements.
+ VZEXT_MOVL,
+
+ // Vector integer truncate.
+ VTRUNC,
+ // Vector integer truncate with unsigned/signed saturation.
+ VTRUNCUS,
+ VTRUNCS,
+
+ // Masked version of the above. Used when less than a 128-bit result is
+ // produced since the mask only applies to the lower elements and can't
+ // be represented by a select.
+ // SRC, PASSTHRU, MASK
+ VMTRUNC,
+ VMTRUNCUS,
+ VMTRUNCS,
+
+ // Vector FP extend.
+ VFPEXT,
+ VFPEXT_SAE,
+ VFPEXTS,
+ VFPEXTS_SAE,
+
+ // Vector FP round.
+ VFPROUND,
+ VFPROUND_RND,
+ VFPROUNDS,
+ VFPROUNDS_RND,
+
+ // Masked version of above. Used for v2f64->v4f32.
+ // SRC, PASSTHRU, MASK
+ VMFPROUND,
+
+ // 128-bit vector logical left / right shift
+ VSHLDQ,
+ VSRLDQ,
+
+ // Vector shift elements
+ VSHL,
+ VSRL,
+ VSRA,
+
+ // Vector variable shift
+ VSHLV,
+ VSRLV,
+ VSRAV,
+
+ // Vector shift elements by immediate
+ VSHLI,
+ VSRLI,
+ VSRAI,
+
+ // Shifts of mask registers.
+ KSHIFTL,
+ KSHIFTR,
+
+ // Bit rotate by immediate
+ VROTLI,
+ VROTRI,
+
+ // Vector packed double/float comparison.
+ CMPP,
+
+ // Vector integer comparisons.
+ PCMPEQ,
+ PCMPGT,
+
+ // v8i16 Horizontal minimum and position.
+ PHMINPOS,
+
+ MULTISHIFT,
+
+ /// Vector comparison generating mask bits for fp and
+ /// integer signed and unsigned data types.
+ CMPM,
+ // Vector mask comparison generating mask bits for FP values.
+ CMPMM,
+ // Vector mask comparison with SAE for FP values.
+ CMPMM_SAE,
+
+ // Arithmetic operations with FLAGS results.
+ ADD,
+ SUB,
+ ADC,
+ SBB,
+ SMUL,
+ UMUL,
+ OR,
+ XOR,
+ AND,
+
+ // Bit field extract.
+ BEXTR,
+ BEXTRI,
+
+ // Zero High Bits Starting with Specified Bit Position.
+ BZHI,
+
+ // Parallel extract and deposit.
+ PDEP,
+ PEXT,
+
+ // X86-specific multiply by immediate.
+ MUL_IMM,
+
+ // Vector sign bit extraction.
+ MOVMSK,
+
+ // Vector bitwise comparisons.
+ PTEST,
+
+ // Vector packed fp sign bitwise comparisons.
+ TESTP,
+
+ // OR/AND test for masks.
+ KORTEST,
+ KTEST,
+
+ // ADD for masks.
+ KADD,
+
+ // Several flavors of instructions with vector shuffle behaviors.
+ // Saturated signed/unnsigned packing.
+ PACKSS,
+ PACKUS,
+ // Intra-lane alignr.
+ PALIGNR,
+ // AVX512 inter-lane alignr.
+ VALIGN,
+ PSHUFD,
+ PSHUFHW,
+ PSHUFLW,
+ SHUFP,
+ // VBMI2 Concat & Shift.
+ VSHLD,
+ VSHRD,
+ VSHLDV,
+ VSHRDV,
+ // Shuffle Packed Values at 128-bit granularity.
+ SHUF128,
+ MOVDDUP,
+ MOVSHDUP,
+ MOVSLDUP,
+ MOVLHPS,
+ MOVHLPS,
+ MOVSD,
+ MOVSS,
+ UNPCKL,
+ UNPCKH,
+ VPERMILPV,
+ VPERMILPI,
+ VPERMI,
+ VPERM2X128,
+
+ // Variable Permute (VPERM).
+ // Res = VPERMV MaskV, V0
+ VPERMV,
+
+ // 3-op Variable Permute (VPERMT2).
+ // Res = VPERMV3 V0, MaskV, V1
+ VPERMV3,
+
+ // Bitwise ternary logic.
+ VPTERNLOG,
+ // Fix Up Special Packed Float32/64 values.
+ VFIXUPIMM,
+ VFIXUPIMM_SAE,
+ VFIXUPIMMS,
+ VFIXUPIMMS_SAE,
+ // Range Restriction Calculation For Packed Pairs of Float32/64 values.
+ VRANGE,
+ VRANGE_SAE,
+ VRANGES,
+ VRANGES_SAE,
+ // Reduce - Perform Reduction Transformation on scalar\packed FP.
+ VREDUCE,
+ VREDUCE_SAE,
+ VREDUCES,
+ VREDUCES_SAE,
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+ // Also used by the legacy (V)ROUND intrinsics where we mask out the
+ // scaling part of the immediate.
+ VRNDSCALE,
+ VRNDSCALE_SAE,
+ VRNDSCALES,
+ VRNDSCALES_SAE,
+ // Tests Types Of a FP Values for packed types.
+ VFPCLASS,
+ // Tests Types Of a FP Values for scalar types.
+ VFPCLASSS,
+
+ // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+ // a vector, this node may change the vector length as part of the splat.
+ VBROADCAST,
+ // Broadcast mask to vector.
+ VBROADCASTM,
+
+ /// SSE4A Extraction and Insertion.
+ EXTRQI,
+ INSERTQI,
+
+ // XOP arithmetic/logical shifts.
+ VPSHA,
+ VPSHL,
+ // XOP signed/unsigned integer comparisons.
+ VPCOM,
+ VPCOMU,
+ // XOP packed permute bytes.
+ VPPERM,
+ // XOP two source permutation.
+ VPERMIL2,
+
+ // Vector multiply packed unsigned doubleword integers.
+ PMULUDQ,
+ // Vector multiply packed signed doubleword integers.
+ PMULDQ,
+ // Vector Multiply Packed UnsignedIntegers with Round and Scale.
+ MULHRS,
+
+ // Multiply and Add Packed Integers.
+ VPMADDUBSW,
+ VPMADDWD,
+
+ // AVX512IFMA multiply and add.
+ // NOTE: These are different than the instruction and perform
+ // op0 x op1 + op2.
+ VPMADD52L,
+ VPMADD52H,
+
+ // VNNI
+ VPDPBUSD,
+ VPDPBUSDS,
+ VPDPWSSD,
+ VPDPWSSDS,
+
+ // FMA nodes.
+ // We use the target independent ISD::FMA for the non-inverted case.
+ FNMADD,
+ FMSUB,
+ FNMSUB,
+ FMADDSUB,
+ FMSUBADD,
+
+ // FMA with rounding mode.
+ FMADD_RND,
+ FNMADD_RND,
+ FMSUB_RND,
+ FNMSUB_RND,
+ FMADDSUB_RND,
+ FMSUBADD_RND,
+
+ // Compress and expand.
+ COMPRESS,
+ EXPAND,
+
+ // Bits shuffle
+ VPSHUFBITQMB,
+
+ // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+ SINT_TO_FP_RND,
+ UINT_TO_FP_RND,
+ SCALAR_SINT_TO_FP,
+ SCALAR_UINT_TO_FP,
+ SCALAR_SINT_TO_FP_RND,
+ SCALAR_UINT_TO_FP_RND,
+
+ // Vector float/double to signed/unsigned integer.
+ CVTP2SI,
+ CVTP2UI,
+ CVTP2SI_RND,
+ CVTP2UI_RND,
+ // Scalar float/double to signed/unsigned integer.
+ CVTS2SI,
+ CVTS2UI,
+ CVTS2SI_RND,
+ CVTS2UI_RND,
+
+ // Vector float/double to signed/unsigned integer with truncation.
+ CVTTP2SI,
+ CVTTP2UI,
+ CVTTP2SI_SAE,
+ CVTTP2UI_SAE,
+ // Scalar float/double to signed/unsigned integer with truncation.
+ CVTTS2SI,
+ CVTTS2UI,
+ CVTTS2SI_SAE,
+ CVTTS2UI_SAE,
+
+ // Vector signed/unsigned integer to float/double.
+ CVTSI2P,
+ CVTUI2P,
+
+ // Masked versions of above. Used for v2f64->v4f32.
+ // SRC, PASSTHRU, MASK
+ MCVTP2SI,
+ MCVTP2UI,
+ MCVTTP2SI,
+ MCVTTP2UI,
+ MCVTSI2P,
+ MCVTUI2P,
+
+ // Vector float to bfloat16.
+ // Convert TWO packed single data to one packed BF16 data
+ CVTNE2PS2BF16,
+ // Convert packed single data to packed BF16 data
+ CVTNEPS2BF16,
+ // Masked version of above.
+ // SRC, PASSTHRU, MASK
+ MCVTNEPS2BF16,
+
+ // Dot product of BF16 pairs to accumulated into
+ // packed single precision.
+ DPBF16PS,
+
+ // Save xmm argument registers to the stack, according to %al. An operator
+ // is needed so that this can be expanded with control flow.
+ VASTART_SAVE_XMM_REGS,
+
+ // Windows's _chkstk call to do stack probing.
+ WIN_ALLOCA,
+
+ // For allocating variable amounts of stack space when using
+ // segmented stacks. Check if the current stacklet has enough space, and
+ // falls back to heap allocation if not.
+ SEG_ALLOCA,
+
+ // For allocating stack space when using stack clash protector.
+ // Allocation is performed by block, and each block is probed.
+ PROBED_ALLOCA,
+
+ // Memory barriers.
+ MEMBARRIER,
+ MFENCE,
+
+ // Get a random integer and indicate whether it is valid in CF.
+ RDRAND,
+
+ // Get a NIST SP800-90B & C compliant random integer and
+ // indicate whether it is valid in CF.
+ RDSEED,
+
+ // Protection keys
+ // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
+ // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
+ // value for ECX.
+ RDPKRU,
+ WRPKRU,
+
+ // SSE42 string comparisons.
+ // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+ // will emit one or two instructions based on which results are used. If
+ // flags and index/mask this allows us to use a single instruction since
+ // we won't have to pick and opcode for flags. Instead we can rely on the
+ // DAG to CSE everything and decide at isel.
+ PCMPISTR,
+ PCMPESTR,
+
+ // Test if in transactional execution.
+ XTEST,
+
+ // ERI instructions.
+ RSQRT28,
+ RSQRT28_SAE,
+ RSQRT28S,
+ RSQRT28S_SAE,
+ RCP28,
+ RCP28_SAE,
+ RCP28S,
+ RCP28S_SAE,
+ EXP2,
+ EXP2_SAE,
+
+ // Conversions between float and half-float.
+ CVTPS2PH,
+ CVTPH2PS,
+ CVTPH2PS_SAE,
+
+ // Masked version of above.
+ // SRC, RND, PASSTHRU, MASK
+ MCVTPS2PH,
+
+ // Galois Field Arithmetic Instructions
+ GF2P8AFFINEINVQB,
+ GF2P8AFFINEQB,
+ GF2P8MULB,
+
+ // LWP insert record.
+ LWPINS,
+
+ // User level wait
+ UMWAIT,
+ TPAUSE,
+
+ // Enqueue Stores Instructions
+ ENQCMD,
+ ENQCMDS,
+
+ // For avx512-vp2intersect
+ VP2INTERSECT,
+
+ // User level interrupts - testui
+ TESTUI,
+
+ /// X86 strict FP compare instructions.
+ STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+ STRICT_FCMPS,
+
+ // Vector packed double/float comparison.
+ STRICT_CMPP,
+
+ /// Vector comparison generating mask bits for fp and
+ /// integer signed and unsigned data types.
+ STRICT_CMPM,
+
+ // Vector float/double to signed/unsigned integer with truncation.
+ STRICT_CVTTP2SI,
+ STRICT_CVTTP2UI,
+
+ // Vector FP extend.
+ STRICT_VFPEXT,
+
+ // Vector FP round.
+ STRICT_VFPROUND,
+
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+ // Also used by the legacy (V)ROUND intrinsics where we mask out the
+ // scaling part of the immediate.
+ STRICT_VRNDSCALE,
+
+ // Vector signed/unsigned integer to float/double.
+ STRICT_CVTSI2P,
+ STRICT_CVTUI2P,
+
+ // Strict FMA nodes.
+ STRICT_FNMADD,
+ STRICT_FMSUB,
+ STRICT_FNMSUB,
+
+ // Conversions between float and half-float.
+ STRICT_CVTPS2PH,
+ STRICT_CVTPH2PS,
+
+ // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
+ // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
+
+ // Compare and swap.
+ LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LCMPXCHG8_DAG,
+ LCMPXCHG16_DAG,
+ LCMPXCHG16_SAVE_RBX_DAG,
+
+ /// LOCK-prefixed arithmetic read-modify-write instructions.
+ /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+ LADD,
+ LSUB,
+ LOR,
+ LXOR,
+ LAND,
+
+ // Load, scalar_to_vector, and zero extend.
+ VZEXT_LOAD,
+
+ // extract_vector_elt, store.
+ VEXTRACT_STORE,
+
+ // scalar broadcast from memory.
+ VBROADCAST_LOAD,
+
+ // subvector broadcast from memory.
+ SUBV_BROADCAST_LOAD,
+
+ // Store FP control world into i16 memory.
+ FNSTCW16m,
+
+ /// This instruction implements FP_TO_SINT with the
+ /// integer destination in memory and a FP reg source. This corresponds
+ /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+ /// has two inputs (token chain and address) and two outputs (int value
+ /// and token chain). Memory VT specifies the type to store to.
+ FP_TO_INT_IN_MEM,
+
+ /// This instruction implements SINT_TO_FP with the
+ /// integer source in memory and FP reg result. This corresponds to the
+ /// X86::FILD*m instructions. It has two inputs (token chain and address)
+ /// and two outputs (FP value and token chain). The integer source type is
+ /// specified by the memory VT.
+ FILD,
+
+ /// This instruction implements a fp->int store from FP stack
+ /// slots. This corresponds to the fist instruction. It takes a
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
+ FIST,
+
+ /// This instruction implements an extending load to FP stack slots.
+ /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+ /// operand, and ptr to load from. The memory VT specifies the type to
+ /// load from.
+ FLD,
+
+ /// This instruction implements a truncating store from FP stack
+ /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
+ FST,
+
+ /// These instructions grab the address of the next argument
+ /// from a va_list. (reads and modifies the va_list in memory)
+ VAARG_64,
+ VAARG_X32,
+
+ // Vector truncating store with unsigned/signed saturation
+ VTRUNCSTOREUS,
+ VTRUNCSTORES,
+ // Vector truncating masked store with unsigned/signed saturation
+ VMTRUNCSTOREUS,
+ VMTRUNCSTORES,
+
+ // X86 specific gather and scatter
+ MGATHER,
+ MSCATTER,
+
+ // Key locker nodes that produce flags.
+ AESENC128KL,
+ AESDEC128KL,
+ AESENC256KL,
+ AESDEC256KL,
+ AESENCWIDE128KL,
+ AESDECWIDE128KL,
+ AESENCWIDE256KL,
+ AESDECWIDE256KL,
+
+ // WARNING: Do not add anything in the end unless you want the node to
+ // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
+ // opcodes will be thought as target memory ops!
+ };
+ } // end namespace X86ISD
+
+ /// Define some predicates that are used for node matching.
+ namespace X86 {
+ /// Returns true if Elt is a constant zero or floating point constant +0.0.
+ bool isZeroNode(SDValue Elt);
+
+ /// Returns true of the given offset can be
+ /// fit into displacement field of the instruction.
+ bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement);
+
+ /// Determines whether the callee is required to pop its
+ /// own arguments. Callee pop is necessary to support tail calls.
+ bool isCalleePop(CallingConv::ID CallingConv,
+ bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
+
+ /// If Op is a constant whose elements are all the same constant or
+ /// undefined, return true and return the constant value in \p SplatVal.
+ /// If we have undef bits that don't cover an entire element, we treat these
+ /// as zero if AllowPartialUndefs is set, else we fail and return false.
+ bool isConstantSplat(SDValue Op, APInt &SplatVal,
+ bool AllowPartialUndefs = true);
+ } // end namespace X86
+
+ //===--------------------------------------------------------------------===//
+ // X86 Implementation of the TargetLowering interface
+ class X86TargetLowering final : public TargetLowering {
+ public:
+ explicit X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI);
+
+ unsigned getJumpTableEncoding() const override;
+ bool useSoftFloat() const override;
+
+ void markLibCallAttributes(MachineFunction *MF, unsigned CC,
+ ArgListTy &Args) const override;
+
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
+ return MVT::i8;
+ }
+
+ const MCExpr *
+ LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB, unsigned uid,
+ MCContext &Ctx) const override;
+
+ /// Returns relocation base for the given PIC jumptable.
+ SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const override;
+ const MCExpr *
+ getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ unsigned JTI, MCContext &Ctx) const override;
+
+ /// Return the desired alignment for ByVal aggregate
+ /// function arguments in the caller parameter area. For X86, aggregates
+ /// that contains are placed at 16-byte boundaries while the rest are at
+ /// 4-byte boundaries.
+ unsigned getByValTypeAlignment(Type *Ty,
+ const DataLayout &DL) const override;
+
+ EVT getOptimalMemOpType(const MemOp &Op,
+ const AttributeList &FuncAttributes) const override;
+
+ /// Returns true if it's safe to use load / store of the
+ /// specified type to expand memcpy / memset inline. This is mostly true
+ /// for all types except for some special cases. For example, on X86
+ /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+ /// also does type conversion. Note the specified type doesn't have to be
+ /// legal as the hook is used before type legalization.
+ bool isSafeMemOpType(MVT VT) const override;
+
+ /// Returns true if the target allows unaligned memory accesses of the
+ /// specified type. Returns whether it is "fast" in the last argument.
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
+ MachineMemOperand::Flags Flags,
+ bool *Fast) const override;
+
+ /// Provide custom lowering hooks for some operations.
+ ///
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ /// Return true if the target has native support for
+ /// the specified value type and it is 'desirable' to use the type for the
+ /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+ /// instruction encodings are longer and some i16 instructions are slow.
+ bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
+
+ /// Return true if the target has native support for the
+ /// specified value type and it is 'desirable' to use the type. e.g. On x86
+ /// i16 is legal, but undesirable since i16 instruction encodings are longer
+ /// and some i16 instructions are slow.
+ bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+
+ /// Return the newly negated expression if the cost is not expensive and
+ /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
+ /// do the negation.
+ SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations, bool ForCodeSize,
+ NegatibleCost &Cost,
+ unsigned Depth) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ /// This method returns the name of a target specific DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ /// Do not merge vector stores after legalization because that may conflict
+ /// with x86-specific store splitting optimizations.
+ bool mergeStoresAfterLegalization(EVT MemVT) const override {
+ return !MemVT.isVector();
+ }
+
+ bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+ const SelectionDAG &DAG) const override;
+
+ bool isCheapToSpeculateCttz() const override;
+
+ bool isCheapToSpeculateCtlz() const override;
+
+ bool isCtlzFast() const override;
+
+ bool hasBitPreservingFPLogic(EVT VT) const override {
+ return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
+ }
+
+ bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
+ // If the pair to store is a mixture of float and int values, we will
+ // save two bitwise instructions and one float-to-int instruction and
+ // increase one store instruction. There is potentially a more
+ // significant benefit because it avoids the float->int domain switch
+ // for input value. So It is more likely a win.
+ if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
+ (LTy.isInteger() && HTy.isFloatingPoint()))
+ return true;
+ // If the pair only contains int values, we will save two bitwise
+ // instructions and increase one store instruction (costing one more
+ // store buffer). Since the benefit is more blurred so we leave
+ // such pair out until we get testcase to prove it is a win.
+ return false;
+ }
+
+ bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
+ bool hasAndNotCompare(SDValue Y) const override;
+
+ bool hasAndNot(SDValue Y) const override;
+
+ bool hasBitTest(SDValue X, SDValue Y) const override;
+
+ bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const override;
+
+ bool shouldFoldConstantShiftPairToMask(const SDNode *N,
+ CombineLevel Level) const override;
+
+ bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
+
+ bool
+ shouldTransformSignedTruncationCheck(EVT XVT,
+ unsigned KeptBits) const override {
+ // For vectors, we don't have a preference..
+ if (XVT.isVector())
+ return false;
+
+ auto VTIsOk = [](EVT VT) -> bool {
+ return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+ VT == MVT::i64;
+ };
+
+ // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
+ // XVT will be larger than KeptBitsVT.
+ MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
+ return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
+ }
+
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
+
+ bool shouldSplatInsEltVarIndex(EVT VT) const override;
+
+ bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+ return VT.isScalarInteger();
+ }
+
+ /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
+ MVT hasFastEqualityCompare(unsigned NumBits) const override;
+
+ /// Return the value type to use for ISD::SETCC.
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ TargetLoweringOpt &TLO) const override;
+
+ /// Determine which of the bits specified in Mask are known to be either
+ /// zero or one and return them in the KnownZero/KnownOne bitsets.
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ /// Determine the number of bits in the operation that are sign bits.
+ unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const override;
+
+ bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
+ const APInt &DemandedElts,
+ APInt &KnownUndef,
+ APInt &KnownZero,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const override;
+
+ bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
+ const APInt &DemandedElts,
+ unsigned MaskIndex,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const;
+
+ bool SimplifyDemandedBitsForTargetNode(SDValue Op,
+ const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ KnownBits &Known,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const override;
+
+ SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ SelectionDAG &DAG, unsigned Depth) const override;
+
+ const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
+
+ SDValue unwrapAddress(SDValue N) const override;
+
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+ bool ExpandInlineAsm(CallInst *CI) const override;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &info,
+ const char *constraint) const override;
+
+ const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+ /// Lower the specified operand into the Ops vector. If it is invalid, don't
+ /// add anything to Ops. If hasMemory is true it means one of the asm
+ /// constraint of the inline asm instruction being processed is 'm'.
+ void LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ unsigned
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
+ else if (ConstraintCode == "v")
+ return InlineAsm::Constraint_v;
+ else if (ConstraintCode == "X")
+ return InlineAsm::Constraint_X;
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ /// Handle Lowering flag assembly outputs.
+ SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
+ const SDLoc &DL,
+ const AsmOperandInfo &Constraint,
+ SelectionDAG &DAG) const override;
+
+ /// Given a physical register constraint
+ /// (e.g. {edx}), return the register number and the register class for the
+ /// register. This should only be used for C_Register constraints. On
+ /// error, this returns a register number of 0.
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ /// Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS,
+ Instruction *I = nullptr) const override;
+
+ /// Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+
+ /// Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can
+ /// add a register and the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalAddImmediate(int64_t Imm) const override;
+
+ bool isLegalStoreImmediate(int64_t Imm) const override;
+
+ /// Return the cost of the scaling factor used in the addressing
+ /// mode represented by AM for this target, for a load/store
+ /// of the specified type.
+ /// If the AM is supported, the return value must be >= 0.
+ /// If the AM is not supported, it returns a negative value.
+ int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ /// This is used to enable splatted operand transforms for vector shifts
+ /// and vector funnel shifts.
+ bool isVectorShiftByScalarCheap(Type *Ty) const override;
+
+ /// Add x86-specific opcodes to the default list.
+ bool isBinOp(unsigned Opcode) const override;
+
+ /// Returns true if the opcode is a commutative binary operation.
+ bool isCommutativeBinOp(unsigned Opcode) const override;
+
+ /// Return true if it's free to truncate a value of
+ /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+ /// register EAX to i16 by referencing its sub-register AX.
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+ /// Return true if any actual instruction that defines a
+ /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+ /// register. This does not necessarily include registers defined in
+ /// unknown ways, such as incoming arguments, or copies from unknown
+ /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+ /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+ /// all instructions that define 32-bit values implicit zero-extend the
+ /// result out to 64 bits.
+ bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+ bool isZExtFree(EVT VT1, EVT VT2) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ bool shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const override;
+ bool shouldConvertPhiType(Type *From, Type *To) const override;
+
+ /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+ /// extend node) is profitable.
+ bool isVectorLoadExtDesirable(SDValue) const override;
+
+ /// Return true if an FMA operation is faster than a pair of fmul and fadd
+ /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+ /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
+
+ /// Return true if it's profitable to narrow
+ /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+ /// from i32 to i8 but not from i32 to i16.
+ bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+ /// Given an intrinsic, checks if on the target the intrinsic will need to map
+ /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
+ /// true and stores the intrinsic information into the IntrinsicInfo that was
+ /// passed to the function.
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ MachineFunction &MF,
+ unsigned Intrinsic) const override;
+
+ /// Returns true if the target can instruction select the
+ /// specified FP immediate natively. If false, the legalizer will
+ /// materialize the FP immediate as a load from a constant pool.
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
+
+ /// Targets can use this to indicate that they only support *some*
+ /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
+ /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
+ /// be legal.
+ bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
+
+ /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
+ /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
+ /// constant pool entry.
+ bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
+
+ /// Returns true if lowering to a jump table is allowed.
+ bool areJTsAllowed(const Function *Fn) const override;
+
+ /// If true, then instruction selection should
+ /// seek to shrink the FP constant of the specified type to a smaller type
+ /// in order to save space and / or reduce runtime.
+ bool ShouldShrinkFPConstant(EVT VT) const override {
+ // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+ // expensive than a straight movsd. On the other hand, it's important to
+ // shrink long double fp constant since fldt is very slow.
+ return !X86ScalarSSEf64 || VT == MVT::f80;
+ }
+
+ /// Return true if we believe it is correct and profitable to reduce the
+ /// load node to a smaller type.
+ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+ EVT NewVT) const override;
+
+ /// Return true if the specified scalar FP type is computed in an SSE
+ /// register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ /// Returns true if it is beneficial to convert a load of a constant
+ /// to just the constant itself.
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
+
+ bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
+
+ bool convertSelectOfConstantsToMath(EVT VT) const override;
+
+ bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const override;
+
+ /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+ /// with this index.
+ bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const override;
+
+ /// Scalar ops always have equal or better analysis/performance/power than
+ /// the vector equivalent, so this always makes sense if the scalar op is
+ /// supported.
+ bool shouldScalarizeBinop(SDValue) const override;
+
+ /// Extract of a scalar FP value from index 0 of a vector is free.
+ bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
+ EVT EltVT = VT.getScalarType();
+ return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
+ }
+
+ /// Overflow nodes should get combined/lowered to optimal instructions
+ /// (they should allow eliminating explicit compares by getting flags from
+ /// math ops).
+ bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool MathUsed) const override;
+
+ bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
+ unsigned AddrSpace) const override {
+ // If we can replace more than 2 scalar stores, there will be a reduction
+ // in instructions even after we add a vector constant load.
+ return NumElem > 2;
+ }
+
+ bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const override;
+
+ /// Intel processors have a unified instruction and data cache
+ const char * getClearCacheBuiltinName() const override {
+ return nullptr; // nothing to do, move along.
+ }
+
+ Register getRegisterByName(const char* RegName, LLT VT,
+ const MachineFunction &MF) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ Register
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ Register
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+ virtual bool needsFixedCatchObjects() const override;
+
+ /// This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const override;
+
+ /// If the target has a standard location for the stack protector cookie,
+ /// returns the address of that location. Otherwise, returns nullptr.
+ Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+
+ bool useLoadStackGuardNode() const override;
+ bool useStackGuardXorFP() const override;
+ void insertSSPDeclarations(Module &M) const override;
+ Value *getSDagStackGuard(const Module &M) const override;
+ Function *getSSPStackGuardCheck(const Module &M) const override;
+ SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
+ const SDLoc &DL) const override;
+
+
+ /// Return true if the target stores SafeStack pointer at a fixed offset in
+ /// some non-standard address space, and populates the address space and
+ /// offset as appropriate.
+ Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+ std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
+ SDValue Chain, SDValue Pointer,
+ MachinePointerInfo PtrInfo,
+ Align Alignment,
+ SelectionDAG &DAG) const;
+
+ /// Customize the preferred legalization strategy for certain types.
+ LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+
+ bool softPromoteHalfType() const override { return true; }
+
+ MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
+ EVT VT) const override;
+
+ unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const override;
+
+ unsigned getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const override;
+
+ bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
+
+ bool supportSwiftError() const override;
+
+ bool hasStackProbeSymbol(MachineFunction &MF) const override;
+ bool hasInlineStackProbe(MachineFunction &MF) const override;
+ StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
+
+ unsigned getStackProbeSize(MachineFunction &MF) const;
+
+ bool hasVectorBlend() const override { return true; }
+
+ unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+ /// Lower interleaved load(s) into target specific
+ /// instructions/intrinsics.
+ bool lowerInterleavedLoad(LoadInst *LI,
+ ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices,
+ unsigned Factor) const override;
+
+ /// Lower interleaved store(s) into target specific
+ /// instructions/intrinsics.
+ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ unsigned Factor) const override;
+
+ SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value,
+ SDValue Addr, SelectionDAG &DAG)
+ const override;
+
+ Align getPrefLoopAlignment(MachineLoop *ML) const override;
+
+ protected:
+ std::pair<const TargetRegisterClass *, uint8_t>
+ findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const override;
+
+ private:
+ /// Keep a reference to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget &Subtarget;
+
+ /// Select between SSE or x87 floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf32;
+ bool X86ScalarSSEf64;
+
+ /// A list of legal FP immediates.
+ std::vector<APFloat> LegalFPImmediates;
+
+ /// Indicate that this x86 target can instruction
+ /// select the specified FP immediate natively.
+ void addLegalFPImmediate(const APFloat& Imm) {
+ LegalFPImmediates.push_back(Imm);
+ }
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals,
+ uint32_t *RegMask) const;
+ SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA, MachineFrameInfo &MFI,
+ unsigned i) const;
+ SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags, bool isByval) const;
+
+ // Call lowering helpers.
+
+ /// Check whether the call is eligible for tail call optimization. Targets
+ /// that want to do tail call optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const;
+ SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+ SDValue Chain, bool IsTailCall,
+ bool Is64Bit, int FPDiff,
+ const SDLoc &dl) const;
+
+ unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG &DAG) const;
+
+ unsigned getAddressSpace(void) const;
+
+ SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
+ SDValue &Chain) const;
+ SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
+
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+
+ unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
+ const unsigned char OpFlags = 0) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
+ /// Creates target global address or external symbol nodes for calls or
+ /// other uses.
+ SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
+ bool ForCall) const;
+
+ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+
+ bool supportSplitCSR(MachineFunction *MF) const override {
+ return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
+ }
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
+ bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
+ bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+
+ EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const override;
+
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+ LoadInst *
+ lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+
+ bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override;
+ bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override;
+
+ bool needsCmpXchgNb(Type *MemType) const;
+
+ void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB, int FI) const;
+
+ // Utility function to emit the low-level va_arg code for X86-64.
+ MachineBasicBlock *
+ EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
+
+ /// Utility function to emit the xmm reg save portion of va_start.
+ MachineBasicBlock *
+ EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
+ MachineInstr &MI2,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ void emitSetJmpShadowStackFix(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Emit flags for the given setcc condition and operands. Also returns the
+ /// corresponding X86 condition code constant in X86CC.
+ SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SDValue &X86CC) const;
+
+ /// Check if replacement of SQRT with RSQRT should be disabled.
+ bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// Use rsqrt* to speed up sqrt calculations.
+ SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps, bool &UseOneConstNR,
+ bool Reciprocal) const override;
+
+ /// Use rcp* to speed up fdiv calculations.
+ SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const override;
+
+ /// Reassociate floating point divisions into multiply by reciprocal.
+ unsigned combineRepeatedFPDivisors() const override;
+
+ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const override;
+ };
+
+ namespace X86 {
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+ } // end namespace X86
+
+ // X86 specific Gather/Scatter nodes.
+ // The class has the same order of operands as MaskedGatherScatterSDNode for
+ // convenience.
+ class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
+ public:
+ // This is a intended as a utility and should never be directly created.
+ X86MaskedGatherScatterSDNode() = delete;
+ ~X86MaskedGatherScatterSDNode() = delete;
+
+ const SDValue &getBasePtr() const { return getOperand(3); }
+ const SDValue &getIndex() const { return getOperand(4); }
+ const SDValue &getMask() const { return getOperand(2); }
+ const SDValue &getScale() const { return getOperand(5); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MGATHER ||
+ N->getOpcode() == X86ISD::MSCATTER;
+ }
+ };
+
+ class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
+ public:
+ const SDValue &getPassThru() const { return getOperand(1); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MGATHER;
+ }
+ };
+
+ class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
+ public:
+ const SDValue &getValue() const { return getOperand(1); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MSCATTER;
+ }
+ };
+
+ /// Generate unpacklo/unpackhi shuffle mask.
+ void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+ bool Unary);
+
+ /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+ /// imposed by AVX and specific to the unary pattern. Example:
+ /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+ /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+ void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
new file mode 100644
index 000000000000..85410c54a4d2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -0,0 +1,175 @@
+//===---- X86IndirectBranchTracking.cpp - Enables CET IBT mechanism -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that enables Indirect Branch Tracking (IBT) as part
+// of Control-Flow Enforcement Technology (CET).
+// The pass adds ENDBR (End Branch) machine instructions at the beginning of
+// each basic block or function that is referenced by an indrect jump/call
+// instruction.
+// The ENDBR instructions have a NOP encoding and as such are ignored in
+// targets that do not support CET IBT mechanism.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-indirect-branch-tracking"
+
+cl::opt<bool> IndirectBranchTracking(
+ "x86-indirect-branch-tracking", cl::init(false), cl::Hidden,
+ cl::desc("Enable X86 indirect branch tracking pass."));
+
+STATISTIC(NumEndBranchAdded, "Number of ENDBR instructions added");
+
+namespace {
+class X86IndirectBranchTrackingPass : public MachineFunctionPass {
+public:
+ X86IndirectBranchTrackingPass() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "X86 Indirect Branch Tracking";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ static char ID;
+
+ /// Machine instruction info used throughout the class.
+ const X86InstrInfo *TII = nullptr;
+
+ /// Endbr opcode for the current machine function.
+ unsigned int EndbrOpcode = 0;
+
+ /// Adds a new ENDBR instruction to the beginning of the MBB.
+ /// The function will not add it if already exists.
+ /// It will add ENDBR32 or ENDBR64 opcode, depending on the target.
+ /// \returns true if the ENDBR was added and false otherwise.
+ bool addENDBR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
+};
+
+} // end anonymous namespace
+
+char X86IndirectBranchTrackingPass::ID = 0;
+
+FunctionPass *llvm::createX86IndirectBranchTrackingPass() {
+ return new X86IndirectBranchTrackingPass();
+}
+
+bool X86IndirectBranchTrackingPass::addENDBR(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+ assert(TII && "Target instruction info was not initialized");
+ assert((X86::ENDBR64 == EndbrOpcode || X86::ENDBR32 == EndbrOpcode) &&
+ "Unexpected Endbr opcode");
+
+ // If the MBB/I is empty or the current instruction is not ENDBR,
+ // insert ENDBR instruction to the location of I.
+ if (I == MBB.end() || I->getOpcode() != EndbrOpcode) {
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(EndbrOpcode));
+ ++NumEndBranchAdded;
+ return true;
+ }
+ return false;
+}
+
+static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
+ if (!MOp.isGlobal())
+ return false;
+ auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal());
+ if (!CalleeFn)
+ return false;
+ AttributeList Attrs = CalleeFn->getAttributes();
+ return Attrs.hasFnAttribute(Attribute::ReturnsTwice);
+}
+
+bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &SubTarget = MF.getSubtarget<X86Subtarget>();
+
+ // Check that the cf-protection-branch is enabled.
+ Metadata *isCFProtectionSupported =
+ MF.getMMI().getModule()->getModuleFlag("cf-protection-branch");
+ // NB: We need to enable IBT in jitted code if JIT compiler is CET
+ // enabled.
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF.getTarget());
+#ifdef __CET__
+ bool isJITwithCET = TM->isJIT();
+#else
+ bool isJITwithCET = false;
+#endif
+ if (!isCFProtectionSupported && !IndirectBranchTracking && !isJITwithCET)
+ return false;
+
+ // True if the current MF was changed and false otherwise.
+ bool Changed = false;
+
+ TII = SubTarget.getInstrInfo();
+ EndbrOpcode = SubTarget.is64Bit() ? X86::ENDBR64 : X86::ENDBR32;
+
+ // Large code model, non-internal function or function whose address
+ // was taken, can be accessed through indirect calls. Mark the first
+ // BB with ENDBR instruction unless nocf_check attribute is used.
+ if ((TM->getCodeModel() == CodeModel::Large ||
+ MF.getFunction().hasAddressTaken() ||
+ !MF.getFunction().hasLocalLinkage()) &&
+ !MF.getFunction().doesNoCfCheck()) {
+ auto MBB = MF.begin();
+ Changed |= addENDBR(*MBB, MBB->begin());
+ }
+
+ for (auto &MBB : MF) {
+ // Find all basic blocks that their address was taken (for example
+ // in the case of indirect jump) and add ENDBR instruction.
+ if (MBB.hasAddressTaken())
+ Changed |= addENDBR(MBB, MBB.begin());
+
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ if (I->isCall() && IsCallReturnTwice(I->getOperand(0)))
+ Changed |= addENDBR(MBB, std::next(I));
+ }
+
+ // Exception handle may indirectly jump to catch pad, So we should add
+ // ENDBR before catch pad instructions. For SjLj exception model, it will
+ // create a new BB(new landingpad) indirectly jump to the old landingpad.
+ if (TM->Options.ExceptionModel == ExceptionHandling::SjLj) {
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ // New Landingpad BB without EHLabel.
+ if (MBB.isEHPad()) {
+ if (I->isDebugInstr())
+ continue;
+ Changed |= addENDBR(MBB, I);
+ break;
+ } else if (I->isEHLabel()) {
+ // Old Landingpad BB (is not Landingpad now) with
+ // the the old "callee" EHLabel.
+ MCSymbol *Sym = I->getOperand(0).getMCSymbol();
+ if (!MF.hasCallSiteLandingPad(Sym))
+ continue;
+ Changed |= addENDBR(MBB, std::next(I));
+ break;
+ }
+ }
+ } else if (MBB.isEHPad()){
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ if (!I->isEHLabel())
+ continue;
+ Changed |= addENDBR(MBB, std::next(I));
+ break;
+ }
+ }
+ }
+ return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
new file mode 100644
index 000000000000..3d96d198b409
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -0,0 +1,269 @@
+//==- X86IndirectThunks.cpp - Construct indirect call/jump thunks for x86 --=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Pass that injects an MI thunk that is used to lower indirect calls in a way
+/// that prevents speculation on some x86 processors and can be used to mitigate
+/// security vulnerabilities due to targeted speculative execution and side
+/// channels such as CVE-2017-5715.
+///
+/// Currently supported thunks include:
+/// - Retpoline -- A RET-implemented trampoline that lowers indirect calls
+/// - LVI Thunk -- A CALL/JMP-implemented thunk that forces load serialization
+/// before making an indirect call/jump
+///
+/// Note that the reason that this is implemented as a MachineFunctionPass and
+/// not a ModulePass is that ModulePasses at this point in the LLVM X86 pipeline
+/// serialize all transformations, which can consume lots of memory.
+///
+/// TODO(chandlerc): All of this code could use better comments and
+/// documentation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/IndirectThunks.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-retpoline-thunks"
+
+static const char RetpolineNamePrefix[] = "__llvm_retpoline_";
+static const char R11RetpolineName[] = "__llvm_retpoline_r11";
+static const char EAXRetpolineName[] = "__llvm_retpoline_eax";
+static const char ECXRetpolineName[] = "__llvm_retpoline_ecx";
+static const char EDXRetpolineName[] = "__llvm_retpoline_edx";
+static const char EDIRetpolineName[] = "__llvm_retpoline_edi";
+
+static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_";
+static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11";
+
+namespace {
+struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> {
+ const char *getThunkPrefix() { return RetpolineNamePrefix; }
+ bool mayUseThunk(const MachineFunction &MF) {
+ const auto &STI = MF.getSubtarget<X86Subtarget>();
+ return (STI.useRetpolineIndirectCalls() ||
+ STI.useRetpolineIndirectBranches()) &&
+ !STI.useRetpolineExternalThunk();
+ }
+ void insertThunks(MachineModuleInfo &MMI);
+ void populateThunk(MachineFunction &MF);
+};
+
+struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
+ const char *getThunkPrefix() { return LVIThunkNamePrefix; }
+ bool mayUseThunk(const MachineFunction &MF) {
+ return MF.getSubtarget<X86Subtarget>().useLVIControlFlowIntegrity();
+ }
+ void insertThunks(MachineModuleInfo &MMI) {
+ createThunkFunction(MMI, R11LVIThunkName);
+ }
+ void populateThunk(MachineFunction &MF) {
+ assert (MF.size() == 1);
+ MachineBasicBlock *Entry = &MF.front();
+ Entry->clear();
+
+ // This code mitigates LVI by replacing each indirect call/jump with a
+ // direct call/jump to a thunk that looks like:
+ // ```
+ // lfence
+ // jmpq *%r11
+ // ```
+ // This ensures that if the value in register %r11 was loaded from memory,
+ // then the value in %r11 is (architecturally) correct prior to the jump.
+ const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE));
+ BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11);
+ MF.front().addLiveIn(X86::R11);
+ }
+};
+
+class X86IndirectThunks : public MachineFunctionPass {
+public:
+ static char ID;
+
+ X86IndirectThunks() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "X86 Indirect Thunks"; }
+
+ bool doInitialization(Module &M) override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs;
+
+ // FIXME: When LLVM moves to C++17, these can become folds
+ template <typename... ThunkInserterT>
+ static void initTIs(Module &M,
+ std::tuple<ThunkInserterT...> &ThunkInserters) {
+ (void)std::initializer_list<int>{
+ (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
+ }
+ template <typename... ThunkInserterT>
+ static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
+ std::tuple<ThunkInserterT...> &ThunkInserters) {
+ bool Modified = false;
+ (void)std::initializer_list<int>{
+ Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
+ return Modified;
+ }
+};
+
+} // end anonymous namespace
+
+void RetpolineThunkInserter::insertThunks(MachineModuleInfo &MMI) {
+ if (MMI.getTarget().getTargetTriple().getArch() == Triple::x86_64)
+ createThunkFunction(MMI, R11RetpolineName);
+ else
+ for (StringRef Name : {EAXRetpolineName, ECXRetpolineName, EDXRetpolineName,
+ EDIRetpolineName})
+ createThunkFunction(MMI, Name);
+}
+
+void RetpolineThunkInserter::populateThunk(MachineFunction &MF) {
+ bool Is64Bit = MF.getTarget().getTargetTriple().getArch() == Triple::x86_64;
+ Register ThunkReg;
+ if (Is64Bit) {
+ assert(MF.getName() == "__llvm_retpoline_r11" &&
+ "Should only have an r11 thunk on 64-bit targets");
+
+ // __llvm_retpoline_r11:
+ // callq .Lr11_call_target
+ // .Lr11_capture_spec:
+ // pause
+ // lfence
+ // jmp .Lr11_capture_spec
+ // .align 16
+ // .Lr11_call_target:
+ // movq %r11, (%rsp)
+ // retq
+ ThunkReg = X86::R11;
+ } else {
+ // For 32-bit targets we need to emit a collection of thunks for various
+ // possible scratch registers as well as a fallback that uses EDI, which is
+ // normally callee saved.
+ // __llvm_retpoline_eax:
+ // calll .Leax_call_target
+ // .Leax_capture_spec:
+ // pause
+ // jmp .Leax_capture_spec
+ // .align 16
+ // .Leax_call_target:
+ // movl %eax, (%esp) # Clobber return addr
+ // retl
+ //
+ // __llvm_retpoline_ecx:
+ // ... # Same setup
+ // movl %ecx, (%esp)
+ // retl
+ //
+ // __llvm_retpoline_edx:
+ // ... # Same setup
+ // movl %edx, (%esp)
+ // retl
+ //
+ // __llvm_retpoline_edi:
+ // ... # Same setup
+ // movl %edi, (%esp)
+ // retl
+ if (MF.getName() == EAXRetpolineName)
+ ThunkReg = X86::EAX;
+ else if (MF.getName() == ECXRetpolineName)
+ ThunkReg = X86::ECX;
+ else if (MF.getName() == EDXRetpolineName)
+ ThunkReg = X86::EDX;
+ else if (MF.getName() == EDIRetpolineName)
+ ThunkReg = X86::EDI;
+ else
+ llvm_unreachable("Invalid thunk name on x86-32!");
+ }
+
+ const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ assert (MF.size() == 1);
+ MachineBasicBlock *Entry = &MF.front();
+ Entry->clear();
+
+ MachineBasicBlock *CaptureSpec =
+ MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+ MachineBasicBlock *CallTarget =
+ MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+ MCSymbol *TargetSym = MF.getContext().createTempSymbol();
+ MF.push_back(CaptureSpec);
+ MF.push_back(CallTarget);
+
+ const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+ const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
+
+ Entry->addLiveIn(ThunkReg);
+ BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
+
+ // The MIR verifier thinks that the CALL in the entry block will fall through
+ // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
+ // the successor, but the MIR verifier doesn't know how to cope with that.
+ Entry->addSuccessor(CaptureSpec);
+
+ // In the capture loop for speculation, we want to stop the processor from
+ // speculating as fast as possible. On Intel processors, the PAUSE instruction
+ // will block speculation without consuming any execution resources. On AMD
+ // processors, the PAUSE instruction is (essentially) a nop, so we also use an
+ // LFENCE instruction which they have advised will stop speculation as well
+ // with minimal resource utilization. We still end the capture with a jump to
+ // form an infinite loop to fully guarantee that no matter what implementation
+ // of the x86 ISA, speculating this code path never escapes.
+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
+ CaptureSpec->setHasAddressTaken();
+ CaptureSpec->addSuccessor(CaptureSpec);
+
+ CallTarget->addLiveIn(ThunkReg);
+ CallTarget->setHasAddressTaken();
+ CallTarget->setAlignment(Align(16));
+
+ // Insert return address clobber
+ const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
+ const Register SPReg = Is64Bit ? X86::RSP : X86::ESP;
+ addRegOffset(BuildMI(CallTarget, DebugLoc(), TII->get(MovOpc)), SPReg, false,
+ 0)
+ .addReg(ThunkReg);
+
+ CallTarget->back().setPreInstrSymbol(MF, TargetSym);
+ BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
+}
+
+FunctionPass *llvm::createX86IndirectThunksPass() {
+ return new X86IndirectThunks();
+}
+
+char X86IndirectThunks::ID = 0;
+
+bool X86IndirectThunks::doInitialization(Module &M) {
+ initTIs(M, TIs);
+ return false;
+}
+
+bool X86IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << getPassName() << '\n');
+ auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+ return runTIs(MMI, MF, TIs);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
new file mode 100644
index 000000000000..004e6fa5ebf4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -0,0 +1,253 @@
+//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass applies cache prefetch instructions based on a profile. The pass
+// assumes DiscriminateMemOps ran immediately before, to ensure debug info
+// matches the one used at profile generation time. The profile is encoded in
+// afdo format (text or binary). It contains prefetch hints recommendations.
+// Each recommendation is made in terms of debug info locations, a type (i.e.
+// nta, t{0|1|2}) and a delta. The debug info identifies an instruction with a
+// memory operand (see X86DiscriminateMemOps). The prefetch will be made for
+// a location at that memory operand + the delta specified in the
+// recommendation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Transforms/IPO/SampleProfile.h"
+using namespace llvm;
+using namespace sampleprof;
+
+static cl::opt<std::string>
+ PrefetchHintsFile("prefetch-hints-file",
+ cl::desc("Path to the prefetch hints profile. See also "
+ "-x86-discriminate-memops"),
+ cl::Hidden);
+namespace {
+
+class X86InsertPrefetch : public MachineFunctionPass {
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool doInitialization(Module &) override;
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ struct PrefetchInfo {
+ unsigned InstructionID;
+ int64_t Delta;
+ };
+ typedef SmallVectorImpl<PrefetchInfo> Prefetches;
+ bool findPrefetchInfo(const FunctionSamples *Samples, const MachineInstr &MI,
+ Prefetches &prefetches) const;
+
+public:
+ static char ID;
+ X86InsertPrefetch(const std::string &PrefetchHintsFilename);
+ StringRef getPassName() const override {
+ return "X86 Insert Cache Prefetches";
+ }
+
+private:
+ std::string Filename;
+ std::unique_ptr<SampleProfileReader> Reader;
+};
+
+using PrefetchHints = SampleRecord::CallTargetMap;
+
+// Return any prefetching hints for the specified MachineInstruction. The hints
+// are returned as pairs (name, delta).
+ErrorOr<PrefetchHints> getPrefetchHints(const FunctionSamples *TopSamples,
+ const MachineInstr &MI) {
+ if (const auto &Loc = MI.getDebugLoc())
+ if (const auto *Samples = TopSamples->findFunctionSamples(Loc))
+ return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc),
+ Loc->getBaseDiscriminator());
+ return std::error_code();
+}
+
+// The prefetch instruction can't take memory operands involving vector
+// registers.
+bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) {
+ Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
+ Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
+ return (BaseReg == 0 ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) &&
+ (IndexReg == 0 ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg));
+}
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation
+//===----------------------------------------------------------------------===//
+
+char X86InsertPrefetch::ID = 0;
+
+X86InsertPrefetch::X86InsertPrefetch(const std::string &PrefetchHintsFilename)
+ : MachineFunctionPass(ID), Filename(PrefetchHintsFilename) {}
+
+/// Return true if the provided MachineInstruction has cache prefetch hints. In
+/// that case, the prefetch hints are stored, in order, in the Prefetches
+/// vector.
+bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
+ const MachineInstr &MI,
+ Prefetches &Prefetches) const {
+ assert(Prefetches.empty() &&
+ "Expected caller passed empty PrefetchInfo vector.");
+ static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = {
+ {"_nta_", X86::PREFETCHNTA},
+ {"_t0_", X86::PREFETCHT0},
+ {"_t1_", X86::PREFETCHT1},
+ {"_t2_", X86::PREFETCHT2},
+ };
+ static const char *SerializedPrefetchPrefix = "__prefetch";
+
+ const ErrorOr<PrefetchHints> T = getPrefetchHints(TopSamples, MI);
+ if (!T)
+ return false;
+ int16_t max_index = -1;
+ // Convert serialized prefetch hints into PrefetchInfo objects, and populate
+ // the Prefetches vector.
+ for (const auto &S_V : *T) {
+ StringRef Name = S_V.getKey();
+ if (Name.consume_front(SerializedPrefetchPrefix)) {
+ int64_t D = static_cast<int64_t>(S_V.second);
+ unsigned IID = 0;
+ for (const auto &HintType : HintTypes) {
+ if (Name.startswith(HintType.first)) {
+ Name = Name.drop_front(HintType.first.size());
+ IID = HintType.second;
+ break;
+ }
+ }
+ if (IID == 0)
+ return false;
+ uint8_t index = 0;
+ Name.consumeInteger(10, index);
+
+ if (index >= Prefetches.size())
+ Prefetches.resize(index + 1);
+ Prefetches[index] = {IID, D};
+ max_index = std::max(max_index, static_cast<int16_t>(index));
+ }
+ }
+ assert(max_index + 1 >= 0 &&
+ "Possible overflow: max_index + 1 should be positive.");
+ assert(static_cast<size_t>(max_index + 1) == Prefetches.size() &&
+ "The number of prefetch hints received should match the number of "
+ "PrefetchInfo objects returned");
+ return !Prefetches.empty();
+}
+
+bool X86InsertPrefetch::doInitialization(Module &M) {
+ if (Filename.empty())
+ return false;
+
+ LLVMContext &Ctx = M.getContext();
+ ErrorOr<std::unique_ptr<SampleProfileReader>> ReaderOrErr =
+ SampleProfileReader::create(Filename, Ctx);
+ if (std::error_code EC = ReaderOrErr.getError()) {
+ std::string Msg = "Could not open profile: " + EC.message();
+ Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg,
+ DiagnosticSeverity::DS_Warning));
+ return false;
+ }
+ Reader = std::move(ReaderOrErr.get());
+ Reader->read();
+ return true;
+}
+
+void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
+ if (!Reader)
+ return false;
+ const FunctionSamples *Samples = Reader->getSamplesFor(MF.getFunction());
+ if (!Samples)
+ return false;
+
+ bool Changed = false;
+
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ SmallVector<PrefetchInfo, 4> Prefetches;
+ for (auto &MBB : MF) {
+ for (auto MI = MBB.instr_begin(); MI != MBB.instr_end();) {
+ auto Current = MI;
+ ++MI;
+
+ int Offset = X86II::getMemoryOperandNo(Current->getDesc().TSFlags);
+ if (Offset < 0)
+ continue;
+ unsigned Bias = X86II::getOperandBias(Current->getDesc());
+ int MemOpOffset = Offset + Bias;
+ // FIXME(mtrofin): ORE message when the recommendation cannot be taken.
+ if (!IsMemOpCompatibleWithPrefetch(*Current, MemOpOffset))
+ continue;
+ Prefetches.clear();
+ if (!findPrefetchInfo(Samples, *Current, Prefetches))
+ continue;
+ assert(!Prefetches.empty() &&
+ "The Prefetches vector should contain at least a value if "
+ "findPrefetchInfo returned true.");
+ for (auto &PrefInfo : Prefetches) {
+ unsigned PFetchInstrID = PrefInfo.InstructionID;
+ int64_t Delta = PrefInfo.Delta;
+ const MCInstrDesc &Desc = TII->get(PFetchInstrID);
+ MachineInstr *PFetch =
+ MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, PFetch);
+
+ static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
+ X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
+ X86::AddrSegmentReg == 4,
+ "Unexpected change in X86 operand offset order.");
+
+ // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc.
+ // FIXME(mtrofin): consider adding a:
+ // MachineInstrBuilder::set(unsigned offset, op).
+ MIB.addReg(Current->getOperand(MemOpOffset + X86::AddrBaseReg).getReg())
+ .addImm(
+ Current->getOperand(MemOpOffset + X86::AddrScaleAmt).getImm())
+ .addReg(
+ Current->getOperand(MemOpOffset + X86::AddrIndexReg).getReg())
+ .addImm(Current->getOperand(MemOpOffset + X86::AddrDisp).getImm() +
+ Delta)
+ .addReg(Current->getOperand(MemOpOffset + X86::AddrSegmentReg)
+ .getReg());
+
+ if (!Current->memoperands_empty()) {
+ MachineMemOperand *CurrentOp = *(Current->memoperands_begin());
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ CurrentOp, CurrentOp->getOffset() + Delta, CurrentOp->getSize()));
+ }
+
+ // Insert before Current. This is because Current may clobber some of
+ // the registers used to describe the input memory operand.
+ MBB.insert(Current, PFetch);
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
+}
+
+FunctionPass *llvm::createX86InsertPrefetchPass() {
+ return new X86InsertPrefetch(PrefetchHintsFile);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
new file mode 100644
index 000000000000..56d2709f5937
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
@@ -0,0 +1,147 @@
+//- X86Insertwait.cpp - Strict-Fp:Insert wait instruction X87 instructions --//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which insert x86 wait instructions after each
+// X87 instructions when strict float is enabled.
+//
+// The logic to insert a wait instruction after an X87 instruction is as below:
+// 1. If the X87 instruction don't raise float exception nor is a load/store
+// instruction, or is a x87 control instruction, don't insert wait.
+// 2. If the X87 instruction is an instruction which the following instruction
+// is an X87 exception synchronizing X87 instruction, don't insert wait.
+// 3. For other situations, insert wait instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-insert-wait"
+
+namespace {
+
+class WaitInsert : public MachineFunctionPass {
+public:
+ static char ID;
+
+ WaitInsert() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "X86 insert wait instruction";
+ }
+};
+
+} // namespace
+
+char WaitInsert::ID = 0;
+
+FunctionPass *llvm::createX86InsertX87waitPass() { return new WaitInsert(); }
+
+/// Return true if the Reg is X87 register.
+static bool isX87Reg(unsigned Reg) {
+ return (Reg == X86::FPCW || Reg == X86::FPSW ||
+ (Reg >= X86::ST0 && Reg <= X86::ST7));
+}
+
+/// check if the instruction is X87 instruction
+static bool isX87Instruction(MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (isX87Reg(MO.getReg()))
+ return true;
+ }
+ return false;
+}
+
+static bool isX87ControlInstruction(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::FNINIT:
+ case X86::FLDCW16m:
+ case X86::FNSTCW16m:
+ case X86::FNSTSW16r:
+ case X86::FNSTSWm:
+ case X86::FNCLEX:
+ case X86::FLDENVm:
+ case X86::FSTENVm:
+ case X86::FRSTORm:
+ case X86::FSAVEm:
+ case X86::FINCSTP:
+ case X86::FDECSTP:
+ case X86::FFREE:
+ case X86::FFREEP:
+ case X86::FNOP:
+ case X86::WAIT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool isX87NonWaitingControlInstruction(MachineInstr &MI) {
+ // a few special control instructions don't perform a wait operation
+ switch (MI.getOpcode()) {
+ case X86::FNINIT:
+ case X86::FNSTSW16r:
+ case X86::FNSTSWm:
+ case X86::FNSTCW16m:
+ case X86::FNCLEX:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
+ if (!MF.getFunction().hasFnAttribute(Attribute::StrictFP))
+ return false;
+
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ const X86InstrInfo *TII = ST.getInstrInfo();
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineBasicBlock::iterator MI = MBB.begin(); MI != MBB.end(); ++MI) {
+ // Jump non X87 instruction.
+ if (!isX87Instruction(*MI))
+ continue;
+ // If the instruction instruction neither has float exception nor is
+ // a load/store instruction, or the instruction is x87 control
+ // instruction, do not insert wait.
+ if (!(MI->mayRaiseFPException() || MI->mayLoadOrStore()) ||
+ isX87ControlInstruction(*MI))
+ continue;
+ // If the following instruction is an X87 instruction and isn't an X87
+ // non-waiting control instruction, we can omit insert wait instruction.
+ MachineBasicBlock::iterator AfterMI = std::next(MI);
+ if (AfterMI != MBB.end() && isX87Instruction(*AfterMI) &&
+ !isX87NonWaitingControlInstruction(*AfterMI))
+ continue;
+
+ BuildMI(MBB, AfterMI, MI->getDebugLoc(), TII->get(X86::WAIT));
+ LLVM_DEBUG(dbgs() << "\nInsert wait after:\t" << *MI);
+ // Jump the newly inserting wait
+ ++MI;
+ Changed = true;
+ }
+ }
+ return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
new file mode 100644
index 000000000000..c4150ed52854
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -0,0 +1,2017 @@
+//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+/// Return a constant boolean vector that has true elements in all positions
+/// where the input constant data vector has an element with the sign bit set.
+static Constant *getNegativeIsTrueBoolVec(Constant *V) {
+ VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
+ V = ConstantExpr::getBitCast(V, IntTy);
+ V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
+ V);
+ return V;
+}
+
+/// Convert the x86 XMM integer vector mask to a vector of bools based on
+/// each element's most significant bit (the sign bit).
+static Value *getBoolVecFromMask(Value *Mask) {
+ // Fold Constant Mask.
+ if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
+ return getNegativeIsTrueBoolVec(ConstantMask);
+
+ // Mask was extended from a boolean vector.
+ Value *ExtMask;
+ if (PatternMatch::match(
+ Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
+ ExtMask->getType()->isIntOrIntVectorTy(1))
+ return ExtMask;
+
+ return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
+ Value *Ptr = II.getOperand(0);
+ Value *Mask = II.getOperand(1);
+ Constant *ZeroVec = Constant::getNullValue(II.getType());
+
+ // Zero Mask - masked load instruction creates a zero vector.
+ if (isa<ConstantAggregateZero>(Mask))
+ return IC.replaceInstUsesWith(II, ZeroVec);
+
+ // The mask is constant or extended from a bool vector. Convert this x86
+ // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
+ if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+ // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+ // the LLVM intrinsic definition for the pointer argument.
+ unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+ PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
+ Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+ // The pass-through vector for an x86 masked load is a zero vector.
+ CallInst *NewMaskedLoad =
+ IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
+ return IC.replaceInstUsesWith(II, NewMaskedLoad);
+ }
+
+ return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
+ Value *Ptr = II.getOperand(0);
+ Value *Mask = II.getOperand(1);
+ Value *Vec = II.getOperand(2);
+
+ // Zero Mask - this masked store instruction does nothing.
+ if (isa<ConstantAggregateZero>(Mask)) {
+ IC.eraseInstFromFunction(II);
+ return true;
+ }
+
+ // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
+ // anything else at this level.
+ if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
+ return false;
+
+ // The mask is constant or extended from a bool vector. Convert this x86
+ // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
+ if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+ unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+ PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
+ Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+ IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
+
+ // 'Replace uses' doesn't work for stores. Erase the original masked store.
+ IC.eraseInstFromFunction(II);
+ return true;
+ }
+
+ return false;
+}
+
+static Value *simplifyX86immShift(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ bool LogicalShift = false;
+ bool ShiftLeft = false;
+ bool IsImm = false;
+
+ switch (II.getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx512_psrai_q_128:
+ case Intrinsic::x86_avx512_psrai_q_256:
+ case Intrinsic::x86_avx512_psrai_d_512:
+ case Intrinsic::x86_avx512_psrai_q_512:
+ case Intrinsic::x86_avx512_psrai_w_512:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
+ LogicalShift = false;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx512_psrli_d_512:
+ case Intrinsic::x86_avx512_psrli_q_512:
+ case Intrinsic::x86_avx512_psrli_w_512:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
+ LogicalShift = true;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx512_pslli_d_512:
+ case Intrinsic::x86_avx512_pslli_q_512:
+ case Intrinsic::x86_avx512_pslli_w_512:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_psll_w_512:
+ LogicalShift = true;
+ ShiftLeft = true;
+ break;
+ }
+ assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+ auto Vec = II.getArgOperand(0);
+ auto Amt = II.getArgOperand(1);
+ auto VT = cast<FixedVectorType>(Vec->getType());
+ auto SVT = VT->getElementType();
+ auto AmtVT = Amt->getType();
+ unsigned VWidth = VT->getNumElements();
+ unsigned BitWidth = SVT->getPrimitiveSizeInBits();
+
+ // If the shift amount is guaranteed to be in-range we can replace it with a
+ // generic shift. If its guaranteed to be out of range, logical shifts combine
+ // to zero and arithmetic shifts are clamped to (BitWidth - 1).
+ if (IsImm) {
+ assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
+ KnownBits KnownAmtBits =
+ llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+ if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
+ Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
+ Amt = Builder.CreateVectorSplat(VWidth, Amt);
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+ if (KnownAmtBits.getMinValue().uge(BitWidth)) {
+ if (LogicalShift)
+ return ConstantAggregateZero::get(VT);
+ Amt = ConstantInt::get(SVT, BitWidth - 1);
+ return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
+ }
+ } else {
+ // Ensure the first element has an in-range value and the rest of the
+ // elements in the bottom 64 bits are zero.
+ assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+ cast<VectorType>(AmtVT)->getElementType() == SVT &&
+ "Unexpected shift-by-scalar type");
+ unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
+ APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
+ APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
+ KnownBits KnownLowerBits = llvm::computeKnownBits(
+ Amt, DemandedLower, II.getModule()->getDataLayout());
+ KnownBits KnownUpperBits = llvm::computeKnownBits(
+ Amt, DemandedUpper, II.getModule()->getDataLayout());
+ if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
+ (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
+ SmallVector<int, 16> ZeroSplat(VWidth, 0);
+ Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+ }
+
+ // Simplify if count is constant vector.
+ auto CDV = dyn_cast<ConstantDataVector>(Amt);
+ if (!CDV)
+ return nullptr;
+
+ // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
+ // operand to compute the shift amount.
+ assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+ cast<VectorType>(AmtVT)->getElementType() == SVT &&
+ "Unexpected shift-by-scalar type");
+
+ // Concatenate the sub-elements to create the 64-bit value.
+ APInt Count(64, 0);
+ for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
+ unsigned SubEltIdx = (NumSubElts - 1) - i;
+ auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
+ Count <<= BitWidth;
+ Count |= SubElt->getValue().zextOrTrunc(64);
+ }
+
+ // If shift-by-zero then just return the original value.
+ if (Count.isNullValue())
+ return Vec;
+
+ // Handle cases when Shift >= BitWidth.
+ if (Count.uge(BitWidth)) {
+ // If LogicalShift - just return zero.
+ if (LogicalShift)
+ return ConstantAggregateZero::get(VT);
+
+ // If ArithmeticShift - clamp Shift to (BitWidth - 1).
+ Count = APInt(64, BitWidth - 1);
+ }
+
+ // Get a constant vector of the same type as the first operand.
+ auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
+ auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
+
+ if (ShiftLeft)
+ return Builder.CreateShl(Vec, ShiftVec);
+
+ if (LogicalShift)
+ return Builder.CreateLShr(Vec, ShiftVec);
+
+ return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
+// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
+// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
+static Value *simplifyX86varShift(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ bool LogicalShift = false;
+ bool ShiftLeft = false;
+
+ switch (II.getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ case Intrinsic::x86_avx512_psrav_q_128:
+ case Intrinsic::x86_avx512_psrav_q_256:
+ case Intrinsic::x86_avx512_psrav_d_512:
+ case Intrinsic::x86_avx512_psrav_q_512:
+ case Intrinsic::x86_avx512_psrav_w_128:
+ case Intrinsic::x86_avx512_psrav_w_256:
+ case Intrinsic::x86_avx512_psrav_w_512:
+ LogicalShift = false;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx512_psrlv_d_512:
+ case Intrinsic::x86_avx512_psrlv_q_512:
+ case Intrinsic::x86_avx512_psrlv_w_128:
+ case Intrinsic::x86_avx512_psrlv_w_256:
+ case Intrinsic::x86_avx512_psrlv_w_512:
+ LogicalShift = true;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx512_psllv_d_512:
+ case Intrinsic::x86_avx512_psllv_q_512:
+ case Intrinsic::x86_avx512_psllv_w_128:
+ case Intrinsic::x86_avx512_psllv_w_256:
+ case Intrinsic::x86_avx512_psllv_w_512:
+ LogicalShift = true;
+ ShiftLeft = true;
+ break;
+ }
+ assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+ auto Vec = II.getArgOperand(0);
+ auto Amt = II.getArgOperand(1);
+ auto VT = cast<FixedVectorType>(II.getType());
+ auto SVT = VT->getElementType();
+ int NumElts = VT->getNumElements();
+ int BitWidth = SVT->getIntegerBitWidth();
+
+ // If the shift amount is guaranteed to be in-range we can replace it with a
+ // generic shift.
+ APInt UpperBits =
+ APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
+ if (llvm::MaskedValueIsZero(Amt, UpperBits,
+ II.getModule()->getDataLayout())) {
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+
+ // Simplify if all shift amounts are constant/undef.
+ auto *CShift = dyn_cast<Constant>(Amt);
+ if (!CShift)
+ return nullptr;
+
+ // Collect each element's shift amount.
+ // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
+ bool AnyOutOfRange = false;
+ SmallVector<int, 8> ShiftAmts;
+ for (int I = 0; I < NumElts; ++I) {
+ auto *CElt = CShift->getAggregateElement(I);
+ if (isa_and_nonnull<UndefValue>(CElt)) {
+ ShiftAmts.push_back(-1);
+ continue;
+ }
+
+ auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
+ if (!COp)
+ return nullptr;
+
+ // Handle out of range shifts.
+ // If LogicalShift - set to BitWidth (special case).
+ // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
+ APInt ShiftVal = COp->getValue();
+ if (ShiftVal.uge(BitWidth)) {
+ AnyOutOfRange = LogicalShift;
+ ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
+ continue;
+ }
+
+ ShiftAmts.push_back((int)ShiftVal.getZExtValue());
+ }
+
+ // If all elements out of range or UNDEF, return vector of zeros/undefs.
+ // ArithmeticShift should only hit this if they are all UNDEF.
+ auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
+ if (llvm::all_of(ShiftAmts, OutOfRange)) {
+ SmallVector<Constant *, 8> ConstantVec;
+ for (int Idx : ShiftAmts) {
+ if (Idx < 0) {
+ ConstantVec.push_back(UndefValue::get(SVT));
+ } else {
+ assert(LogicalShift && "Logical shift expected");
+ ConstantVec.push_back(ConstantInt::getNullValue(SVT));
+ }
+ }
+ return ConstantVector::get(ConstantVec);
+ }
+
+ // We can't handle only some out of range values with generic logical shifts.
+ if (AnyOutOfRange)
+ return nullptr;
+
+ // Build the shift amount constant vector.
+ SmallVector<Constant *, 8> ShiftVecAmts;
+ for (int Idx : ShiftAmts) {
+ if (Idx < 0)
+ ShiftVecAmts.push_back(UndefValue::get(SVT));
+ else
+ ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
+ }
+ auto ShiftVec = ConstantVector::get(ShiftVecAmts);
+
+ if (ShiftLeft)
+ return Builder.CreateShl(Vec, ShiftVec);
+
+ if (LogicalShift)
+ return Builder.CreateLShr(Vec, ShiftVec);
+
+ return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+static Value *simplifyX86pack(IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder, bool IsSigned) {
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ Type *ResTy = II.getType();
+
+ // Fast all undef handling.
+ if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
+ return UndefValue::get(ResTy);
+
+ auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
+ unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
+ unsigned NumSrcElts = ArgTy->getNumElements();
+ assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
+ "Unexpected packing types");
+
+ unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+ unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
+ unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
+ assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
+ "Unexpected packing types");
+
+ // Constant folding.
+ if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+ return nullptr;
+
+ // Clamp Values - signed/unsigned both use signed clamp values, but they
+ // differ on the min/max values.
+ APInt MinValue, MaxValue;
+ if (IsSigned) {
+ // PACKSS: Truncate signed value with signed saturation.
+ // Source values less than dst minint are saturated to minint.
+ // Source values greater than dst maxint are saturated to maxint.
+ MinValue =
+ APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+ MaxValue =
+ APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+ } else {
+ // PACKUS: Truncate signed value with unsigned saturation.
+ // Source values less than zero are saturated to zero.
+ // Source values greater than dst maxuint are saturated to maxuint.
+ MinValue = APInt::getNullValue(SrcScalarSizeInBits);
+ MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
+ }
+
+ auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
+ auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
+ Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
+ Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
+ Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
+ Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
+
+ // Shuffle clamped args together at the lane level.
+ SmallVector<int, 32> PackMask;
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+ PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
+ for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+ PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
+ }
+ auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
+
+ // Truncate to dst size.
+ return Builder.CreateTrunc(Shuffle, ResTy);
+}
+
+static Value *simplifyX86movmsk(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Arg = II.getArgOperand(0);
+ Type *ResTy = II.getType();
+
+ // movmsk(undef) -> zero as we must ensure the upper bits are zero.
+ if (isa<UndefValue>(Arg))
+ return Constant::getNullValue(ResTy);
+
+ auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
+ // We can't easily peek through x86_mmx types.
+ if (!ArgTy)
+ return nullptr;
+
+ // Expand MOVMSK to compare/bitcast/zext:
+ // e.g. PMOVMSKB(v16i8 x):
+ // %cmp = icmp slt <16 x i8> %x, zeroinitializer
+ // %int = bitcast <16 x i1> %cmp to i16
+ // %res = zext i16 %int to i32
+ unsigned NumElts = ArgTy->getNumElements();
+ Type *IntegerVecTy = VectorType::getInteger(ArgTy);
+ Type *IntegerTy = Builder.getIntNTy(NumElts);
+
+ Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
+ Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
+ Res = Builder.CreateBitCast(Res, IntegerTy);
+ Res = Builder.CreateZExtOrTrunc(Res, ResTy);
+ return Res;
+}
+
+static Value *simplifyX86addcarry(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Value *CarryIn = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ Value *Op2 = II.getArgOperand(2);
+ Type *RetTy = II.getType();
+ Type *OpTy = Op1->getType();
+ assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
+ RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
+ "Unexpected types for x86 addcarry");
+
+ // If carry-in is zero, this is just an unsigned add with overflow.
+ if (match(CarryIn, PatternMatch::m_ZeroInt())) {
+ Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
+ {Op1, Op2});
+ // The types have to be adjusted to match the x86 call types.
+ Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
+ Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
+ Builder.getInt8Ty());
+ Value *Res = UndefValue::get(RetTy);
+ Res = Builder.CreateInsertValue(Res, UAddOV, 0);
+ return Builder.CreateInsertValue(Res, UAddResult, 1);
+ }
+
+ return nullptr;
+}
+
+static Value *simplifyX86insertps(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ if (!CInt)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
+
+ // The immediate permute control byte looks like this:
+ // [3:0] - zero mask for each 32-bit lane
+ // [5:4] - select one 32-bit destination lane
+ // [7:6] - select one 32-bit source lane
+
+ uint8_t Imm = CInt->getZExtValue();
+ uint8_t ZMask = Imm & 0xf;
+ uint8_t DestLane = (Imm >> 4) & 0x3;
+ uint8_t SourceLane = (Imm >> 6) & 0x3;
+
+ ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+ // If all zero mask bits are set, this was just a weird way to
+ // generate a zero vector.
+ if (ZMask == 0xf)
+ return ZeroVector;
+
+ // Initialize by passing all of the first source bits through.
+ int ShuffleMask[4] = {0, 1, 2, 3};
+
+ // We may replace the second operand with the zero vector.
+ Value *V1 = II.getArgOperand(1);
+
+ if (ZMask) {
+ // If the zero mask is being used with a single input or the zero mask
+ // overrides the destination lane, this is a shuffle with the zero vector.
+ if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+ (ZMask & (1 << DestLane))) {
+ V1 = ZeroVector;
+ // We may still move 32-bits of the first source vector from one lane
+ // to another.
+ ShuffleMask[DestLane] = SourceLane;
+ // The zero mask may override the previous insert operation.
+ for (unsigned i = 0; i < 4; ++i)
+ if ((ZMask >> i) & 0x1)
+ ShuffleMask[i] = i + 4;
+ } else {
+ // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+ return nullptr;
+ }
+ } else {
+ // Replace the selected destination lane with the selected source lane.
+ ShuffleMask[DestLane] = SourceLane + 4;
+ }
+
+ return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
+}
+
+/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
+/// or conversion to a shuffle vector.
+static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
+ ConstantInt *CILength, ConstantInt *CIIndex,
+ InstCombiner::BuilderTy &Builder) {
+ auto LowConstantHighUndef = [&](uint64_t Val) {
+ Type *IntTy64 = Type::getInt64Ty(II.getContext());
+ Constant *Args[] = {ConstantInt::get(IntTy64, Val),
+ UndefValue::get(IntTy64)};
+ return ConstantVector::get(Args);
+ };
+
+ // See if we're dealing with constant values.
+ Constant *C0 = dyn_cast<Constant>(Op0);
+ ConstantInt *CI0 =
+ C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+ : nullptr;
+
+ // Attempt to constant fold.
+ if (CILength && CIIndex) {
+ // From AMD documentation: "The bit index and field length are each six
+ // bits in length other bits of the field are ignored."
+ APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
+ APInt APLength = CILength->getValue().zextOrTrunc(6);
+
+ unsigned Index = APIndex.getZExtValue();
+
+ // From AMD documentation: "a value of zero in the field length is
+ // defined as length of 64".
+ unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+ // From AMD documentation: "If the sum of the bit index + length field
+ // is greater than 64, the results are undefined".
+ unsigned End = Index + Length;
+
+ // Note that both field index and field length are 8-bit quantities.
+ // Since variables 'Index' and 'Length' are unsigned values
+ // obtained from zero-extending field index and field length
+ // respectively, their sum should never wrap around.
+ if (End > 64)
+ return UndefValue::get(II.getType());
+
+ // If we are inserting whole bytes, we can convert this to a shuffle.
+ // Lowering can recognize EXTRQI shuffle masks.
+ if ((Length % 8) == 0 && (Index % 8) == 0) {
+ // Convert bit indices to byte indices.
+ Length /= 8;
+ Index /= 8;
+
+ Type *IntTy8 = Type::getInt8Ty(II.getContext());
+ auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+ SmallVector<int, 16> ShuffleMask;
+ for (int i = 0; i != (int)Length; ++i)
+ ShuffleMask.push_back(i + Index);
+ for (int i = Length; i != 8; ++i)
+ ShuffleMask.push_back(i + 16);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(-1);
+
+ Value *SV = Builder.CreateShuffleVector(
+ Builder.CreateBitCast(Op0, ShufTy),
+ ConstantAggregateZero::get(ShufTy), ShuffleMask);
+ return Builder.CreateBitCast(SV, II.getType());
+ }
+
+ // Constant Fold - shift Index'th bit to lowest position and mask off
+ // Length bits.
+ if (CI0) {
+ APInt Elt = CI0->getValue();
+ Elt.lshrInPlace(Index);
+ Elt = Elt.zextOrTrunc(Length);
+ return LowConstantHighUndef(Elt.getZExtValue());
+ }
+
+ // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
+ if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
+ Value *Args[] = {Op0, CILength, CIIndex};
+ Module *M = II.getModule();
+ Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
+ return Builder.CreateCall(F, Args);
+ }
+ }
+
+ // Constant Fold - extraction from zero is always {zero, undef}.
+ if (CI0 && CI0->isZero())
+ return LowConstantHighUndef(0);
+
+ return nullptr;
+}
+
+/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
+/// folding or conversion to a shuffle vector.
+static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
+ APInt APLength, APInt APIndex,
+ InstCombiner::BuilderTy &Builder) {
+ // From AMD documentation: "The bit index and field length are each six bits
+ // in length other bits of the field are ignored."
+ APIndex = APIndex.zextOrTrunc(6);
+ APLength = APLength.zextOrTrunc(6);
+
+ // Attempt to constant fold.
+ unsigned Index = APIndex.getZExtValue();
+
+ // From AMD documentation: "a value of zero in the field length is
+ // defined as length of 64".
+ unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+ // From AMD documentation: "If the sum of the bit index + length field
+ // is greater than 64, the results are undefined".
+ unsigned End = Index + Length;
+
+ // Note that both field index and field length are 8-bit quantities.
+ // Since variables 'Index' and 'Length' are unsigned values
+ // obtained from zero-extending field index and field length
+ // respectively, their sum should never wrap around.
+ if (End > 64)
+ return UndefValue::get(II.getType());
+
+ // If we are inserting whole bytes, we can convert this to a shuffle.
+ // Lowering can recognize INSERTQI shuffle masks.
+ if ((Length % 8) == 0 && (Index % 8) == 0) {
+ // Convert bit indices to byte indices.
+ Length /= 8;
+ Index /= 8;
+
+ Type *IntTy8 = Type::getInt8Ty(II.getContext());
+ auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+ SmallVector<int, 16> ShuffleMask;
+ for (int i = 0; i != (int)Index; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 0; i != (int)Length; ++i)
+ ShuffleMask.push_back(i + 16);
+ for (int i = Index + Length; i != 8; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(-1);
+
+ Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
+ Builder.CreateBitCast(Op1, ShufTy),
+ ShuffleMask);
+ return Builder.CreateBitCast(SV, II.getType());
+ }
+
+ // See if we're dealing with constant values.
+ Constant *C0 = dyn_cast<Constant>(Op0);
+ Constant *C1 = dyn_cast<Constant>(Op1);
+ ConstantInt *CI00 =
+ C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+ : nullptr;
+ ConstantInt *CI10 =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+ : nullptr;
+
+ // Constant Fold - insert bottom Length bits starting at the Index'th bit.
+ if (CI00 && CI10) {
+ APInt V00 = CI00->getValue();
+ APInt V10 = CI10->getValue();
+ APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
+ V00 = V00 & ~Mask;
+ V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
+ APInt Val = V00 | V10;
+ Type *IntTy64 = Type::getInt64Ty(II.getContext());
+ Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
+ UndefValue::get(IntTy64)};
+ return ConstantVector::get(Args);
+ }
+
+ // If we were an INSERTQ call, we'll save demanded elements if we convert to
+ // INSERTQI.
+ if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
+ Type *IntTy8 = Type::getInt8Ty(II.getContext());
+ Constant *CILength = ConstantInt::get(IntTy8, Length, false);
+ Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
+
+ Value *Args[] = {Op0, Op1, CILength, CIIndex};
+ Module *M = II.getModule();
+ Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+ return Builder.CreateCall(F, Args);
+ }
+
+ return nullptr;
+}
+
+/// Attempt to convert pshufb* to shufflevector if the mask is constant.
+static Value *simplifyX86pshufb(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!V)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ unsigned NumElts = VecTy->getNumElements();
+ assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+ "Unexpected number of elements in shuffle mask!");
+
+ // Construct a shuffle mask from constant integers or UNDEFs.
+ int Indexes[64];
+
+ // Each byte in the shuffle control mask forms an index to permute the
+ // corresponding byte in the destination operand.
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *COp = V->getAggregateElement(I);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return nullptr;
+
+ if (isa<UndefValue>(COp)) {
+ Indexes[I] = -1;
+ continue;
+ }
+
+ int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
+
+ // If the most significant bit (bit[7]) of each byte of the shuffle
+ // control mask is set, then zero is written in the result byte.
+ // The zero vector is in the right-hand side of the resulting
+ // shufflevector.
+
+ // The value of each index for the high 128-bit lane is the least
+ // significant 4 bits of the respective shuffle control byte.
+ Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
+ Indexes[I] = Index;
+ }
+
+ auto V1 = II.getArgOperand(0);
+ auto V2 = Constant::getNullValue(VecTy);
+ return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
+static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!V)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ unsigned NumElts = VecTy->getNumElements();
+ bool IsPD = VecTy->getScalarType()->isDoubleTy();
+ unsigned NumLaneElts = IsPD ? 2 : 4;
+ assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
+
+ // Construct a shuffle mask from constant integers or UNDEFs.
+ int Indexes[16];
+
+ // The intrinsics only read one or two bits, clear the rest.
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *COp = V->getAggregateElement(I);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return nullptr;
+
+ if (isa<UndefValue>(COp)) {
+ Indexes[I] = -1;
+ continue;
+ }
+
+ APInt Index = cast<ConstantInt>(COp)->getValue();
+ Index = Index.zextOrTrunc(32).getLoBits(2);
+
+ // The PD variants uses bit 1 to select per-lane element index, so
+ // shift down to convert to generic shuffle mask index.
+ if (IsPD)
+ Index.lshrInPlace(1);
+
+ // The _256 variants are a bit trickier since the mask bits always index
+ // into the corresponding 128 half. In order to convert to a generic
+ // shuffle, we have to make that explicit.
+ Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
+
+ Indexes[I] = Index.getZExtValue();
+ }
+
+ auto V1 = II.getArgOperand(0);
+ return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
+static Value *simplifyX86vpermv(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ auto *V = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!V)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ unsigned Size = VecTy->getNumElements();
+ assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
+ "Unexpected shuffle mask size");
+
+ // Construct a shuffle mask from constant integers or UNDEFs.
+ int Indexes[64];
+
+ for (unsigned I = 0; I < Size; ++I) {
+ Constant *COp = V->getAggregateElement(I);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return nullptr;
+
+ if (isa<UndefValue>(COp)) {
+ Indexes[I] = -1;
+ continue;
+ }
+
+ uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
+ Index &= Size - 1;
+ Indexes[I] = Index;
+ }
+
+ auto V1 = II.getArgOperand(0);
+ return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
+}
+
+Optional<Instruction *>
+X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+ auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
+ unsigned DemandedWidth) {
+ APInt UndefElts(Width, 0);
+ APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
+ return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+ };
+
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ case Intrinsic::x86_bmi_bextr_32:
+ case Intrinsic::x86_bmi_bextr_64:
+ case Intrinsic::x86_tbm_bextri_u32:
+ case Intrinsic::x86_tbm_bextri_u64:
+ // If the RHS is a constant we can try some simplifications.
+ if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ uint64_t Shift = C->getZExtValue();
+ uint64_t Length = (Shift >> 8) & 0xff;
+ Shift &= 0xff;
+ unsigned BitWidth = II.getType()->getIntegerBitWidth();
+ // If the length is 0 or the shift is out of range, replace with zero.
+ if (Length == 0 || Shift >= BitWidth) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ // If the LHS is also a constant, we can completely constant fold this.
+ if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Result = InC->getZExtValue() >> Shift;
+ if (Length > BitWidth)
+ Length = BitWidth;
+ Result &= maskTrailingOnes<uint64_t>(Length);
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
+ // are only masking bits that a shift already cleared?
+ }
+ break;
+
+ case Intrinsic::x86_bmi_bzhi_32:
+ case Intrinsic::x86_bmi_bzhi_64:
+ // If the RHS is a constant we can try some simplifications.
+ if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ uint64_t Index = C->getZExtValue() & 0xff;
+ unsigned BitWidth = II.getType()->getIntegerBitWidth();
+ if (Index >= BitWidth) {
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+ if (Index == 0) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ // If the LHS is also a constant, we can completely constant fold this.
+ if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Result = InC->getZExtValue();
+ Result &= maskTrailingOnes<uint64_t>(Index);
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ // TODO should we convert this to an AND if the RHS is constant?
+ }
+ break;
+ case Intrinsic::x86_bmi_pext_32:
+ case Intrinsic::x86_bmi_pext_64:
+ if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ if (MaskC->isNullValue()) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ if (MaskC->isAllOnesValue()) {
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+
+ if (MaskC->getValue().isShiftedMask()) {
+ // any single contingous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+ Value *Input = II.getArgOperand(0);
+ Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
+ Value *Shifted = IC.Builder.CreateLShr(Masked,
+ ConstantInt::get(II.getType(),
+ ShiftAmount));
+ return IC.replaceInstUsesWith(II, Shifted);
+ }
+
+
+ if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Src = SrcC->getZExtValue();
+ uint64_t Mask = MaskC->getZExtValue();
+ uint64_t Result = 0;
+ uint64_t BitToSet = 1;
+
+ while (Mask) {
+ // Isolate lowest set bit.
+ uint64_t BitToTest = Mask & -Mask;
+ if (BitToTest & Src)
+ Result |= BitToSet;
+
+ BitToSet <<= 1;
+ // Clear lowest set bit.
+ Mask &= Mask - 1;
+ }
+
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ }
+ break;
+ case Intrinsic::x86_bmi_pdep_32:
+ case Intrinsic::x86_bmi_pdep_64:
+ if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ if (MaskC->isNullValue()) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ if (MaskC->isAllOnesValue()) {
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+ if (MaskC->getValue().isShiftedMask()) {
+ // any single contingous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+ Value *Input = II.getArgOperand(0);
+ Value *Shifted = IC.Builder.CreateShl(Input,
+ ConstantInt::get(II.getType(),
+ ShiftAmount));
+ Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
+ return IC.replaceInstUsesWith(II, Masked);
+ }
+
+ if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Src = SrcC->getZExtValue();
+ uint64_t Mask = MaskC->getZExtValue();
+ uint64_t Result = 0;
+ uint64_t BitToTest = 1;
+
+ while (Mask) {
+ // Isolate lowest set bit.
+ uint64_t BitToSet = Mask & -Mask;
+ if (BitToTest & Src)
+ Result |= BitToSet;
+
+ BitToTest <<= 1;
+ // Clear lowest set bit;
+ Mask &= Mask - 1;
+ }
+
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ }
+ break;
+
+ case Intrinsic::x86_sse_cvtss2si:
+ case Intrinsic::x86_sse_cvtss2si64:
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ case Intrinsic::x86_sse2_cvtsd2si:
+ case Intrinsic::x86_sse2_cvtsd2si64:
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64:
+ case Intrinsic::x86_avx512_vcvtss2si32:
+ case Intrinsic::x86_avx512_vcvtss2si64:
+ case Intrinsic::x86_avx512_vcvtss2usi32:
+ case Intrinsic::x86_avx512_vcvtss2usi64:
+ case Intrinsic::x86_avx512_vcvtsd2si32:
+ case Intrinsic::x86_avx512_vcvtsd2si64:
+ case Intrinsic::x86_avx512_vcvtsd2usi32:
+ case Intrinsic::x86_avx512_vcvtsd2usi64:
+ case Intrinsic::x86_avx512_cvttss2si:
+ case Intrinsic::x86_avx512_cvttss2si64:
+ case Intrinsic::x86_avx512_cvttss2usi:
+ case Intrinsic::x86_avx512_cvttss2usi64:
+ case Intrinsic::x86_avx512_cvttsd2si:
+ case Intrinsic::x86_avx512_cvttsd2si64:
+ case Intrinsic::x86_avx512_cvttsd2usi:
+ case Intrinsic::x86_avx512_cvttsd2usi64: {
+ // These intrinsics only demand the 0th element of their input vectors. If
+ // we can simplify the input based on that, do so now.
+ Value *Arg = II.getArgOperand(0);
+ unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
+ return IC.replaceOperand(II, 0, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_mmx_pmovmskb:
+ case Intrinsic::x86_sse_movmsk_ps:
+ case Intrinsic::x86_sse2_movmsk_pd:
+ case Intrinsic::x86_sse2_pmovmskb_128:
+ case Intrinsic::x86_avx_movmsk_pd_256:
+ case Intrinsic::x86_avx_movmsk_ps_256:
+ case Intrinsic::x86_avx2_pmovmskb:
+ if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ case Intrinsic::x86_sse2_comige_sd:
+ case Intrinsic::x86_sse2_comigt_sd:
+ case Intrinsic::x86_sse2_comile_sd:
+ case Intrinsic::x86_sse2_comilt_sd:
+ case Intrinsic::x86_sse2_comineq_sd:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ case Intrinsic::x86_sse2_ucomineq_sd:
+ case Intrinsic::x86_avx512_vcomi_ss:
+ case Intrinsic::x86_avx512_vcomi_sd:
+ case Intrinsic::x86_avx512_mask_cmp_ss:
+ case Intrinsic::x86_avx512_mask_cmp_sd: {
+ // These intrinsics only demand the 0th element of their input vectors. If
+ // we can simplify the input based on that, do so now.
+ bool MadeChange = false;
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+ if (MadeChange) {
+ return &II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
+ case Intrinsic::x86_avx512_div_pd_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
+ // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+ // IR operations.
+ if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+ if (R->getValue() == 4) {
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+
+ Value *V;
+ switch (IID) {
+ default:
+ llvm_unreachable("Case stmts out of sync!");
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
+ V = IC.Builder.CreateFAdd(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
+ V = IC.Builder.CreateFSub(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
+ V = IC.Builder.CreateFMul(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_div_pd_512:
+ V = IC.Builder.CreateFDiv(Arg0, Arg1);
+ break;
+ }
+
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+ break;
+
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+ // IR operations.
+ if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
+ if (R->getValue() == 4) {
+ // Extract the element as scalars.
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
+ Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
+
+ Value *V;
+ switch (IID) {
+ default:
+ llvm_unreachable("Case stmts out of sync!");
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ V = IC.Builder.CreateFAdd(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ V = IC.Builder.CreateFSub(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ V = IC.Builder.CreateFMul(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ V = IC.Builder.CreateFDiv(LHS, RHS);
+ break;
+ }
+
+ // Handle the masking aspect of the intrinsic.
+ Value *Mask = II.getArgOperand(3);
+ auto *C = dyn_cast<ConstantInt>(Mask);
+ // We don't need a select if we know the mask bit is a 1.
+ if (!C || !C->getValue()[0]) {
+ // Cast the mask to an i1 vector and then extract the lowest element.
+ auto *MaskTy = FixedVectorType::get(
+ IC.Builder.getInt1Ty(),
+ cast<IntegerType>(Mask->getType())->getBitWidth());
+ Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
+ Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
+ // Extract the lowest element from the passthru operand.
+ Value *Passthru =
+ IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
+ V = IC.Builder.CreateSelect(Mask, V, Passthru);
+ }
+
+ // Insert the result back into the original argument 0.
+ V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
+
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+ break;
+
+ // Constant fold ashr( <A x Bi>, Ci ).
+ // Constant fold lshr( <A x Bi>, Ci ).
+ // Constant fold shl( <A x Bi>, Ci ).
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx512_psrai_q_128:
+ case Intrinsic::x86_avx512_psrai_q_256:
+ case Intrinsic::x86_avx512_psrai_d_512:
+ case Intrinsic::x86_avx512_psrai_q_512:
+ case Intrinsic::x86_avx512_psrai_w_512:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx512_psrli_d_512:
+ case Intrinsic::x86_avx512_psrli_q_512:
+ case Intrinsic::x86_avx512_psrli_w_512:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx512_pslli_d_512:
+ case Intrinsic::x86_avx512_pslli_q_512:
+ case Intrinsic::x86_avx512_pslli_w_512:
+ if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_psll_w_512: {
+ if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+
+ // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
+ // operand to compute the shift amount.
+ Value *Arg1 = II.getArgOperand(1);
+ assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
+ "Unexpected packed shift size");
+ unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
+
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
+ return IC.replaceOperand(II, 1, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx512_psllv_d_512:
+ case Intrinsic::x86_avx512_psllv_q_512:
+ case Intrinsic::x86_avx512_psllv_w_128:
+ case Intrinsic::x86_avx512_psllv_w_256:
+ case Intrinsic::x86_avx512_psllv_w_512:
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ case Intrinsic::x86_avx512_psrav_q_128:
+ case Intrinsic::x86_avx512_psrav_q_256:
+ case Intrinsic::x86_avx512_psrav_d_512:
+ case Intrinsic::x86_avx512_psrav_q_512:
+ case Intrinsic::x86_avx512_psrav_w_128:
+ case Intrinsic::x86_avx512_psrav_w_256:
+ case Intrinsic::x86_avx512_psrav_w_512:
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx512_psrlv_d_512:
+ case Intrinsic::x86_avx512_psrlv_q_512:
+ case Intrinsic::x86_avx512_psrlv_w_128:
+ case Intrinsic::x86_avx512_psrlv_w_256:
+ case Intrinsic::x86_avx512_psrlv_w_512:
+ if (Value *V = simplifyX86varShift(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse2_packssdw_128:
+ case Intrinsic::x86_sse2_packsswb_128:
+ case Intrinsic::x86_avx2_packssdw:
+ case Intrinsic::x86_avx2_packsswb:
+ case Intrinsic::x86_avx512_packssdw_512:
+ case Intrinsic::x86_avx512_packsswb_512:
+ if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse2_packuswb_128:
+ case Intrinsic::x86_sse41_packusdw:
+ case Intrinsic::x86_avx2_packusdw:
+ case Intrinsic::x86_avx2_packuswb:
+ case Intrinsic::x86_avx512_packusdw_512:
+ case Intrinsic::x86_avx512_packuswb_512:
+ if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_pclmulqdq:
+ case Intrinsic::x86_pclmulqdq_256:
+ case Intrinsic::x86_pclmulqdq_512: {
+ if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+ unsigned Imm = C->getZExtValue();
+
+ bool MadeChange = false;
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ unsigned VWidth =
+ cast<FixedVectorType>(Arg0->getType())->getNumElements();
+
+ APInt UndefElts1(VWidth, 0);
+ APInt DemandedElts1 =
+ APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
+ if (Value *V =
+ IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+
+ APInt UndefElts2(VWidth, 0);
+ APInt DemandedElts2 =
+ APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
+ if (Value *V =
+ IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+
+ // If either input elements are undef, the result is zero.
+ if (DemandedElts1.isSubsetOf(UndefElts1) ||
+ DemandedElts2.isSubsetOf(UndefElts2)) {
+ return IC.replaceInstUsesWith(II,
+ ConstantAggregateZero::get(II.getType()));
+ }
+
+ if (MadeChange) {
+ return &II;
+ }
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse41_insertps:
+ if (Value *V = simplifyX86insertps(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse4a_extrq: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+ Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+ VWidth1 == 16 && "Unexpected operand sizes");
+
+ // See if we're dealing with constant values.
+ Constant *C1 = dyn_cast<Constant>(Op1);
+ ConstantInt *CILength =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+ : nullptr;
+ ConstantInt *CIIndex =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+ : nullptr;
+
+ // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
+ if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+
+ // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
+ // operands and the lowest 16-bits of the second.
+ bool MadeChange = false;
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+ if (MadeChange) {
+ return &II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse4a_extrqi: {
+ // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
+ // bits of the lower 64-bits. The upper 64-bits are undefined.
+ Value *Op0 = II.getArgOperand(0);
+ unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+ "Unexpected operand size");
+
+ // See if we're dealing with constant values.
+ ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
+ ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
+
+ // Attempt to simplify to a constant or shuffle vector.
+ if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+
+ // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
+ // operand.
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+ return IC.replaceOperand(II, 0, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse4a_insertq: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+ Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+ cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
+ "Unexpected operand size");
+
+ // See if we're dealing with constant values.
+ Constant *C1 = dyn_cast<Constant>(Op1);
+ ConstantInt *CI11 =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+ : nullptr;
+
+ // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
+ if (CI11) {
+ const APInt &V11 = CI11->getValue();
+ APInt Len = V11.zextOrTrunc(6);
+ APInt Idx = V11.lshr(8).zextOrTrunc(6);
+ if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+
+ // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
+ // operand.
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+ return IC.replaceOperand(II, 0, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse4a_insertqi: {
+ // INSERTQI: Extract lowest Length bits from lower half of second source and
+ // insert over first source starting at Index bit. The upper 64-bits are
+ // undefined.
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+ Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+ VWidth1 == 2 && "Unexpected operand sizes");
+
+ // See if we're dealing with constant values.
+ ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
+
+ // Attempt to simplify to a constant or shuffle vector.
+ if (CILength && CIIndex) {
+ APInt Len = CILength->getValue().zextOrTrunc(6);
+ APInt Idx = CIIndex->getValue().zextOrTrunc(6);
+ if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+
+ // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
+ // operands.
+ bool MadeChange = false;
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+ if (MadeChange) {
+ return &II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse41_pblendvb:
+ case Intrinsic::x86_sse41_blendvps:
+ case Intrinsic::x86_sse41_blendvpd:
+ case Intrinsic::x86_avx_blendv_ps_256:
+ case Intrinsic::x86_avx_blendv_pd_256:
+ case Intrinsic::x86_avx2_pblendvb: {
+ // fold (blend A, A, Mask) -> A
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ Value *Mask = II.getArgOperand(2);
+ if (Op0 == Op1) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ // Zero Mask - select 1st argument.
+ if (isa<ConstantAggregateZero>(Mask)) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
+ if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
+ Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
+ return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
+ }
+
+ // Convert to a vector select if we can bypass casts and find a boolean
+ // vector condition value.
+ Value *BoolVec;
+ Mask = InstCombiner::peekThroughBitcast(Mask);
+ if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
+ BoolVec->getType()->isVectorTy() &&
+ BoolVec->getType()->getScalarSizeInBits() == 1) {
+ assert(Mask->getType()->getPrimitiveSizeInBits() ==
+ II.getType()->getPrimitiveSizeInBits() &&
+ "Not expecting mask and operands with different sizes");
+
+ unsigned NumMaskElts =
+ cast<FixedVectorType>(Mask->getType())->getNumElements();
+ unsigned NumOperandElts =
+ cast<FixedVectorType>(II.getType())->getNumElements();
+ if (NumMaskElts == NumOperandElts) {
+ return SelectInst::Create(BoolVec, Op1, Op0);
+ }
+
+ // If the mask has less elements than the operands, each mask bit maps to
+ // multiple elements of the operands. Bitcast back and forth.
+ if (NumMaskElts < NumOperandElts) {
+ Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
+ Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
+ Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
+ return new BitCastInst(Sel, II.getType());
+ }
+ }
+
+ break;
+ }
+
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_avx2_pshuf_b:
+ case Intrinsic::x86_avx512_pshuf_b_512:
+ if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_avx_vpermilvar_ps:
+ case Intrinsic::x86_avx_vpermilvar_ps_256:
+ case Intrinsic::x86_avx512_vpermilvar_ps_512:
+ case Intrinsic::x86_avx_vpermilvar_pd:
+ case Intrinsic::x86_avx_vpermilvar_pd_256:
+ case Intrinsic::x86_avx512_vpermilvar_pd_512:
+ if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps:
+ case Intrinsic::x86_avx512_permvar_df_256:
+ case Intrinsic::x86_avx512_permvar_df_512:
+ case Intrinsic::x86_avx512_permvar_di_256:
+ case Intrinsic::x86_avx512_permvar_di_512:
+ case Intrinsic::x86_avx512_permvar_hi_128:
+ case Intrinsic::x86_avx512_permvar_hi_256:
+ case Intrinsic::x86_avx512_permvar_hi_512:
+ case Intrinsic::x86_avx512_permvar_qi_128:
+ case Intrinsic::x86_avx512_permvar_qi_256:
+ case Intrinsic::x86_avx512_permvar_qi_512:
+ case Intrinsic::x86_avx512_permvar_sf_512:
+ case Intrinsic::x86_avx512_permvar_si_512:
+ if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_avx_maskload_ps:
+ case Intrinsic::x86_avx_maskload_pd:
+ case Intrinsic::x86_avx_maskload_ps_256:
+ case Intrinsic::x86_avx_maskload_pd_256:
+ case Intrinsic::x86_avx2_maskload_d:
+ case Intrinsic::x86_avx2_maskload_q:
+ case Intrinsic::x86_avx2_maskload_d_256:
+ case Intrinsic::x86_avx2_maskload_q_256:
+ if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
+ return I;
+ }
+ break;
+
+ case Intrinsic::x86_sse2_maskmov_dqu:
+ case Intrinsic::x86_avx_maskstore_ps:
+ case Intrinsic::x86_avx_maskstore_pd:
+ case Intrinsic::x86_avx_maskstore_ps_256:
+ case Intrinsic::x86_avx_maskstore_pd_256:
+ case Intrinsic::x86_avx2_maskstore_d:
+ case Intrinsic::x86_avx2_maskstore_q:
+ case Intrinsic::x86_avx2_maskstore_d_256:
+ case Intrinsic::x86_avx2_maskstore_q_256:
+ if (simplifyX86MaskedStore(II, IC)) {
+ return nullptr;
+ }
+ break;
+
+ case Intrinsic::x86_addcarry_32:
+ case Intrinsic::x86_addcarry_64:
+ if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ default:
+ break;
+ }
+ return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+ bool &KnownBitsComputed) const {
+ switch (II.getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::x86_mmx_pmovmskb:
+ case Intrinsic::x86_sse_movmsk_ps:
+ case Intrinsic::x86_sse2_movmsk_pd:
+ case Intrinsic::x86_sse2_pmovmskb_128:
+ case Intrinsic::x86_avx_movmsk_ps_256:
+ case Intrinsic::x86_avx_movmsk_pd_256:
+ case Intrinsic::x86_avx2_pmovmskb: {
+ // MOVMSK copies the vector elements' sign bits to the low bits
+ // and zeros the high bits.
+ unsigned ArgWidth;
+ if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
+ ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
+ } else {
+ auto Arg = II.getArgOperand(0);
+ auto ArgType = cast<FixedVectorType>(Arg->getType());
+ ArgWidth = ArgType->getNumElements();
+ }
+
+ // If we don't need any of low bits then return zero,
+ // we know that DemandedMask is non-zero already.
+ APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
+ Type *VTy = II.getType();
+ if (DemandedElts.isNullValue()) {
+ return ConstantInt::getNullValue(VTy);
+ }
+
+ // We know that the upper bits are set to zero.
+ Known.Zero.setBitsFrom(ArgWidth);
+ KnownBitsComputed = true;
+ break;
+ }
+ }
+ return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+ APInt &UndefElts2, APInt &UndefElts3,
+ std::function<void(Instruction *, unsigned, APInt, APInt &)>
+ simplifyAndSetOp) const {
+ unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
+ switch (II.getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::x86_xop_vfrcz_ss:
+ case Intrinsic::x86_xop_vfrcz_sd:
+ // The instructions for these intrinsics are speced to zero upper bits not
+ // pass them through like other scalar intrinsics. So we shouldn't just
+ // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
+ // Instead we should return a zero vector.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return ConstantAggregateZero::get(II.getType());
+ }
+
+ // Only the lower element is used.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // Only the lower element is undefined. The high elements are zero.
+ UndefElts = UndefElts[0];
+ break;
+
+ // Unary scalar-as-vector operations that work column-wise.
+ case Intrinsic::x86_sse_rcp_ss:
+ case Intrinsic::x86_sse_rsqrt_ss:
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+ // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
+ // checks).
+ break;
+
+ // Binary scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0. The low element is a function of both
+ // operands.
+ case Intrinsic::x86_sse_min_ss:
+ case Intrinsic::x86_sse_max_ss:
+ case Intrinsic::x86_sse_cmp_ss:
+ case Intrinsic::x86_sse2_min_sd:
+ case Intrinsic::x86_sse2_max_sd:
+ case Intrinsic::x86_sse2_cmp_sd: {
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+ // Lower element is undefined if both lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0])
+ UndefElts.clearBit(0);
+
+ break;
+ }
+
+ // Binary scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0 and the low element comes from operand 1.
+ case Intrinsic::x86_sse41_round_ss:
+ case Intrinsic::x86_sse41_round_sd: {
+ // Don't use the low element of operand 0.
+ APInt DemandedElts2 = DemandedElts;
+ DemandedElts2.clearBit(0);
+ simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+ // Take the high undef elements from operand 0 and take the lower element
+ // from operand 1.
+ UndefElts.clearBit(0);
+ UndefElts |= UndefElts2[0];
+ break;
+ }
+
+ // Three input scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0 and the low element is a function of all
+ // three inputs.
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_max_ss_round:
+ case Intrinsic::x86_avx512_mask_min_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ case Intrinsic::x86_avx512_mask_max_sd_round:
+ case Intrinsic::x86_avx512_mask_min_sd_round:
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1 and 2.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+ simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
+
+ // Lower element is undefined if all three lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0] || !UndefElts3[0])
+ UndefElts.clearBit(0);
+ break;
+
+ // TODO: Add fmaddsub support?
+ case Intrinsic::x86_sse3_addsub_pd:
+ case Intrinsic::x86_sse3_addsub_ps:
+ case Intrinsic::x86_avx_addsub_pd_256:
+ case Intrinsic::x86_avx_addsub_ps_256: {
+ // If none of the even or none of the odd lanes are required, turn this
+ // into a generic FP math instruction.
+ APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
+ APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
+ bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
+ bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
+ if (IsSubOnly || IsAddOnly) {
+ assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
+ IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+ IC.Builder.SetInsertPoint(&II);
+ Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
+ return IC.Builder.CreateBinOp(
+ IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
+ }
+
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+ UndefElts &= UndefElts2;
+ break;
+ }
+
+ case Intrinsic::x86_sse2_packssdw_128:
+ case Intrinsic::x86_sse2_packsswb_128:
+ case Intrinsic::x86_sse2_packuswb_128:
+ case Intrinsic::x86_sse41_packusdw:
+ case Intrinsic::x86_avx2_packssdw:
+ case Intrinsic::x86_avx2_packsswb:
+ case Intrinsic::x86_avx2_packusdw:
+ case Intrinsic::x86_avx2_packuswb:
+ case Intrinsic::x86_avx512_packssdw_512:
+ case Intrinsic::x86_avx512_packsswb_512:
+ case Intrinsic::x86_avx512_packusdw_512:
+ case Intrinsic::x86_avx512_packuswb_512: {
+ auto *Ty0 = II.getArgOperand(0)->getType();
+ unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
+ assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
+
+ unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
+ unsigned VWidthPerLane = VWidth / NumLanes;
+ unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
+
+ // Per lane, pack the elements of the first input and then the second.
+ // e.g.
+ // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
+ // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
+ for (int OpNum = 0; OpNum != 2; ++OpNum) {
+ APInt OpDemandedElts(InnerVWidth, 0);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ unsigned LaneIdx = Lane * VWidthPerLane;
+ for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
+ unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
+ if (DemandedElts[Idx])
+ OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
+ }
+ }
+
+ // Demand elements from the operand.
+ APInt OpUndefElts(InnerVWidth, 0);
+ simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
+
+ // Pack the operand's UNDEF elements, one lane at a time.
+ OpUndefElts = OpUndefElts.zext(VWidth);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
+ LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
+ LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
+ UndefElts |= LaneElts;
+ }
+ }
+ break;
+ }
+
+ // PSHUFB
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_avx2_pshuf_b:
+ case Intrinsic::x86_avx512_pshuf_b_512:
+ // PERMILVAR
+ case Intrinsic::x86_avx_vpermilvar_ps:
+ case Intrinsic::x86_avx_vpermilvar_ps_256:
+ case Intrinsic::x86_avx512_vpermilvar_ps_512:
+ case Intrinsic::x86_avx_vpermilvar_pd:
+ case Intrinsic::x86_avx_vpermilvar_pd_256:
+ case Intrinsic::x86_avx512_vpermilvar_pd_512:
+ // PERMV
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps: {
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
+ break;
+ }
+
+ // SSE4A instructions leave the upper 64-bits of the 128-bit result
+ // in an undefined state.
+ case Intrinsic::x86_sse4a_extrq:
+ case Intrinsic::x86_sse4a_extrqi:
+ case Intrinsic::x86_sse4a_insertq:
+ case Intrinsic::x86_sse4a_insertqi:
+ UndefElts.setHighBits(VWidth / 2);
+ break;
+ }
+ return None;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm-project/llvm/lib/Target/X86/X86Instr3DNow.td
new file mode 100644
index 000000000000..cd1b06365971
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -0,0 +1,112 @@
+//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the 3DNow! instruction set, which extends MMX to support
+// floating point and also adds a few more random instructions for good measure.
+//
+//===----------------------------------------------------------------------===//
+
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+ : I<o, F, outs, ins, asm, pat>, Requires<[Has3DNow]> {
+}
+
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+ : I3DNow<o, F, (outs VR64:$dst), ins,
+ !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, ThreeDNow {
+ let Constraints = "$src1 = $dst";
+}
+
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+ : I3DNow<o, F, (outs VR64:$dst), ins,
+ !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, ThreeDNow;
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn,
+ X86FoldableSchedWrite sched, bit Commutable = 0,
+ string Ver = ""> {
+ let isCommutable = Commutable in
+ def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>,
+ Sched<[sched]>;
+ def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn,
+ X86FoldableSchedWrite sched, string Ver = ""> {
+ def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>,
+ Sched<[sched]>;
+ def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+ [(set VR64:$dst, (!cast<Intrinsic>(
+ !strconcat("int_x86_3dnow", Ver, "_", Mn))
+ (bitconvert (load_mmx addr:$src))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", SchedWriteVecALU.MMX, 1>;
+defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id", WriteCvtPS2I>;
+defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc", WriteFAdd>;
+defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", WriteFAdd, 1>;
+defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", WriteFAdd, 1>;
+defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge", WriteFAdd>;
+defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt", WriteFAdd>;
+defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax", WriteFAdd>;
+defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin", WriteFAdd>;
+defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", WriteFAdd, 1>;
+defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp", WriteFAdd>;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", WriteFAdd>;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", WriteFAdd>;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", WriteFAdd>;
+defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt", WriteFAdd>;
+defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", WriteFAdd, 1>;
+defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>;
+defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>;
+defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>;
+
+let SchedRW = [WriteEMMS],
+ Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in
+def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
+ [(int_x86_mmx_femms)]>, TB;
+
+// PREFETCHWT1 is supported we want to use it for everything but T0.
+def PrefetchWLevel : PatFrag<(ops), (i32 imm), [{
+ return N->getSExtValue() == 3 || !Subtarget->hasPREFETCHWT1();
+}]>;
+
+// Use PREFETCHWT1 for NTA, T2, T1.
+def PrefetchWT1Level : ImmLeaf<i32, [{
+ return Imm < 3;
+}]>;
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [Has3DNow, NoSSEPrefetch] in
+def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
+ "prefetch\t$addr",
+ [(prefetch addr:$addr, imm, imm, (i32 1))]>, TB;
+
+def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
+ [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))]>,
+ TB, Requires<[HasPrefetchW]>;
+
+def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr",
+ [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))]>,
+ TB, Requires<[HasPREFETCHWT1]>;
+}
+
+// "3DNowA" instructions
+defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", WriteCvtPS2I, "a">;
+defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", WriteCvtI2PS, "a">;
+defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", WriteFAdd, 0, "a">;
+defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", WriteFAdd, 0, "a">;
+defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", SchedWriteShuffle.MMX, "a">;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
new file mode 100644
index 000000000000..e4f3290cab9f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
@@ -0,0 +1,149 @@
+//===---- X86InstrAMX.td - AMX Instruction Set Extension --*- tablegen -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel AMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMX instructions
+
+let Predicates = [HasAMXTILE, In64BitMode] in {
+ let SchedRW = [WriteSystem] in {
+ let hasSideEffects = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
+ "ldtilecfg\t$src",
+ [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
+ let hasSideEffects = 1 in
+ def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
+ "sttilecfg\t$src",
+ [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
+ let mayLoad = 1 in
+ def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src),
+ "tileloadd\t{$src, $dst|$dst, $src}", []>,
+ VEX, T8XD;
+ let mayLoad = 1 in
+ def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src),
+ "tileloaddt1\t{$src, $dst|$dst, $src}", []>,
+ VEX, T8PD;
+ let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
+ "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
+ let mayStore = 1 in
+ def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
+ (ins sibmem:$dst, TILE:$src),
+ "tilestored\t{$src, $dst|$dst, $src}", []>,
+ VEX, T8XS;
+ def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins),
+ "tilezero\t$dst", []>,
+ VEX, T8XD;
+
+ // Pseduo instruction for RA.
+ let hasSideEffects = 1, mayLoad = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
+
+ let hasSideEffects = 1, mayStore = 1 in
+ def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
+
+ def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3,
+ TILECFG:$cfg), []>;
+ def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
+ GR16:$src2, opaquemem:$src3,
+ TILE:$src4, TILECFG:$cfg), []>;
+ def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2,
+ TILECFG:$cfg), []>;
+
+ let usesCustomInserter = 1 in {
+ // Pseudo instructions, using immediates instead of tile registers.
+ // To be translated to the actual instructions in X86ISelLowering.cpp
+ def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+ def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
+ sibmem:$src2), []>;
+ def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
+ def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
+ [(int_x86_tilezero timm:$src)]>;
+ }
+ } // SchedRW
+} // HasAMXTILE
+
+let Predicates = [HasAMXINT8, In64BitMode] in {
+ let SchedRW = [WriteSystem] in {
+ let Constraints = "$src1 = $dst" in {
+ def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ VEX_4V, T8XD;
+ def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ VEX_4V, T8XS;
+ def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ VEX_4V, T8PD;
+ def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ VEX_4V, T8PS;
+ }
+
+ // Pseduo instruction for RA.
+ let Constraints = "$src4 = $dst" in
+ def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
+
+ let usesCustomInserter = 1 in {
+ // Pseudo instructions, using immediates instead of tile registers.
+ // To be translated to the actual instructions in X86ISelLowering.cpp
+ def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbssd timm:$src1,
+ timm:$src2, timm:$src3)]>;
+ def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbsud timm:$src1,
+ timm:$src2, timm:$src3)]>;
+ def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbusd timm:$src1,
+ timm:$src2, timm:$src3)]>;
+ def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbuud timm:$src1,
+ timm:$src2, timm:$src3)]>;
+ }
+ }
+} // HasAMXTILE
+
+let Predicates = [HasAMXBF16, In64BitMode] in {
+ let SchedRW = [WriteSystem] in {
+ let Constraints = "$src1 = $dst" in
+ def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ []>, VEX_4V, T8XS;
+
+ let usesCustomInserter = 1 in {
+ // Pseudo instructions, using immediates instead of tile registers.
+ // To be translated to the actual instructions in X86ISelLowering.cpp
+ def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbf16ps timm:$src1,
+ timm:$src2, timm:$src3)]>;
+ }
+ }
+} // HasAMXTILE, HasAMXBF16
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
new file mode 100644
index 000000000000..0c2b278fdd7b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -0,0 +1,12239 @@
+//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 AVX512 instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+// Group template arguments that can be derived from the vector type (EltNum x
+// EltVT). These are things like the register class for the writemask, etc.
+// The idea is to pass one of these as the template argument rather than the
+// individual arguments.
+// The template is also used for scalar types, in this case numelts is 1.
+class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
+ string suffix = ""> {
+ RegisterClass RC = rc;
+ ValueType EltVT = eltvt;
+ int NumElts = numelts;
+
+ // Corresponding mask register class.
+ RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts);
+
+ // Corresponding mask register pair class.
+ RegisterOperand KRPC = !if (!gt(NumElts, 16), ?,
+ !cast<RegisterOperand>("VK" # NumElts # "Pair"));
+
+ // Corresponding write-mask register class.
+ RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
+
+ // The mask VT.
+ ValueType KVT = !cast<ValueType>("v" # NumElts # "i1");
+
+ // Suffix used in the instruction mnemonic.
+ string Suffix = suffix;
+
+ // VTName is a string name for vector VT. For vector types it will be
+ // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32
+ // It is a little bit complex for scalar types, where NumElts = 1.
+ // In this case we build v4f32 or v2f64
+ string VTName = "v" # !if (!eq (NumElts, 1),
+ !if (!eq (EltVT.Size, 32), 4,
+ !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT;
+
+ // The vector VT.
+ ValueType VT = !cast<ValueType>(VTName);
+
+ string EltTypeName = !cast<string>(EltVT);
+ // Size of the element type in bits, e.g. 32 for v16i32.
+ string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName));
+ int EltSize = EltVT.Size;
+
+ // "i" for integer types and "f" for floating-point types
+ string TypeVariantName = !subst(EltSizeName, "", EltTypeName);
+
+ // Size of RC in bits, e.g. 512 for VR512.
+ int Size = VT.Size;
+
+ // The corresponding memory operand, e.g. i512mem for VR512.
+ X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
+ X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
+ // FP scalar memory operand for intrinsics - ssmem/sdmem.
+ Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"),
+ !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
+
+ // Load patterns
+ PatFrag LdFrag = !cast<PatFrag>("load" # VTName);
+
+ PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName);
+
+ PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
+ PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName);
+
+ PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f32"),
+ !cast<PatFrags>("sse_load_f32"),
+ !if (!eq (EltTypeName, "f64"),
+ !cast<PatFrags>("sse_load_f64"),
+ ?));
+
+ // The string to specify embedded broadcast in assembly.
+ string BroadcastStr = "{1to" # NumElts # "}";
+
+ // 8-bit compressed displacement tuple/subvector format. This is only
+ // defined for NumElts <= 8.
+ CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0),
+ !cast<CD8VForm>("CD8VT" # NumElts), ?);
+
+ SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm,
+ !if (!eq (Size, 256), sub_ymm, ?));
+
+ Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle,
+ !if (!eq (EltTypeName, "f64"), SSEPackedDouble,
+ SSEPackedInt));
+
+ RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+
+ dag ImmAllZerosV = (VT immAllZerosV);
+
+ string ZSuffix = !if (!eq (Size, 128), "Z128",
+ !if (!eq (Size, 256), "Z256", "Z"));
+}
+
+def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">;
+def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
+def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
+def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">;
+def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
+def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">;
+
+// "x" in v32i8x_info means RC = VR256X
+def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">;
+def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
+def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">;
+def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">;
+def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">;
+def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">;
+
+def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">;
+def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">;
+def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">;
+def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">;
+def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">;
+def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">;
+
+// We map scalar types to the smallest (128-bit) vector type
+// with the appropriate element type. This allows to use the same masking logic.
+def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">;
+def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">;
+def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">;
+def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">;
+
+class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,
+ X86VectorVTInfo i128> {
+ X86VectorVTInfo info512 = i512;
+ X86VectorVTInfo info256 = i256;
+ X86VectorVTInfo info128 = i128;
+}
+
+def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info,
+ v16i8x_info>;
+def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info,
+ v8i16x_info>;
+def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info,
+ v4i32x_info>;
+def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info,
+ v2i64x_info>;
+def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
+ v4f32x_info>;
+def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
+ v2f64x_info>;
+
+class X86KVectorVTInfo<RegisterClass _krc, RegisterClass _krcwm,
+ ValueType _vt> {
+ RegisterClass KRC = _krc;
+ RegisterClass KRCWM = _krcwm;
+ ValueType KVT = _vt;
+}
+
+def v1i1_info : X86KVectorVTInfo<VK1, VK1WM, v1i1>;
+def v2i1_info : X86KVectorVTInfo<VK2, VK2WM, v2i1>;
+def v4i1_info : X86KVectorVTInfo<VK4, VK4WM, v4i1>;
+def v8i1_info : X86KVectorVTInfo<VK8, VK8WM, v8i1>;
+def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>;
+def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>;
+def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>;
+
+// Used for matching masked operations. Ensures the operation part only has a
+// single use.
+def vselect_mask : PatFrag<(ops node:$mask, node:$src1, node:$src2),
+ (vselect node:$mask, node:$src1, node:$src2), [{
+ return isProfitableToFormMaskedOp(N);
+}]>;
+
+def X86selects_mask : PatFrag<(ops node:$mask, node:$src1, node:$src2),
+ (X86selects node:$mask, node:$src1, node:$src2), [{
+ return isProfitableToFormMaskedOp(N);
+}]>;
+
+// This multiclass generates the masking variants from the non-masking
+// variant. It only provides the assembly pieces for the masking variants.
+// It assumes custom ISel patterns for masking which can be provided as
+// template arguments.
+multiclass AVX512_maskable_custom<bits<8> O, Format F,
+ dag Outs,
+ dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern,
+ list<dag> MaskingPattern,
+ list<dag> ZeroMaskingPattern,
+ string MaskingConstraint = "",
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable> {
+ let isCommutable = IsCommutable in
+ def NAME: AVX512<O, F, Outs, Ins,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+ "$dst, "#IntelSrcAsm#"}",
+ Pattern>;
+
+ // Prefer over VMOV*rrk Pat<>
+ let isCommutable = IsKCommutable in
+ def NAME#k: AVX512<O, F, Outs, MaskingIns,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+ "$dst {${mask}}, "#IntelSrcAsm#"}",
+ MaskingPattern>,
+ EVEX_K {
+ // In case of the 3src subclass this is overridden with a let.
+ string Constraints = MaskingConstraint;
+ }
+
+ // Zero mask does not add any restrictions to commute operands transformation.
+ // So, it is Ok to use IsCommutable instead of IsKCommutable.
+ let isCommutable = IsKZCommutable in // Prefer over VMOV*rrkz Pat<>
+ def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
+ ZeroMaskingPattern>,
+ EVEX_KZ;
+}
+
+
+// Common base class of AVX512_maskable and AVX512_maskable_3src.
+multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs,
+ dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS,
+ SDPatternOperator Select = vselect_mask,
+ string MaskingConstraint = "",
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable> :
+ AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+ AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst, MaskingRHS)],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
+ MaskingConstraint, IsCommutable,
+ IsKCommutable, IsKZCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction. In the masking case, the
+// preserved vector elements come from a new dummy input operand tied to $dst.
+// This version uses a separate dag for non-masking and masking.
+multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskRHS,
+ bit IsCommutable = 0, bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable> :
+ AVX512_maskable_custom<O, F, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst,
+ (vselect_mask _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
+ [(set _.RC:$dst,
+ (vselect_mask _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
+ "$src0 = $dst", IsCommutable, IsKCommutable,
+ IsKZCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction. In the masking case, the
+// preserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS,
+ bit IsCommutable = 0, bit IsKCommutable = 0,
+ bit IsKZCommutable = IsCommutable,
+ SDPatternOperator Select = vselect_mask> :
+ AVX512_maskable_common<O, F, _, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (Select _.KRCWM:$mask, RHS, _.RC:$src0),
+ Select, "$src0 = $dst", IsCommutable, IsKCommutable,
+ IsKZCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the scalar instruction.
+multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS> :
+ AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ RHS, 0, 0, 0, X86selects_mask>;
+
+// Similar to AVX512_maskable but in this case one of the source operands
+// ($src1) is already tied to $dst so we just use that for the preserved
+// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude
+// $src1.
+multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS,
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0,
+ SDPatternOperator Select = vselect_mask,
+ bit MaskOnly = 0> :
+ AVX512_maskable_common<O, F, _, Outs,
+ !con((ins _.RC:$src1), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ !if(MaskOnly, (null_frag), RHS),
+ (Select _.KRCWM:$mask, RHS, _.RC:$src1),
+ Select, "", IsCommutable, IsKCommutable>;
+
+// Similar to AVX512_maskable_3src but in this case the input VT for the tied
+// operand differs from the output VT. This requires a bitconvert on
+// the preserved vector going into the vselect.
+// NOTE: The unmasked pattern is disabled.
+multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
+ X86VectorVTInfo InVT,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, bit IsCommutable = 0> :
+ AVX512_maskable_common<O, F, OutVT, Outs,
+ !con((ins InVT.RC:$src1), NonTiedIns),
+ !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+ !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, (null_frag),
+ (vselect_mask InVT.KRCWM:$mask, RHS,
+ (bitconvert InVT.RC:$src1)),
+ vselect_mask, "", IsCommutable>;
+
+multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS,
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0,
+ bit MaskOnly = 0> :
+ AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm,
+ IntelSrcAsm, RHS, IsCommutable, IsKCommutable,
+ X86selects_mask, MaskOnly>;
+
+multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern> :
+ AVX512_maskable_custom<O, F, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
+ "$src0 = $dst">;
+
+multiclass AVX512_maskable_3src_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern> :
+ AVX512_maskable_custom<O, F, Outs,
+ !con((ins _.RC:$src1), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
+ "">;
+
+// Instruction with mask that puts result in mask register,
+// like "compare" and "vptest"
+multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
+ dag Outs,
+ dag Ins, dag MaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ list<dag> Pattern,
+ list<dag> MaskingPattern,
+ bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in {
+ def NAME: AVX512<O, F, Outs, Ins,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+ "$dst, "#IntelSrcAsm#"}",
+ Pattern>;
+
+ def NAME#k: AVX512<O, F, Outs, MaskingIns,
+ OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+ "$dst {${mask}}, "#IntelSrcAsm#"}",
+ MaskingPattern>, EVEX_K;
+ }
+}
+
+multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs,
+ dag Ins, dag MaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS,
+ bit IsCommutable = 0> :
+ AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
+ AttSrcAsm, IntelSrcAsm,
+ [(set _.KRC:$dst, RHS)],
+ [(set _.KRC:$dst, MaskingRHS)], IsCommutable>;
+
+multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag RHS_su, bit IsCommutable = 0> :
+ AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (and _.KRCWM:$mask, RHS_su), IsCommutable>;
+
+// Used by conversion instructions.
+multiclass AVX512_maskable_cvt<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs,
+ dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS, dag ZeroMaskingRHS> :
+ AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+ AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst, MaskingRHS)],
+ [(set _.RC:$dst, ZeroMaskingRHS)],
+ "$src0 = $dst">;
+
+multiclass AVX512_maskable_fma<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS, bit IsCommutable,
+ bit IsKCommutable> :
+ AVX512_maskable_custom<O, F, Outs,
+ !con((ins _.RC:$src1), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst,
+ (vselect_mask _.KRCWM:$mask, MaskingRHS, _.RC:$src1))],
+ [(set _.RC:$dst,
+ (vselect_mask _.KRCWM:$mask, MaskingRHS, _.ImmAllZerosV))],
+ "", IsCommutable, IsKCommutable>;
+
+// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDomainFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+ [(set VR512:$dst, (v16i32 immAllZerosV))]>;
+def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+ [(set VR512:$dst, (v16i32 immAllOnesV))]>;
+}
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+}
+
+// Alias instructions that allow VPTERNLOG to be used with a mask to create
+// a mix of all ones and all zeros elements. This is done this way to force
+// the same register to be used as input for all three sources.
+let isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteVecALU] in {
+def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst),
+ (ins VK16WM:$mask), "",
+ [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask),
+ (v16i32 immAllOnesV),
+ (v16i32 immAllZerosV)))]>;
+def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
+ (ins VK8WM:$mask), "",
+ [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask),
+ (v8i64 immAllOnesV),
+ (v8i64 immAllZerosV)))]>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
+def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
+ [(set VR128X:$dst, (v4i32 immAllZerosV))]>;
+def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
+ [(set VR256X:$dst, (v8i32 immAllZerosV))]>;
+}
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
+}
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in {
+ def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
+ [(set FR32X:$dst, fp32imm0)]>;
+ def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
+ [(set FR64X:$dst, fp64imm0)]>;
+ def AVX512_FsFLD0F128 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
+ [(set VR128X:$dst, fp128imm0)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VECTOR INSERT
+//
+
+// Supports two different pattern operators for mask and unmasked ops. Allows
+// null_frag to be passed for one.
+multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDPatternOperator vinsert_insert,
+ SDPatternOperator vinsert_for_mask,
+ X86FoldableSchedWrite sched> {
+ let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+ defm rr : AVX512_maskable_split<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
+ (ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
+ "vinsert" # From.EltTypeName # "x" # From.NumElts,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (vinsert_insert:$src3 (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm)),
+ (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))>,
+ AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
+ (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
+ "vinsert" # From.EltTypeName # "x" # From.NumElts,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (vinsert_insert:$src3 (To.VT To.RC:$src1),
+ (From.VT (From.LdFrag addr:$src2)),
+ (iPTR imm)),
+ (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
+ (From.VT (From.LdFrag addr:$src2)),
+ (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+ EVEX_CD8<From.EltSize, From.CD8TupleForm>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+// Passes the same pattern operator for masked and unmasked ops.
+multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDPatternOperator vinsert_insert,
+ X86FoldableSchedWrite sched> :
+ vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert, sched>;
+
+multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, PatFrag vinsert_insert,
+ SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
+ let Predicates = p in {
+ def : Pat<(vinsert_insert:$ins
+ (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rr")
+ To.RC:$src1, From.RC:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins)))>;
+
+ def : Pat<(vinsert_insert:$ins
+ (To.VT To.RC:$src1),
+ (From.VT (From.LdFrag addr:$src2)),
+ (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rm")
+ To.RC:$src1, addr:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins)))>;
+ }
+}
+
+multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
+ ValueType EltVT64, int Opcode256,
+ X86FoldableSchedWrite sched> {
+
+ let Predicates = [HasVLX] in
+ defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ vinsert128_insert, sched>, EVEX_V256;
+
+ defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ vinsert128_insert, sched>, EVEX_V512;
+
+ defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ vinsert256_insert, sched>, VEX_W, EVEX_V512;
+
+ // Even with DQI we'd like to only use these instructions for masking.
+ let Predicates = [HasVLX, HasDQI] in
+ defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ null_frag, vinsert128_insert, sched>,
+ VEX_W1X, EVEX_V256;
+
+ // Even with DQI we'd like to only use these instructions for masking.
+ let Predicates = [HasDQI] in {
+ defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ null_frag, vinsert128_insert, sched>,
+ VEX_W, EVEX_V512;
+
+ defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ null_frag, vinsert256_insert, sched>,
+ EVEX_V512;
+ }
+}
+
+// FIXME: Is there a better scheduler class for VINSERTF/VINSERTI?
+defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a, WriteFShuffle256>;
+defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a, WriteShuffle256>;
+
+// Codegen pattern with the alternative types,
+// Even with AVX512DQ we'll still use these for unmasked operations.
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+// Codegen pattern with the alternative types insert VEC128 into VEC256
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+// Codegen pattern with the alternative types insert VEC128 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types insert VEC256 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+
+multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, X86VectorVTInfo Cast,
+ PatFrag vinsert_insert,
+ SDNodeXForm INSERT_get_vinsert_imm,
+ list<Predicate> p> {
+let Predicates = p in {
+ def : Pat<(Cast.VT
+ (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))),
+ Cast.RC:$src0)),
+ (!cast<Instruction>(InstrStr#"rrk")
+ Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins))>;
+ def : Pat<(Cast.VT
+ (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT
+ (bitconvert
+ (From.LdFrag addr:$src2))),
+ (iPTR imm))),
+ Cast.RC:$src0)),
+ (!cast<Instruction>(InstrStr#"rmk")
+ Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, addr:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins))>;
+
+ def : Pat<(Cast.VT
+ (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))),
+ Cast.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#"rrkz")
+ Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins))>;
+ def : Pat<(Cast.VT
+ (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT (From.LdFrag addr:$src2)),
+ (iPTR imm))),
+ Cast.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#"rmkz")
+ Cast.KRCWM:$mask, To.RC:$src1, addr:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins))>;
+}
+}
+
+defm : vinsert_for_mask_cast<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+ v8f32x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4f32x_info, v8f32x_info,
+ v4f64x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
+
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+ v8i32x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+ v8i32x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+ v8i32x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4i32x_info, v8i32x_info,
+ v4i64x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v8i16x_info, v16i16x_info,
+ v4i64x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v16i8x_info, v32i8x_info,
+ v4i64x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
+
+defm : vinsert_for_mask_cast<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+ v16f32_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z", v4f32x_info, v16f32_info,
+ v8f64_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI]>;
+
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+ v16i32_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+ v16i32_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+ v16i32_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v4i32x_info, v16i32_info,
+ v8i64_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v8i16x_info, v32i16_info,
+ v8i64_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v16i8x_info, v64i8_info,
+ v8i64_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI]>;
+
+defm : vinsert_for_mask_cast<"VINSERTF32x8Z", v4f64x_info, v8f64_info,
+ v16f32_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+ v8f64_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v4i64x_info, v8i64_info,
+ v16i32_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v16i16x_info, v32i16_info,
+ v16i32_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v32i8x_info, v64i8_info,
+ v16i32_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+ v8i64_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+ v8i64_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+ v8i64_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+// vinsertps - insert f32 to XMM
+let ExeDomain = SSEPackedSingle in {
+let isCommutable = 1 in
+def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
+ "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>,
+ EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+ (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
+ "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insertps VR128X:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ timm:$src3))]>,
+ EVEX_4V, EVEX_CD8<32, CD8VT1>,
+ Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VECTOR EXTRACT
+//---
+
+// Supports two different pattern operators for mask and unmasked ops. Allows
+// null_frag to be passed for one.
+multiclass vextract_for_size_split<int Opcode,
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ SDPatternOperator vextract_extract,
+ SDPatternOperator vextract_for_mask,
+ SchedWrite SchedRR, SchedWrite SchedMR> {
+
+ let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+ defm rr : AVX512_maskable_split<Opcode, MRMDestReg, To, (outs To.RC:$dst),
+ (ins From.RC:$src1, u8imm:$idx),
+ "vextract" # To.EltTypeName # "x" # To.NumElts,
+ "$idx, $src1", "$src1, $idx",
+ (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)),
+ (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm))>,
+ AVX512AIi8Base, EVEX, Sched<[SchedRR]>;
+
+ def mr : AVX512AIi8<Opcode, MRMDestMem, (outs),
+ (ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
+ "vextract" # To.EltTypeName # "x" # To.NumElts #
+ "\t{$idx, $src1, $dst|$dst, $src1, $idx}",
+ [(store (To.VT (vextract_extract:$idx
+ (From.VT From.RC:$src1), (iPTR imm))),
+ addr:$dst)]>, EVEX,
+ Sched<[SchedMR]>;
+
+ let mayStore = 1, hasSideEffects = 0 in
+ def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
+ (ins To.MemOp:$dst, To.KRCWM:$mask,
+ From.RC:$src1, u8imm:$idx),
+ "vextract" # To.EltTypeName # "x" # To.NumElts #
+ "\t{$idx, $src1, $dst {${mask}}|"
+ "$dst {${mask}}, $src1, $idx}", []>,
+ EVEX_K, EVEX, Sched<[SchedMR]>, NotMemoryFoldable;
+ }
+}
+
+// Passes the same pattern operator for masked and unmasked ops.
+multiclass vextract_for_size<int Opcode, X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDPatternOperator vextract_extract,
+ SchedWrite SchedRR, SchedWrite SchedMR> :
+ vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract, SchedRR, SchedMR>;
+
+// Codegen pattern for the alternative types
+multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, PatFrag vextract_extract,
+ SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> {
+ let Predicates = p in {
+ def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rr")
+ From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+ def : Pat<(store (To.VT (vextract_extract:$ext (From.VT From.RC:$src1),
+ (iPTR imm))), addr:$dst),
+ (!cast<Instruction>(InstrStr#"mr") addr:$dst, From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext))>;
+ }
+}
+
+multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
+ ValueType EltVT64, int Opcode256,
+ SchedWrite SchedRR, SchedWrite SchedMR> {
+ let Predicates = [HasAVX512] in {
+ defm NAME # "32x4Z" : vextract_for_size<Opcode128,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ vextract128_extract, SchedRR, SchedMR>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+ defm NAME # "64x4Z" : vextract_for_size<Opcode256,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ vextract256_extract, SchedRR, SchedMR>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
+ }
+ let Predicates = [HasVLX] in
+ defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ vextract128_extract, SchedRR, SchedMR>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+
+ // Even with DQI we'd like to only use these instructions for masking.
+ let Predicates = [HasVLX, HasDQI] in
+ defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ null_frag, vextract128_extract, SchedRR, SchedMR>,
+ VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+
+ // Even with DQI we'd like to only use these instructions for masking.
+ let Predicates = [HasDQI] in {
+ defm NAME # "64x2Z" : vextract_for_size_split<Opcode128,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ null_frag, vextract128_extract, SchedRR, SchedMR>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
+ defm NAME # "32x8Z" : vextract_for_size_split<Opcode256,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ null_frag, vextract256_extract, SchedRR, SchedMR>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+ }
+}
+
+// TODO - replace WriteFStore/WriteVecStore with X86SchedWriteMoveLSWidths types.
+defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b, WriteFShuffle256, WriteFStore>;
+defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b, WriteShuffle256, WriteVecStore>;
+
+// extract_subvector codegen patterns with the alternative types.
+// Even with AVX512DQ we'll still use these for unmasked operations.
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC256
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types extract VEC256 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
+
+// A 128-bit extract from bits [255:128] of a 512-bit vector should use a
+// smaller extract to enable EVEX->VEX.
+let Predicates = [NoVLX] in {
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
+ (v2i64 (VEXTRACTI128rr
+ (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 2))),
+ (v2f64 (VEXTRACTF128rr
+ (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 4))),
+ (v4i32 (VEXTRACTI128rr
+ (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 4))),
+ (v4f32 (VEXTRACTF128rr
+ (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
+ (v8i16 (VEXTRACTI128rr
+ (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
+ (v16i8 (VEXTRACTI128rr
+ (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+}
+
+// A 128-bit extract from bits [255:128] of a 512-bit vector should use a
+// smaller extract to enable EVEX->VEX.
+let Predicates = [HasVLX] in {
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
+ (v2i64 (VEXTRACTI32x4Z256rr
+ (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 2))),
+ (v2f64 (VEXTRACTF32x4Z256rr
+ (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 4))),
+ (v4i32 (VEXTRACTI32x4Z256rr
+ (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 4))),
+ (v4f32 (VEXTRACTF32x4Z256rr
+ (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
+ (v8i16 (VEXTRACTI32x4Z256rr
+ (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
+ (v16i8 (VEXTRACTI32x4Z256rr
+ (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+}
+
+
+// Additional patterns for handling a bitcast between the vselect and the
+// extract_subvector.
+multiclass vextract_for_mask_cast<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, X86VectorVTInfo Cast,
+ PatFrag vextract_extract,
+ SDNodeXForm EXTRACT_get_vextract_imm,
+ list<Predicate> p> {
+let Predicates = p in {
+ def : Pat<(Cast.VT (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (To.VT (vextract_extract:$ext
+ (From.VT From.RC:$src), (iPTR imm)))),
+ To.RC:$src0)),
+ (Cast.VT (!cast<Instruction>(InstrStr#"rrk")
+ Cast.RC:$src0, Cast.KRCWM:$mask, From.RC:$src,
+ (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+
+ def : Pat<(Cast.VT (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (To.VT (vextract_extract:$ext
+ (From.VT From.RC:$src), (iPTR imm)))),
+ Cast.ImmAllZerosV)),
+ (Cast.VT (!cast<Instruction>(InstrStr#"rrkz")
+ Cast.KRCWM:$mask, From.RC:$src,
+ (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+}
+}
+
+defm : vextract_for_mask_cast<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+ v4f32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTF64x2Z256", v8f32x_info, v4f32x_info,
+ v2f64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v8i32x_info, v4i32x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v16i16x_info, v8i16x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v32i8x_info, v16i8x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+ v4f32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTF64x2Z", v16f32_info, v4f32x_info,
+ v2f64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v16i32_info, v4i32x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v32i16_info, v8i16x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v64i8_info, v16i8x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTF32x8Z", v8f64_info, v4f64x_info,
+ v8f32x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+ v4f64x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v8i64_info, v4i64x_info,
+ v8i32x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v32i16_info, v16i16x_info,
+ v8i32x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v64i8_info, v32i8x_info,
+ v8i32x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+ v4i64x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+ v4i64x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+ v4i64x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
+// vextractps - extract 32 bits from XMM
+def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+ (ins VR128X:$src1, u8imm:$src2),
+ "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+ EVEX, VEX_WIG, Sched<[WriteVecExtract]>;
+
+def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
+ "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
+ addr:$dst)]>,
+ EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecExtractSt]>;
+
+//===---------------------------------------------------------------------===//
+// AVX-512 BROADCAST
+//---
+// broadcast with a scalar argument.
+multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
+ string Name,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+ def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#rr)
+ (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
+ def : Pat<(DestInfo.VT (vselect_mask DestInfo.KRCWM:$mask,
+ (X86VBroadcast SrcInfo.FRC:$src),
+ DestInfo.RC:$src0)),
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#rrk)
+ DestInfo.RC:$src0, DestInfo.KRCWM:$mask,
+ (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
+ def : Pat<(DestInfo.VT (vselect_mask DestInfo.KRCWM:$mask,
+ (X86VBroadcast SrcInfo.FRC:$src),
+ DestInfo.ImmAllZerosV)),
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#rrkz)
+ DestInfo.KRCWM:$mask, (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
+}
+
+// Split version to allow mask and broadcast node to be different types. This
+// helps support the 32x2 broadcasts.
+multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
+ string Name,
+ SchedWrite SchedRR, SchedWrite SchedRM,
+ X86VectorVTInfo MaskInfo,
+ X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo,
+ bit IsConvertibleToThreeAddress,
+ SDPatternOperator UnmaskedOp = X86VBroadcast,
+ SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> {
+ let hasSideEffects = 0 in
+ def rr : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
+ DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
+ def rrkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect_mask MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+ MaskInfo.ImmAllZerosV))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
+ let Constraints = "$src0 = $dst" in
+ def rrk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+ SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect_mask MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+ MaskInfo.RC:$src0))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
+
+ let hasSideEffects = 0, mayLoad = 1 in
+ def rm : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (UnmaskedBcastOp addr:$src)))))],
+ DestInfo.ExeDomain>, T8PD, EVEX,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+ def rmkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect_mask MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (SrcInfo.BroadcastLdFrag addr:$src)))),
+ MaskInfo.ImmAllZerosV))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+ let Constraints = "$src0 = $dst",
+ isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
+ def rmk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+ SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect_mask MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (SrcInfo.BroadcastLdFrag addr:$src)))),
+ MaskInfo.RC:$src0))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+}
+
+// Helper class to force mask and broadcast result to same type.
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name,
+ SchedWrite SchedRR, SchedWrite SchedRM,
+ X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo,
+ bit IsConvertibleToThreeAddress> :
+ avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM,
+ DestInfo, DestInfo, SrcInfo,
+ IsConvertibleToThreeAddress>;
+
+multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info512, _.info128, 1>,
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
+ _.info128>,
+ EVEX_V512;
+ }
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info256, _.info128, 1>,
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
+ _.info128>,
+ EVEX_V256;
+ }
+}
+
+multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info512, _.info128, 1>,
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
+ _.info128>,
+ EVEX_V512;
+ }
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info256, _.info128, 1>,
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
+ _.info128>,
+ EVEX_V256;
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info128, _.info128, 1>,
+ avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128,
+ _.info128>,
+ EVEX_V128;
+ }
+}
+defm VBROADCASTSS : avx512_fp_broadcast_ss<0x18, "vbroadcastss",
+ avx512vl_f32_info>;
+defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
+ avx512vl_f64_info>, VEX_W1X;
+
+multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
+ X86VectorVTInfo _, SDPatternOperator OpNode,
+ RegisterClass SrcRC> {
+ // Fold with a mask even if it has multiple uses since it is cheap.
+ let ExeDomain = _.ExeDomain in
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins SrcRC:$src),
+ "vpbroadcast"#_.Suffix, "$src", "$src",
+ (_.VT (OpNode SrcRC:$src)), /*IsCommutable*/0,
+ /*IsKCommutable*/0, /*IsKZCommutable*/0, vselect>,
+ T8PD, EVEX, Sched<[SchedRR]>;
+}
+
+multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite SchedRR,
+ X86VectorVTInfo _, SDPatternOperator OpNode,
+ RegisterClass SrcRC, SubRegIndex Subreg> {
+ let hasSideEffects = 0, ExeDomain = _.ExeDomain in
+ defm rr : AVX512_maskable_custom<opc, MRMSrcReg,
+ (outs _.RC:$dst), (ins GR32:$src),
+ !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
+ !con((ins _.KRCWM:$mask), (ins GR32:$src)),
+ "vpbroadcast"#_.Suffix, "$src", "$src", [], [], [],
+ "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
+
+ def : Pat <(_.VT (OpNode SrcRC:$src)),
+ (!cast<Instruction>(Name#rr)
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+
+ // Fold with a mask even if it has multiple uses since it is cheap.
+ def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.RC:$src0),
+ (!cast<Instruction>(Name#rrk) _.RC:$src0, _.KRCWM:$mask,
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+
+ def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.ImmAllZerosV),
+ (!cast<Instruction>(Name#rrkz) _.KRCWM:$mask,
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+}
+
+multiclass avx512_int_broadcastbw_reg_vl<bits<8> opc, string Name,
+ AVX512VLVectorVTInfo _, SDPatternOperator OpNode,
+ RegisterClass SrcRC, SubRegIndex Subreg, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_int_broadcastbw_reg<opc, Name#Z, WriteShuffle256, _.info512,
+ OpNode, SrcRC, Subreg>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_int_broadcastbw_reg<opc, Name#Z256, WriteShuffle256,
+ _.info256, OpNode, SrcRC, Subreg>, EVEX_V256;
+ defm Z128 : avx512_int_broadcastbw_reg<opc, Name#Z128, WriteShuffle,
+ _.info128, OpNode, SrcRC, Subreg>, EVEX_V128;
+ }
+}
+
+multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
+ SDPatternOperator OpNode,
+ RegisterClass SrcRC, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_int_broadcast_reg<opc, WriteShuffle256, _.info512, OpNode,
+ SrcRC>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_int_broadcast_reg<opc, WriteShuffle256, _.info256, OpNode,
+ SrcRC>, EVEX_V256;
+ defm Z128 : avx512_int_broadcast_reg<opc, WriteShuffle, _.info128, OpNode,
+ SrcRC>, EVEX_V128;
+ }
+}
+
+defm VPBROADCASTBr : avx512_int_broadcastbw_reg_vl<0x7A, "VPBROADCASTBr",
+ avx512vl_i8_info, X86VBroadcast, GR8, sub_8bit, HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcastbw_reg_vl<0x7B, "VPBROADCASTWr",
+ avx512vl_i16_info, X86VBroadcast, GR16, sub_16bit,
+ HasBWI>;
+defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
+ X86VBroadcast, GR32, HasAVX512>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
+ X86VBroadcast, GR64, HasAVX512>, VEX_W;
+
+multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd,
+ bit IsConvertibleToThreeAddress> {
+ let Predicates = [prd] in {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
+ WriteShuffle256Ld, _.info512, _.info128,
+ IsConvertibleToThreeAddress>,
+ EVEX_V512;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
+ WriteShuffle256Ld, _.info256, _.info128,
+ IsConvertibleToThreeAddress>,
+ EVEX_V256;
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle,
+ WriteShuffleXLd, _.info128, _.info128,
+ IsConvertibleToThreeAddress>,
+ EVEX_V128;
+ }
+}
+
+defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
+ avx512vl_i8_info, HasBWI, 0>;
+defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
+ avx512vl_i16_info, HasBWI, 0>;
+defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
+ avx512vl_i32_info, HasAVX512, 1>;
+defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
+ avx512vl_i64_info, HasAVX512, 1>, VEX_W1X;
+
+multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ X86VectorVTInfo _Dst,
+ X86VectorVTInfo _Src> {
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+ (_Dst.VT (OpNode addr:$src))>,
+ Sched<[SchedWriteShuffle.YMM.Folded]>,
+ AVX5128IBase, EVEX;
+}
+
+// This should be used for the AVX512DQ broadcast instructions. It disables
+// the unmasked patterns so that we only use the DQ instructions when masking
+// is requested.
+multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ X86VectorVTInfo _Dst,
+ X86VectorVTInfo _Src> {
+ let hasSideEffects = 0, mayLoad = 1 in
+ defm rm : AVX512_maskable_split<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+ (null_frag),
+ (_Dst.VT (OpNode addr:$src))>,
+ Sched<[SchedWriteShuffle.YMM.Folded]>,
+ AVX5128IBase, EVEX;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST SUBVECTORS
+//
+
+defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+ X86SubVBroadcastld128, v16i32_info, v4i32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+ X86SubVBroadcastld128, v16f32_info, v4f32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
+ X86SubVBroadcastld256, v8i64_info, v4i64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT4>;
+defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
+ X86SubVBroadcastld256, v8f64_info, v4f64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT4>;
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
+
+def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
+ (v16f32 immAllZerosV)),
+ (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
+ VR512:$src0),
+ (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
+ (v16i32 immAllZerosV)),
+ (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
+ VR512:$src0),
+ (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
+ (v8f64 immAllZerosV)),
+ (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
+ VR512:$src0),
+ (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
+ (v8i64 immAllZerosV)),
+ (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
+ VR512:$src0),
+ (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+ X86SubVBroadcastld128, v8i32x_info, v4i32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+ X86SubVBroadcastld128, v8f32x_info, v4f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
+ (v8f32 immAllZerosV)),
+ (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
+ VR256X:$src0),
+ (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
+ (v8i32 immAllZerosV)),
+ (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
+ VR256X:$src0),
+ (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+}
+
+let Predicates = [HasVLX, HasDQI] in {
+defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
+ X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X,
+ EVEX_V256, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
+ X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X,
+ EVEX_V256, EVEX_CD8<64, CD8VT2>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect_mask VK4WM:$mask,
+ (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
+ (v4f64 immAllZerosV)),
+ (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK4WM:$mask,
+ (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
+ VR256X:$src0),
+ (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK4WM:$mask,
+ (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
+ (v4i64 immAllZerosV)),
+ (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK4WM:$mask,
+ (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
+ VR256X:$src0),
+ (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
+}
+
+let Predicates = [HasDQI] in {
+defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
+ X86SubVBroadcastld128, v8i64_info, v2i64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8",
+ X86SubVBroadcastld256, v16i32_info, v8i32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
+ X86SubVBroadcastld128, v8f64_info, v2f64x_info>, VEX_W,
+ EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
+ X86SubVBroadcastld256, v16f32_info, v8f32x_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+
+// Patterns for selects of bitcasted operations.
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
+ (v16f32 immAllZerosV)),
+ (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
+ VR512:$src0),
+ (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
+ (v16i32 immAllZerosV)),
+ (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
+ VR512:$src0),
+ (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
+
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
+ (v8f64 immAllZerosV)),
+ (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
+ VR512:$src0),
+ (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
+ (v8i64 immAllZerosV)),
+ (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
+ VR512:$src0),
+ (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
+}
+
+multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
+ let Predicates = [HasDQI] in
+ defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
+ WriteShuffle256Ld, _Dst.info512,
+ _Src.info512, _Src.info128, 0, null_frag, null_frag>,
+ EVEX_V512;
+ let Predicates = [HasDQI, HasVLX] in
+ defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
+ WriteShuffle256Ld, _Dst.info256,
+ _Src.info256, _Src.info128, 0, null_frag, null_frag>,
+ EVEX_V256;
+}
+
+multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> :
+ avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {
+
+ let Predicates = [HasDQI, HasVLX] in
+ defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle,
+ WriteShuffleXLd, _Dst.info128,
+ _Src.info128, _Src.info128, 0, null_frag, null_frag>,
+ EVEX_V128;
+}
+
+defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
+ avx512vl_i32_info, avx512vl_i64_info>;
+defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
+ avx512vl_f32_info, avx512vl_f64_info>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST MASK TO VECTOR REGISTER
+//---
+multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, RegisterClass KRC> {
+ def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>,
+ EVEX, Sched<[WriteShuffle]>;
+}
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> {
+ let Predicates = [HasCDI] in
+ defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512;
+ let Predicates = [HasCDI, HasVLX] in {
+ defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256;
+ defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128;
+ }
+}
+
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d",
+ avx512vl_i32_info, VK16>;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
+ avx512vl_i64_info, VK8>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// -- VPERMI2 - 3 source operands form --
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
+ hasSideEffects = 0 in {
+ defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1, _.RC:$src3)), 1>,
+ EVEX_4V, AVX5128IBase, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
+ (_.VT (_.LdFrag addr:$src3)))), 1>,
+ EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
+ hasSideEffects = 0, mayLoad = 1 in
+ defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (_.VT (X86VPermt2 _.RC:$src2,
+ IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
+ AVX5128IBase, EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo ShuffleMask> {
+ defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512,
+ ShuffleMask.info512>,
+ avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info512,
+ ShuffleMask.info512>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128,
+ ShuffleMask.info128>,
+ avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info128,
+ ShuffleMask.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256,
+ ShuffleMask.info256>,
+ avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info256,
+ ShuffleMask.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo Idx,
+ Predicate Prd> {
+ let Predicates = [Prd] in
+ defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512,
+ Idx.info512>, EVEX_V512;
+ let Predicates = [Prd, HasVLX] in {
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128,
+ Idx.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256,
+ Idx.info256>, EVEX_V256;
+ }
+}
+
+defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", WriteVarShuffle256,
+ avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", WriteVarShuffle256,
+ avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w", WriteVarShuffle256,
+ avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b", WriteVarShuffle256,
+ avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
+ EVEX_CD8<8, CD8VF>;
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", WriteFVarShuffle256,
+ avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", WriteFVarShuffle256,
+ avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// Extra patterns to deal with extra bitcasts due to passthru and index being
+// different types on the fp versions.
+multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _,
+ X86VectorVTInfo IdxVT,
+ X86VectorVTInfo CastVT> {
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (X86VPermt2 (_.VT _.RC:$src2),
+ (IdxVT.VT (bitconvert
+ (CastVT.VT _.RC:$src1))),
+ _.RC:$src3),
+ (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+ (!cast<Instruction>(InstrStr#"rrk") _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, _.RC:$src3)>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (X86VPermt2 _.RC:$src2,
+ (IdxVT.VT (bitconvert
+ (CastVT.VT _.RC:$src1))),
+ (_.LdFrag addr:$src3)),
+ (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+ (!cast<Instruction>(InstrStr#"rmk") _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3)>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (X86VPermt2 _.RC:$src2,
+ (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
+ (_.BroadcastLdFrag addr:$src3)),
+ (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+ (!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3)>;
+}
+
+// TODO: Should we add more casts? The vXi64 case is common due to ABI.
+defm : avx512_perm_i_lowering<"VPERMI2PS", v16f32_info, v16i32_info, v8i64_info>;
+defm : avx512_perm_i_lowering<"VPERMI2PS256", v8f32x_info, v8i32x_info, v4i64x_info>;
+defm : avx512_perm_i_lowering<"VPERMI2PS128", v4f32x_info, v4i32x_info, v2i64x_info>;
+
+// VPERMT2
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins IdxVT.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
+ EVEX_4V, AVX5128IBase, Sched<[sched]>;
+
+ defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins IdxVT.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
+ (_.LdFrag addr:$src3))), 1>,
+ EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (_.VT (X86VPermt2 _.RC:$src1,
+ IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
+ AVX5128IBase, EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo ShuffleMask> {
+ defm NAME: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info512,
+ ShuffleMask.info512>,
+ avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info512,
+ ShuffleMask.info512>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info128,
+ ShuffleMask.info128>,
+ avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info128,
+ ShuffleMask.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info256,
+ ShuffleMask.info256>,
+ avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info256,
+ ShuffleMask.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo Idx, Predicate Prd> {
+ let Predicates = [Prd] in
+ defm NAME: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info512,
+ Idx.info512>, EVEX_V512;
+ let Predicates = [Prd, HasVLX] in {
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info128,
+ Idx.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info256,
+ Idx.info256>, EVEX_V256;
+ }
+}
+
+defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", WriteVarShuffle256,
+ avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", WriteVarShuffle256,
+ avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w", WriteVarShuffle256,
+ avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b", WriteVarShuffle256,
+ avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
+ EVEX_CD8<8, CD8VF>;
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", WriteFVarShuffle256,
+ avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", WriteFVarShuffle256,
+ avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - BLEND using mask
+//
+
+multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+ def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>,
+ EVEX_4V, Sched<[sched]>;
+ def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+ def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_KZ, Sched<[sched]>, NotMemoryFoldable;
+ let mayLoad = 1 in {
+ def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
+ }
+ }
+}
+multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in {
+ def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
+ EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ def rmbkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}} {z}|",
+ "$dst {${mask}} {z}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
+ EVEX_4V, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
+
+ def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, "}"), []>,
+ EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass blendmask_dq<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo> {
+ defm Z : WriteFVarBlendask<opc, OpcodeStr, sched.ZMM, VTInfo.info512>,
+ WriteFVarBlendask_rmb<opc, OpcodeStr, sched.ZMM, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z256 : WriteFVarBlendask<opc, OpcodeStr, sched.YMM, VTInfo.info256>,
+ WriteFVarBlendask_rmb<opc, OpcodeStr, sched.YMM, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : WriteFVarBlendask<opc, OpcodeStr, sched.XMM, VTInfo.info128>,
+ WriteFVarBlendask_rmb<opc, OpcodeStr, sched.XMM, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass blendmask_bw<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasBWI] in
+ defm Z : WriteFVarBlendask<opc, OpcodeStr, sched.ZMM, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [HasBWI, HasVLX] in {
+ defm Z256 : WriteFVarBlendask<opc, OpcodeStr, sched.YMM, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : WriteFVarBlendask<opc, OpcodeStr, sched.XMM, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+defm VBLENDMPS : blendmask_dq<0x65, "vblendmps", SchedWriteFVarBlend,
+ avx512vl_f32_info>;
+defm VBLENDMPD : blendmask_dq<0x65, "vblendmpd", SchedWriteFVarBlend,
+ avx512vl_f64_info>, VEX_W;
+defm VPBLENDMD : blendmask_dq<0x64, "vpblendmd", SchedWriteVarBlend,
+ avx512vl_i32_info>;
+defm VPBLENDMQ : blendmask_dq<0x64, "vpblendmq", SchedWriteVarBlend,
+ avx512vl_i64_info>, VEX_W;
+defm VPBLENDMB : blendmask_bw<0x66, "vpblendmb", SchedWriteVarBlend,
+ avx512vl_i8_info>;
+defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend,
+ avx512vl_i16_info>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
+
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
+ PatFrag OpNode_su, PatFrag OpNodeSAE_su,
+ X86FoldableSchedWrite sched> {
+ defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+ (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ let mayLoad = 1 in
+ defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
+ timm:$cc),
+ (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
+ timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+
+ let Uses = [MXCSR] in
+ defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc",
+ (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ timm:$cc),
+ (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ timm:$cc)>,
+ EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
+
+ let isCodeGenOnly = 1 in {
+ let isCommutable = 1 in
+ def rr : AVX512Ii8<0xC2, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, u8imm:$cc),
+ !strconcat("vcmp", _.Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+ [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+ _.FRC:$src2,
+ timm:$cc))]>,
+ EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ def rm : AVX512Ii8<0xC2, MRMSrcMem,
+ (outs _.KRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+ !strconcat("vcmp", _.Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+ [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2),
+ timm:$cc))]>,
+ EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+}
+
+def X86cmpms_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpms node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+def X86cmpmsSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpmsSAE node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+
+let Predicates = [HasAVX512] in {
+ let ExeDomain = SSEPackedSingle in
+ defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsSAE,
+ X86cmpms_su, X86cmpmsSAE_su,
+ SchedWriteFCmp.Scl>, AVX512XSIi8Base;
+ let ExeDomain = SSEPackedDouble in
+ defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsSAE,
+ X86cmpms_su, X86cmpmsSAE_su,
+ SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
+}
+
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, bit IsCommutable> {
+ let isCommutable = IsCommutable, hasSideEffects = 0 in
+ def rr : AVX512BI<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, EVEX_4V, Sched<[sched]>;
+ let mayLoad = 1, hasSideEffects = 0 in
+ def rm : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ let isCommutable = IsCommutable, hasSideEffects = 0 in
+ def rrk : AVX512BI<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+ let mayLoad = 1, hasSideEffects = 0 in
+ def rmk : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ bit IsCommutable> :
+ avx512_icmp_packed<opc, OpcodeStr, sched, _, IsCommutable> {
+ let mayLoad = 1, hasSideEffects = 0 in {
+ def rmb : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
+ "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+ []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def rmbk : AVX512BI<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+ _.ScalarMemOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+ []>, EVEX_4V, EVEX_K, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd,
+ bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, sched.ZMM,
+ VTInfo.info512, IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, sched.YMM,
+ VTInfo.info256, IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, sched.XMM,
+ VTInfo.info128, IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo,
+ Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.ZMM,
+ VTInfo.info512, IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.YMM,
+ VTInfo.info256, IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.XMM,
+ VTInfo.info128, IsCommutable>, EVEX_V128;
+ }
+}
+
+// This fragment treats X86cmpm as commutable to help match loads in both
+// operands for PCMPEQ.
+def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>;
+def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
+ (setcc node:$src1, node:$src2, SETGT)>;
+
+// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
+// increase the pattern complexity the way an immediate would.
+let AddedComplexity = 2 in {
+// FIXME: Is there a better scheduler class for VPCMP?
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb",
+ SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>,
+ EVEX_CD8<8, CD8VF>, VEX_WIG;
+
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw",
+ SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>,
+ EVEX_CD8<16, CD8VF>, VEX_WIG;
+
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd",
+ SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>,
+ EVEX_CD8<32, CD8VF>;
+
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq",
+ SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
+ T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb",
+ SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
+ EVEX_CD8<8, CD8VF>, VEX_WIG;
+
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw",
+ SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
+ EVEX_CD8<16, CD8VF>, VEX_WIG;
+
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd",
+ SchedWriteVecALU, avx512vl_i32_info, HasAVX512>,
+ EVEX_CD8<32, CD8VF>;
+
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq",
+ SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
+ T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
+ PatFrag Frag_su, PatFrag CommFrag, PatFrag CommFrag_su,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Name> {
+ let isCommutable = 1 in
+ def rri : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+ [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ cond)))]>,
+ EVEX_4V, Sched<[sched]>;
+ def rmi : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+ [(set _.KRC:$dst, (_.KVT
+ (Frag:$cc
+ (_.VT _.RC:$src1),
+ (_.VT (_.LdFrag addr:$src2)),
+ cond)))]>,
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ let isCommutable = 1 in
+ def rrik : AVX512AIi8<opc, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (_.KVT (Frag_su:$cc (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ cond))))]>,
+ EVEX_4V, EVEX_K, Sched<[sched]>;
+ def rmik : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (_.KVT
+ (Frag_su:$cc
+ (_.VT _.RC:$src1),
+ (_.VT (_.LdFrag addr:$src2)),
+ cond))))]>,
+ EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
+ (_.VT _.RC:$src1), cond)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi")
+ _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
+
+ def : Pat<(and _.KRCWM:$mask,
+ (_.KVT (CommFrag_su:$cc (_.LdFrag addr:$src2),
+ (_.VT _.RC:$src1), cond))),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik")
+ _.KRCWM:$mask, _.RC:$src1, addr:$src2,
+ (CommFrag.OperandTransform $cc))>;
+}
+
+multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
+ PatFrag Frag_su, PatFrag CommFrag,
+ PatFrag CommFrag_su, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Name> :
+ avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched, _, Name> {
+ def rmib : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+ [(set _.KRC:$dst, (_.KVT (Frag:$cc
+ (_.VT _.RC:$src1),
+ (_.BroadcastLdFrag addr:$src2),
+ cond)))]>,
+ EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def rmibk : AVX512AIi8<opc, MRMSrcMem,
+ (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+ _.ScalarMemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+ [(set _.KRC:$dst, (and _.KRCWM:$mask,
+ (_.KVT (Frag_su:$cc
+ (_.VT _.RC:$src1),
+ (_.BroadcastLdFrag addr:$src2),
+ cond))))]>,
+ EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ def : Pat<(_.KVT (CommFrag:$cc (_.BroadcastLdFrag addr:$src2),
+ (_.VT _.RC:$src1), cond)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmib")
+ _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
+
+ def : Pat<(and _.KRCWM:$mask,
+ (_.KVT (CommFrag_su:$cc (_.BroadcastLdFrag addr:$src2),
+ (_.VT _.RC:$src1), cond))),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmibk")
+ _.KRCWM:$mask, _.RC:$src1, addr:$src2,
+ (CommFrag_su.OperandTransform $cc))>;
+}
+
+multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
+ PatFrag Frag_su, PatFrag CommFrag,
+ PatFrag CommFrag_su, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.ZMM, VTInfo.info512, NAME>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.YMM, VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.XMM, VTInfo.info128, NAME>, EVEX_V128;
+ }
+}
+
+multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, PatFrag Frag,
+ PatFrag Frag_su, PatFrag CommFrag,
+ PatFrag CommFrag_su, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.ZMM, VTInfo.info512, NAME>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.YMM, VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.XMM, VTInfo.info128, NAME>, EVEX_V128;
+ }
+}
+
+def X86pcmpm_imm : SDNodeXForm<setcc, [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ uint8_t SSECC = X86::getVPCMPImmForCond(CC);
+ return getI8Imm(SSECC, SDLoc(N));
+}]>;
+
+// Swapped operand version of the above.
+def X86pcmpm_imm_commute : SDNodeXForm<setcc, [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ uint8_t SSECC = X86::getVPCMPImmForCond(CC);
+ SSECC = X86::getSwappedVPCMPImm(SSECC);
+ return getI8Imm(SSECC, SDLoc(N));
+}]>;
+
+def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
+def X86pcmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
+// Same as above, but commutes immediate. Use for load folding.
+def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
+def X86pcmpm_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
+def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
+def X86pcmpum_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
+// Same as above, but commutes immediate. Use for load folding.
+def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
+def X86pcmpum_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
+// FIXME: Is there a better scheduler class for VPCMP/VPCMPU?
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
+ SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
+ EVEX_CD8<8, CD8VF>;
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
+ SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
+ EVEX_CD8<8, CD8VF>;
+
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
+ SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
+ SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
+ SchedWriteVecALU, avx512vl_i32_info,
+ HasAVX512>, EVEX_CD8<32, CD8VF>;
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
+ SchedWriteVecALU, avx512vl_i32_info,
+ HasAVX512>, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
+ SchedWriteVecALU, avx512vl_i64_info,
+ HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
+ SchedWriteVecALU, avx512vl_i64_info,
+ HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpm node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+
+def X86cmpm_imm_commute : SDNodeXForm<timm, [{
+ uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f);
+ return getI8Imm(Imm, SDLoc(N));
+}]>;
+
+multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ string Name> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (X86any_cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+ 1>, Sched<[sched]>;
+
+ defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (X86any_cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
+ timm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
+ timm:$cc)>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $cc",
+ (X86any_cmpm (_.VT _.RC:$src1),
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ timm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1),
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ timm:$cc)>,
+ EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+
+ // Patterns for selecting with loads in other operand.
+ def : Pat<(X86any_cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
+ timm:$cc),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2),
+ (_.VT _.RC:$src1),
+ timm:$cc)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86any_cmpm (_.BroadcastLdFrag addr:$src2),
+ (_.VT _.RC:$src1), timm:$cc),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.BroadcastLdFrag addr:$src2),
+ (_.VT _.RC:$src1),
+ timm:$cc)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ // Patterns for mask intrinsics.
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rri") _.RC:$src1, _.RC:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rrik") _.KRCWM:$mask, _.RC:$src1,
+ _.RC:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1,
+ addr:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1,
+ addr:$src2, timm:$cc)>;
+
+ // Patterns for mask intrinsics with loads in other operand.
+ def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+}
+
+multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ // comparison code form (VCMP[EQ/LT/LE/...]
+ let Uses = [MXCSR] in
+ defm rrib : AVX512_maskable_custom_cmp<0xC2, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, {sae}, $src2, $src1",
+ "$src1, $src2, {sae}, $cc",
+ [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2), timm:$cc, (_.KVT immAllOnesV)))],
+ [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask))]>,
+ EVEX_B, Sched<[sched]>;
+}
+
+multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcmp_common<sched.ZMM, _.info512, NAME>,
+ avx512_vcmp_sae<sched.ZMM, _.info512>, EVEX_V512;
+
+ }
+ let Predicates = [HasAVX512,HasVLX] in {
+ defm Z128 : avx512_vcmp_common<sched.XMM, _.info128, NAME>, EVEX_V128;
+ defm Z256 : avx512_vcmp_common<sched.YMM, _.info256, NAME>, EVEX_V256;
+ }
+}
+
+defm VCMPPD : avx512_vcmp<SchedWriteFCmp, avx512vl_f64_info>,
+ AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
+ AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+// Patterns to select fp compares with load as first operand.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1,
+ timm:$cc)),
+ (VCMPSDZrm FR64X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1,
+ timm:$cc)),
+ (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
+}
+
+// ----------------------------------------------------------------
+// FPClass
+
+def X86Vfpclasss_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86Vfpclasss node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
+def X86Vfpclass_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86Vfpclass node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
+//handle fpclass instruction mask = op(reg_scalar,imm)
+// op(mem_scalar,imm)
+multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ Predicate prd> {
+ let Predicates = [prd], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
+ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1),
+ (i32 timm:$src2)))]>,
+ Sched<[sched]>;
+ def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(and _.KRCWM:$mask,
+ (X86Vfpclasss_su (_.VT _.RC:$src1),
+ (i32 timm:$src2))))]>,
+ EVEX_K, Sched<[sched]>;
+ def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.KRC:$dst,
+ (X86Vfpclasss (_.ScalarIntMemFrags addr:$src1),
+ (i32 timm:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(and _.KRCWM:$mask,
+ (X86Vfpclasss_su (_.ScalarIntMemFrags addr:$src1),
+ (i32 timm:$src2))))]>,
+ EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
+// fpclass(reg_vec, mem_vec, imm)
+// fpclass(reg_vec, broadcast(eltVt), imm)
+multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ string mem>{
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
+ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1),
+ (i32 timm:$src2)))]>,
+ Sched<[sched]>;
+ def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(and _.KRCWM:$mask,
+ (X86Vfpclass_su (_.VT _.RC:$src1),
+ (i32 timm:$src2))))]>,
+ EVEX_K, Sched<[sched]>;
+ def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#"{"#mem#"}"#
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.KRC:$dst,(X86Vfpclass
+ (_.VT (_.LdFrag addr:$src1)),
+ (i32 timm:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#"{"#mem#"}"#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su
+ (_.VT (_.LdFrag addr:$src1)),
+ (i32 timm:$src2))))]>,
+ EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#"\t{$src2, ${src1}"#
+ _.BroadcastStr#", $dst|$dst, ${src1}"
+ #_.BroadcastStr#", $src2}",
+ [(set _.KRC:$dst,(X86Vfpclass
+ (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2)))]>,
+ EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix#"\t{$src2, ${src1}"#
+ _.BroadcastStr#", $dst {${mask}}|$dst {${mask}}, ${src1}"#
+ _.BroadcastStr#", $src2}",
+ [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su
+ (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2))))]>,
+ EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+
+ // Allow registers or broadcast with the x, y, z suffix we use to disambiguate
+ // the memory form.
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(NAME#"rr")
+ _.KRC:$dst, _.RC:$src1, i32u8imm:$src2), 0, "att">;
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ (!cast<Instruction>(NAME#"rrk")
+ _.KRC:$dst, _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), 0, "att">;
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, ${src1}"#_.BroadcastStr#", $dst|$dst, ${src1}"#
+ _.BroadcastStr#", $src2}",
+ (!cast<Instruction>(NAME#"rmb")
+ _.KRC:$dst, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">;
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, ${src1}"#_.BroadcastStr#", $dst {${mask}}|"
+ "$dst {${mask}}, ${src1}"#_.BroadcastStr#", $src2}",
+ (!cast<Instruction>(NAME#"rmbk")
+ _.KRC:$dst, _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">;
+}
+
+multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, X86SchedWriteWidths sched,
+ Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z : avx512_vector_fpclass<opc, OpcodeStr, sched.ZMM,
+ _.info512, "z">, EVEX_V512;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, sched.XMM,
+ _.info128, "x">, EVEX_V128;
+ defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, sched.YMM,
+ _.info256, "y">, EVEX_V256;
+ }
+}
+
+multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
+ bits<8> opcScalar, X86SchedWriteWidths sched,
+ Predicate prd> {
+ defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
+ sched, prd>,
+ EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
+ sched, prd>,
+ EVEX_CD8<64, CD8VF> , VEX_W;
+ defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
+ sched.Scl, f32x_info, prd>, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
+ sched.Scl, f64x_info, prd>, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp,
+ HasDQI>, AVX512AIi8Base, EVEX;
+
+//-----------------------------------------------------------------
+// Mask register copy, including
+// - copy between mask registers
+// - load/store mask registers
+// - copy from GPR to mask register and vice versa
+//
+multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
+ string OpcodeStr, RegisterClass KRC,
+ ValueType vvt, X86MemOperand x86memop> {
+ let isMoveReg = 1, hasSideEffects = 0, SchedRW = [WriteMove] in
+ def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[WriteMove]>;
+ def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (vvt (load addr:$src)))]>,
+ Sched<[WriteLoad]>;
+ def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store KRC:$src, addr:$dst)]>,
+ Sched<[WriteStore]>;
+}
+
+multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
+ string OpcodeStr,
+ RegisterClass KRC, RegisterClass GRC> {
+ let hasSideEffects = 0 in {
+ def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[WriteMove]>;
+ def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[WriteMove]>;
+ }
+}
+
+let Predicates = [HasDQI] in
+ defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
+ avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
+ VEX, PD;
+
+let Predicates = [HasAVX512] in
+ defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+ avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
+ VEX, PS;
+
+let Predicates = [HasBWI] in {
+ defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
+ VEX, PD, VEX_W;
+ defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
+ VEX, XD;
+ defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
+ VEX, PS, VEX_W;
+ defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
+ VEX, XD, VEX_W;
+}
+
+// GR from/to mask register
+def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>;
+def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>;
+def : Pat<(i8 (trunc (i16 (bitconvert (v16i1 VK16:$src))))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_8bit)>;
+
+def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>;
+def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit)>;
+
+def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (KMOVWrk VK16:$src)>;
+def : Pat<(i64 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (SUBREG_TO_REG (i64 0), (KMOVWrk VK16:$src), sub_32bit)>;
+def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (COPY_TO_REGCLASS VK16:$src, GR32)>;
+def : Pat<(i64 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK16:$src, GR32), sub_32bit)>;
+
+def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
+def : Pat<(i64 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (SUBREG_TO_REG (i64 0), (KMOVBrk VK8:$src), sub_32bit)>, Requires<[HasDQI]>;
+def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (COPY_TO_REGCLASS VK8:$src, GR32)>;
+def : Pat<(i64 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK8:$src, GR32), sub_32bit)>;
+
+def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
+ (COPY_TO_REGCLASS GR32:$src, VK32)>;
+def : Pat<(i32 (bitconvert (v32i1 VK32:$src))),
+ (COPY_TO_REGCLASS VK32:$src, GR32)>;
+def : Pat<(v64i1 (bitconvert (i64 GR64:$src))),
+ (COPY_TO_REGCLASS GR64:$src, VK64)>;
+def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
+ (COPY_TO_REGCLASS VK64:$src, GR64)>;
+
+// Load/store kreg
+let Predicates = [HasDQI] in {
+ def : Pat<(v1i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>;
+ def : Pat<(v2i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
+ def : Pat<(v4i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+ (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
+ def : Pat<(v16i1 (bitconvert (loadi16 addr:$src))),
+ (KMOVWkm addr:$src)>;
+}
+
+def X86kextract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i8>,
+ SDTCVecEltisVT<1, i1>,
+ SDTCisPtrTy<2>]>>;
+
+let Predicates = [HasAVX512] in {
+ multiclass operation_gpr_mask_copy_lowering<RegisterClass maskRC, ValueType maskVT> {
+ def : Pat<(maskVT (scalar_to_vector GR32:$src)),
+ (COPY_TO_REGCLASS GR32:$src, maskRC)>;
+
+ def : Pat<(maskVT (scalar_to_vector GR8:$src)),
+ (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
+
+ def : Pat<(i8 (X86kextract maskRC:$src, (iPTR 0))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>;
+
+ def : Pat<(i32 (anyext (i8 (X86kextract maskRC:$src, (iPTR 0))))),
+ (i32 (COPY_TO_REGCLASS maskRC:$src, GR32))>;
+ }
+
+ defm : operation_gpr_mask_copy_lowering<VK1, v1i1>;
+ defm : operation_gpr_mask_copy_lowering<VK2, v2i1>;
+ defm : operation_gpr_mask_copy_lowering<VK4, v4i1>;
+ defm : operation_gpr_mask_copy_lowering<VK8, v8i1>;
+ defm : operation_gpr_mask_copy_lowering<VK16, v16i1>;
+ defm : operation_gpr_mask_copy_lowering<VK32, v32i1>;
+ defm : operation_gpr_mask_copy_lowering<VK64, v64i1>;
+
+ def : Pat<(insert_subvector (v16i1 immAllZerosV),
+ (v1i1 (scalar_to_vector GR8:$src)), (iPTR 0)),
+ (KMOVWkr (AND32ri8
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit),
+ (i32 1)))>;
+}
+
+// Mask unary operation
+// - KNOT
+multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode,
+ X86FoldableSchedWrite sched, Predicate prd> {
+ let Predicates = [prd] in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (OpNode KRC:$src))]>,
+ Sched<[sched]>;
+}
+
+multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ X86FoldableSchedWrite sched> {
+ defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+ sched, HasDQI>, VEX, PD;
+ defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+ sched, HasAVX512>, VEX, PS;
+ defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+ sched, HasBWI>, VEX, PD, VEX_W;
+ defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+ sched, HasBWI>, VEX, PS, VEX_W;
+}
+
+// TODO - do we need a X86SchedWriteWidths::KMASK type?
+defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot, SchedWriteVecLogic.XMM>;
+
+// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
+let Predicates = [HasAVX512, NoDQI] in
+def : Pat<(vnot VK8:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+
+def : Pat<(vnot VK4:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>;
+def : Pat<(vnot VK2:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>;
+def : Pat<(vnot VK1:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK1:$src, VK16)), VK2)>;
+
+// Mask binary operation
+// - KAND, KANDN, KOR, KXNOR, KXOR
+multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode,
+ X86FoldableSchedWrite sched, Predicate prd,
+ bit IsCommutable> {
+ let Predicates = [prd], isCommutable = IsCommutable in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>,
+ Sched<[sched]>;
+}
+
+multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ X86FoldableSchedWrite sched, bit IsCommutable,
+ Predicate prdW = HasAVX512> {
+ defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+ sched, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+ defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+ sched, prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+ defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+ sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
+ defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+ sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
+}
+
+// These nodes use 'vnot' instead of 'not' to support vectors.
+def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
+def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;
+
+// TODO - do we need a X86SchedWriteWidths::KMASK type?
+defm KAND : avx512_mask_binop_all<0x41, "kand", and, SchedWriteVecLogic.XMM, 1>;
+defm KOR : avx512_mask_binop_all<0x45, "kor", or, SchedWriteVecLogic.XMM, 1>;
+defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SchedWriteVecLogic.XMM, 1>;
+defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SchedWriteVecLogic.XMM, 1>;
+defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SchedWriteVecLogic.XMM, 0>;
+defm KADD : avx512_mask_binop_all<0x4A, "kadd", X86kadd, SchedWriteVecLogic.XMM, 1, HasDQI>;
+
+multiclass avx512_binop_pat<SDPatternOperator VOpNode,
+ Instruction Inst> {
+ // With AVX512F, 8-bit mask is promoted to 16-bit mask,
+ // for the DQI set, this type is legal and KxxxB instruction is used
+ let Predicates = [NoDQI] in
+ def : Pat<(VOpNode VK8:$src1, VK8:$src2),
+ (COPY_TO_REGCLASS
+ (Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
+ (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+
+ // All types smaller than 8 bits require conversion anyway
+ def : Pat<(VOpNode VK1:$src1, VK1:$src2),
+ (COPY_TO_REGCLASS (Inst
+ (COPY_TO_REGCLASS VK1:$src1, VK16),
+ (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+ def : Pat<(VOpNode VK2:$src1, VK2:$src2),
+ (COPY_TO_REGCLASS (Inst
+ (COPY_TO_REGCLASS VK2:$src1, VK16),
+ (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>;
+ def : Pat<(VOpNode VK4:$src1, VK4:$src2),
+ (COPY_TO_REGCLASS (Inst
+ (COPY_TO_REGCLASS VK4:$src1, VK16),
+ (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>;
+}
+
+defm : avx512_binop_pat<and, KANDWrr>;
+defm : avx512_binop_pat<vandn, KANDNWrr>;
+defm : avx512_binop_pat<or, KORWrr>;
+defm : avx512_binop_pat<vxnor, KXNORWrr>;
+defm : avx512_binop_pat<xor, KXORWrr>;
+
+// Mask unpacking
+multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
+ X86KVectorVTInfo Src, X86FoldableSchedWrite sched,
+ Predicate prd> {
+ let Predicates = [prd] in {
+ let hasSideEffects = 0 in
+ def rr : I<0x4b, MRMSrcReg, (outs Dst.KRC:$dst),
+ (ins Src.KRC:$src1, Src.KRC:$src2),
+ "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ VEX_4V, VEX_L, Sched<[sched]>;
+
+ def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)),
+ (!cast<Instruction>(NAME#rr) Src.KRC:$src2, Src.KRC:$src1)>;
+ }
+}
+
+defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info, WriteShuffle, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, PS, VEX_W;
+
+// Mask bit testing
+multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode, X86FoldableSchedWrite sched,
+ Predicate prd> {
+ let Predicates = [prd], Defs = [EFLAGS] in
+ def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>,
+ Sched<[sched]>;
+}
+
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ Predicate prdW = HasAVX512> {
+ defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, sched, HasDQI>,
+ VEX, PD;
+ defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, sched, prdW>,
+ VEX, PS;
+ defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, sched, HasBWI>,
+ VEX, PS, VEX_W;
+ defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, sched, HasBWI>,
+ VEX, PD, VEX_W;
+}
+
+// TODO - do we need a X86SchedWriteWidths::KMASK type?
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SchedWriteVecLogic.XMM>;
+defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, SchedWriteVecLogic.XMM, HasDQI>;
+
+// Mask shift
+multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode, X86FoldableSchedWrite sched> {
+ let Predicates = [HasAVX512] in
+ def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
+ !strconcat(OpcodeStr,
+ "\t{$imm, $src, $dst|$dst, $src, $imm}"),
+ [(set KRC:$dst, (OpNode KRC:$src, (i8 timm:$imm)))]>,
+ Sched<[sched]>;
+}
+
+multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
+ SDNode OpNode, X86FoldableSchedWrite sched> {
+ defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+ sched>, VEX, TAPD, VEX_W;
+ let Predicates = [HasDQI] in
+ defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+ sched>, VEX, TAPD;
+ let Predicates = [HasBWI] in {
+ defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+ sched>, VEX, TAPD, VEX_W;
+ defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+ sched>, VEX, TAPD;
+ }
+}
+
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShuffle>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>;
+
+// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
+multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
+ string InstStr,
+ X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), cond)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrri")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+ (Frag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2),
+ cond)))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+ (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+}
+
+multiclass axv512_icmp_packed_cc_rmb_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
+ PatFrag CommFrag, PatFrag CommFrag_su,
+ string InstStr,
+ X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+// Broadcast load.
+def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.BroadcastLdFrag addr:$src2), cond)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmib")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (Frag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (Narrow.KVT
+ (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.BroadcastLdFrag addr:$src2),
+ cond)))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+
+// Commuted with broadcast load.
+def : Pat<(Narrow.KVT (CommFrag:$cc (Narrow.BroadcastLdFrag addr:$src2),
+ (Narrow.VT Narrow.RC:$src1),
+ cond)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmib")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (CommFrag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (Narrow.KVT
+ (CommFrag_su:$cc (Narrow.BroadcastLdFrag addr:$src2),
+ (Narrow.VT Narrow.RC:$src1),
+ cond)))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (CommFrag_su.OperandTransform $cc)), Narrow.KRC)>;
+}
+
+// Same as above, but for fp types which don't use PatFrags.
+multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr,
+ X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), timm:$cc)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrri")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+ timm:$cc), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (X86cmpm_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), timm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
+ timm:$cc), Narrow.KRC)>;
+
+// Broadcast load.
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmbi")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, timm:$cc), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (X86cmpm_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, timm:$cc), Narrow.KRC)>;
+
+// Commuted with broadcast load.
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+ (Narrow.VT Narrow.RC:$src1), timm:$cc)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmbi")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (X86cmpm_su (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+ (Narrow.VT Narrow.RC:$src1), timm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v4i32x_info, v16i32_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v4i64x_info, v8i64_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
+
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v8i32x_info, v16i32_info>;
+
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v4i32x_info, v16i32_info>;
+
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v4i64x_info, v8i64_info>;
+
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
+
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v8f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v4f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v4f64x_info, v8f64_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v16i8x_info, v64i8_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUW", v16i16x_info, v32i16_info>;
+
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUW", v8i16x_info, v32i16_info>;
+}
+
+// Mask setting all 0s or 1s
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+ let Predicates = [HasAVX512] in
+ let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1,
+ SchedRW = [WriteZero] in
+ def NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+ [(set KRC:$dst, (VT Val))]>;
+}
+
+multiclass avx512_mask_setop_w<PatFrag Val> {
+ defm W : avx512_mask_setop<VK16, v16i1, Val>;
+ defm D : avx512_mask_setop<VK32, v32i1, Val>;
+ defm Q : avx512_mask_setop<VK64, v64i1, Val>;
+}
+
+defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
+defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+ def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>;
+ def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>;
+ def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>;
+ def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>;
+ def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>;
+ def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>;
+ def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>;
+}
+
+// Patterns for kmask insert_subvector/extract_subvector to/from index=0
+multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
+ RegisterClass RC, ValueType VT> {
+ def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
+ (subVT (COPY_TO_REGCLASS RC:$src, subRC))>;
+
+ def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
+ (VT (COPY_TO_REGCLASS subRC:$src, RC))>;
+}
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK2, v2i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK4, v4i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK8, v8i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK4, v4i1>;
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK8, v8i1>;
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK2, v2i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK4, v4i1, VK8, v8i1>;
+defm : operation_subvector_mask_lowering<VK4, v4i1, VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK4, v4i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK4, v4i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK8, v8i1, VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK8, v8i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK8, v8i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK16, v16i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK16, v16i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
+ X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload,
+ X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd,
+ bit NoRMPattern = 0,
+ SDPatternOperator SelectOprr = vselect> {
+ let hasSideEffects = 0 in {
+ let isMoveReg = 1 in
+ def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
+ _.ExeDomain>, EVEX, Sched<[Sched.RR]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
+ def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
+ (_.VT _.RC:$src),
+ _.ImmAllZerosV)))], _.ExeDomain>,
+ EVEX, EVEX_KZ, Sched<[Sched.RR]>;
+
+ let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1 in
+ def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ !if(NoRMPattern, [],
+ [(set _.RC:$dst,
+ (_.VT (ld_frag addr:$src)))]),
+ _.ExeDomain>, EVEX, Sched<[Sched.RM]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
+
+ let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
+ def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src1}"),
+ [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
+ (_.VT _.RC:$src1),
+ (_.VT _.RC:$src0))))], _.ExeDomain>,
+ EVEX, EVEX_K, Sched<[Sched.RR]>;
+ def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src1}"),
+ [(set _.RC:$dst, (_.VT
+ (vselect_mask _.KRCWM:$mask,
+ (_.VT (ld_frag addr:$src1)),
+ (_.VT _.RC:$src0))))], _.ExeDomain>,
+ EVEX, EVEX_K, Sched<[Sched.RM]>;
+ }
+ def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src),
+ OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
+ "${dst} {${mask}} {z}, $src}",
+ [(set _.RC:$dst, (_.VT (vselect_mask _.KRCWM:$mask,
+ (_.VT (ld_frag addr:$src)), _.ImmAllZerosV)))],
+ _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>;
+ }
+ def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+ def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+ def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
+ (!cast<Instruction>(Name#_.ZSuffix#rmk) _.RC:$src0,
+ _.KRCWM:$mask, addr:$ptr)>;
+}
+
+multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd,
+ X86SchedWriteMoveLSWidths Sched,
+ string EVEX2VEXOvrd, bit NoRMPattern = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512,
+ _.info512.AlignedLdFrag, masked_load_aligned,
+ Sched.ZMM, "", NoRMPattern>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256,
+ _.info256.AlignedLdFrag, masked_load_aligned,
+ Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128,
+ _.info128.AlignedLdFrag, masked_load_aligned,
+ Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128;
+ }
+}
+
+multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd,
+ X86SchedWriteMoveLSWidths Sched,
+ string EVEX2VEXOvrd, bit NoRMPattern = 0,
+ SDPatternOperator SelectOprr = vselect> {
+ let Predicates = [prd] in
+ defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag,
+ masked_load, Sched.ZMM, "",
+ NoRMPattern, SelectOprr>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag,
+ masked_load, Sched.YMM, EVEX2VEXOvrd#"Y",
+ NoRMPattern, SelectOprr>, EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag,
+ masked_load, Sched.XMM, EVEX2VEXOvrd,
+ NoRMPattern, SelectOprr>, EVEX_V128;
+ }
+}
+
+multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName,
+ X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore,
+ X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd,
+ bit NoMRPattern = 0> {
+ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ let isMoveReg = 1 in
+ def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
+ OpcodeStr # "\t{$src, $dst|$dst, $src}",
+ [], _.ExeDomain>, EVEX,
+ FoldGenData<BaseName#_.ZSuffix#rr>, Sched<[Sched.RR]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rr_REV">;
+ def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, ${dst} {${mask}}|"#
+ "${dst} {${mask}}, $src}",
+ [], _.ExeDomain>, EVEX, EVEX_K,
+ FoldGenData<BaseName#_.ZSuffix#rrk>,
+ Sched<[Sched.RR]>;
+ def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, ${dst} {${mask}} {z}|" #
+ "${dst} {${mask}} {z}, $src}",
+ [], _.ExeDomain>, EVEX, EVEX_KZ,
+ FoldGenData<BaseName#_.ZSuffix#rrkz>,
+ Sched<[Sched.RR]>;
+ }
+
+ let hasSideEffects = 0, mayStore = 1 in
+ def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ !if(NoMRPattern, [],
+ [(st_frag (_.VT _.RC:$src), addr:$dst)]),
+ _.ExeDomain>, EVEX, Sched<[Sched.MR]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"mr">;
+ def mrk : AVX512PI<opc, MRMDestMem, (outs),
+ (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
+ [], _.ExeDomain>, EVEX, EVEX_K, Sched<[Sched.MR]>,
+ NotMemoryFoldable;
+
+ def: Pat<(mstore (_.VT _.RC:$src), addr:$ptr, _.KRCWM:$mask),
+ (!cast<Instruction>(BaseName#_.ZSuffix#mrk) addr:$ptr,
+ _.KRCWM:$mask, _.RC:$src)>;
+
+ def : InstAlias<OpcodeStr#".s\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(BaseName#_.ZSuffix#"rr_REV")
+ _.RC:$dst, _.RC:$src), 0>;
+ def : InstAlias<OpcodeStr#".s\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
+ (!cast<Instruction>(BaseName#_.ZSuffix#"rrk_REV")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src), 0>;
+ def : InstAlias<OpcodeStr#".s\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}",
+ (!cast<Instruction>(BaseName#_.ZSuffix#"rrkz_REV")
+ _.RC:$dst, _.KRCWM:$mask, _.RC:$src), 0>;
+}
+
+multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd,
+ X86SchedWriteMoveLSWidths Sched,
+ string EVEX2VEXOvrd, bit NoMRPattern = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store,
+ masked_store, Sched.ZMM, "",
+ NoMRPattern>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store,
+ masked_store, Sched.YMM,
+ EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
+ defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store,
+ masked_store, Sched.XMM, EVEX2VEXOvrd,
+ NoMRPattern>, EVEX_V128;
+ }
+}
+
+multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd,
+ X86SchedWriteMoveLSWidths Sched,
+ string EVEX2VEXOvrd, bit NoMRPattern = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore,
+ masked_store_aligned, Sched.ZMM, "",
+ NoMRPattern>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore,
+ masked_store_aligned, Sched.YMM,
+ EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
+ defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore,
+ masked_store_aligned, Sched.XMM, EVEX2VEXOvrd,
+ NoMRPattern>, EVEX_V128;
+ }
+}
+
+defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
+ HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
+ avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
+ HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
+ PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
+ HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
+ avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
+ HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
+ SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>,
+ avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
+ SchedWriteFMoveLS, "VMOVUPS">,
+ PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
+ SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>,
+ avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
+ SchedWriteFMoveLS, "VMOVUPD">,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
+ HasAVX512, SchedWriteVecMoveLS,
+ "VMOVDQA", 1>,
+ avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
+ HasAVX512, SchedWriteVecMoveLS,
+ "VMOVDQA", 1>,
+ PD, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
+ HasAVX512, SchedWriteVecMoveLS,
+ "VMOVDQA">,
+ avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
+ HasAVX512, SchedWriteVecMoveLS,
+ "VMOVDQA">,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ XD, EVEX_CD8<8, CD8VF>;
+
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ XD, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
+ SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>,
+ avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
+ SchedWriteVecMoveLS, "VMOVDQU", 1>,
+ XS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
+ SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>,
+ avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
+ SchedWriteVecMoveLS, "VMOVDQU">,
+ XS, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// Special instructions to help with spilling when we don't have VLX. We need
+// to load or store from a ZMM register instead. These are converted in
+// expandPostRAPseudos.
+let isReMaterializable = 1, canFoldAsLoad = 1,
+ isPseudo = 1, mayLoad = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+ "", []>, Sched<[WriteFLoadX]>;
+def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+ "", []>, Sched<[WriteFLoadY]>;
+def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+ "", []>, Sched<[WriteFLoadX]>;
+def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+ "", []>, Sched<[WriteFLoadY]>;
+}
+
+let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
+ "", []>, Sched<[WriteFStoreX]>;
+def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
+ "", []>, Sched<[WriteFStoreY]>;
+def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
+ "", []>, Sched<[WriteFStoreX]>;
+def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
+ "", []>, Sched<[WriteFStoreY]>;
+}
+
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 immAllZerosV),
+ (v8i64 VR512:$src))),
+ (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
+ VK8), VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
+ (v16i32 VR512:$src))),
+ (VMOVDQA32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
+
+// These patterns exist to prevent the above patterns from introducing a second
+// mask inversion when one already exists.
+def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)),
+ (v8i64 immAllZerosV),
+ (v8i64 VR512:$src))),
+ (VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>;
+def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
+ (v16i32 immAllZerosV),
+ (v16i32 VR512:$src))),
+ (VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
+
+multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+ def : Pat<(Narrow.VT (vselect (Narrow.KVT Narrow.KRCWM:$mask),
+ Narrow.RC:$src1, Narrow.RC:$src0)),
+ (EXTRACT_SUBREG
+ (Wide.VT
+ (!cast<Instruction>(InstrStr#"rrk")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src0, Narrow.SubRegIdx)),
+ (COPY_TO_REGCLASS Narrow.KRCWM:$mask, Wide.KRCWM),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)))),
+ Narrow.SubRegIdx)>;
+
+ def : Pat<(Narrow.VT (vselect (Narrow.KVT Narrow.KRCWM:$mask),
+ Narrow.RC:$src1, Narrow.ImmAllZerosV)),
+ (EXTRACT_SUBREG
+ (Wide.VT
+ (!cast<Instruction>(InstrStr#"rrkz")
+ (COPY_TO_REGCLASS Narrow.KRCWM:$mask, Wide.KRCWM),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)))),
+ Narrow.SubRegIdx)>;
+}
+
+// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
+// available. Use a 512-bit operation and extract.
+let Predicates = [HasAVX512, NoVLX] in {
+ defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>;
+ defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>;
+ defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>;
+ defm : mask_move_lowering<"VMOVDQA32Z", v8i32x_info, v16i32_info>;
+
+ defm : mask_move_lowering<"VMOVAPDZ", v2f64x_info, v8f64_info>;
+ defm : mask_move_lowering<"VMOVDQA64Z", v2i64x_info, v8i64_info>;
+ defm : mask_move_lowering<"VMOVAPDZ", v4f64x_info, v8f64_info>;
+ defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+ defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>;
+ defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>;
+
+ defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>;
+ defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>;
+}
+
+let Predicates = [HasAVX512] in {
+ // 512-bit load.
+ def : Pat<(alignedloadv16i32 addr:$src),
+ (VMOVDQA64Zrm addr:$src)>;
+ def : Pat<(alignedloadv32i16 addr:$src),
+ (VMOVDQA64Zrm addr:$src)>;
+ def : Pat<(alignedloadv64i8 addr:$src),
+ (VMOVDQA64Zrm addr:$src)>;
+ def : Pat<(loadv16i32 addr:$src),
+ (VMOVDQU64Zrm addr:$src)>;
+ def : Pat<(loadv32i16 addr:$src),
+ (VMOVDQU64Zrm addr:$src)>;
+ def : Pat<(loadv64i8 addr:$src),
+ (VMOVDQU64Zrm addr:$src)>;
+
+ // 512-bit store.
+ def : Pat<(alignedstore (v16i32 VR512:$src), addr:$dst),
+ (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst),
+ (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst),
+ (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(store (v16i32 VR512:$src), addr:$dst),
+ (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(store (v32i16 VR512:$src), addr:$dst),
+ (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(store (v64i8 VR512:$src), addr:$dst),
+ (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+ // 128-bit load.
+ def : Pat<(alignedloadv4i32 addr:$src),
+ (VMOVDQA64Z128rm addr:$src)>;
+ def : Pat<(alignedloadv8i16 addr:$src),
+ (VMOVDQA64Z128rm addr:$src)>;
+ def : Pat<(alignedloadv16i8 addr:$src),
+ (VMOVDQA64Z128rm addr:$src)>;
+ def : Pat<(loadv4i32 addr:$src),
+ (VMOVDQU64Z128rm addr:$src)>;
+ def : Pat<(loadv8i16 addr:$src),
+ (VMOVDQU64Z128rm addr:$src)>;
+ def : Pat<(loadv16i8 addr:$src),
+ (VMOVDQU64Z128rm addr:$src)>;
+
+ // 128-bit store.
+ def : Pat<(alignedstore (v4i32 VR128X:$src), addr:$dst),
+ (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
+ (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
+ (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v4i32 VR128X:$src), addr:$dst),
+ (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
+ (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
+ (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
+
+ // 256-bit load.
+ def : Pat<(alignedloadv8i32 addr:$src),
+ (VMOVDQA64Z256rm addr:$src)>;
+ def : Pat<(alignedloadv16i16 addr:$src),
+ (VMOVDQA64Z256rm addr:$src)>;
+ def : Pat<(alignedloadv32i8 addr:$src),
+ (VMOVDQA64Z256rm addr:$src)>;
+ def : Pat<(loadv8i32 addr:$src),
+ (VMOVDQU64Z256rm addr:$src)>;
+ def : Pat<(loadv16i16 addr:$src),
+ (VMOVDQU64Z256rm addr:$src)>;
+ def : Pat<(loadv32i8 addr:$src),
+ (VMOVDQU64Z256rm addr:$src)>;
+
+ // 256-bit store.
+ def : Pat<(alignedstore (v8i32 VR256X:$src), addr:$dst),
+ (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst),
+ (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst),
+ (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v8i32 VR256X:$src), addr:$dst),
+ (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
+ (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
+ (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
+}
+
+// Move Int Doubleword to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))]>,
+ EVEX, Sched<[WriteVecMoveFromGpr]>;
+def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+ EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>;
+def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))]>,
+ EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}", []>,
+ EVEX, VEX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteVecLoad]>;
+let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set FR64X:$dst, (bitconvert GR64:$src))]>,
+ EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
+def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64X:$src))]>,
+ EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
+}
+} // ExeDomain = SSEPackedInt
+
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert GR32:$src))]>,
+ EVEX, Sched<[WriteVecMoveFromGpr]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move doubleword from xmm register to r/m32
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+ (iPTR 0)))]>,
+ EVEX, Sched<[WriteVecMoveToGpr]>;
+def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128X:$src),
+ (iPTR 0))), addr:$dst)]>,
+ EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>;
+} // ExeDomain = SSEPackedInt
+
+// Move quadword from xmm1 register to r/m64
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+ (iPTR 0)))]>,
+ PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>,
+ Requires<[HasAVX512]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}", []>, PD,
+ EVEX, VEX_W, Sched<[WriteVecStore]>,
+ Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+ addr:$dst)]>,
+ EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
+ Sched<[WriteVecStore]>, Requires<[HasAVX512]>;
+
+let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
+def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
+ (ins VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}", []>,
+ EVEX, VEX_W, Sched<[SchedWriteVecLogic.XMM]>;
+} // ExeDomain = SSEPackedInt
+
+def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
+ (VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>;
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(X86vextractstore64 (v2i64 VR128X:$src), addr:$dst),
+ (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>;
+}
+
+// Move Scalar Single to Double Int
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
+ (ins FR32X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32X:$src))]>,
+ EVEX, Sched<[WriteVecMoveToGpr]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move Quadword Int to Packed Quadword Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>;
+} // ExeDomain = SSEPackedInt
+
+// Allow "vmovd" but print "vmovq".
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOV64toPQIZrr VR128X:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOVPQIto64Zrr GR64:$dst, VR128X:$src), 0>;
+
+// Conversions between masks and scalar fp.
+def : Pat<(v32i1 (bitconvert FR32X:$src)),
+ (KMOVDkr (VMOVSS2DIZrr FR32X:$src))>;
+def : Pat<(f32 (bitconvert VK32:$src)),
+ (VMOVDI2SSZrr (KMOVDrk VK32:$src))>;
+
+def : Pat<(v64i1 (bitconvert FR64X:$src)),
+ (KMOVQkr (VMOVSDto64Zrr FR64X:$src))>;
+def : Pat<(f64 (bitconvert VK64:$src)),
+ (VMOV64toSDZrr (KMOVQrk VK64:$src))>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 MOVSS, MOVSD
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
+ X86VectorVTInfo _> {
+ let Predicates = [HasAVX512, OptForSize] in
+ def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
+ _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+ def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
+ "$dst {${mask}} {z}, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ _.ImmAllZerosV)))],
+ _.ExeDomain>, EVEX_4V, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>;
+ let Constraints = "$src0 = $dst" in
+ def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ (_.VT _.RC:$src0))))],
+ _.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
+ let canFoldAsLoad = 1, isReMaterializable = 1 in {
+ def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))],
+ _.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
+ // _alt version uses FR32/FR64 register class.
+ let isCodeGenOnly = 1 in
+ def rm_alt : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ _.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
+ }
+ let mayLoad = 1, hasSideEffects = 0 in {
+ let Constraints = "$src0 = $dst" in
+ def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}}|",
+ "$dst {${mask}}, $src}"),
+ [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFLoad]>;
+ def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}} {z}|",
+ "$dst {${mask}} {z}, $src}"),
+ [], _.ExeDomain>, EVEX, EVEX_KZ, Sched<[WriteFLoad]>;
+ }
+ def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(store _.FRC:$src, addr:$dst)], _.ExeDomain>,
+ EVEX, Sched<[WriteFStore]>;
+ let mayStore = 1, hasSideEffects = 0 in
+ def mrk: AVX512PI<0x11, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.RC:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+ [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>,
+ NotMemoryFoldable;
+}
+
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
+ VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
+
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
+ VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+
+multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
+ PatLeaf ZeroFP, X86VectorVTInfo _> {
+
+def : Pat<(_.VT (OpNode _.RC:$src0,
+ (_.VT (scalar_to_vector
+ (_.EltVT (X86selects VK1WM:$mask,
+ (_.EltVT _.FRC:$src1),
+ (_.EltVT _.FRC:$src2))))))),
+ (!cast<Instruction>(InstrStr#rrk)
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, _.RC)),
+ VK1WM:$mask,
+ (_.VT _.RC:$src0),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>;
+
+def : Pat<(_.VT (OpNode _.RC:$src0,
+ (_.VT (scalar_to_vector
+ (_.EltVT (X86selects VK1WM:$mask,
+ (_.EltVT _.FRC:$src1),
+ (_.EltVT ZeroFP))))))),
+ (!cast<Instruction>(InstrStr#rrkz)
+ VK1WM:$mask,
+ (_.VT _.RC:$src0),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>;
+}
+
+multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
+ dag Mask, RegisterClass MaskRC> {
+
+def : Pat<(masked_store
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT _.info128.RC:$src),
+ (iPTR 0))), addr:$dst, Mask),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
+ _.info128.RC:$src)>;
+
+}
+
+multiclass avx512_store_scalar_lowering_subreg<string InstrStr,
+ AVX512VLVectorVTInfo _,
+ dag Mask, RegisterClass MaskRC,
+ SubRegIndex subreg> {
+
+def : Pat<(masked_store
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT _.info128.RC:$src),
+ (iPTR 0))), addr:$dst, Mask),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ _.info128.RC:$src)>;
+
+}
+
+// This matches the more recent codegen from clang that avoids emitting a 512
+// bit masked store directly. Codegen will widen 128-bit masked store to 512
+// bits on AVX512F only targets.
+multiclass avx512_store_scalar_lowering_subreg2<string InstrStr,
+ AVX512VLVectorVTInfo _,
+ dag Mask512, dag Mask128,
+ RegisterClass MaskRC,
+ SubRegIndex subreg> {
+
+// AVX512F pattern.
+def : Pat<(masked_store
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT _.info128.RC:$src),
+ (iPTR 0))), addr:$dst, Mask512),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ _.info128.RC:$src)>;
+
+// AVX512VL pattern.
+def : Pat<(masked_store (_.info128.VT _.info128.RC:$src), addr:$dst, Mask128),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ _.info128.RC:$src)>;
+}
+
+multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
+ dag Mask, RegisterClass MaskRC> {
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask,
+ _.info512.ImmAllZerosV)),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask,
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+ (iPTR 0))))),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
+ addr:$srcAddr)>;
+
+}
+
+multiclass avx512_load_scalar_lowering_subreg<string InstrStr,
+ AVX512VLVectorVTInfo _,
+ dag Mask, RegisterClass MaskRC,
+ SubRegIndex subreg> {
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask,
+ _.info512.ImmAllZerosV)),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask,
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+ (iPTR 0))))),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+}
+
+// This matches the more recent codegen from clang that avoids emitting a 512
+// bit masked load directly. Codegen will widen 128-bit masked load to 512
+// bits on AVX512F only targets.
+multiclass avx512_load_scalar_lowering_subreg2<string InstrStr,
+ AVX512VLVectorVTInfo _,
+ dag Mask512, dag Mask128,
+ RegisterClass MaskRC,
+ SubRegIndex subreg> {
+// AVX512F patterns.
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask512,
+ _.info512.ImmAllZerosV)),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask512,
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+ (iPTR 0))))),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+// AVX512Vl patterns.
+def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
+ _.info128.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+}
+
+defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
+defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
+
+defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
+
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (insert_subvector
+ (v16i1 immAllZerosV),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))), GR8, sub_8bit>;
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1
+ (extract_subvector
+ (v16i1
+ (insert_subvector
+ (v16i1 immAllZerosV),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))), GR8, sub_8bit>;
+
+defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
+
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (insert_subvector
+ (v16i1 immAllZerosV),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))), GR8, sub_8bit>;
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1
+ (extract_subvector
+ (v16i1
+ (insert_subvector
+ (v16i1 immAllZerosV),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))), GR8, sub_8bit>;
+
+def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
+ (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)),
+ VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>;
+
+def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)),
+ (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>;
+
+def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), (f32 FR32X:$src0))),
+ (COPY_TO_REGCLASS
+ (v4f32 (VMOVSSZrmk (v4f32 (COPY_TO_REGCLASS FR32X:$src0, VR128X)),
+ VK1WM:$mask, addr:$src)),
+ FR32X)>;
+def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), fp32imm0)),
+ (COPY_TO_REGCLASS (v4f32 (VMOVSSZrmkz VK1WM:$mask, addr:$src)), FR32X)>;
+
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
+ (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)),
+ VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
+
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fp64imm0)),
+ (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
+
+def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), (f64 FR64X:$src0))),
+ (COPY_TO_REGCLASS
+ (v2f64 (VMOVSDZrmk (v2f64 (COPY_TO_REGCLASS FR64X:$src0, VR128X)),
+ VK1WM:$mask, addr:$src)),
+ FR64X)>;
+def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), fp64imm0)),
+ (COPY_TO_REGCLASS (v2f64 (VMOVSDZrmkz VK1WM:$mask, addr:$src)), FR64X)>;
+
+
+def : Pat<(v4f32 (X86selects VK1WM:$mask, (v4f32 VR128X:$src1), (v4f32 VR128X:$src2))),
+ (VMOVSSZrrk VR128X:$src2, VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+def : Pat<(v2f64 (X86selects VK1WM:$mask, (v2f64 VR128X:$src1), (v2f64 VR128X:$src2))),
+ (VMOVSDZrrk VR128X:$src2, VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+
+def : Pat<(v4f32 (X86selects VK1WM:$mask, (v4f32 VR128X:$src1), (v4f32 immAllZerosV))),
+ (VMOVSSZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+def : Pat<(v2f64 (X86selects VK1WM:$mask, (v2f64 VR128X:$src1), (v2f64 immAllZerosV))),
+ (VMOVSDZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+
+let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrr">,
+ Sched<[SchedWriteFShuffle.XMM]>;
+
+ let Constraints = "$src0 = $dst" in
+ def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
+ VR128X:$src1, VR128X:$src2),
+ "vmovss\t{$src2, $src1, $dst {${mask}}|"#
+ "$dst {${mask}}, $src1, $src2}",
+ []>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrrk">,
+ Sched<[SchedWriteFShuffle.XMM]>;
+
+ def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
+ "vmovss\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, $src1, $src2}",
+ []>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrrkz">,
+ Sched<[SchedWriteFShuffle.XMM]>;
+
+ def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, XD, EVEX_4V, VEX_LIG, VEX_W,
+ FoldGenData<"VMOVSDZrr">,
+ Sched<[SchedWriteFShuffle.XMM]>;
+
+ let Constraints = "$src0 = $dst" in
+ def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
+ VR128X:$src1, VR128X:$src2),
+ "vmovsd\t{$src2, $src1, $dst {${mask}}|"#
+ "$dst {${mask}}, $src1, $src2}",
+ []>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+ VEX_W, FoldGenData<"VMOVSDZrrk">,
+ Sched<[SchedWriteFShuffle.XMM]>;
+
+ def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins f64x_info.KRCWM:$mask, VR128X:$src1,
+ VR128X:$src2),
+ "vmovsd\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, $src1, $src2}",
+ []>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+ VEX_W, FoldGenData<"VMOVSDZrrkz">,
+ Sched<[SchedWriteFShuffle.XMM]>;
+}
+
+def : InstAlias<"vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VMOVSSZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
+ "$dst {${mask}}, $src1, $src2}",
+ (VMOVSSZrrk_REV VR128X:$dst, VK1WM:$mask,
+ VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, $src1, $src2}",
+ (VMOVSSZrrkz_REV VR128X:$dst, VK1WM:$mask,
+ VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VMOVSDZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
+ "$dst {${mask}}, $src1, $src2}",
+ (VMOVSDZrrk_REV VR128X:$dst, VK1WM:$mask,
+ VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, $src1, $src2}",
+ (VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
+ VR128X:$src1, VR128X:$src2), 0>;
+
+let Predicates = [HasAVX512, OptForSize] in {
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
+ (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
+ (VMOVSSZrr (v4i32 (AVX512_128_SET0)), VR128X:$src)>;
+
+ // Move low f32 and clear high bits.
+ def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>;
+
+ def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))), sub_xmm)>;
+ def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
+}
+
+// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
+// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.
+let Predicates = [HasAVX512, OptForSpeed] in {
+ def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)),
+ (i8 1))), sub_xmm)>;
+ def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
+ (i8 3))), sub_xmm)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (VMOVSSZrm addr:$src)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (VMOVSDZrm addr:$src)>;
+
+ // Represent the same patterns above but in the form they appear for
+ // 256-bit types
+ def : Pat<(v8f32 (X86vzload32 addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzload64 addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+
+ // Represent the same patterns above but in the form they appear for
+ // 512-bit types
+ def : Pat<(v16f32 (X86vzload32 addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f64 (X86vzload64 addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+}
+
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
+def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst, (v2i64 (X86vzmovl
+ (v2i64 VR128X:$src))))]>,
+ EVEX, VEX_W;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (VMOVDI2PDIZrr GR32:$src)>;
+
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (VMOV64toPQIZrr GR64:$src)>;
+
+ // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v8i32 (X86vzload32 addr:$src)),
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+ (VMOVZPQILo2PQIZrr VR128X:$src)>;
+ def : Pat<(v2i64 (X86vzload64 addr:$src)),
+ (VMOVQI2PQIZrm addr:$src)>;
+ def : Pat<(v4i64 (X86vzload64 addr:$src)),
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
+
+ // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
+ def : Pat<(v16i32 (X86vzload32 addr:$src)),
+ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
+ def : Pat<(v8i64 (X86vzload64 addr:$src)),
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
+
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIZrr
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIZrr
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))),
+ sub_xmm)>;
+
+ def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIZrr
+ (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIZrr
+ (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))),
+ sub_xmm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Non-temporals
+//===----------------------------------------------------------------------===//
+
+def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
+ (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.ZMM.RM]>,
+ EVEX, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [HasVLX] in {
+ def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
+ (ins i256mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+ EVEX, T8PD, EVEX_V256, EVEX_CD8<64, CD8VF>;
+
+ def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
+ (ins i128mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.XMM.RM]>,
+ EVEX, T8PD, EVEX_V128, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86SchedWriteMoveLS Sched,
+ PatFrag st_frag = alignednontemporalstore> {
+ let SchedRW = [Sched.MR], AddedComplexity = 400 in
+ def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(st_frag (_.VT _.RC:$src), addr:$dst)],
+ _.ExeDomain>, EVEX, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ X86SchedWriteMoveLSWidths Sched> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512, Sched.ZMM>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256, Sched.YMM>, EVEX_V256;
+ defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128, Sched.XMM>, EVEX_V128;
+ }
+}
+
+defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info,
+ SchedWriteVecMoveLSNT>, PD;
+defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info,
+ SchedWriteFMoveLSNT>, PD, VEX_W;
+defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info,
+ SchedWriteFMoveLSNT>, PS;
+
+let Predicates = [HasAVX512], AddedComplexity = 400 in {
+ def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
+ (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+ def : Pat<(alignednontemporalstore (v32i16 VR512:$src), addr:$dst),
+ (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+ def : Pat<(alignednontemporalstore (v64i8 VR512:$src), addr:$dst),
+ (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+
+ def : Pat<(v8f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v16f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v8i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v16i32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v32i16 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZrm addr:$src)>;
+ def : Pat<(v64i8 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZrm addr:$src)>;
+}
+
+let Predicates = [HasVLX], AddedComplexity = 400 in {
+ def : Pat<(alignednontemporalstore (v8i32 VR256X:$src), addr:$dst),
+ (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignednontemporalstore (v16i16 VR256X:$src), addr:$dst),
+ (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignednontemporalstore (v32i8 VR256X:$src), addr:$dst),
+ (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+
+ def : Pat<(v4f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v8f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v4i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ256rm addr:$src)>;
+ def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ256rm addr:$src)>;
+
+ def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
+ (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignednontemporalstore (v8i16 VR128X:$src), addr:$dst),
+ (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignednontemporalstore (v16i8 VR128X:$src), addr:$dst),
+ (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+
+ def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ128rm addr:$src)>;
+ def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAZ128rm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, X86FoldableSchedWrite sched,
+ bit IsCommutable = 0> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ IsCommutable, IsCommutable>, AVX512BIBase, EVEX_4V,
+ Sched<[sched]>;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2)))>,
+ AVX512BIBase, EVEX_4V,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, X86FoldableSchedWrite sched,
+ bit IsCommutable = 0> :
+ avx512_binop_rm<opc, OpcodeStr, OpNode, _, sched, IsCommutable> {
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src1,
+ (_.BroadcastLdFrag addr:$src2)))>,
+ AVX512BIBase, EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo,
+ X86SchedWriteWidths sched, Predicate prd,
+ bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, sched.ZMM,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256,
+ sched.YMM, IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128,
+ sched.XMM, IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo,
+ X86SchedWriteWidths sched, Predicate prd,
+ bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, sched.ZMM,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
+ sched.YMM, IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
+ sched.XMM, IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ sched, prd, IsCommutable>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ sched, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+ sched, prd, IsCommutable>, EVEX_CD8<16, CD8VF>,
+ VEX_WIG;
+}
+
+multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
+ sched, prd, IsCommutable>, EVEX_CD8<8, CD8VF>,
+ VEX_WIG;
+}
+
+multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd, bit IsCommutable = 0> {
+ defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, sched, prd,
+ IsCommutable>;
+
+ defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, sched, prd,
+ IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd, bit IsCommutable = 0> {
+ defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, sched, prd,
+ IsCommutable>;
+
+ defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, sched, prd,
+ IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+ bits<8> opc_d, bits<8> opc_q,
+ string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+ sched, HasAVX512, IsCommutable>,
+ avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+ sched, HasBWI, IsCommutable>;
+}
+
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ SDNode OpNode,X86VectorVTInfo _Src,
+ X86VectorVTInfo _Dst, X86VectorVTInfo _Brdct,
+ bit IsCommutable = 0> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1","$src1, $src2",
+ (_Dst.VT (OpNode
+ (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2))),
+ IsCommutable>,
+ AVX512BIBase, EVEX_4V, Sched<[sched]>;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (_Src.LdFrag addr:$src2)))>,
+ AVX512BIBase, EVEX_4V,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
+ OpcodeStr,
+ "${src2}"#_Brdct.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_Brdct.BroadcastStr,
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+ (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
+ AVX512BIBase, EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
+ SchedWriteVecALU, 1>;
+defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
+ SchedWriteVecALU, 0>;
+defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", saddsat,
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", ssubsat,
+ SchedWriteVecALU, HasBWI, 0>;
+defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", uaddsat,
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", usubsat,
+ SchedWriteVecALU, HasBWI, 0>;
+defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
+ SchedWritePMULLD, HasAVX512, 1>, T8PD;
+defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
+ SchedWriteVecIMul, HasBWI, 1>;
+defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
+ SchedWriteVecIMul, HasDQI, 1>, T8PD,
+ NotEVEX2VEXConvertible;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul,
+ HasBWI, 1>;
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul,
+ HasBWI, 1>;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs,
+ SchedWriteVecIMul, HasBWI, 1>, T8PD;
+defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq,
+ SchedWriteVecIMul, HasAVX512, 1>, T8PD;
+defm VPMULUDQ : avx512_binop_rm_vl_q<0xF4, "vpmuludq", X86pmuludq,
+ SchedWriteVecIMul, HasAVX512, 1>;
+
+multiclass avx512_binop_all<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _SrcVTInfo,
+ AVX512VLVectorVTInfo _DstVTInfo,
+ SDNode OpNode, Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode,
+ _SrcVTInfo.info512, _DstVTInfo.info512,
+ v8i64_info, IsCommutable>,
+ EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
+ let Predicates = [HasVLX, prd] in {
+ defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode,
+ _SrcVTInfo.info256, _DstVTInfo.info256,
+ v4i64x_info, IsCommutable>,
+ EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
+ defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, sched.XMM, OpNode,
+ _SrcVTInfo.info128, _DstVTInfo.info128,
+ v2i64x_info, IsCommutable>,
+ EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
+ }
+}
+
+defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SchedWriteVecALU,
+ avx512vl_i8_info, avx512vl_i8_info,
+ X86multishift, HasVBMI, 0>, T8PD;
+
+multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _Src, X86VectorVTInfo _Dst,
+ X86FoldableSchedWrite sched> {
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
+ OpcodeStr,
+ "${src2}"#_Src.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_Src.BroadcastStr,
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+ (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>,
+ EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,X86VectorVTInfo _Src,
+ X86VectorVTInfo _Dst, X86FoldableSchedWrite sched,
+ bit IsCommutable = 0> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1","$src1, $src2",
+ (_Dst.VT (OpNode
+ (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2))),
+ IsCommutable, IsCommutable>,
+ EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (_Src.LdFrag addr:$src2)))>,
+ EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
+ v32i16_info, SchedWriteShuffle.ZMM>,
+ avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
+ v32i16_info, SchedWriteShuffle.ZMM>, EVEX_V512;
+ let Predicates = [HasBWI, HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
+ v16i16x_info, SchedWriteShuffle.YMM>,
+ avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
+ v16i16x_info, SchedWriteShuffle.YMM>,
+ EVEX_V256;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info,
+ v8i16x_info, SchedWriteShuffle.XMM>,
+ avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info,
+ v8i16x_info, SchedWriteShuffle.XMM>,
+ EVEX_V128;
+ }
+}
+multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
+ SDNode OpNode> {
+ let Predicates = [HasBWI] in
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info, v64i8_info,
+ SchedWriteShuffle.ZMM>, EVEX_V512, VEX_WIG;
+ let Predicates = [HasBWI, HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
+ v32i8x_info, SchedWriteShuffle.YMM>,
+ EVEX_V256, VEX_WIG;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
+ v16i8x_info, SchedWriteShuffle.XMM>,
+ EVEX_V128, VEX_WIG;
+ }
+}
+
+multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, AVX512VLVectorVTInfo _Src,
+ AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> {
+ let Predicates = [HasBWI] in
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
+ _Dst.info512, SchedWriteVecIMul.ZMM,
+ IsCommutable>, EVEX_V512;
+ let Predicates = [HasBWI, HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
+ _Dst.info256, SchedWriteVecIMul.YMM,
+ IsCommutable>, EVEX_V256;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
+ _Dst.info128, SchedWriteVecIMul.XMM,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, AVX512BIBase;
+defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, AVX5128IBase;
+defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase;
+defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase;
+
+defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
+ avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD, VEX_WIG;
+defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
+ avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase, VEX_WIG;
+
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
+ SchedWriteVecALU, HasBWI, 1>, T8PD;
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD,
+ NotEVEX2VEXConvertible;
+
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
+ SchedWriteVecALU, HasBWI, 1>, T8PD;
+defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD,
+ NotEVEX2VEXConvertible;
+
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
+ SchedWriteVecALU, HasBWI, 1>, T8PD;
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD,
+ NotEVEX2VEXConvertible;
+
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
+ SchedWriteVecALU, HasBWI, 1>;
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
+ SchedWriteVecALU, HasBWI, 1>, T8PD;
+defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD;
+defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
+ SchedWriteVecALU, HasAVX512, 1>, T8PD,
+ NotEVEX2VEXConvertible;
+
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasDQI, NoVLX] in {
+ def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+ def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrmb
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ addr:$src2),
+ sub_ymm)>;
+
+ def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrmb
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ addr:$src2),
+ sub_xmm)>;
+}
+
+multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> {
+ def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(Instr#"rr")
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+ def : Pat<(v4i64 (OpNode (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(Instr#"rmb")
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ addr:$src2),
+ sub_ymm)>;
+
+ def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(Instr#"rr")
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+ def : Pat<(v2i64 (OpNode (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(Instr#"rmb")
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ addr:$src2),
+ sub_xmm)>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+ defm : avx512_min_max_lowering<"VPMAXUQZ", umax>;
+ defm : avx512_min_max_lowering<"VPMINUQZ", umin>;
+ defm : avx512_min_max_lowering<"VPMAXSQZ", smax>;
+ defm : avx512_min_max_lowering<"VPMINSQZ", smin>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+ SchedWriteVecLogic, HasAVX512, 1>;
+defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+ SchedWriteVecLogic, HasAVX512, 1>;
+defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+ SchedWriteVecLogic, HasAVX512, 1>;
+defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+ SchedWriteVecLogic, HasAVX512>;
+
+let Predicates = [HasVLX] in {
+ def : Pat<(v16i8 (and VR128X:$src1, VR128X:$src2)),
+ (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
+ def : Pat<(v8i16 (and VR128X:$src1, VR128X:$src2)),
+ (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+ def : Pat<(v16i8 (or VR128X:$src1, VR128X:$src2)),
+ (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
+ def : Pat<(v8i16 (or VR128X:$src1, VR128X:$src2)),
+ (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+ def : Pat<(v16i8 (xor VR128X:$src1, VR128X:$src2)),
+ (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
+ def : Pat<(v8i16 (xor VR128X:$src1, VR128X:$src2)),
+ (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+ def : Pat<(v16i8 (X86andnp VR128X:$src1, VR128X:$src2)),
+ (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
+ def : Pat<(v8i16 (X86andnp VR128X:$src1, VR128X:$src2)),
+ (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+ def : Pat<(and VR128X:$src1, (loadv16i8 addr:$src2)),
+ (VPANDQZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(and VR128X:$src1, (loadv8i16 addr:$src2)),
+ (VPANDQZ128rm VR128X:$src1, addr:$src2)>;
+
+ def : Pat<(or VR128X:$src1, (loadv16i8 addr:$src2)),
+ (VPORQZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(or VR128X:$src1, (loadv8i16 addr:$src2)),
+ (VPORQZ128rm VR128X:$src1, addr:$src2)>;
+
+ def : Pat<(xor VR128X:$src1, (loadv16i8 addr:$src2)),
+ (VPXORQZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(xor VR128X:$src1, (loadv8i16 addr:$src2)),
+ (VPXORQZ128rm VR128X:$src1, addr:$src2)>;
+
+ def : Pat<(X86andnp VR128X:$src1, (loadv16i8 addr:$src2)),
+ (VPANDNQZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)),
+ (VPANDNQZ128rm VR128X:$src1, addr:$src2)>;
+
+ def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)),
+ (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
+ def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)),
+ (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+ def : Pat<(v32i8 (or VR256X:$src1, VR256X:$src2)),
+ (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
+ def : Pat<(v16i16 (or VR256X:$src1, VR256X:$src2)),
+ (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+ def : Pat<(v32i8 (xor VR256X:$src1, VR256X:$src2)),
+ (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
+ def : Pat<(v16i16 (xor VR256X:$src1, VR256X:$src2)),
+ (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+ def : Pat<(v32i8 (X86andnp VR256X:$src1, VR256X:$src2)),
+ (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
+ def : Pat<(v16i16 (X86andnp VR256X:$src1, VR256X:$src2)),
+ (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+ def : Pat<(and VR256X:$src1, (loadv32i8 addr:$src2)),
+ (VPANDQZ256rm VR256X:$src1, addr:$src2)>;
+ def : Pat<(and VR256X:$src1, (loadv16i16 addr:$src2)),
+ (VPANDQZ256rm VR256X:$src1, addr:$src2)>;
+
+ def : Pat<(or VR256X:$src1, (loadv32i8 addr:$src2)),
+ (VPORQZ256rm VR256X:$src1, addr:$src2)>;
+ def : Pat<(or VR256X:$src1, (loadv16i16 addr:$src2)),
+ (VPORQZ256rm VR256X:$src1, addr:$src2)>;
+
+ def : Pat<(xor VR256X:$src1, (loadv32i8 addr:$src2)),
+ (VPXORQZ256rm VR256X:$src1, addr:$src2)>;
+ def : Pat<(xor VR256X:$src1, (loadv16i16 addr:$src2)),
+ (VPXORQZ256rm VR256X:$src1, addr:$src2)>;
+
+ def : Pat<(X86andnp VR256X:$src1, (loadv32i8 addr:$src2)),
+ (VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)),
+ (VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v64i8 (and VR512:$src1, VR512:$src2)),
+ (VPANDQZrr VR512:$src1, VR512:$src2)>;
+ def : Pat<(v32i16 (and VR512:$src1, VR512:$src2)),
+ (VPANDQZrr VR512:$src1, VR512:$src2)>;
+
+ def : Pat<(v64i8 (or VR512:$src1, VR512:$src2)),
+ (VPORQZrr VR512:$src1, VR512:$src2)>;
+ def : Pat<(v32i16 (or VR512:$src1, VR512:$src2)),
+ (VPORQZrr VR512:$src1, VR512:$src2)>;
+
+ def : Pat<(v64i8 (xor VR512:$src1, VR512:$src2)),
+ (VPXORQZrr VR512:$src1, VR512:$src2)>;
+ def : Pat<(v32i16 (xor VR512:$src1, VR512:$src2)),
+ (VPXORQZrr VR512:$src1, VR512:$src2)>;
+
+ def : Pat<(v64i8 (X86andnp VR512:$src1, VR512:$src2)),
+ (VPANDNQZrr VR512:$src1, VR512:$src2)>;
+ def : Pat<(v32i16 (X86andnp VR512:$src1, VR512:$src2)),
+ (VPANDNQZrr VR512:$src1, VR512:$src2)>;
+
+ def : Pat<(and VR512:$src1, (loadv64i8 addr:$src2)),
+ (VPANDQZrm VR512:$src1, addr:$src2)>;
+ def : Pat<(and VR512:$src1, (loadv32i16 addr:$src2)),
+ (VPANDQZrm VR512:$src1, addr:$src2)>;
+
+ def : Pat<(or VR512:$src1, (loadv64i8 addr:$src2)),
+ (VPORQZrm VR512:$src1, addr:$src2)>;
+ def : Pat<(or VR512:$src1, (loadv32i16 addr:$src2)),
+ (VPORQZrm VR512:$src1, addr:$src2)>;
+
+ def : Pat<(xor VR512:$src1, (loadv64i8 addr:$src2)),
+ (VPXORQZrm VR512:$src1, addr:$src2)>;
+ def : Pat<(xor VR512:$src1, (loadv32i16 addr:$src2)),
+ (VPXORQZrm VR512:$src1, addr:$src2)>;
+
+ def : Pat<(X86andnp VR512:$src1, (loadv64i8 addr:$src2)),
+ (VPANDNQZrm VR512:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)),
+ (VPANDNQZrm VR512:$src1, addr:$src2)>;
+}
+
+// Patterns to catch vselect with different type than logic op.
+multiclass avx512_logical_lowering<string InstrStr, SDNode OpNode,
+ X86VectorVTInfo _,
+ X86VectorVTInfo IntInfo> {
+ // Masked register-register logical operations.
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
+ _.RC:$src1, _.RC:$src2)>;
+
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
+ _.RC:$src2)>;
+
+ // Masked register-memory logical operations.
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (bitconvert (IntInfo.VT (OpNode _.RC:$src1,
+ (load addr:$src2)))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2)>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (bitconvert (IntInfo.VT (OpNode _.RC:$src1,
+ (load addr:$src2)))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1,
+ addr:$src2)>;
+}
+
+multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode,
+ X86VectorVTInfo _,
+ X86VectorVTInfo IntInfo> {
+ // Register-broadcast logical operations.
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (bitconvert
+ (IntInfo.VT (OpNode _.RC:$src1,
+ (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2)>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (bitconvert
+ (IntInfo.VT (OpNode _.RC:$src1,
+ (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2)>;
+}
+
+multiclass avx512_logical_lowering_sizes<string InstrStr, SDNode OpNode,
+ AVX512VLVectorVTInfo SelectInfo,
+ AVX512VLVectorVTInfo IntInfo> {
+let Predicates = [HasVLX] in {
+ defm : avx512_logical_lowering<InstrStr#"Z128", OpNode, SelectInfo.info128,
+ IntInfo.info128>;
+ defm : avx512_logical_lowering<InstrStr#"Z256", OpNode, SelectInfo.info256,
+ IntInfo.info256>;
+}
+let Predicates = [HasAVX512] in {
+ defm : avx512_logical_lowering<InstrStr#"Z", OpNode, SelectInfo.info512,
+ IntInfo.info512>;
+}
+}
+
+multiclass avx512_logical_lowering_sizes_bcast<string InstrStr, SDNode OpNode,
+ AVX512VLVectorVTInfo SelectInfo,
+ AVX512VLVectorVTInfo IntInfo> {
+let Predicates = [HasVLX] in {
+ defm : avx512_logical_lowering_bcast<InstrStr#"Z128", OpNode,
+ SelectInfo.info128, IntInfo.info128>;
+ defm : avx512_logical_lowering_bcast<InstrStr#"Z256", OpNode,
+ SelectInfo.info256, IntInfo.info256>;
+}
+let Predicates = [HasAVX512] in {
+ defm : avx512_logical_lowering_bcast<InstrStr#"Z", OpNode,
+ SelectInfo.info512, IntInfo.info512>;
+}
+}
+
+multiclass avx512_logical_lowering_types<string InstrStr, SDNode OpNode> {
+ // i64 vselect with i32/i16/i8 logic op
+ defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_i64_info,
+ avx512vl_i32_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_i64_info,
+ avx512vl_i16_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_i64_info,
+ avx512vl_i8_info>;
+
+ // i32 vselect with i64/i16/i8 logic op
+ defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_i32_info,
+ avx512vl_i64_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_i32_info,
+ avx512vl_i16_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_i32_info,
+ avx512vl_i8_info>;
+
+ // f32 vselect with i64/i32/i16/i8 logic op
+ defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+ avx512vl_i64_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+ avx512vl_i32_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+ avx512vl_i16_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+ avx512vl_i8_info>;
+
+ // f64 vselect with i64/i32/i16/i8 logic op
+ defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+ avx512vl_i64_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+ avx512vl_i32_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+ avx512vl_i16_info>;
+ defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+ avx512vl_i8_info>;
+
+ defm : avx512_logical_lowering_sizes_bcast<InstrStr#"D", OpNode,
+ avx512vl_f32_info,
+ avx512vl_i32_info>;
+ defm : avx512_logical_lowering_sizes_bcast<InstrStr#"Q", OpNode,
+ avx512vl_f64_info,
+ avx512vl_i64_info>;
+}
+
+defm : avx512_logical_lowering_types<"VPAND", and>;
+defm : avx512_logical_lowering_types<"VPOR", or>;
+defm : avx512_logical_lowering_types<"VPXOR", xor>;
+defm : avx512_logical_lowering_types<"VPANDN", X86andnp>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 FP arithmetic
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode OpNode, SDNode VecNode,
+ X86FoldableSchedWrite sched, bit IsCommutable> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
+ Sched<[sched]>;
+
+ defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (VecNode _.RC:$src1,
+ (_.ScalarIntMemFrags addr:$src2)))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+ def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
+ Sched<[sched]> {
+ let isCommutable = IsCommutable;
+ }
+ def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+ }
+}
+
+multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode VecNode, X86FoldableSchedWrite sched,
+ bit IsCommutable = 0> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
+ defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 timm:$rc))>,
+ EVEX_B, EVEX_RC, Sched<[sched]>;
+}
+multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode OpNode, SDNode VecNode, SDNode SaeNode,
+ X86FoldableSchedWrite sched, bit IsCommutable,
+ string EVEX2VexOvrd> {
+ let ExeDomain = _.ExeDomain in {
+ defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
+ Sched<[sched]>, SIMD_EXC;
+
+ defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (VecNode _.RC:$src1,
+ (_.ScalarIntMemFrags addr:$src2)))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+
+ let isCodeGenOnly = 1, Predicates = [HasAVX512],
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
+ def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
+ Sched<[sched]>,
+ EVEX2VEXOverride<EVEX2VexOvrd#"rr"> {
+ let isCommutable = IsCommutable;
+ }
+ def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>,
+ EVEX2VEXOverride<EVEX2VexOvrd#"rm">;
+ }
+
+ let Uses = [MXCSR] in
+ defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_B, Sched<[sched]>;
+ }
+}
+
+multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode VecNode, SDNode RndNode,
+ X86SchedWriteSizes sched, bit IsCommutable> {
+ defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
+ sched.PS.Scl, IsCommutable>,
+ avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, RndNode,
+ sched.PS.Scl, IsCommutable>,
+ XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
+ sched.PD.Scl, IsCommutable>,
+ avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode,
+ sched.PD.Scl, IsCommutable>,
+ XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+
+multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode VecNode, SDNode SaeNode,
+ X86SchedWriteSizes sched, bit IsCommutable> {
+ defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
+ VecNode, SaeNode, sched.PS.Scl, IsCommutable,
+ NAME#"SS">,
+ XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
+ VecNode, SaeNode, sched.PD.Scl, IsCommutable,
+ NAME#"SD">,
+ XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds,
+ SchedWriteFAddSizes, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", any_fmul, X86fmuls, X86fmulRnds,
+ SchedWriteFMulSizes, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", any_fsub, X86fsubs, X86fsubRnds,
+ SchedWriteFAddSizes, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", any_fdiv, X86fdivs, X86fdivRnds,
+ SchedWriteFDivSizes, 0>;
+defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs,
+ SchedWriteFCmpSizes, 0>;
+defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
+ SchedWriteFCmpSizes, 0>;
+
+// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
+// X86fminc and X86fmaxc instead of X86fmin and X86fmax
+multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ string EVEX2VEXOvrd> {
+ let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
+ def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
+ Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr"> {
+ let isCommutable = 1;
+ }
+ def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
+ }
+}
+defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
+ SchedWriteFCmp.Scl, "VMINCSS">, XS,
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
+
+defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
+ SchedWriteFCmp.Scl, "VMINCSD">, XD,
+ VEX_W, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>, SIMD_EXC;
+
+defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
+ SchedWriteFCmp.Scl, "VMAXCSS">, XS,
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
+
+defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
+ SchedWriteFCmp.Scl, "VMAXCSD">, XD,
+ VEX_W, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>, SIMD_EXC;
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+ SDPatternOperator MaskOpNode,
+ X86VectorVTInfo _, X86FoldableSchedWrite sched,
+ bit IsCommutable,
+ bit IsKCommutable = IsCommutable> {
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rr: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), IsCommutable,
+ IsKCommutable, IsKCommutable>,
+ EVEX_4V, Sched<[sched]>;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
+ (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm rmb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
+ (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
+ (MaskOpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
+ EVEX_4V, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+ }
+}
+
+multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNodeRnd,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
+ defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr#_.Suffix,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>,
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+}
+
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNodeSAE,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
+ defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>,
+ EVEX_4V, EVEX_B, Sched<[sched]>;
+}
+
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+ SDPatternOperator MaskOpNode,
+ Predicate prd, X86SchedWriteSizes sched,
+ bit IsCommutable = 0,
+ bit IsPD128Commutable = IsCommutable> {
+ let Predicates = [prd] in {
+ defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f32_info,
+ sched.PS.ZMM, IsCommutable>, EVEX_V512, PS,
+ EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f64_info,
+ sched.PD.ZMM, IsCommutable>, EVEX_V512, PD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+ }
+
+ // Define only if AVX512VL feature is present.
+ let Predicates = [prd, HasVLX] in {
+ defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f32x_info,
+ sched.PS.XMM, IsCommutable>, EVEX_V128, PS,
+ EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f32x_info,
+ sched.PS.YMM, IsCommutable>, EVEX_V256, PS,
+ EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v2f64x_info,
+ sched.PD.XMM, IsPD128Commutable,
+ IsCommutable>, EVEX_V128, PD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f64x_info,
+ sched.PD.YMM, IsCommutable>, EVEX_V256, PD, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+ }
+}
+
+let Uses = [MXCSR] in
+multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+ X86SchedWriteSizes sched> {
+ defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
+ v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
+ v8f64_info>,
+ EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+let Uses = [MXCSR] in
+multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+ X86SchedWriteSizes sched> {
+ defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
+ v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
+ v8f64_info>,
+ EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512,
+ SchedWriteFAddSizes, 1>,
+ avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>;
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, fmul, HasAVX512,
+ SchedWriteFMulSizes, 1>,
+ avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>;
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, fsub, HasAVX512,
+ SchedWriteFAddSizes>,
+ avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>;
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, fdiv, HasAVX512,
+ SchedWriteFDivSizes>,
+ avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, X86fmin, HasAVX512,
+ SchedWriteFCmpSizes, 0>,
+ avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>;
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, X86fmax, HasAVX512,
+ SchedWriteFCmpSizes, 0>,
+ avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>;
+let isCodeGenOnly = 1 in {
+ defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, X86fminc, HasAVX512,
+ SchedWriteFCmpSizes, 1>;
+ defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, X86fmaxc, HasAVX512,
+ SchedWriteFCmpSizes, 1>;
+}
+let Uses = []<Register>, mayRaiseFPException = 0 in {
+defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, null_frag, HasDQI,
+ SchedWriteFLogicSizes, 1>;
+defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, null_frag, HasDQI,
+ SchedWriteFLogicSizes, 0>;
+defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, null_frag, HasDQI,
+ SchedWriteFLogicSizes, 1>;
+defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, null_frag, HasDQI,
+ SchedWriteFLogicSizes, 1>;
+}
+
+multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
+ EVEX_4V, Sched<[sched]>;
+ defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
+ (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
+ EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
+ Sched<[sched]>;
+ defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.ScalarIntMemFrags addr:$src2))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr,
+ X86SchedWriteWidths sched> {
+ defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v8f64_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
+ X86scalefsRnd, sched.Scl>,
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
+ X86scalefsRnd, sched.Scl>,
+ EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W;
+
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v4f32x_info>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v8f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v2f64x_info>,
+ EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v4f64x_info>,
+ EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+ }
+}
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef",
+ SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ string Name> {
+ // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
+ // There are just too many permutations due to commutability and bitcasts.
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+ defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (null_frag), (null_frag), 1>,
+ EVEX_4V, Sched<[sched]>;
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (null_frag), (null_frag)>,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
+ defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
+ (null_frag), (null_frag)>,
+ EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_vptest<opc, OpcodeStr, sched.ZMM, _.info512, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.ZMM, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_vptest<opc, OpcodeStr, sched.YMM, _.info256, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.YMM, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vptest<opc, OpcodeStr, sched.XMM, _.info128, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.XMM, _.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched> {
+ defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", sched,
+ avx512vl_i32_info>;
+ defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", sched,
+ avx512vl_i64_info>, VEX_W;
+}
+
+multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasBWI] in {
+ defm WZ: avx512_vptest<opc, OpcodeStr#"w", sched.ZMM,
+ v32i16_info, NAME#"W">, EVEX_V512, VEX_W;
+ defm BZ: avx512_vptest<opc, OpcodeStr#"b", sched.ZMM,
+ v64i8_info, NAME#"B">, EVEX_V512;
+ }
+ let Predicates = [HasVLX, HasBWI] in {
+
+ defm WZ256: avx512_vptest<opc, OpcodeStr#"w", sched.YMM,
+ v16i16x_info, NAME#"W">, EVEX_V256, VEX_W;
+ defm WZ128: avx512_vptest<opc, OpcodeStr#"w", sched.XMM,
+ v8i16x_info, NAME#"W">, EVEX_V128, VEX_W;
+ defm BZ256: avx512_vptest<opc, OpcodeStr#"b", sched.YMM,
+ v32i8x_info, NAME#"B">, EVEX_V256;
+ defm BZ128: avx512_vptest<opc, OpcodeStr#"b", sched.XMM,
+ v16i8x_info, NAME#"B">, EVEX_V128;
+ }
+}
+
+multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
+ X86SchedWriteWidths sched> :
+ avx512_vptest_wb<opc_wb, OpcodeStr, sched>,
+ avx512_vptest_dq<opc_dq, OpcodeStr, sched>;
+
+defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm",
+ SchedWriteVecLogic>, T8PD;
+defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm",
+ SchedWriteVecLogic>, T8XS;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Shift instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (i8 timm:$src2)))>,
+ Sched<[sched]>;
+ defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_.VT (_.LdFrag addr:$src1)),
+ (i8 timm:$src2)))>,
+ Sched<[sched.Folded]>;
+ }
+}
+
+multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
+ defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
+ "$src2, ${src1}"#_.BroadcastStr, "${src1}"#_.BroadcastStr#", $src2",
+ (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>,
+ EVEX_B, Sched<[sched.Folded]>;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, ValueType SrcVT,
+ X86VectorVTInfo _> {
+ // src2 is always 128-bit
+ let ExeDomain = _.ExeDomain in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2)))>,
+ AVX512BIBase, EVEX_4V, Sched<[sched]>;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (SrcVT (load addr:$src2))))>,
+ AVX512BIBase,
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, ValueType SrcVT,
+ AVX512VLVectorVTInfo VTInfo,
+ Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.ZMM, SrcVT,
+ VTInfo.info512>, EVEX_V512,
+ EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.YMM, SrcVT,
+ VTInfo.info256>, EVEX_V256,
+ EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
+ defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.XMM, SrcVT,
+ VTInfo.info128>, EVEX_V128,
+ EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
+ string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched,
+ bit NotEVEX2VEXConvertibleQ = 0> {
+ defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, sched, v4i32,
+ avx512vl_i32_info, HasAVX512>;
+ let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
+ defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, sched, v2i64,
+ avx512vl_i64_info, HasAVX512>, VEX_W;
+ defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, sched, v8i16,
+ avx512vl_i16_info, HasBWI>;
+}
+
+multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasAVX512] in
+ defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched.ZMM, VTInfo.info512>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.ZMM,
+ VTInfo.info512>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched.YMM, VTInfo.info256>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.YMM,
+ VTInfo.info256>, EVEX_V256;
+ defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched.XMM, VTInfo.info128>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.XMM,
+ VTInfo.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasBWI] in
+ defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched.ZMM, v32i16_info>, EVEX_V512, VEX_WIG;
+ let Predicates = [HasVLX, HasBWI] in {
+ defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched.YMM, v16i16x_info>, EVEX_V256, VEX_WIG;
+ defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched.XMM, v8i16x_info>, EVEX_V128, VEX_WIG;
+ }
+}
+
+multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
+ Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched,
+ bit NotEVEX2VEXConvertibleQ = 0> {
+ defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
+ sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
+ defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
+ sched, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli,
+ SchedWriteVecShiftImm>,
+ avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli,
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli,
+ SchedWriteVecShiftImm>,
+ avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli,
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai,
+ SchedWriteVecShiftImm, 1>,
+ avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai,
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+
+defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri,
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli,
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl,
+ SchedWriteVecShift>;
+defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra,
+ SchedWriteVecShift, 1>;
+defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,
+ SchedWriteVecShift>;
+
+// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+ def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPSRAQZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ VR128X:$src2)), sub_ymm)>;
+
+ def : Pat<(v2i64 (X86vsra (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPSRAQZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ VR128X:$src2)), sub_xmm)>;
+
+ def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPSRAQZri
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ timm:$src2)), sub_ymm)>;
+
+ def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPSRAQZri
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ timm:$src2)), sub_xmm)>;
+}
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2)))>,
+ AVX5128IBase, EVEX_4V, Sched<[sched]>;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (_.VT (_.LdFrag addr:$src2))))>,
+ AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>,
+ AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256;
+ defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, sched,
+ avx512vl_i32_info>;
+ defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, sched,
+ avx512vl_i64_info>, VEX_W;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr,
+ SDNode OpNode, list<Predicate> p> {
+ let Predicates = p in {
+ def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1),
+ (_.info256.VT _.info256.RC:$src2))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(OpcodeStr#"Zrr")
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+
+ def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1),
+ (_.info128.VT _.info128.RC:$src2))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(OpcodeStr#"Zrr")
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+ }
+}
+multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ let Predicates = [HasBWI] in
+ defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, v32i16_info>,
+ EVEX_V512, VEX_W;
+ let Predicates = [HasVLX, HasBWI] in {
+
+ defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, v16i16x_info>,
+ EVEX_V256, VEX_W;
+ defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, v8i16x_info>,
+ EVEX_V128, VEX_W;
+ }
+}
+
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", X86vshlv, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x12, "vpsllvw", X86vshlv, SchedWriteVarVecShift>;
+
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", X86vsrav, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x11, "vpsravw", X86vsrav, SchedWriteVarVecShift>;
+
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x10, "vpsrlvw", X86vsrlv, SchedWriteVarVecShift>;
+
+defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
+defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
+
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
+
+
+// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+ def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPROLVQZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPROLVQZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
+ sub_ymm)>;
+
+ def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+ (EXTRACT_SUBREG (v16i32
+ (VPROLVDZrr
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+ (EXTRACT_SUBREG (v16i32
+ (VPROLVDZrr
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
+ sub_ymm)>;
+
+ def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPROLQZri
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPROLQZri
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ timm:$src2)), sub_ymm)>;
+
+ def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v16i32
+ (VPROLDZri
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v16i32
+ (VPROLDZri
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ timm:$src2)), sub_ymm)>;
+}
+
+// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+ def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPRORVQZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPRORVQZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
+ sub_ymm)>;
+
+ def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+ (EXTRACT_SUBREG (v16i32
+ (VPRORVDZrr
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+ (EXTRACT_SUBREG (v16i32
+ (VPRORVDZrr
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
+ sub_ymm)>;
+
+ def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPRORQZri
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v8i64
+ (VPRORQZri
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ timm:$src2)), sub_ymm)>;
+
+ def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v16i32
+ (VPRORDZri
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (v16i32
+ (VPRORDZri
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ timm:$src2)), sub_ymm)>;
+}
+
+//===-------------------------------------------------------------------===//
+// 1-src variable permutation VPERMW/D/Q
+//===-------------------------------------------------------------------===//
+
+multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched, _.info256>, EVEX_V256;
+}
+
+multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, AVX512VLVectorVTInfo VTInfo> {
+ let Predicates = [HasAVX512] in
+ defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched, VTInfo.info512>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ sched, VTInfo.info512>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in
+ defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ sched, VTInfo.info256>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ sched, VTInfo.info256>, EVEX_V256;
+}
+
+multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr,
+ Predicate prd, SDNode OpNode,
+ X86FoldableSchedWrite sched, AVX512VLVectorVTInfo _> {
+ let Predicates = [prd] in
+ defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info512>,
+ EVEX_V512 ;
+ let Predicates = [HasVLX, prd] in {
+ defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info256>,
+ EVEX_V256 ;
+ defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info128>,
+ EVEX_V128 ;
+ }
+}
+
+defm VPERMW : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv,
+ WriteVarShuffle256, avx512vl_i16_info>, VEX_W;
+defm VPERMB : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv,
+ WriteVarShuffle256, avx512vl_i8_info>;
+
+defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
+ WriteVarShuffle256, avx512vl_i32_info>;
+defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
+ WriteVarShuffle256, avx512vl_i64_info>, VEX_W;
+defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
+ WriteFVarShuffle256, avx512vl_f32_info>;
+defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
+ WriteFVarShuffle256, avx512vl_f64_info>, VEX_W;
+
+defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
+ X86VPermi, WriteShuffle256, avx512vl_i64_info>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
+ X86VPermi, WriteFShuffle256, avx512vl_f64_info>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERMIL
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ X86VectorVTInfo Ctrl> {
+ defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (Ctrl.VT Ctrl.RC:$src2)))>,
+ T8PD, EVEX_4V, Sched<[sched]>;
+ defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (Ctrl.LdFrag addr:$src2))))>,
+ T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
+ T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _,
+ AVX512VLVectorVTInfo Ctrl> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.ZMM,
+ _.info512, Ctrl.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.XMM,
+ _.info128, Ctrl.info128>, EVEX_V128;
+ defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.YMM,
+ _.info256, Ctrl.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
+ AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+ defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, SchedWriteFVarShuffle,
+ _, Ctrl>;
+ defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
+ X86VPermilpi, SchedWriteFShuffle, _>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
+ avx512vl_i32_info>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
+ avx512vl_i64_info>, VEX_W1X;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
+//===----------------------------------------------------------------------===//
+
+defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
+ X86PShufd, SchedWriteShuffle, avx512vl_i32_info>,
+ EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
+defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
+ X86PShufhw, SchedWriteShuffle>,
+ EVEX, AVX512XSIi8Base;
+defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
+ X86PShuflw, SchedWriteShuffle>,
+ EVEX, AVX512XDIi8Base;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPSHUFB
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasBWI] in
+ defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, v64i8_info>,
+ EVEX_V512;
+
+ let Predicates = [HasVLX, HasBWI] in {
+ defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, v32i8x_info>,
+ EVEX_V256;
+ defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, v16i8x_info>,
+ EVEX_V128;
+ }
+}
+
+defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb,
+ SchedWriteVarShuffle>, VEX_WIG;
+
+//===----------------------------------------------------------------------===//
+// Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))]>,
+ Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+let isCommutable = 1 in
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))]>,
+ Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V, NotMemoryFoldable;
+
+//===----------------------------------------------------------------------===//
+// VMOVHPS/PD VMOVLPS Instructions
+// All patterns was taken from SSS implementation.
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ X86VectorVTInfo _> {
+ let hasSideEffects = 0, mayLoad = 1, ExeDomain = _.ExeDomain in
+ def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,
+ (OpNode _.RC:$src1,
+ (_.VT (bitconvert
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))]>,
+ Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>, EVEX_4V;
+}
+
+// No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in
+// SSE1. And MOVLPS pattern is even more complex.
+defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", null_frag,
+ v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl,
+ v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", null_frag,
+ v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd,
+ v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+
+let Predicates = [HasAVX512] in {
+ // VMOVHPD patterns
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))),
+ (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+
+ // VMOVLPD patterns
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload64 addr:$src2))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+}
+
+let SchedRW = [WriteFStore] in {
+let mayStore = 1, hasSideEffects = 0 in
+def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovhps\t{$src, $dst|$dst, $src}",
+ []>, EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
+ (iPTR 0))), addr:$dst)]>,
+ EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+let mayStore = 1, hasSideEffects = 0 in
+def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovlps\t{$src, $dst|$dst, $src}",
+ []>, EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (v2f64 VR128X:$src),
+ (iPTR 0))), addr:$dst)]>,
+ EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+} // SchedRW
+
+let Predicates = [HasAVX512] in {
+ // VMOVHPD patterns
+ def : Pat<(store (f64 (extractelt
+ (v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
+}
+//===----------------------------------------------------------------------===//
+// FMA - Fused Multiply Operations
+//
+
+multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
+ (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
+ AVX512FMA3Base, Sched<[sched]>;
+
+ defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
+ (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
+ AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (OpNode _.RC:$src2,
+ _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))),
+ (MaskOpNode _.RC:$src2,
+ _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
+ AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR] in
+ defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))),
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+}
+
+multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _, string Suff> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.ZMM, _.info512, Suff>,
+ avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
+ _.info512, Suff>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.YMM, _.info256, Suff>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.XMM, _.info128, Suff>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd> {
+ defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f32_info, "PS">;
+ defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma,
+ fma, X86FmaddRnd>;
+defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub,
+ X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub,
+ X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd,
+ X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86any_Fnmadd,
+ X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86any_Fnmsub,
+ X86Fnmsub, X86FnmsubRnd>;
+
+
+multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (null_frag),
+ (_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
+ AVX512FMA3Base, Sched<[sched]>;
+
+ defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
+ (_.VT (MaskOpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
+ AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, "${src3}"#_.BroadcastStr#", $src2",
+ "$src2, ${src3}"#_.BroadcastStr,
+ (_.VT (OpNode _.RC:$src2,
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
+ _.RC:$src1)),
+ (_.VT (MaskOpNode _.RC:$src2,
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
+ _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR] in
+ defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (null_frag),
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
+ 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+}
+
+multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _, string Suff> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.ZMM, _.info512, Suff>,
+ avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
+ _.info512, Suff>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.YMM, _.info256, Suff>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.XMM, _.info128, Suff>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f32_info, "PS">;
+ defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma,
+ fma, X86FmaddRnd>;
+defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub,
+ X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub,
+ X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd,
+ X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86any_Fnmadd,
+ X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86any_Fnmsub,
+ X86Fnmsub, X86FnmsubRnd>;
+
+multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (null_frag),
+ (_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
+ AVX512FMA3Base, Sched<[sched]>;
+
+ // Pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
+ defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
+ (_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
+ AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ // Pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
+ defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, "${src3}"#_.BroadcastStr#", $src2",
+ "$src2, ${src3}"#_.BroadcastStr,
+ (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
+ _.RC:$src1, _.RC:$src2)),
+ (_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
+ _.RC:$src1, _.RC:$src2)), 1, 0>,
+ AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR] in
+ defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (null_frag),
+ (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
+ 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+}
+
+multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _, string Suff> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.ZMM, _.info512, Suff>,
+ avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
+ _.info512, Suff>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.YMM, _.info256, Suff>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.XMM, _.info128, Suff>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f32_info, "PS">;
+ defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma,
+ fma, X86FmaddRnd>;
+defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub,
+ X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub,
+ X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd,
+ X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86any_Fnmadd,
+ X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86any_Fnmsub,
+ X86Fnmsub, X86FnmsubRnd>;
+
+// Scalar FMA
+multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ dag RHS_r, dag RHS_m, dag RHS_b, bit MaskOnlyReg> {
+let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
+ defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
+ "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
+ AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
+
+ let mayLoad = 1 in
+ defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
+ "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
+ AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
+
+ let Uses = [MXCSR] in
+ defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
+
+ let isCodeGenOnly = 1, isCommutable = 1 in {
+ def r : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
+ def m : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
+
+ let Uses = [MXCSR] in
+ def rb : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc),
+ !strconcat(OpcodeStr,
+ "\t{$rc, $src3, $src2, $dst|$dst, $src2, $src3, $rc}"),
+ !if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
+ Sched<[SchedWriteFMA.Scl]>;
+ }// isCodeGenOnly = 1
+}// Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd,
+ X86VectorVTInfo _, string SUFF> {
+ let ExeDomain = _.ExeDomain in {
+ defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _,
+ // Operands for intrinsic are in 123 order to preserve passthu
+ // semantics.
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+ _.FRC:$src3))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src3)))),
+ (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src1,
+ _.FRC:$src3, (i32 timm:$rc)))), 0>;
+
+ defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _,
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
+ _.FRC:$src1))),
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src1))),
+ (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src3,
+ _.FRC:$src1, (i32 timm:$rc)))), 1>;
+
+ // One pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
+ defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _,
+ (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
+ _.FRC:$src2))),
+ (set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3),
+ _.FRC:$src1, _.FRC:$src2))),
+ (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src1, _.FRC:$src3,
+ _.FRC:$src2, (i32 timm:$rc)))), 1>;
+ }
+}
+
+multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd> {
+ let Predicates = [HasAVX512] in {
+ defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+ OpNodeRnd, f32x_info, "SS">,
+ EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+ OpNodeRnd, f64x_info, "SD">,
+ EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+ }
+}
+
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", any_fma, X86FmaddRnd>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>;
+
+multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode MaskedOp,
+ SDNode RndOp, string Prefix,
+ string Suffix, SDNode Move,
+ X86VectorVTInfo _, PatLeaf ZeroFP> {
+ let Predicates = [HasAVX512] in {
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zr_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zr_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zm_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src2))))),
+ (!cast<I>(Prefix#"132"#Suffix#"Zm_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zm_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zr_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3)),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zm_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src2),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"132"#Suffix#"Zm_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zr_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zm_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zr_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zr_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3)),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zm_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src2, (_.ScalarLdFrag addr:$src3)),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"132"#Suffix#"Zm_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zm_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
+
+ // Patterns with rounding mode.
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (RndOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3, (i32 timm:$rc)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zrb_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (RndOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (i32 timm:$rc)))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zrb_Int")
+ VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (RndOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3, (i32 timm:$rc)),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (RndOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (i32 timm:$rc)),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intk")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (RndOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3, (i32 timm:$rc)),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
+
+ def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (RndOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (i32 timm:$rc)),
+ (_.EltVT ZeroFP)))))),
+ (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intkz")
+ VR128X:$src1, VK1WM:$mask,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
+ }
+}
+
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
+ "SS", X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
+ "SS", X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD",
+ "SS", X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB",
+ "SS", X86Movss, v4f32x_info, fp32imm0>;
+
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
+ "SD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
+ "SD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD",
+ "SD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB",
+ "SD", X86Movsd, v2f64x_info, fp64imm0>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
+//===----------------------------------------------------------------------===//
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ // NOTE: The SDNode have the multiply operands first with the add last.
+ // This enables commuted load patterns to be autogenerated by tablegen.
+ let ExeDomain = _.ExeDomain in {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
+ AVX512FMA3Base, Sched<[sched]>;
+
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
+ AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (OpNode _.RC:$src2,
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
+ _.RC:$src1)>,
+ AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
+ let Predicates = [HasIFMA] in {
+ defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+ EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ }
+ let Predicates = [HasVLX, HasIFMA] in {
+ defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
+ EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+ defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
+ EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+
+defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l,
+ SchedWriteVecIMul, avx512vl_i64_info>,
+ VEX_W;
+defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
+ SchedWriteVecIMul, avx512vl_i64_info>,
+ VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Scalar convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSchedWrite sched,
+ RegisterClass SrcRC, X86VectorVTInfo DstVT,
+ X86MemOperand x86memop, PatFrag ld_frag, string asm,
+ string mem, list<Register> _Uses = [MXCSR],
+ bit _mayRaiseFPException = 1> {
+let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
+ mayRaiseFPException = _mayRaiseFPException in {
+ let hasSideEffects = 0, isCodeGenOnly = 1 in {
+ def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
+ (ins DstVT.FRC:$src1, SrcRC:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+ let mayLoad = 1 in
+ def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
+ (ins DstVT.FRC:$src1, x86memop:$src),
+ asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ } // hasSideEffects = 0
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, SrcRC:$src2),
+ !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2))]>,
+ EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, x86memop:$src2),
+ asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ (ld_frag addr:$src2)))]>,
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+ def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst,
+ DstVT.RC:$src1, SrcRC:$src2), 0, "att">;
+}
+
+multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
+ X86FoldableSchedWrite sched, RegisterClass SrcRC,
+ X86VectorVTInfo DstVT, string asm,
+ string mem> {
+ let ExeDomain = DstVT.ExeDomain, Uses = [MXCSR] in
+ def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
+ !strconcat(asm,
+ "\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ SrcRC:$src2,
+ (i32 timm:$rc)))]>,
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+ def : InstAlias<"v"#asm#mem#"\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}",
+ (!cast<Instruction>(NAME#"rrb_Int") DstVT.RC:$dst,
+ DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), 0, "att">;
+}
+
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched,
+ RegisterClass SrcRC, X86VectorVTInfo DstVT,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, string mem> {
+ defm NAME : avx512_vcvtsi_round<opc, OpNodeRnd, sched, SrcRC, DstVT, asm, mem>,
+ avx512_vcvtsi<opc, OpNode, sched, SrcRC, DstVT, x86memop,
+ ld_frag, asm, mem>, VEX_LIG;
+}
+
+let Predicates = [HasAVX512] in {
+defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
+ WriteCvtI2SS, GR32,
+ v4f32x_info, i32mem, loadi32, "cvtsi2ss", "l">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
+ WriteCvtI2SS, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">,
+ XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSI2SDZ : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32,
+ v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l", [], 0>,
+ XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
+ WriteCvtI2SD, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtsi2sd", "q">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
+
+def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (any_sint_to_fp GR32:$src)),
+ (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (any_sint_to_fp GR64:$src)),
+ (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (any_sint_to_fp GR32:$src)),
+ (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (any_sint_to_fp GR64:$src)),
+ (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
+ WriteCvtI2SS, GR32,
+ v4f32x_info, i32mem, loadi32,
+ "cvtusi2ss", "l">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
+ WriteCvtI2SS, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">,
+ XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info,
+ i32mem, loadi32, "cvtusi2sd", "l", [], 0>,
+ XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
+ WriteCvtI2SD, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtusi2sd", "q">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTUSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
+def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTUSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
+
+def : Pat<(f32 (any_uint_to_fp (loadi32 addr:$src))),
+ (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (any_uint_to_fp (loadi64 addr:$src))),
+ (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (any_uint_to_fp (loadi32 addr:$src))),
+ (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (any_uint_to_fp (loadi64 addr:$src))),
+ (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (any_uint_to_fp GR32:$src)),
+ (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (any_uint_to_fp GR64:$src)),
+ (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (any_uint_to_fp GR32:$src)),
+ (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (any_uint_to_fp GR64:$src)),
+ (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Scalar convert from float/double to integer
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
+ X86VectorVTInfo DstVT, SDNode OpNode,
+ SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched, string asm,
+ string aliasStr> {
+ let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in {
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src)))]>,
+ EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ let Uses = [MXCSR] in
+ def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
+ !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
+ [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>,
+ EVEX, VEX_LIG, EVEX_B, EVEX_RC,
+ Sched<[sched]>;
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstVT.RC:$dst, (OpNode
+ (SrcVT.ScalarIntMemFrags addr:$src)))]>,
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ } // Predicates = [HasAVX512]
+
+ def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">;
+ def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}",
+ (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">;
+ def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst,
+ SrcVT.IntScalarMemOp:$src), 0, "att">;
+}
+
+// Convert float/double to signed/unsigned int 32/64
+defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,X86cvts2si,
+ X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{l}">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, X86cvts2si,
+ X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{q}">,
+ XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info, X86cvts2usi,
+ X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{l}">,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info, X86cvts2usi,
+ X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{q}">,
+ XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si,
+ X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{l}">,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, X86cvts2si,
+ X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{q}">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info, X86cvts2usi,
+ X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{l}">,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2usi,
+ X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
+ X86VectorVTInfo DstVT, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ string aliasStr> {
+ let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in {
+ let isCodeGenOnly = 1 in {
+ def rr : AVX512<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.FRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstVT.RC:$dst, (OpNode SrcVT.FRC:$src))]>,
+ EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ def rm : AVX512<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstVT.RC:$dst, (OpNode (SrcVT.ScalarLdFrag addr:$src)))]>,
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+ } // Predicates = [HasAVX512]
+}
+
+defm VCVTSS2SIZ: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i32x_info,
+ lrint, WriteCvtSS2I,
+ "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i64x_info,
+ llrint, WriteCvtSS2I,
+ "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i32x_info,
+ lrint, WriteCvtSD2I,
+ "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i64x_info,
+ llrint, WriteCvtSD2I,
+ "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64Zrr FR32:$src)>;
+ def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64Zrm addr:$src)>;
+
+ def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64Zrr FR64:$src)>;
+ def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64Zrm addr:$src)>;
+}
+
+// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
+// which produce unnecessary vmovs{s,d} instructions
+let Predicates = [HasAVX512] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
+ (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
+ (VCVTSI642SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
+ (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
+ (VCVTSI2SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
+ (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
+ (VCVTSI642SDZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
+ (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
+ (VCVTSI2SDZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_uint_to_fp GR64:$src)))))),
+ (VCVTUSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_uint_to_fp (loadi64 addr:$src))))))),
+ (VCVTUSI642SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_uint_to_fp GR32:$src)))))),
+ (VCVTUSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_uint_to_fp (loadi32 addr:$src))))))),
+ (VCVTUSI2SSZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_uint_to_fp GR64:$src)))))),
+ (VCVTUSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_uint_to_fp (loadi64 addr:$src))))))),
+ (VCVTUSI642SDZrm_Int VR128X:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_uint_to_fp GR32:$src)))))),
+ (VCVTUSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_uint_to_fp (loadi32 addr:$src))))))),
+ (VCVTUSI2SDZrm_Int VR128X:$dst, addr:$src)>;
+} // Predicates = [HasAVX512]
+
+// Convert float/double to signed/unsigned int 32/64 with truncation
+multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
+ X86VectorVTInfo _DstRC, SDNode OpNode,
+ SDNode OpNodeInt, SDNode OpNodeSAE,
+ X86FoldableSchedWrite sched, string aliasStr>{
+let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
+ let isCodeGenOnly = 1 in {
+ def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>,
+ EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+
+ def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNodeInt (_SrcRC.VT _SrcRC.RC:$src)))]>,
+ EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ let Uses = [MXCSR] in
+ def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+ [(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>,
+ EVEX, VEX_LIG, EVEX_B, Sched<[sched]>;
+ def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
+ (ins _SrcRC.IntScalarMemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst,
+ (OpNodeInt (_SrcRC.ScalarIntMemFrags addr:$src)))]>,
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+} //HasAVX512
+
+ def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
+ def : InstAlias<asm # aliasStr # "\t{{sae}, $src, $dst|$dst, $src, {sae}}",
+ (!cast<Instruction>(NAME # "rrb_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
+ def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rm_Int") _DstRC.RC:$dst,
+ _SrcRC.IntScalarMemOp:$src), 0, "att">;
+}
+
+defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
+ any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
+ "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
+ any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
+ "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
+ any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
+ "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
+ any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
+ "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
+ any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
+ "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
+ any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
+ "{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
+ any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
+ "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
+ any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
+ "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Convert form float to double and back
+//===----------------------------------------------------------------------===//
+
+let Uses = [MXCSR], mayRaiseFPException = 1 in
+multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode,
+ X86FoldableSchedWrite sched> {
+ defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_Src.VT _Src.RC:$src2)))>,
+ EVEX_4V, VEX_LIG, Sched<[sched]>;
+ defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_Src.ScalarIntMemFrags addr:$src2)))>,
+ EVEX_4V, VEX_LIG,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
+ def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _Src.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ EVEX_4V, VEX_LIG, Sched<[sched]>;
+ let mayLoad = 1 in
+ def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+// Scalar Conversion with SAE - suppress all exceptions
+multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeSAE,
+ X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
+ defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (_.VT (OpNodeSAE (_.VT _.RC:$src1),
+ (_Src.VT _Src.RC:$src2)))>,
+ EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
+}
+
+// Scalar Conversion with rounding control (RC)
+multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
+ defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (_.VT (OpNodeRnd (_.VT _.RC:$src1),
+ (_Src.VT _Src.RC:$src2), (i32 timm:$rc)))>,
+ EVEX_4V, VEX_LIG, Sched<[sched]>,
+ EVEX_B, EVEX_RC;
+}
+multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _src, X86VectorVTInfo _dst> {
+ let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
+ avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
+ OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
+ }
+}
+
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeSAE,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _src, X86VectorVTInfo _dst> {
+ let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
+ avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>,
+ EVEX_CD8<32, CD8VT1>, XS;
+ }
+}
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86frounds,
+ X86froundsRnd, WriteCvtSD2SS, f64x_info,
+ f32x_info>;
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts,
+ X86fpextsSAE, WriteCvtSS2SD, f32x_info,
+ f64x_info>;
+
+def : Pat<(f64 (any_fpextend FR32X:$src)),
+ (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
+ Requires<[HasAVX512]>;
+def : Pat<(f64 (any_fpextend (loadf32 addr:$src))),
+ (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
+ Requires<[HasAVX512, OptForSize]>;
+
+def : Pat<(f32 (any_fpround FR64X:$src)),
+ (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
+ Requires<[HasAVX512]>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (any_fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (VCVTSD2SSZrr_Int VR128X:$dst, VR128X:$src)>,
+ Requires<[HasAVX512]>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (any_fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (VCVTSS2SDZrr_Int VR128X:$dst, VR128X:$src)>,
+ Requires<[HasAVX512]>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Vector convert from signed/unsigned integer to float/double
+// and from float/double to signed/unsigned integer
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode, SDNode MaskOpNode,
+ X86FoldableSchedWrite sched,
+ string Broadcast = _.BroadcastStr,
+ string Alias = "", X86MemOperand MemOp = _Src.MemOp,
+ RegisterClass MaskRC = _.KRCWM,
+ dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src)))),
+ dag MaskLdDAG = (_.VT (MaskOpNode (_Src.VT (_Src.LdFrag addr:$src))))> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rr : AVX512_maskable_cvt<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src),
+ (ins _.RC:$src0, MaskRC:$mask, _Src.RC:$src),
+ (ins MaskRC:$mask, _Src.RC:$src),
+ OpcodeStr, "$src", "$src",
+ (_.VT (OpNode (_Src.VT _Src.RC:$src))),
+ (vselect_mask MaskRC:$mask,
+ (_.VT (MaskOpNode (_Src.VT _Src.RC:$src))),
+ _.RC:$src0),
+ (vselect_mask MaskRC:$mask,
+ (_.VT (MaskOpNode (_Src.VT _Src.RC:$src))),
+ _.ImmAllZerosV)>,
+ EVEX, Sched<[sched]>;
+
+ defm rm : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins MemOp:$src),
+ (ins _.RC:$src0, MaskRC:$mask, MemOp:$src),
+ (ins MaskRC:$mask, MemOp:$src),
+ OpcodeStr#Alias, "$src", "$src",
+ LdDAG,
+ (vselect_mask MaskRC:$mask, MaskLdDAG, _.RC:$src0),
+ (vselect_mask MaskRC:$mask, MaskLdDAG, _.ImmAllZerosV)>,
+ EVEX, Sched<[sched.Folded]>;
+
+ defm rmb : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _Src.ScalarMemOp:$src),
+ (ins _.RC:$src0, MaskRC:$mask, _Src.ScalarMemOp:$src),
+ (ins MaskRC:$mask, _Src.ScalarMemOp:$src),
+ OpcodeStr,
+ "${src}"#Broadcast, "${src}"#Broadcast,
+ (_.VT (OpNode (_Src.VT
+ (_Src.BroadcastLdFrag addr:$src))
+ )),
+ (vselect_mask MaskRC:$mask,
+ (_.VT
+ (MaskOpNode
+ (_Src.VT
+ (_Src.BroadcastLdFrag addr:$src)))),
+ _.RC:$src0),
+ (vselect_mask MaskRC:$mask,
+ (_.VT
+ (MaskOpNode
+ (_Src.VT
+ (_Src.BroadcastLdFrag addr:$src)))),
+ _.ImmAllZerosV)>,
+ EVEX, EVEX_B, Sched<[sched.Folded]>;
+ }
+}
+// Conversion with SAE - suppress all exceptions
+multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeSAE,
+ X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
+ defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src), OpcodeStr,
+ "{sae}, $src", "$src, {sae}",
+ (_.VT (OpNodeSAE (_Src.VT _Src.RC:$src)))>,
+ EVEX, EVEX_B, Sched<[sched]>;
+}
+
+// Conversion with rounding control (RC)
+multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
+ defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src", "$src, $rc",
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 timm:$rc)))>,
+ EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
+}
+
+// Similar to avx512_vcvt_fp, but uses an extload for the memory form.
+multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode,
+ SDNode MaskOpNode,
+ X86FoldableSchedWrite sched,
+ string Broadcast = _.BroadcastStr,
+ string Alias = "", X86MemOperand MemOp = _Src.MemOp,
+ RegisterClass MaskRC = _.KRCWM>
+ : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, MaskOpNode, sched, Broadcast,
+ Alias, MemOp, MaskRC,
+ (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src)),
+ (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src))>;
+
+// Extend Float to Double
+multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info,
+ any_fpextend, fpextend, sched.ZMM>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+ X86vfpextSAE, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+ X86any_vfpext, X86vfpext, sched.XMM, "{1to2}",
+ "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info,
+ any_fpextend, fpextend, sched.YMM>, EVEX_V256;
+ }
+}
+
+// Truncate Double to Float
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info,
+ X86any_vfpround, X86vfpround, sched.ZMM>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+ X86vfproundRnd, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
+ null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
+ f128mem, VK2WM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info,
+ X86any_vfpround, X86vfpround,
+ sched.YMM, "{1to4}", "{y}">, EVEX_V256;
+ }
+
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+}
+
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
+ VEX_W, PD, EVEX_CD8<64, CD8VF>;
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>,
+ PS, EVEX_CD8<32, CD8VH>;
+
+let Predicates = [HasVLX] in {
+ // Special patterns to allow use of X86vmfpround for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(X86any_vfpround (v2f64 VR128X:$src)),
+ (VCVTPD2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86vmfpround (v2f64 VR128X:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTPD2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86vmfpround (v2f64 VR128X:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTPD2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(X86any_vfpround (loadv2f64 addr:$src)),
+ (VCVTPD2PSZ128rm addr:$src)>;
+ def : Pat<(X86vmfpround (loadv2f64 addr:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTPD2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86vmfpround (loadv2f64 addr:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(X86any_vfpround (v2f64 (X86VBroadcastld64 addr:$src))),
+ (VCVTPD2PSZ128rmb addr:$src)>;
+ def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
+ (v4f32 VR128X:$src0), VK2WM:$mask),
+ (VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
+ v4f32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
+}
+
+// Convert Signed/Unsigned Doubleword to Double
+let Uses = []<Register>, mayRaiseFPException = 0 in
+multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNode128,
+ SDNode MaskOpNode128,
+ X86SchedWriteWidths sched> {
+ // No rounding in this op
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode,
+ MaskOpNode, sched.ZMM>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
+ OpNode128, MaskOpNode128, sched.XMM, "{1to2}",
+ "", i64mem, VK2WM,
+ (v2f64 (OpNode128 (bc_v4i32
+ (v2i64
+ (scalar_to_vector (loadi64 addr:$src)))))),
+ (v2f64 (MaskOpNode128 (bc_v4i32
+ (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode,
+ MaskOpNode, sched.YMM>, EVEX_V256;
+ }
+}
+
+// Convert Signed/Unsigned Doubleword to Float
+multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
+
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode,
+ MaskOpNode, sched.XMM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode,
+ MaskOpNode, sched.YMM>, EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode,
+ SDNode OpNodeSAE, X86SchedWriteWidths sched> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
+ OpNodeSAE, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
+ MaskOpNode, sched.XMM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
+ MaskOpNode, sched.YMM>, EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Doubleword
+multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
+ MaskOpNode, sched.XMM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
+ MaskOpNode, sched.YMM>, EVEX_V256;
+ }
+}
+
+// Convert Double to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeSAE,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
+ OpNodeSAE, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+ // memory forms of these instructions in Asm Parser. They have the same
+ // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+ // due to the same reason.
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
+ null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
+ VK2WM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+ MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256;
+ }
+
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+}
+
+// Convert Double to Signed/Unsigned Doubleword
+multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasVLX] in {
+ // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+ // memory forms of these instructions in Asm Parcer. They have the same
+ // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+ // due to the same reason.
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
+ null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
+ VK2WM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+ MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256;
+ }
+
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+}
+
+// Convert Double to Signed/Unsigned Quardword
+multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
+ MaskOpNode, sched.XMM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
+ MaskOpNode, sched.YMM>, EVEX_V256;
+ }
+}
+
+// Convert Double to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
+ MaskOpNode, sched.XMM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
+ MaskOpNode, sched.YMM>, EVEX_V256;
+ }
+}
+
+// Convert Signed/Unsigned Quardword to Double
+multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode,
+ MaskOpNode, sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode,
+ MaskOpNode, sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible;
+ }
+}
+
+// Convert Float to Signed/Unsigned Quardword
+multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ // Explicitly specified broadcast string, since we take only 2 elements
+ // from v4f32x_info source
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+ MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM,
+ (v2i64 (OpNode (bc_v4f32
+ (v2f64
+ (scalar_to_vector (loadf64 addr:$src)))))),
+ (v2i64 (MaskOpNode (bc_v4f32
+ (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
+ MaskOpNode, sched.YMM>, EVEX_V256;
+ }
+}
+
+// Convert Float to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ // Explicitly specified broadcast string, since we take only 2 elements
+ // from v4f32x_info source
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+ MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM,
+ (v2i64 (OpNode (bc_v4f32
+ (v2f64
+ (scalar_to_vector (loadf64 addr:$src)))))),
+ (v2i64 (MaskOpNode (bc_v4f32
+ (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))>,
+ EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
+ MaskOpNode, sched.YMM>, EVEX_V256;
+ }
+}
+
+// Convert Signed/Unsigned Quardword to Float
+multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasDQI] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
+ MaskOpNode, sched.ZMM>,
+ avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
+ OpNodeRnd, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasDQI, HasVLX] in {
+ // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+ // memory forms of these instructions in Asm Parcer. They have the same
+ // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+ // due to the same reason.
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, null_frag,
+ null_frag, sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>,
+ EVEX_V128, NotEVEX2VEXConvertible;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
+ MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256,
+ NotEVEX2VEXConvertible;
+ }
+
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+ i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, i64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|"
+ "$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+ i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, i64mem:$src), 0, "att">;
+}
+
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, sint_to_fp,
+ X86any_VSintToFP, X86VSintToFP,
+ SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, sint_to_fp,
+ X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
+ PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86any_cvttp2si,
+ X86cvttp2si, X86cvttp2siSAE,
+ SchedWriteCvtPS2DQ>, XS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86any_cvttp2si,
+ X86cvttp2si, X86cvttp2siSAE,
+ SchedWriteCvtPD2DQ>,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86any_cvttp2ui,
+ X86cvttp2ui, X86cvttp2uiSAE,
+ SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86any_cvttp2ui,
+ X86cvttp2ui, X86cvttp2uiSAE,
+ SchedWriteCvtPD2DQ>,
+ PS, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp,
+ uint_to_fp, X86any_VUintToFP, X86VUintToFP,
+ SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", any_uint_to_fp,
+ uint_to_fp, X86VUintToFpRnd,
+ SchedWriteCvtDQ2PS>, XD, EVEX_CD8<32, CD8VF>;
+
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int, X86cvtp2Int,
+ X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
+ EVEX_CD8<32, CD8VF>;
+
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, X86cvtp2Int,
+ X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, XD,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, X86cvtp2UInt,
+ X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>,
+ PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, X86cvtp2UInt,
+ X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
+ PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, X86cvtp2Int,
+ X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int, X86cvtp2Int,
+ X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
+ EVEX_CD8<32, CD8VH>;
+
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt, X86cvtp2UInt,
+ X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, X86cvtp2UInt,
+ X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD,
+ EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86any_cvttp2si,
+ X86cvttp2si, X86cvttp2siSAE,
+ SchedWriteCvtPD2DQ>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86any_cvttp2si,
+ X86cvttp2si, X86cvttp2siSAE,
+ SchedWriteCvtPS2DQ>, PD,
+ EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86any_cvttp2ui,
+ X86cvttp2ui, X86cvttp2uiSAE,
+ SchedWriteCvtPD2DQ>, VEX_W,
+ PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86any_cvttp2ui,
+ X86cvttp2ui, X86cvttp2uiSAE,
+ SchedWriteCvtPS2DQ>, PD,
+ EVEX_CD8<32, CD8VH>;
+
+defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", any_sint_to_fp,
+ sint_to_fp, X86VSintToFpRnd,
+ SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp,
+ uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>,
+ VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp,
+ sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
+ VEX_W, PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp,
+ uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>,
+ VEX_W, XD, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [HasVLX] in {
+ // Special patterns to allow use of X86mcvtp2Int for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4i32 (X86cvtp2Int (v2f64 VR128X:$src))),
+ (VCVTPD2DQZ128rr VR128X:$src)>;
+ def : Pat<(X86mcvtp2Int (v2f64 VR128X:$src), (v4i32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTPD2DQZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86mcvtp2Int (v2f64 VR128X:$src), v4i32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTPD2DQZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4i32 (X86cvtp2Int (loadv2f64 addr:$src))),
+ (VCVTPD2DQZ128rm addr:$src)>;
+ def : Pat<(X86mcvtp2Int (loadv2f64 addr:$src), (v4i32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTPD2DQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtp2Int (loadv2f64 addr:$src), v4i32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)))),
+ (VCVTPD2DQZ128rmb addr:$src)>;
+ def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)),
+ (v4i32 VR128X:$src0), VK2WM:$mask),
+ (VCVTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)),
+ v4i32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+ // Special patterns to allow use of X86mcvttp2si for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4i32 (X86any_cvttp2si (v2f64 VR128X:$src))),
+ (VCVTTPD2DQZ128rr VR128X:$src)>;
+ def : Pat<(X86mcvttp2si (v2f64 VR128X:$src), (v4i32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTTPD2DQZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86mcvttp2si (v2f64 VR128X:$src), v4i32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTTPD2DQZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))),
+ (VCVTTPD2DQZ128rm addr:$src)>;
+ def : Pat<(X86mcvttp2si (loadv2f64 addr:$src), (v4i32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTTPD2DQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvttp2si (loadv2f64 addr:$src), v4i32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4i32 (X86any_cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))),
+ (VCVTTPD2DQZ128rmb addr:$src)>;
+ def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)),
+ (v4i32 VR128X:$src0), VK2WM:$mask),
+ (VCVTTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)),
+ v4i32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+ // Special patterns to allow use of X86mcvtp2UInt for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4i32 (X86cvtp2UInt (v2f64 VR128X:$src))),
+ (VCVTPD2UDQZ128rr VR128X:$src)>;
+ def : Pat<(X86mcvtp2UInt (v2f64 VR128X:$src), (v4i32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTPD2UDQZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86mcvtp2UInt (v2f64 VR128X:$src), v4i32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTPD2UDQZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4i32 (X86cvtp2UInt (loadv2f64 addr:$src))),
+ (VCVTPD2UDQZ128rm addr:$src)>;
+ def : Pat<(X86mcvtp2UInt (loadv2f64 addr:$src), (v4i32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTPD2UDQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtp2UInt (loadv2f64 addr:$src), v4i32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)))),
+ (VCVTPD2UDQZ128rmb addr:$src)>;
+ def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)),
+ (v4i32 VR128X:$src0), VK2WM:$mask),
+ (VCVTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)),
+ v4i32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+ // Special patterns to allow use of X86mcvtp2UInt for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4i32 (X86any_cvttp2ui (v2f64 VR128X:$src))),
+ (VCVTTPD2UDQZ128rr VR128X:$src)>;
+ def : Pat<(X86mcvttp2ui (v2f64 VR128X:$src), (v4i32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTTPD2UDQZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86mcvttp2ui (v2f64 VR128X:$src), v4i32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTTPD2UDQZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4i32 (X86any_cvttp2ui (loadv2f64 addr:$src))),
+ (VCVTTPD2UDQZ128rm addr:$src)>;
+ def : Pat<(X86mcvttp2ui (loadv2f64 addr:$src), (v4i32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTTPD2UDQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvttp2ui (loadv2f64 addr:$src), v4i32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4i32 (X86any_cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))),
+ (VCVTTPD2UDQZ128rmb addr:$src)>;
+ def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
+ (v4i32 VR128X:$src0), VK2WM:$mask),
+ (VCVTTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
+ v4i32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
+}
+
+let Predicates = [HasDQI, HasVLX] in {
+ def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTPS2QQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTPS2UQQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2i64 (X86any_cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTTPS2QQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2i64 (X86any_cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTTPS2UQQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+ def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ (VCVTDQ2PDZ128rm addr:$src)>;
+ def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ v2f64x_info.ImmAllZerosV)),
+ (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2f64 (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ (VCVTUDQ2PDZ128rm addr:$src)>;
+ def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ v2f64x_info.ImmAllZerosV)),
+ (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
+}
+
+let Predicates = [HasDQI, HasVLX] in {
+ // Special patterns to allow use of X86VMSintToFP for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4f32 (X86any_VSintToFP (v2i64 VR128X:$src))),
+ (VCVTQQ2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4f32 (X86any_VSintToFP (loadv2i64 addr:$src))),
+ (VCVTQQ2PSZ128rm addr:$src)>;
+ def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4f32 (X86any_VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
+ (VCVTQQ2PSZ128rmb addr:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+ (v4f32 VR128X:$src0), VK2WM:$mask),
+ (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+ v4f32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+ // Special patterns to allow use of X86VMUintToFP for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4f32 (X86any_VUintToFP (v2i64 VR128X:$src))),
+ (VCVTUQQ2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4f32 (X86any_VUintToFP (loadv2i64 addr:$src))),
+ (VCVTUQQ2PSZ128rm addr:$src)>;
+ def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4f32 (X86any_VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
+ (VCVTUQQ2PSZ128rmb addr:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+ (v4f32 VR128X:$src0), VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+ v4f32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+
+let Uses = [MXCSR], mayRaiseFPException = 1 in
+multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86MemOperand x86memop, dag ld_dag,
+ X86FoldableSchedWrite sched> {
+ defm rr : AVX512_maskable_split<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
+ (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
+ (X86any_cvtph2ps (_src.VT _src.RC:$src)),
+ (X86cvtph2ps (_src.VT _src.RC:$src))>,
+ T8PD, Sched<[sched]>;
+ defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
+ (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
+ (X86any_cvtph2ps (_src.VT ld_dag)),
+ (X86cvtph2ps (_src.VT ld_dag))>,
+ T8PD, Sched<[sched.Folded]>;
+}
+
+multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
+ defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
+ (ins _src.RC:$src), "vcvtph2ps",
+ "{sae}, $src", "$src, {sae}",
+ (X86cvtph2psSAE (_src.VT _src.RC:$src))>,
+ T8PD, EVEX_B, Sched<[sched]>;
+}
+
+let Predicates = [HasAVX512] in
+ defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem,
+ (load addr:$src), WriteCvtPH2PSZ>,
+ avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+
+let Predicates = [HasVLX] in {
+ defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
+ (load addr:$src), WriteCvtPH2PSY>, EVEX, EVEX_V256,
+ EVEX_CD8<32, CD8VH>;
+ defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+ (bitconvert (v2i64 (X86vzload64 addr:$src))),
+ WriteCvtPH2PS>, EVEX, EVEX_V128,
+ EVEX_CD8<32, CD8VH>;
+
+ // Pattern match vcvtph2ps of a scalar i64 load.
+ def : Pat<(v4f32 (X86any_cvtph2ps (v8i16 (bitconvert
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (VCVTPH2PSZ128rm addr:$src)>;
+}
+
+multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> {
+let ExeDomain = GenericDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ def rr : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set _dest.RC:$dst,
+ (X86any_cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
+ Sched<[RR]>;
+ let Constraints = "$src0 = $dst" in
+ def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
+ (ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [(set _dest.RC:$dst,
+ (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2),
+ _dest.RC:$src0, _src.KRCWM:$mask))]>,
+ Sched<[RR]>, EVEX_K;
+ def rrkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
+ (ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}",
+ [(set _dest.RC:$dst,
+ (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2),
+ _dest.ImmAllZerosV, _src.KRCWM:$mask))]>,
+ Sched<[RR]>, EVEX_KZ;
+ let hasSideEffects = 0, mayStore = 1 in {
+ def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[MR]>;
+ def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", []>,
+ EVEX_K, Sched<[MR]>, NotMemoryFoldable;
+ }
+}
+}
+
+multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ SchedWrite Sched> {
+ let hasSideEffects = 0, Uses = [MXCSR] in
+ defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
+ (outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", []>,
+ EVEX_B, AVX512AIi8Base, Sched<[Sched]>;
+}
+
+let Predicates = [HasAVX512] in {
+ defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem,
+ WriteCvtPS2PHZ, WriteCvtPS2PHZSt>,
+ avx512_cvtps2ph_sae<v16i16x_info, v16f32_info, WriteCvtPS2PHZ>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+
+ def : Pat<(store (v16i16 (X86any_cvtps2ph VR512:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>;
+}
+
+let Predicates = [HasVLX] in {
+ defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem,
+ WriteCvtPS2PHY, WriteCvtPS2PHYSt>,
+ EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem,
+ WriteCvtPS2PH, WriteCvtPS2PHSt>,
+ EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
+ def : Pat<(store (i64 (extractelt
+ (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
+ def : Pat<(store (v8i16 (X86any_cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>;
+}
+
+// Unordered/Ordered scalar fp compare with Sae and set EFLAGS
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
+ string OpcodeStr, Domain d,
+ X86FoldableSchedWrite sched = WriteFComX> {
+ let hasSideEffects = 0, Uses = [MXCSR] in
+ def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>,
+ EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[sched]>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+ defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", SSEPackedSingle>,
+ AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+ defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", SSEPackedDouble>,
+ AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+ defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", SSEPackedSingle>,
+ AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", SSEPackedDouble>,
+ AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+ defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86any_fcmp, f32, f32mem, loadf32,
+ "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86any_fcmp, f64, f64mem, loadf64,
+ "ucomisd", SSEPackedDouble>, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, X86strict_fcmps, f32, f32mem, loadf32,
+ "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, X86strict_fcmps, f64, f64mem, loadf64,
+ "comisd", SSEPackedDouble>, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ let isCodeGenOnly = 1 in {
+ defm VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd", SSEPackedDouble>, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+ defm VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd", SSEPackedDouble>, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ }
+}
+
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let Predicates = [HasAVX512], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
+ defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_4V, VEX_LIG, Sched<[sched]>;
+ defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.ScalarIntMemFrags addr:$src2))>, EVEX_4V, VEX_LIG,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+}
+
+defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl,
+ f32x_info>, EVEX_CD8<32, CD8VT1>,
+ T8PD;
+defm VRCP14SDZ : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SchedWriteFRcp.Scl,
+ f64x_info>, VEX_W, EVEX_CD8<64, CD8VT1>,
+ T8PD;
+defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s,
+ SchedWriteFRsqrt.Scl, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s,
+ SchedWriteFRsqrt.Scl, f64x_info>, VEX_W,
+ EVEX_CD8<64, CD8VT1>, T8PD;
+
+/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
+multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.VT (OpNode _.RC:$src))>, EVEX, T8PD,
+ Sched<[sched]>;
+ defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.VT
+ (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr,
+ "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
+ (OpNode (_.VT
+ (_.BroadcastLdFrag addr:$src)))>,
+ EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+let Uses = [MXCSR] in
+multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
+ defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM,
+ v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, sched.ZMM,
+ v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, sched.XMM, v4f32x_info>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, sched.YMM, v8f32x_info>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, sched.XMM, v2f64x_info>,
+ EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, sched.YMM, v4f64x_info>,
+ EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+ }
+}
+
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SchedWriteFRsqrt>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
+
+/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
+multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ SDNode OpNode, SDNode OpNodeSAE,
+ X86FoldableSchedWrite sched> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
+ defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ Sched<[sched]>, SIMD_EXC;
+
+ defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_B, Sched<[sched]>;
+
+ defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+}
+
+multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
+ defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
+ sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
+ sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+}
+
+let Predicates = [HasERI] in {
+ defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs,
+ SchedWriteFRcp.Scl>, T8PD, EVEX_4V;
+ defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs,
+ SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V;
+}
+
+defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
+ SchedWriteFRnd.Scl>, T8PD, EVEX_4V;
+/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
+
+multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ SDNode OpNode, X86FoldableSchedWrite sched> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.VT _.RC:$src))>,
+ Sched<[sched]>;
+
+ defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+ (OpNode (_.VT
+ (bitconvert (_.LdFrag addr:$src))))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr,
+ "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
+ (OpNode (_.VT
+ (_.BroadcastLdFrag addr:$src)))>,
+ EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+multiclass avx512_fp28_p_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ SDNode OpNode, X86FoldableSchedWrite sched> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
+ defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr,
+ "{sae}, $src", "$src, {sae}",
+ (OpNode (_.VT _.RC:$src))>,
+ EVEX_B, Sched<[sched]>;
+}
+
+multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeSAE, X86SchedWriteWidths sched> {
+ defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
+ avx512_fp28_p_sae<opc, OpcodeStr#"ps", v16f32_info, OpNodeSAE, sched.ZMM>,
+ T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
+ avx512_fp28_p_sae<opc, OpcodeStr#"pd", v8f64_info, OpNodeSAE, sched.ZMM>,
+ T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode,
+ sched.XMM>,
+ EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode,
+ sched.YMM>,
+ EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode,
+ sched.XMM>,
+ EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode,
+ sched.YMM>,
+ EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ }
+}
+
+let Predicates = [HasERI] in {
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE,
+ SchedWriteFRsqrt>, EVEX;
+ defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, X86rcp28SAE,
+ SchedWriteFRcp>, EVEX;
+ defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, X86exp2SAE,
+ SchedWriteFAdd>, EVEX;
+}
+defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
+ SchedWriteFRnd>,
+ avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexp,
+ SchedWriteFRnd>, EVEX;
+
+multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain in
+ defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
+ (_.VT (X86fsqrtRnd _.RC:$src, (i32 timm:$rc)))>,
+ EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
+}
+
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm r: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.VT (any_fsqrt _.RC:$src)),
+ (_.VT (fsqrt _.RC:$src))>, EVEX,
+ Sched<[sched]>;
+ defm m: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+ (any_fsqrt (_.VT (_.LdFrag addr:$src))),
+ (fsqrt (_.VT (_.LdFrag addr:$src)))>, EVEX,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm mb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr,
+ "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
+ (any_fsqrt (_.VT (_.BroadcastLdFrag addr:$src))),
+ (fsqrt (_.VT (_.BroadcastLdFrag addr:$src)))>,
+ EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+let Uses = [MXCSR], mayRaiseFPException = 1 in
+multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
+ X86SchedWriteSizes sched> {
+ defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+ sched.PS.ZMM, v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+ sched.PD.ZMM, v8f64_info>,
+ EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+ // Define only if AVX512VL feature is present.
+ let Predicates = [HasVLX] in {
+ defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+ sched.PS.XMM, v4f32x_info>,
+ EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+ sched.PS.YMM, v8f32x_info>,
+ EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+ sched.PD.XMM, v2f64x_info>,
+ EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+ sched.PD.YMM, v4f64x_info>,
+ EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+ }
+}
+
+let Uses = [MXCSR] in
+multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
+ X86SchedWriteSizes sched> {
+ defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
+ sched.PS.ZMM, v16f32_info>,
+ EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"),
+ sched.PD.ZMM, v8f64_info>,
+ EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, string Name> {
+ let ExeDomain = _.ExeDomain in {
+ defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (X86fsqrts (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2))>,
+ Sched<[sched]>, SIMD_EXC;
+ defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (X86fsqrts (_.VT _.RC:$src1),
+ (_.ScalarIntMemFrags addr:$src2))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ let Uses = [MXCSR] in
+ defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (X86fsqrtRnds (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 timm:$rc))>,
+ EVEX_B, EVEX_RC, Sched<[sched]>;
+
+ let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in {
+ def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[sched]>, SIMD_EXC;
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+ }
+
+ let Predicates = [HasAVX512] in {
+ def : Pat<(_.EltVT (any_fsqrt _.FRC:$src)),
+ (!cast<Instruction>(Name#Zr)
+ (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+ }
+
+ let Predicates = [HasAVX512, OptForSize] in {
+ def : Pat<(_.EltVT (any_fsqrt (load addr:$src))),
+ (!cast<Instruction>(Name#Zm)
+ (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
+ }
+}
+
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
+ X86SchedWriteSizes sched> {
+ defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
+ EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+ defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
+ EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
+}
+
+defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", SchedWriteFSqrtSizes>,
+ avx512_sqrt_packed_all_round<0x51, "vsqrt", SchedWriteFSqrtSizes>;
+
+defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt", SchedWriteFSqrtSizes>, VEX_LIG;
+
+multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 timm:$src3)))>,
+ Sched<[sched]>, SIMD_EXC;
+
+ let Uses = [MXCSR] in
+ defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+ "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
+ (_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 timm:$src3)))>, EVEX_B,
+ Sched<[sched]>;
+
+ defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
+ OpcodeStr,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86RndScales _.RC:$src1,
+ (_.ScalarIntMemFrags addr:$src2), (i32 timm:$src3)))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+
+ let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
+ def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[sched]>, SIMD_EXC;
+
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+ }
+
+ let Predicates = [HasAVX512] in {
+ def : Pat<(X86any_VRndScale _.FRC:$src1, timm:$src2),
+ (_.EltVT (!cast<Instruction>(NAME#r) (_.EltVT (IMPLICIT_DEF)),
+ _.FRC:$src1, timm:$src2))>;
+ }
+
+ let Predicates = [HasAVX512, OptForSize] in {
+ def : Pat<(X86any_VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2),
+ (_.EltVT (!cast<Instruction>(NAME#m) (_.EltVT (IMPLICIT_DEF)),
+ addr:$src1, timm:$src2))>;
+ }
+}
+
+defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
+ SchedWriteFRnd.Scl, f32x_info>,
+ AVX512AIi8Base, EVEX_4V, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+
+defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd",
+ SchedWriteFRnd.Scl, f64x_info>,
+ VEX_W, AVX512AIi8Base, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>;
+
+multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
+ dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP,
+ dag OutMask, Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask,
+ (OpNode (extractelt _.VT:$src2, (iPTR 0))),
+ (extractelt _.VT:$dst, (iPTR 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#r_Intk)
+ _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>;
+
+ def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask,
+ (OpNode (extractelt _.VT:$src2, (iPTR 0))),
+ ZeroFP))),
+ (!cast<Instruction>("V"#OpcPrefix#r_Intkz)
+ OutMask, _.VT:$src2, _.VT:$src1)>;
+ }
+}
+
+defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss,
+ (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info,
+ fp32imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
+defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd,
+ (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info,
+ fp64imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
+
+
+//-------------------------------------------------
+// Integer truncate and extend operations
+//-------------------------------------------------
+
+// PatFrags that contain a select and a truncate op. The take operands in the
+// same order as X86vmtrunc, X86vmtruncs, X86vmtruncus. This allows us to pass
+// either to the multiclasses.
+def select_trunc : PatFrag<(ops node:$src, node:$src0, node:$mask),
+ (vselect_mask node:$mask,
+ (trunc node:$src), node:$src0)>;
+def select_truncs : PatFrag<(ops node:$src, node:$src0, node:$mask),
+ (vselect_mask node:$mask,
+ (X86vtruncs node:$src), node:$src0)>;
+def select_truncus : PatFrag<(ops node:$src, node:$src0, node:$mask),
+ (vselect_mask node:$mask,
+ (X86vtruncus node:$src), node:$src0)>;
+
+multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDPatternOperator MaskNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo SrcInfo,
+ X86VectorVTInfo DestInfo, X86MemOperand x86memop> {
+ let ExeDomain = DestInfo.ExeDomain in {
+ def rr : AVX512XS8I<opc, MRMDestReg, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst|$dst, $src}",
+ [(set DestInfo.RC:$dst,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src))))]>,
+ EVEX, Sched<[sched]>;
+ let Constraints = "$src0 = $dst" in
+ def rrk : AVX512XS8I<opc, MRMDestReg, (outs DestInfo.RC:$dst),
+ (ins DestInfo.RC:$src0, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ [(set DestInfo.RC:$dst,
+ (MaskNode (SrcInfo.VT SrcInfo.RC:$src),
+ (DestInfo.VT DestInfo.RC:$src0),
+ SrcInfo.KRCWM:$mask))]>,
+ EVEX, EVEX_K, Sched<[sched]>;
+ def rrkz : AVX512XS8I<opc, MRMDestReg, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ [(set DestInfo.RC:$dst,
+ (DestInfo.VT (MaskNode (SrcInfo.VT SrcInfo.RC:$src),
+ DestInfo.ImmAllZerosV, SrcInfo.KRCWM:$mask)))]>,
+ EVEX, EVEX_KZ, Sched<[sched]>;
+ }
+
+ let mayStore = 1, hasSideEffects = 0, ExeDomain = DestInfo.ExeDomain in {
+ def mr : AVX512XS8I<opc, MRMDestMem, (outs),
+ (ins x86memop:$dst, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst|$dst, $src}", []>,
+ EVEX, Sched<[sched.Folded]>;
+
+ def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+ (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", []>,
+ EVEX, EVEX_K, Sched<[sched.Folded]>, NotMemoryFoldable;
+ }//mayStore = 1, hasSideEffects = 0
+}
+
+multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
+ X86VectorVTInfo DestInfo,
+ PatFrag truncFrag, PatFrag mtruncFrag,
+ string Name> {
+
+ def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
+ (!cast<Instruction>(Name#SrcInfo.ZSuffix#mr)
+ addr:$dst, SrcInfo.RC:$src)>;
+
+ def : Pat<(mtruncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst,
+ SrcInfo.KRCWM:$mask),
+ (!cast<Instruction>(Name#SrcInfo.ZSuffix#mrk)
+ addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
+}
+
+multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode128,
+ SDNode OpNode256, SDNode OpNode512,
+ SDPatternOperator MaskNode128,
+ SDPatternOperator MaskNode256,
+ SDPatternOperator MaskNode512,
+ X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTSrcInfo,
+ X86VectorVTInfo DestInfoZ128,
+ X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+ X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+ X86MemOperand x86memopZ, PatFrag truncFrag,
+ PatFrag mtruncFrag, Predicate prd = HasAVX512>{
+
+ let Predicates = [HasVLX, prd] in {
+ defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode128, MaskNode128, sched,
+ VTSrcInfo.info128, DestInfoZ128, x86memopZ128>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+ truncFrag, mtruncFrag, NAME>, EVEX_V128;
+
+ defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode256, MaskNode256, sched,
+ VTSrcInfo.info256, DestInfoZ256, x86memopZ256>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+ truncFrag, mtruncFrag, NAME>, EVEX_V256;
+ }
+ let Predicates = [prd] in
+ defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode512, MaskNode512, sched,
+ VTSrcInfo.info512, DestInfoZ, x86memopZ>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+ truncFrag, mtruncFrag, NAME>, EVEX_V512;
+}
+
+multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDPatternOperator MaskNode,
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode,
+ SDPatternOperator InVecMaskNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, InVecNode,
+ InVecMaskNode, InVecMaskNode, InVecMaskNode, sched,
+ avx512vl_i64_info, v16i8x_info, v16i8x_info,
+ v16i8x_info, i16mem, i32mem, i64mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
+}
+
+multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDPatternOperator MaskNode,
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode,
+ SDPatternOperator InVecMaskNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, OpNode,
+ InVecMaskNode, InVecMaskNode, MaskNode, sched,
+ avx512vl_i64_info, v8i16x_info, v8i16x_info,
+ v8i16x_info, i32mem, i64mem, i128mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
+}
+
+multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDPatternOperator MaskNode,
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode,
+ SDPatternOperator InVecMaskNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode,
+ InVecMaskNode, MaskNode, MaskNode, sched,
+ avx512vl_i64_info, v4i32x_info, v4i32x_info,
+ v8i32x_info, i64mem, i128mem, i256mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
+}
+
+multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDPatternOperator MaskNode,
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode,
+ SDPatternOperator InVecMaskNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, OpNode,
+ InVecMaskNode, InVecMaskNode, MaskNode, sched,
+ avx512vl_i32_info, v16i8x_info, v16i8x_info,
+ v16i8x_info, i32mem, i64mem, i128mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
+}
+
+multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDPatternOperator MaskNode,
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode,
+ SDPatternOperator InVecMaskNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode,
+ InVecMaskNode, MaskNode, MaskNode, sched,
+ avx512vl_i32_info, v8i16x_info, v8i16x_info,
+ v16i16x_info, i64mem, i128mem, i256mem, StoreNode,
+ MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
+}
+
+multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDPatternOperator MaskNode,
+ X86FoldableSchedWrite sched, PatFrag StoreNode,
+ PatFrag MaskedStoreNode, SDNode InVecNode,
+ SDPatternOperator InVecMaskNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode,
+ InVecMaskNode, MaskNode, MaskNode, sched,
+ avx512vl_i16_info, v16i8x_info, v16i8x_info,
+ v32i8x_info, i64mem, i128mem, i256mem, StoreNode,
+ MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+
+defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", trunc, select_trunc,
+ WriteShuffle256, truncstorevi8,
+ masked_truncstorevi8, X86vtrunc, X86vmtrunc>;
+defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, select_truncs,
+ WriteShuffle256, truncstore_s_vi8,
+ masked_truncstore_s_vi8, X86vtruncs,
+ X86vmtruncs>;
+defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus,
+ select_truncus, WriteShuffle256,
+ truncstore_us_vi8, masked_truncstore_us_vi8,
+ X86vtruncus, X86vmtruncus>;
+
+defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", trunc, select_trunc,
+ WriteShuffle256, truncstorevi16,
+ masked_truncstorevi16, X86vtrunc, X86vmtrunc>;
+defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, select_truncs,
+ WriteShuffle256, truncstore_s_vi16,
+ masked_truncstore_s_vi16, X86vtruncs,
+ X86vmtruncs>;
+defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus,
+ select_truncus, WriteShuffle256,
+ truncstore_us_vi16, masked_truncstore_us_vi16,
+ X86vtruncus, X86vmtruncus>;
+
+defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", trunc, select_trunc,
+ WriteShuffle256, truncstorevi32,
+ masked_truncstorevi32, X86vtrunc, X86vmtrunc>;
+defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, select_truncs,
+ WriteShuffle256, truncstore_s_vi32,
+ masked_truncstore_s_vi32, X86vtruncs,
+ X86vmtruncs>;
+defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus,
+ select_truncus, WriteShuffle256,
+ truncstore_us_vi32, masked_truncstore_us_vi32,
+ X86vtruncus, X86vmtruncus>;
+
+defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", trunc, select_trunc,
+ WriteShuffle256, truncstorevi8,
+ masked_truncstorevi8, X86vtrunc, X86vmtrunc>;
+defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, select_truncs,
+ WriteShuffle256, truncstore_s_vi8,
+ masked_truncstore_s_vi8, X86vtruncs,
+ X86vmtruncs>;
+defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus,
+ select_truncus, WriteShuffle256,
+ truncstore_us_vi8, masked_truncstore_us_vi8,
+ X86vtruncus, X86vmtruncus>;
+
+defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", trunc, select_trunc,
+ WriteShuffle256, truncstorevi16,
+ masked_truncstorevi16, X86vtrunc, X86vmtrunc>;
+defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, select_truncs,
+ WriteShuffle256, truncstore_s_vi16,
+ masked_truncstore_s_vi16, X86vtruncs,
+ X86vmtruncs>;
+defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus,
+ select_truncus, WriteShuffle256,
+ truncstore_us_vi16, masked_truncstore_us_vi16,
+ X86vtruncus, X86vmtruncus>;
+
+defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", trunc, select_trunc,
+ WriteShuffle256, truncstorevi8,
+ masked_truncstorevi8, X86vtrunc,
+ X86vmtrunc>;
+defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, select_truncs,
+ WriteShuffle256, truncstore_s_vi8,
+ masked_truncstore_s_vi8, X86vtruncs,
+ X86vmtruncs>;
+defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
+ select_truncus, WriteShuffle256,
+ truncstore_us_vi8, masked_truncstore_us_vi8,
+ X86vtruncus, X86vmtruncus>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))),
+ (v8i16 (EXTRACT_SUBREG
+ (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src, sub_ymm)))), sub_xmm))>;
+def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))),
+ (v4i32 (EXTRACT_SUBREG
+ (v8i32 (VPMOVQDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src, sub_ymm)))), sub_xmm))>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
+ (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src, sub_ymm))), sub_xmm))>;
+}
+
+// Without BWI we can't use vXi16/vXi8 vselect so we have to use vmtrunc nodes.
+multiclass mtrunc_lowering<string InstrName, SDNode OpNode,
+ X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo> {
+ def : Pat<(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src),
+ DestInfo.RC:$src0,
+ SrcInfo.KRCWM:$mask)),
+ (!cast<Instruction>(InstrName#"rrk") DestInfo.RC:$src0,
+ SrcInfo.KRCWM:$mask,
+ SrcInfo.RC:$src)>;
+
+ def : Pat<(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src),
+ DestInfo.ImmAllZerosV,
+ SrcInfo.KRCWM:$mask)),
+ (!cast<Instruction>(InstrName#"rrkz") SrcInfo.KRCWM:$mask,
+ SrcInfo.RC:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+defm : mtrunc_lowering<"VPMOVDWZ256", X86vmtrunc, v8i16x_info, v8i32x_info>;
+defm : mtrunc_lowering<"VPMOVSDWZ256", X86vmtruncs, v8i16x_info, v8i32x_info>;
+defm : mtrunc_lowering<"VPMOVUSDWZ256", X86vmtruncus, v8i16x_info, v8i32x_info>;
+}
+
+let Predicates = [HasAVX512] in {
+defm : mtrunc_lowering<"VPMOVDWZ", X86vmtrunc, v16i16x_info, v16i32_info>;
+defm : mtrunc_lowering<"VPMOVSDWZ", X86vmtruncs, v16i16x_info, v16i32_info>;
+defm : mtrunc_lowering<"VPMOVUSDWZ", X86vmtruncus, v16i16x_info, v16i32_info>;
+
+defm : mtrunc_lowering<"VPMOVDBZ", X86vmtrunc, v16i8x_info, v16i32_info>;
+defm : mtrunc_lowering<"VPMOVSDBZ", X86vmtruncs, v16i8x_info, v16i32_info>;
+defm : mtrunc_lowering<"VPMOVUSDBZ", X86vmtruncus, v16i8x_info, v16i32_info>;
+
+defm : mtrunc_lowering<"VPMOVQWZ", X86vmtrunc, v8i16x_info, v8i64_info>;
+defm : mtrunc_lowering<"VPMOVSQWZ", X86vmtruncs, v8i16x_info, v8i64_info>;
+defm : mtrunc_lowering<"VPMOVUSQWZ", X86vmtruncus, v8i16x_info, v8i64_info>;
+}
+
+multiclass WriteShuffle256_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
+ X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{
+ let ExeDomain = DestInfo.ExeDomain in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
+ EVEX, Sched<[sched]>;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins x86memop:$src), OpcodeStr ,"$src", "$src",
+ (DestInfo.VT (LdFrag addr:$src))>,
+ EVEX, Sched<[sched.Folded]>;
+ }
+}
+
+multiclass WriteShuffle256_BW<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ let Predicates = [HasVLX, HasBWI] in {
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v8i16x_info,
+ v16i8x_info, i64mem, LdFrag, InVecNode>,
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
+
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v16i16x_info,
+ v16i8x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
+ }
+ let Predicates = [HasBWI] in {
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v32i16_info,
+ v32i8x_info, i256mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
+ }
+}
+
+multiclass WriteShuffle256_BD<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info,
+ v16i8x_info, i32mem, LdFrag, InVecNode>,
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
+
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info,
+ v16i8x_info, i64mem, LdFrag, InVecNode>,
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info,
+ v16i8x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
+ }
+}
+
+multiclass WriteShuffle256_BQ<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
+ v16i8x_info, i16mem, LdFrag, InVecNode>,
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG;
+
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
+ v16i8x_info, i32mem, LdFrag, InVecNode>,
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
+ v16i8x_info, i64mem, LdFrag, InVecNode>,
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG;
+ }
+}
+
+multiclass WriteShuffle256_WD<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info,
+ v8i16x_info, i64mem, LdFrag, InVecNode>,
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
+
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info,
+ v8i16x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info,
+ v16i16x_info, i256mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
+ }
+}
+
+multiclass WriteShuffle256_WQ<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
+ v8i16x_info, i32mem, LdFrag, InVecNode>,
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
+
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
+ v8i16x_info, i64mem, LdFrag, InVecNode>,
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
+ v8i16x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
+ }
+}
+
+multiclass WriteShuffle256_DQ<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode InVecNode, string ExtTy,
+ X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
+
+ let Predicates = [HasVLX, HasAVX512] in {
+ defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
+ v4i32x_info, i64mem, LdFrag, InVecNode>,
+ EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
+
+ defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
+ v4i32x_info, i128mem, LdFrag, OpNode>,
+ EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
+ }
+ let Predicates = [HasAVX512] in {
+ defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
+ v8i32x_info, i256mem, LdFrag, OpNode>,
+ EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
+ }
+}
+
+defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", zext, zext_invec, "z", WriteShuffle256>;
+
+defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", sext, sext_invec, "s", WriteShuffle256>;
+
+
+// Patterns that we also need any extend versions of. aext_vector_inreg
+// is currently legalized to zext_vector_inreg.
+multiclass AVX512_pmovx_patterns_base<string OpcPrefix, SDNode ExtOp> {
+ // 256-bit patterns
+ let Predicates = [HasVLX, HasBWI] in {
+ def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
+ (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+ }
+
+ let Predicates = [HasVLX] in {
+ def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
+ (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
+ (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+ }
+
+ // 512-bit patterns
+ let Predicates = [HasBWI] in {
+ def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))),
+ (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX512] in {
+ def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))),
+ (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
+ def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))),
+ (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
+
+ def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))),
+ (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
+
+ def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))),
+ (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
+ }
+}
+
+multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
+ SDNode InVecOp> :
+ AVX512_pmovx_patterns_base<OpcPrefix, ExtOp> {
+ // 128-bit patterns
+ let Predicates = [HasVLX, HasBWI] in {
+ def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ }
+ let Predicates = [HasVLX] in {
+ def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+
+ def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+
+ def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+
+ def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+
+ def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ }
+ let Predicates = [HasVLX] in {
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+
+ def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ }
+ // 512-bit patterns
+ let Predicates = [HasAVX512] in {
+ def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+ def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+ def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+ }
+}
+
+defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>;
+
+// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge
+// ext+trunc aggressively making it impossible to legalize the DAG to this
+// pattern directly.
+let Predicates = [HasAVX512, NoBWI] in {
+def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
+ (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
+def: Pat<(v16i8 (trunc (loadv16i16 addr:$src))),
+ (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>;
+}
+
+//===----------------------------------------------------------------------===//
+// GATHER - SCATTER Operations
+
+// FIXME: Improve scheduling of gather/scatter instructions.
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> {
+ let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
+ ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
+ def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, MaskRC:$mask_wb),
+ (ins _.RC:$src1, MaskRC:$mask, memop:$src2),
+ !strconcat(OpcodeStr#_.Suffix,
+ "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+ []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
+}
+
+multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512,
+ vy512xmem>, EVEX_V512, VEX_W;
+ defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info512,
+ vz512mem>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+ defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256,
+ vx256xmem>, EVEX_V256, VEX_W;
+ defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info256,
+ vy256xmem>, EVEX_V256, VEX_W;
+ defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128,
+ vx128xmem>, EVEX_V128, VEX_W;
+ defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128,
+ vx128xmem>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512, vz512mem>,
+ EVEX_V512;
+ defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info256, vz256mem>,
+ EVEX_V512;
+let Predicates = [HasVLX] in {
+ defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256,
+ vy256xmem>, EVEX_V256;
+ defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info128,
+ vy128xmem>, EVEX_V256;
+ defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128,
+ vx128xmem>, EVEX_V128;
+ defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128,
+ vx64xmem, VK2WM>, EVEX_V128;
+}
+}
+
+
+defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">,
+ avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">;
+
+defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">,
+ avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> {
+
+let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain,
+ hasSideEffects = 0 in
+
+ def mr : AVX5128I<opc, MRMDestMem, (outs MaskRC:$mask_wb),
+ (ins memop:$dst, MaskRC:$mask, _.RC:$src),
+ !strconcat(OpcodeStr#_.Suffix,
+ "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+ []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[WriteStore]>;
+}
+
+multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512,
+ vy512xmem>, EVEX_V512, VEX_W;
+ defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info512,
+ vz512mem>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+ defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256,
+ vx256xmem>, EVEX_V256, VEX_W;
+ defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info256,
+ vy256xmem>, EVEX_V256, VEX_W;
+ defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128,
+ vx128xmem>, EVEX_V128, VEX_W;
+ defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
+ vx128xmem>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
+ AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+ defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512, vz512mem>,
+ EVEX_V512;
+ defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info256, vz256mem>,
+ EVEX_V512;
+let Predicates = [HasVLX] in {
+ defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256,
+ vy256xmem>, EVEX_V256;
+ defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
+ vy128xmem>, EVEX_V256;
+ defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128,
+ vx128xmem>, EVEX_V128;
+ defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
+ vx64xmem, VK2WM>, EVEX_V128;
+}
+}
+
+defm VSCATTER : avx512_scatter_q_pd<0xA2, 0xA3, avx512vl_f64_info, "vscatter", "PD">,
+ avx512_scatter_d_ps<0xA2, 0xA3, avx512vl_f32_info, "vscatter", "PS">;
+
+defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", "Q">,
+ avx512_scatter_d_ps<0xA0, 0xA1, avx512vl_i32_info, "vpscatter", "D">;
+
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+ RegisterClass KRC, X86MemOperand memop> {
+ let Predicates = [HasPFI], mayLoad = 1, mayStore = 1 in
+ def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+ !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), []>,
+ EVEX, EVEX_K, Sched<[WriteLoad]>;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+ VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+ VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+ VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+ VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+ VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+ VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+ VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+ VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+ VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+ VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+ VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+ VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+ VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+ VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+ VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+ VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
+def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
+ !strconcat(OpcodeStr#Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
+ [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>,
+ EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc?
+}
+
+multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
+ string OpcodeStr, Predicate prd> {
+let Predicates = [prd] in
+ defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+ defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+ }
+}
+
+defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>;
+defm VPMOVM2W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, "vpmovm2", HasBWI> , VEX_W;
+defm VPMOVM2D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, "vpmovm2", HasDQI>;
+defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI> , VEX_W;
+
+multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
+ def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set _.KRC:$dst, (X86pcmpgtm _.ImmAllZerosV, (_.VT _.RC:$src)))]>,
+ EVEX, Sched<[WriteMove]>;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass convert_vector_to_mask_lowering<X86VectorVTInfo ExtendInfo,
+ X86VectorVTInfo _,
+ string Name> {
+
+ def : Pat<(_.KVT (X86pcmpgtm _.ImmAllZerosV, (_.VT _.RC:$src))),
+ (_.KVT (COPY_TO_REGCLASS
+ (!cast<Instruction>(Name#"Zrr")
+ (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+ _.RC:$src, _.SubRegIdx)),
+ _.KRC))>;
+}
+
+multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>,
+ EVEX_V256;
+ defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
+ EVEX_V128;
+ }
+ let Predicates = [prd, NoVLX] in {
+ defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>;
+ defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>;
+ }
+}
+
+defm VPMOVB2M : avx512_convert_vector_to_mask<0x29, "vpmovb2m",
+ avx512vl_i8_info, HasBWI>;
+defm VPMOVW2M : avx512_convert_vector_to_mask<0x29, "vpmovw2m",
+ avx512vl_i16_info, HasBWI>, VEX_W;
+defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m",
+ avx512vl_i32_info, HasDQI>;
+defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
+ avx512vl_i64_info, HasDQI>, VEX_W;
+
+// Patterns for handling sext from a mask register to v16i8/v16i16 when DQI
+// is available, but BWI is not. We can't handle this in lowering because
+// a target independent DAG combine likes to combine sext and trunc.
+let Predicates = [HasDQI, NoBWI] in {
+ def : Pat<(v16i8 (sext (v16i1 VK16:$src))),
+ (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+ def : Pat<(v16i16 (sext (v16i1 VK16:$src))),
+ (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+}
+
+let Predicates = [HasDQI, NoBWI, HasVLX] in {
+ def : Pat<(v8i16 (sext (v8i1 VK8:$src))),
+ (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - COMPRESS and EXPAND
+//
+
+multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
+ string OpcodeStr, X86FoldableSchedWrite sched> {
+ defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+ (null_frag)>, AVX5128IBase,
+ Sched<[sched]>;
+
+ let mayStore = 1, hasSideEffects = 0 in
+ def mr : AVX5128I<opc, MRMDestMem, (outs),
+ (ins _.MemOp:$dst, _.RC:$src),
+ OpcodeStr # "\t{$src, $dst|$dst, $src}",
+ []>, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[sched.Folded]>;
+
+ def mrk : AVX5128I<opc, MRMDestMem, (outs),
+ (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ []>,
+ EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[sched.Folded]>;
+}
+
+multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
+ def : Pat<(X86mCompressingStore (_.VT _.RC:$src), addr:$dst, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#mrk)
+ addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
+
+ def : Pat<(X86compress (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#rrk)
+ _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
+ def : Pat<(X86compress (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#rrkz)
+ _.KRCWM:$mask, _.RC:$src)>;
+}
+
+multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTInfo,
+ Predicate Pred = HasAVX512> {
+ let Predicates = [Pred] in
+ defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr, sched>,
+ compress_by_vec_width_lowering<VTInfo.info512, NAME>, EVEX_V512;
+
+ let Predicates = [Pred, HasVLX] in {
+ defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr, sched>,
+ compress_by_vec_width_lowering<VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr, sched>,
+ compress_by_vec_width_lowering<VTInfo.info128, NAME>, EVEX_V128;
+ }
+}
+
+// FIXME: Is there a better scheduler class for VPCOMPRESS?
+defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", WriteVarShuffle256,
+ avx512vl_i32_info>, EVEX, NotMemoryFoldable;
+defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", WriteVarShuffle256,
+ avx512vl_i64_info>, EVEX, VEX_W, NotMemoryFoldable;
+defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", WriteVarShuffle256,
+ avx512vl_f32_info>, EVEX, NotMemoryFoldable;
+defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", WriteVarShuffle256,
+ avx512vl_f64_info>, EVEX, VEX_W, NotMemoryFoldable;
+
+// expand
+multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+ string OpcodeStr, X86FoldableSchedWrite sched> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+ (null_frag)>, AVX5128IBase,
+ Sched<[sched]>;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
+ (null_frag)>,
+ AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
+
+ def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmkz)
+ _.KRCWM:$mask, addr:$src)>;
+
+ def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, _.ImmAllZerosV)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmkz)
+ _.KRCWM:$mask, addr:$src)>;
+
+ def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask,
+ (_.VT _.RC:$src0))),
+ (!cast<Instruction>(Name#_.ZSuffix#rmk)
+ _.RC:$src0, _.KRCWM:$mask, addr:$src)>;
+
+ def : Pat<(X86expand (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#rrk)
+ _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
+ def : Pat<(X86expand (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#rrkz)
+ _.KRCWM:$mask, _.RC:$src)>;
+}
+
+multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo VTInfo,
+ Predicate Pred = HasAVX512> {
+ let Predicates = [Pred] in
+ defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr, sched>,
+ expand_by_vec_width_lowering<VTInfo.info512, NAME>, EVEX_V512;
+
+ let Predicates = [Pred, HasVLX] in {
+ defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr, sched>,
+ expand_by_vec_width_lowering<VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr, sched>,
+ expand_by_vec_width_lowering<VTInfo.info128, NAME>, EVEX_V128;
+ }
+}
+
+// FIXME: Is there a better scheduler class for VPEXPAND?
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", WriteVarShuffle256,
+ avx512vl_i32_info>, EVEX;
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", WriteVarShuffle256,
+ avx512vl_i64_info>, EVEX, VEX_W;
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", WriteVarShuffle256,
+ avx512vl_f32_info>, EVEX;
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256,
+ avx512vl_f64_info>, EVEX, VEX_W;
+
+//handle instruction reg_vec1 = op(reg_vec,imm)
+// op(mem_vec,imm)
+// op(broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode MaskOpNode,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rri : AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (i32 timm:$src2)),
+ (MaskOpNode (_.VT _.RC:$src1), (i32 timm:$src2))>,
+ Sched<[sched]>;
+ defm rmi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 timm:$src2)),
+ (MaskOpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 timm:$src2))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm rmbi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix, "$src2, ${src1}"#_.BroadcastStr,
+ "${src1}"#_.BroadcastStr#", $src2",
+ (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2)),
+ (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2))>, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
+ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr#_.Suffix, "$src2, {sae}, $src1",
+ "$src1, {sae}, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (i32 timm:$src2))>,
+ EVEX_B, Sched<[sched]>;
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched,
+ Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.ZMM, _.info512>,
+ avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE,
+ sched.ZMM, _.info512>, EVEX_V512;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.XMM, _.info128>, EVEX_V128;
+ defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.YMM, _.info256>, EVEX_V256;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
+// op(reg_vec2,broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 timm:$src3))>,
+ Sched<[sched]>;
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (bitconvert (_.LdFrag addr:$src2))),
+ (i32 timm:$src3))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ (i32 timm:$src3))>, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
+multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo>{
+ let ExeDomain = DestInfo.ExeDomain in {
+ defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+ (SrcInfo.VT SrcInfo.RC:$src2),
+ (i8 timm:$src3)))>,
+ Sched<[sched]>;
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+ (SrcInfo.VT (bitconvert
+ (SrcInfo.LdFrag addr:$src2))),
+ (i8 timm:$src3)))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
+// op(reg_vec2,broadcast(eltVt),imm)
+multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>:
+ avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, sched, _, _>{
+
+ let ExeDomain = _.ExeDomain in
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ (i8 timm:$src3))>, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_scalar,imm)
+multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 timm:$src3))>,
+ Sched<[sched]>;
+ defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.ScalarIntMemFrags addr:$src2),
+ (i32 timm:$src3))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
+ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, {sae}, $src2, $src1",
+ "$src1, $src2, {sae}, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 timm:$src3))>,
+ EVEX_B, Sched<[sched]>;
+}
+
+//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
+ defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3, {sae}, $src2, $src1",
+ "$src1, $src2, {sae}, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 timm:$src3))>,
+ EVEX_B, Sched<[sched]>;
+}
+
+multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE, sched.ZMM, _.info512>,
+ EVEX_V512;
+
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
+ EVEX_V128;
+ defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
+ EVEX_V256;
+ }
+}
+
+multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo DestInfo,
+ AVX512VLVectorVTInfo SrcInfo, Predicate Pred = HasBWI> {
+ let Predicates = [Pred] in {
+ defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.ZMM, DestInfo.info512,
+ SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+ }
+ let Predicates = [Pred, HasVLX] in {
+ defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.XMM, DestInfo.info128,
+ SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+ defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.YMM, DestInfo.info256,
+ SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+ }
+}
+
+multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate Pred = HasAVX512> {
+ let Predicates = [Pred] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
+ EVEX_V512;
+ }
+ let Predicates = [Pred, HasVLX] in {
+ defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.XMM, _.info128>,
+ EVEX_V128;
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.YMM, _.info256>,
+ EVEX_V256;
+ }
+}
+
+multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
+ X86VectorVTInfo _, bits<8> opc, SDNode OpNode,
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd> {
+ let Predicates = [prd] in {
+ defm Z : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, sched.XMM, _>,
+ avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeSAE, sched.XMM, _>;
+ }
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
+ bits<8> opcPs, bits<8> opcPd, SDNode OpNode,
+ SDNode MaskOpNode, SDNode OpNodeSAE,
+ X86SchedWriteWidths sched, Predicate prd>{
+ defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
+ opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
+ EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
+ opcPd, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
+ EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
+ X86VReduce, X86VReduce, X86VReduceSAE,
+ SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX;
+defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
+ X86any_VRndScale, X86VRndScale, X86VRndScaleSAE,
+ SchedWriteFRnd, HasAVX512>,
+ AVX512AIi8Base, EVEX;
+defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
+ X86VGetMant, X86VGetMant, X86VGetMantSAE,
+ SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX;
+
+defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
+ 0x50, X86VRange, X86VRangeSAE,
+ SchedWriteFAdd, HasDQI>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
+ 0x50, X86VRange, X86VRangeSAE,
+ SchedWriteFAdd, HasDQI>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd",
+ f64x_info, 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
+ 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
+ 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
+ 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
+ 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
+ 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _,
+ X86VectorVTInfo CastInfo,
+ string EVEX2VEXOvrd> {
+ let ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (bitconvert
+ (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2,
+ (i8 timm:$src3)))))>,
+ Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT
+ (bitconvert
+ (CastInfo.VT (X86Shuf128 _.RC:$src1,
+ (CastInfo.LdFrag addr:$src2),
+ (i8 timm:$src3)))))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $src3",
+ (_.VT
+ (bitconvert
+ (CastInfo.VT
+ (X86Shuf128 _.RC:$src1,
+ (_.BroadcastLdFrag addr:$src2),
+ (i8 timm:$src3)))))>, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched,
+ AVX512VLVectorVTInfo _,
+ AVX512VLVectorVTInfo CastInfo, bits<8> opc,
+ string EVEX2VEXOvrd>{
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_shuff_packed_128_common<opc, OpcodeStr, sched,
+ _.info512, CastInfo.info512, "">, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in
+ defm Z256 : avx512_shuff_packed_128_common<opc, OpcodeStr, sched,
+ _.info256, CastInfo.info256,
+ EVEX2VEXOvrd>, EVEX_V256;
+}
+
+defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256,
+ avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256,
+ avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256,
+ avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256,
+ avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_valign<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
+ // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the
+ // instantiation of this class.
+ let ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>,
+ Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">;
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (_.VT (X86VAlign _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src2)),
+ (i8 timm:$src3)))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>,
+ EVEX2VEXOverride<"VPALIGNRrmi">;
+
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $src3",
+ (X86VAlign _.RC:$src1,
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ (i8 timm:$src3))>, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_valign<0x03, OpcodeStr, sched.ZMM, _.info512>,
+ AVX512AIi8Base, EVEX_4V, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>,
+ AVX512AIi8Base, EVEX_4V, EVEX_V128;
+ // We can't really override the 256-bit version so change it back to unset.
+ let EVEX2VEXOverride = ? in
+ defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>,
+ AVX512AIi8Base, EVEX_4V, EVEX_V256;
+ }
+}
+
+defm VALIGND: avx512_valign_common<"valignd", SchedWriteShuffle,
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign_common<"valignq", SchedWriteShuffle,
+ avx512vl_i64_info>, EVEX_CD8<64, CD8VF>,
+ VEX_W;
+
+defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr",
+ SchedWriteShuffle, avx512vl_i8_info,
+ avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
+
+// Fragments to help convert valignq into masked valignd. Or valignq/valignd
+// into vpalignr.
+def ValignqImm32XForm : SDNodeXForm<timm, [{
+ return getI8Imm(N->getZExtValue() * 2, SDLoc(N));
+}]>;
+def ValignqImm8XForm : SDNodeXForm<timm, [{
+ return getI8Imm(N->getZExtValue() * 8, SDLoc(N));
+}]>;
+def ValigndImm8XForm : SDNodeXForm<timm, [{
+ return getI8Imm(N->getZExtValue() * 4, SDLoc(N));
+}]>;
+
+multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ SDNodeXForm ImmXForm> {
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ timm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, To.RC:$src2,
+ (ImmXForm timm:$src3))>;
+
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ timm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
+ To.RC:$src1, To.RC:$src2,
+ (ImmXForm timm:$src3))>;
+
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (From.LdFrag addr:$src2),
+ timm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm timm:$src3))>;
+
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (From.LdFrag addr:$src2),
+ timm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm timm:$src3))>;
+}
+
+multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDNodeXForm ImmXForm> :
+ avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> {
+ def : Pat<(From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3)),
+ (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
+ (ImmXForm timm:$src3))>;
+
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm timm:$src3))>;
+
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm timm:$src3))>;
+}
+
+let Predicates = [HasAVX512] in {
+ // For 512-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info,
+ v16i32_info, ValignqImm32XForm>;
+}
+
+let Predicates = [HasVLX] in {
+ // For 128-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info,
+ v4i32x_info, ValignqImm32XForm>;
+ // For 256-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info,
+ v8i32x_info, ValignqImm32XForm>;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+ // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR.
+ defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info,
+ v16i8x_info, ValignqImm8XForm>;
+ defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info,
+ v16i8x_info, ValigndImm8XForm>;
+}
+
+defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw",
+ SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info>,
+ EVEX_CD8<8, CD8VF>, NotEVEX2VEXConvertible;
+
+multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1), OpcodeStr,
+ "$src1", "$src1",
+ (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase,
+ Sched<[sched]>;
+
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1), OpcodeStr,
+ "$src1", "$src1",
+ (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1)))))>,
+ EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded]>;
+ }
+}
+
+multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> :
+ avx512_unary_rm<opc, OpcodeStr, OpNode, sched, _> {
+ defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1), OpcodeStr,
+ "${src1}"#_.BroadcastStr,
+ "${src1}"#_.BroadcastStr,
+ (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>,
+ EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded]>;
+}
+
+multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.ZMM, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.YMM, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.XMM, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo,
+ Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.ZMM, VTInfo.info512>,
+ EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.YMM, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.XMM, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd> {
+ defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, sched,
+ avx512vl_i64_info, prd>, VEX_W;
+ defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, sched,
+ avx512vl_i32_info, prd>;
+}
+
+multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd> {
+ defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, sched,
+ avx512vl_i16_info, prd>, VEX_WIG;
+ defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, sched,
+ avx512vl_i8_info, prd>, VEX_WIG;
+}
+
+multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+ bits<8> opc_d, bits<8> opc_q,
+ string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
+ defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, sched,
+ HasAVX512>,
+ avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, sched,
+ HasBWI>;
+}
+
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs,
+ SchedWriteVecALU>;
+
+// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+ def : Pat<(v4i64 (abs VR256X:$src)),
+ (EXTRACT_SUBREG
+ (VPABSQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
+ sub_ymm)>;
+ def : Pat<(v2i64 (abs VR128X:$src)),
+ (EXTRACT_SUBREG
+ (VPABSQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
+ sub_xmm)>;
+}
+
+// Use 512bit version to implement 128/256 bit.
+multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd, NoVLX] in {
+ def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(InstrStr # "Zrr")
+ (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+ _.info256.RC:$src1,
+ _.info256.SubRegIdx)),
+ _.info256.SubRegIdx)>;
+
+ def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(InstrStr # "Zrr")
+ (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+ _.info128.RC:$src1,
+ _.info128.SubRegIdx)),
+ _.info128.SubRegIdx)>;
+ }
+}
+
+defm VPLZCNT : avx512_unary_rm_vl_dq<0x44, 0x44, "vplzcnt", ctlz,
+ SchedWriteVecIMul, HasCDI>;
+
+// FIXME: Is there a better scheduler class for VPCONFLICT?
+defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict,
+ SchedWriteVecALU, HasCDI>;
+
+// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX.
+defm : avx512_unary_lowering<"VPLZCNTQ", ctlz, avx512vl_i64_info, HasCDI>;
+defm : avx512_unary_lowering<"VPLZCNTD", ctlz, avx512vl_i32_info, HasCDI>;
+
+//===---------------------------------------------------------------------===//
+// Counts number of ones - VPOPCNTD and VPOPCNTQ
+//===---------------------------------------------------------------------===//
+
+// FIXME: Is there a better scheduler class for VPOPCNTD/VPOPCNTQ?
+defm VPOPCNT : avx512_unary_rm_vl_dq<0x55, 0x55, "vpopcnt", ctpop,
+ SchedWriteVecALU, HasVPOPCNTDQ>;
+
+defm : avx512_unary_lowering<"VPOPCNTQ", ctpop, avx512vl_i64_info, HasVPOPCNTDQ>;
+defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+
+multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
+ defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, sched,
+ avx512vl_f32_info, HasAVX512>, XS;
+}
+
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup,
+ SchedWriteFShuffle>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup,
+ SchedWriteFShuffle>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX,
+ Sched<[sched]>;
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+ (_.VT (_.BroadcastLdFrag addr:$src))>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VH>,
+ Sched<[sched.Folded]>;
+ }
+}
+
+multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo> {
+ defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.ZMM,
+ VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.YMM,
+ VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_movddup_128<opc, OpcodeStr, sched.XMM,
+ VTInfo.info128>, EVEX_V128;
+ }
+}
+
+multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
+ defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, sched,
+ avx512vl_f64_info>, XD, VEX_W;
+}
+
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>;
+
+let Predicates = [HasVLX] in {
+def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+ (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
+
+def : Pat<(vselect_mask (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
+ (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
+def : Pat<(vselect_mask (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ immAllZerosV),
+ (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+
+let Uses = []<Register>, mayRaiseFPException = 0 in {
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, X86Unpckh, HasAVX512,
+ SchedWriteFShuffleSizes, 0, 1>;
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, X86Unpckl, HasAVX512,
+ SchedWriteFShuffleSizes>;
+}
+
+defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
+ SchedWriteShuffle, HasBWI>;
+defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
+ SchedWriteShuffle, HasBWI>;
+defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
+ SchedWriteShuffle, HasBWI>;
+defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
+ SchedWriteShuffle, HasBWI>;
+
+defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
+ SchedWriteShuffle, HasAVX512>;
+defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
+ SchedWriteShuffle, HasAVX512>;
+defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
+ SchedWriteShuffle, HasAVX512>;
+defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
+ SchedWriteShuffle, HasAVX512>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Extract & Insert Integer Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ def mr : AVX512Ii8<opc, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), timm:$src2))),
+ addr:$dst)]>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>;
+}
+
+multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst,
+ (X86pextrb (_.VT _.RC:$src1), timm:$src2))]>,
+ EVEX, TAPD, Sched<[WriteVecExtract]>;
+
+ defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
+ }
+}
+
+multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst,
+ (X86pextrw (_.VT _.RC:$src1), timm:$src2))]>,
+ EVEX, PD, Sched<[WriteVecExtract]>;
+
+ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
+ def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ EVEX, TAPD, FoldGenData<NAME#rr>,
+ Sched<[WriteVecExtract]>;
+
+ defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
+ }
+}
+
+multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
+ RegisterClass GRC> {
+ let Predicates = [HasDQI] in {
+ def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GRC:$dst,
+ (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, TAPD, Sched<[WriteVecExtract]>;
+
+ def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (extractelt (_.VT _.RC:$src1),
+ imm:$src2),addr:$dst)]>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD,
+ Sched<[WriteVecExtractSt]>;
+ }
+}
+
+defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>, VEX_WIG;
+defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>, VEX_WIG;
+defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
+defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
+
+multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, PatFrag LdFrag,
+ SDPatternOperator immoperator> {
+ def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
+}
+
+multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, PatFrag LdFrag> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V,
+ Sched<[WriteVecInsert]>;
+
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>;
+ }
+}
+
+multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, RegisterClass GRC> {
+ let Predicates = [HasDQI] in {
+ def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, GRC:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
+ EVEX_4V, TAPD, Sched<[WriteVecInsert]>;
+
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
+ _.ScalarLdFrag, imm>, TAPD;
+ }
+}
+
+defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
+ extloadi8>, TAPD, VEX_WIG;
+defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
+ extloadi16>, PD, VEX_WIG;
+defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
+defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+ AVX512VLVectorVTInfo VTInfo_FP>{
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp,
+ SchedWriteFShuffle>,
+ EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+ AVX512AIi8Base, EVEX_4V;
+}
+
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Byte shift Left/Right
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
+ Format MRMm, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _>{
+ def ri : AVX512<opc, MRMr,
+ (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 timm:$src2))))]>,
+ Sched<[sched]>;
+ def mi : AVX512<opc, MRMm,
+ (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,(_.VT (OpNode
+ (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i8 timm:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
+ Format MRMm, string OpcodeStr,
+ X86SchedWriteWidths sched, Predicate prd>{
+ let Predicates = [prd] in
+ defm Z : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr,
+ sched.ZMM, v64i8_info>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr,
+ sched.YMM, v32i8x_info>, EVEX_V256;
+ defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr,
+ sched.XMM, v16i8x_info>, EVEX_V128;
+ }
+}
+defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
+ SchedWriteShuffle, HasBWI>,
+ AVX512PDIi8Base, EVEX_4V, VEX_WIG;
+defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
+ SchedWriteShuffle, HasBWI>,
+ AVX512PDIi8Base, EVEX_4V, VEX_WIG;
+
+multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
+ string OpcodeStr, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _dst, X86VectorVTInfo _src> {
+ let isCommutable = 1 in
+ def rr : AVX512BI<opc, MRMSrcReg,
+ (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _dst.RC:$dst,(_dst.VT
+ (OpNode (_src.VT _src.RC:$src1),
+ (_src.VT _src.RC:$src2))))]>,
+ Sched<[sched]>;
+ def rm : AVX512BI<opc, MRMSrcMem,
+ (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _dst.RC:$dst,(_dst.VT
+ (OpNode (_src.VT _src.RC:$src1),
+ (_src.VT (bitconvert
+ (_src.LdFrag addr:$src2))))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
+ string OpcodeStr, X86SchedWriteWidths sched,
+ Predicate prd> {
+ let Predicates = [prd] in
+ defm Z : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.ZMM,
+ v8i64_info, v64i8_info>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.YMM,
+ v4i64x_info, v32i8x_info>, EVEX_V256;
+ defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.XMM,
+ v2i64x_info, v16i8x_info>, EVEX_V128;
+ }
+}
+
+defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
+ SchedWritePSADBW, HasBWI>, EVEX_4V, VEX_WIG;
+
+// Transforms to swizzle an immediate to enable better matching when
+// memory operand isn't in the right place.
+def VPTERNLOG321_imm8 : SDNodeXForm<timm, [{
+ // Convert a VPTERNLOG immediate by swapping operand 0 and operand 2.
+ uint8_t Imm = N->getZExtValue();
+ // Swap bits 1/4 and 3/6.
+ uint8_t NewImm = Imm & 0xa5;
+ if (Imm & 0x02) NewImm |= 0x10;
+ if (Imm & 0x10) NewImm |= 0x02;
+ if (Imm & 0x08) NewImm |= 0x40;
+ if (Imm & 0x40) NewImm |= 0x08;
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG213_imm8 : SDNodeXForm<timm, [{
+ // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
+ uint8_t Imm = N->getZExtValue();
+ // Swap bits 2/4 and 3/5.
+ uint8_t NewImm = Imm & 0xc3;
+ if (Imm & 0x04) NewImm |= 0x10;
+ if (Imm & 0x10) NewImm |= 0x04;
+ if (Imm & 0x08) NewImm |= 0x20;
+ if (Imm & 0x20) NewImm |= 0x08;
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG132_imm8 : SDNodeXForm<timm, [{
+ // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
+ uint8_t Imm = N->getZExtValue();
+ // Swap bits 1/2 and 5/6.
+ uint8_t NewImm = Imm & 0x99;
+ if (Imm & 0x02) NewImm |= 0x04;
+ if (Imm & 0x04) NewImm |= 0x02;
+ if (Imm & 0x20) NewImm |= 0x40;
+ if (Imm & 0x40) NewImm |= 0x20;
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG231_imm8 : SDNodeXForm<timm, [{
+ // Convert a VPTERNLOG immediate by moving operand 1 to the end.
+ uint8_t Imm = N->getZExtValue();
+ // Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5
+ uint8_t NewImm = Imm & 0x81;
+ if (Imm & 0x02) NewImm |= 0x04;
+ if (Imm & 0x04) NewImm |= 0x10;
+ if (Imm & 0x08) NewImm |= 0x40;
+ if (Imm & 0x10) NewImm |= 0x02;
+ if (Imm & 0x20) NewImm |= 0x08;
+ if (Imm & 0x40) NewImm |= 0x20;
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG312_imm8 : SDNodeXForm<timm, [{
+ // Convert a VPTERNLOG immediate by moving operand 2 to the beginning.
+ uint8_t Imm = N->getZExtValue();
+ // Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3
+ uint8_t NewImm = Imm & 0x81;
+ if (Imm & 0x02) NewImm |= 0x10;
+ if (Imm & 0x04) NewImm |= 0x02;
+ if (Imm & 0x08) NewImm |= 0x20;
+ if (Imm & 0x10) NewImm |= 0x04;
+ if (Imm & 0x20) NewImm |= 0x40;
+ if (Imm & 0x40) NewImm |= 0x08;
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ string Name>{
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT _.RC:$src3),
+ (i8 timm:$src4)), 1, 1>,
+ AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
+ defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT (bitconvert (_.LdFrag addr:$src3))),
+ (i8 timm:$src4)), 1, 0>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, ${src3}"#_.BroadcastStr#", $src2",
+ "$src2, ${src3}"#_.BroadcastStr#", $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
+ (i8 timm:$src4)), 1, 0>, EVEX_B,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }// Constraints = "$src1 = $dst"
+
+ // Additional patterns for matching passthru operand in other positions.
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
+
+ // Additional patterns for matching zero masking with loads in other
+ // positions.
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode (bitconvert (_.LdFrag addr:$src3)),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
+ _.RC:$src2, (i8 timm:$src4)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
+
+ // Additional patterns for matching masked loads with different
+ // operand orders.
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
+ _.RC:$src2, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode (bitconvert (_.LdFrag addr:$src3)),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src2, _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src3)), (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
+ _.RC:$src1, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode (bitconvert (_.LdFrag addr:$src3)),
+ _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
+
+ // Additional patterns for matching zero masking with broadcasts in other
+ // positions.
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src1,
+ (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, (i8 timm:$src4)),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ // Additional patterns for matching masked broadcasts with different
+ // operand orders.
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src2, _.RC:$src1,
+ (_.BroadcastLdFrag addr:$src3),
+ (i8 timm:$src4)), _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode _.RC:$src2,
+ (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src1, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
+ _.RC:$src1)),
+ (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+ _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
+}
+
+multiclass avx512_common_ternlog<string OpcodeStr, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.ZMM,
+ _.info512, NAME>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.XMM,
+ _.info128, NAME>, EVEX_V128;
+ defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.YMM,
+ _.info256, NAME>, EVEX_V256;
+ }
+}
+
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU,
+ avx512vl_i32_info>;
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
+ avx512vl_i64_info>, VEX_W;
+
+// Patterns to implement vnot using vpternlog instead of creating all ones
+// using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen
+// so that the result is only dependent on src0. But we use the same source
+// for all operands to prevent a false dependency.
+// TODO: We should maybe have a more generalized algorithm for folding to
+// vpternlog.
+let Predicates = [HasAVX512] in {
+ def : Pat<(xor VR512:$src, (v64i8 immAllOnesV)),
+ (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+ def : Pat<(xor VR512:$src, (v32i16 immAllOnesV)),
+ (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+ def : Pat<(xor VR512:$src, (v16i32 immAllOnesV)),
+ (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+ def : Pat<(xor VR512:$src, (v8i64 immAllOnesV)),
+ (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+ def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (i8 15)), sub_xmm)>;
+ def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (i8 15)), sub_xmm)>;
+ def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (i8 15)), sub_xmm)>;
+ def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (i8 15)), sub_xmm)>;
+
+ def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (i8 15)), sub_ymm)>;
+ def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (i8 15)), sub_ymm)>;
+ def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (i8 15)), sub_ymm)>;
+ def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)),
+ (EXTRACT_SUBREG
+ (VPTERNLOGQZrri
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (i8 15)), sub_ymm)>;
+}
+
+let Predicates = [HasVLX] in {
+ def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)),
+ (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+ def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)),
+ (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+ def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)),
+ (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+ def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)),
+ (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+
+ def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)),
+ (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+ def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)),
+ (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+ def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)),
+ (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+ def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)),
+ (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - FixupImm
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ X86VectorVTInfo TblVT>{
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+ OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (X86VFixupimm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT _.RC:$src3),
+ (i32 timm:$src4))>, Sched<[sched]>;
+ defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
+ OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (X86VFixupimm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
+ (i32 timm:$src4))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
+ OpcodeStr#_.Suffix, "$src4, ${src3}"#_.BroadcastStr#", $src2",
+ "$src2, ${src3}"#_.BroadcastStr#", $src4",
+ (X86VFixupimm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)),
+ (i32 timm:$src4))>,
+ EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ } // Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo TblVT>
+ : avx512_fixupimm_packed<opc, OpcodeStr, sched, _, TblVT> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
+ defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+ OpcodeStr#_.Suffix, "$src4, {sae}, $src3, $src2",
+ "$src2, $src3, {sae}, $src4",
+ (X86VFixupimmSAE (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT _.RC:$src3),
+ (i32 timm:$src4))>,
+ EVEX_B, Sched<[sched]>;
+ }
+}
+
+multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _,
+ X86VectorVTInfo _src3VT> {
+ let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
+ ExeDomain = _.ExeDomain in {
+ defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+ OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (X86VFixupimms (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT _src3VT.RC:$src3),
+ (i32 timm:$src4))>, Sched<[sched]>, SIMD_EXC;
+ let Uses = [MXCSR] in
+ defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+ OpcodeStr#_.Suffix, "$src4, {sae}, $src3, $src2",
+ "$src2, $src3, {sae}, $src4",
+ (X86VFixupimmSAEs (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT _src3VT.RC:$src3),
+ (i32 timm:$src4))>,
+ EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
+ OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ (X86VFixupimms (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT (scalar_to_vector
+ (_src3VT.ScalarLdFrag addr:$src3))),
+ (i32 timm:$src4))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+}
+
+multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _Vec,
+ AVX512VLVectorVTInfo _Tbl> {
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM,
+ _Vec.info512, _Tbl.info512>, AVX512AIi8Base,
+ EVEX_4V, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.XMM,
+ _Vec.info128, _Tbl.info128>, AVX512AIi8Base,
+ EVEX_4V, EVEX_V128;
+ defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.YMM,
+ _Vec.info256, _Tbl.info256>, AVX512AIi8Base,
+ EVEX_4V, EVEX_V256;
+ }
+}
+
+defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
+ SchedWriteFAdd.Scl, f32x_info, v4i32x_info>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
+ SchedWriteFAdd.Scl, f64x_info, v2i64x_info>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info,
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info,
+ avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// either:
+//
+// (1) a scalar fp operation followed by a blend
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// A[0] += B[0];
+// return A;
+// }
+//
+// Previously we generated:
+// addss %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// __m128 C = A + B;
+// return (__m128) {c[0], a[1], a[2], a[3]};
+// }
+//
+// Previously we generated:
+// addps %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass AVX512_scalar_math_fp_patterns<SDNode Op, SDNode MaskedOp,
+ string OpcPrefix, SDNode MoveNode,
+ X86VectorVTInfo _, PatLeaf ZeroFP> {
+ let Predicates = [HasAVX512] in {
+ // extracted scalar math op with insert via movss
+ def : Pat<(MoveNode
+ (_.VT VR128X:$dst),
+ (_.VT (scalar_to_vector
+ (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
+ _.FRC:$src)))),
+ (!cast<Instruction>("V"#OpcPrefix#"Zrr_Int") _.VT:$dst,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
+ def : Pat<(MoveNode
+ (_.VT VR128X:$dst),
+ (_.VT (scalar_to_vector
+ (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src))))),
+ (!cast<Instruction>("V"#OpcPrefix#"Zrm_Int") _.VT:$dst, addr:$src)>;
+
+ // extracted masked scalar math op with insert via movss
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src2),
+ _.FRC:$src0))),
+ (!cast<Instruction>("V"#OpcPrefix#"Zrr_Intk")
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
+ VK1WM:$mask, _.VT:$src1,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)),
+ _.FRC:$src0))),
+ (!cast<Instruction>("V"#OpcPrefix#"Zrm_Intk")
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
+ VK1WM:$mask, _.VT:$src1, addr:$src2)>;
+
+ // extracted masked scalar math op with insert via movss
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src2), (_.EltVT ZeroFP)))),
+ (!cast<I>("V"#OpcPrefix#"Zrr_Intkz")
+ VK1WM:$mask, _.VT:$src1,
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
+ (!cast<I>("V"#OpcPrefix#"Zrm_Intkz") VK1WM:$mask, _.VT:$src1, addr:$src2)>;
+ }
+}
+
+defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>;
+
+defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
+
+multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix,
+ SDNode Move, X86VectorVTInfo _> {
+ let Predicates = [HasAVX512] in {
+ def : Pat<(_.VT (Move _.VT:$dst,
+ (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#"Zr_Int") _.VT:$dst, _.VT:$src)>;
+ }
+}
+
+defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
+defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
+
+//===----------------------------------------------------------------------===//
+// AES instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> {
+ let Predicates = [HasVLX, HasVAES] in {
+ defm Z128 : AESI_binop_rm_int<Op, OpStr,
+ !cast<Intrinsic>(IntPrefix),
+ loadv2i64, 0, VR128X, i128mem>,
+ EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, VEX_WIG;
+ defm Z256 : AESI_binop_rm_int<Op, OpStr,
+ !cast<Intrinsic>(IntPrefix#"_256"),
+ loadv4i64, 0, VR256X, i256mem>,
+ EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, VEX_WIG;
+ }
+ let Predicates = [HasAVX512, HasVAES] in
+ defm Z : AESI_binop_rm_int<Op, OpStr,
+ !cast<Intrinsic>(IntPrefix#"_512"),
+ loadv8i64, 0, VR512, i512mem>,
+ EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_WIG;
+}
+
+defm VAESENC : avx512_vaes<0xDC, "vaesenc", "int_x86_aesni_aesenc">;
+defm VAESENCLAST : avx512_vaes<0xDD, "vaesenclast", "int_x86_aesni_aesenclast">;
+defm VAESDEC : avx512_vaes<0xDE, "vaesdec", "int_x86_aesni_aesdec">;
+defm VAESDECLAST : avx512_vaes<0xDF, "vaesdeclast", "int_x86_aesni_aesdeclast">;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL instructions - Carry less multiplication
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasAVX512, HasVPCLMULQDQ] in
+defm VPCLMULQDQZ : vpclmulqdq<VR512, i512mem, loadv8i64, int_x86_pclmulqdq_512>,
+ EVEX_4V, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_WIG;
+
+let Predicates = [HasVLX, HasVPCLMULQDQ] in {
+defm VPCLMULQDQZ128 : vpclmulqdq<VR128X, i128mem, loadv2i64, int_x86_pclmulqdq>,
+ EVEX_4V, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_WIG;
+
+defm VPCLMULQDQZ256: vpclmulqdq<VR256X, i256mem, loadv4i64,
+ int_x86_pclmulqdq_256>, EVEX_4V, EVEX_V256,
+ EVEX_CD8<64, CD8VF>, VEX_WIG;
+}
+
+// Aliases
+defm : vpclmulqdq_aliases<"VPCLMULQDQZ", VR512, i512mem>;
+defm : vpclmulqdq_aliases<"VPCLMULQDQZ128", VR128X, i128mem>;
+defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>;
+
+//===----------------------------------------------------------------------===//
+// VBMI2
+//===----------------------------------------------------------------------===//
+
+multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
+ let Constraints = "$src1 = $dst",
+ ExeDomain = VTI.ExeDomain in {
+ defm r: AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
+ "$src3, $src2", "$src2, $src3",
+ (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
+ AVX512FMA3Base, Sched<[sched]>;
+ defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
+ "$src3, $src2", "$src2, $src3",
+ (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
+ (VTI.VT (VTI.LdFrag addr:$src3))))>,
+ AVX512FMA3Base,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI>
+ : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> {
+ let Constraints = "$src1 = $dst",
+ ExeDomain = VTI.ExeDomain in
+ defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr,
+ "${src3}"#VTI.BroadcastStr#", $src2",
+ "$src2, ${src3}"#VTI.BroadcastStr,
+ (OpNode VTI.RC:$src1, VTI.RC:$src2,
+ (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
+ AVX512FMA3Base, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
+ let Predicates = [HasVBMI2] in
+ defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+ EVEX_V512;
+ let Predicates = [HasVBMI2, HasVLX] in {
+ defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+ EVEX_V256;
+ defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
+ let Predicates = [HasVBMI2] in
+ defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+ EVEX_V512;
+ let Predicates = [HasVBMI2, HasVLX] in {
+ defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+ EVEX_V256;
+ defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+ EVEX_V128;
+ }
+}
+multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched,
+ avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+ defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched,
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched,
+ avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix#"w", sched,
+ avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+ defm D : avx512_common_3Op_imm8<Prefix#"d", avx512vl_i32_info, dqOp,
+ OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+ defm Q : avx512_common_3Op_imm8<Prefix#"q", avx512vl_i64_info, dqOp, OpNode,
+ sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+// Concat & Shift
+defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>;
+defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>;
+defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>;
+defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>;
+
+// Compress
+defm VPCOMPRESSB : compress_by_elt_width<0x63, "vpcompressb", WriteVarShuffle256,
+ avx512vl_i8_info, HasVBMI2>, EVEX,
+ NotMemoryFoldable;
+defm VPCOMPRESSW : compress_by_elt_width <0x63, "vpcompressw", WriteVarShuffle256,
+ avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W,
+ NotMemoryFoldable;
+// Expand
+defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", WriteVarShuffle256,
+ avx512vl_i8_info, HasVBMI2>, EVEX;
+defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256,
+ avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VNNI
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst" in
+multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
+ bit IsCommutable> {
+ let ExeDomain = VTI.ExeDomain in {
+ defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
+ "$src3, $src2", "$src2, $src3",
+ (VTI.VT (OpNode VTI.RC:$src1,
+ VTI.RC:$src2, VTI.RC:$src3)),
+ IsCommutable, IsCommutable>,
+ EVEX_4V, T8PD, Sched<[sched]>;
+ defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
+ "$src3, $src2", "$src2, $src3",
+ (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
+ (VTI.VT (VTI.LdFrag addr:$src3))))>,
+ EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm mb : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3),
+ OpStr, "${src3}"#VTI.BroadcastStr#", $src2",
+ "$src2, ${src3}"#VTI.BroadcastStr,
+ (OpNode VTI.RC:$src1, VTI.RC:$src2,
+ (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
+ EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
+ T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
+ X86SchedWriteWidths sched, bit IsCommutable> {
+ let Predicates = [HasVNNI] in
+ defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info,
+ IsCommutable>, EVEX_V512;
+ let Predicates = [HasVNNI, HasVLX] in {
+ defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+// FIXME: Is there a better scheduler class for VPDP?
+defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0>;
+defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0>;
+defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>;
+defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>;
+
+// Patterns to match VPDPWSSD from existing instructions/intrinsics.
+let Predicates = [HasVNNI] in {
+ def : Pat<(v16i32 (add VR512:$src1,
+ (X86vpmaddwd_su VR512:$src2, VR512:$src3))),
+ (VPDPWSSDZr VR512:$src1, VR512:$src2, VR512:$src3)>;
+ def : Pat<(v16i32 (add VR512:$src1,
+ (X86vpmaddwd_su VR512:$src2, (load addr:$src3)))),
+ (VPDPWSSDZm VR512:$src1, VR512:$src2, addr:$src3)>;
+}
+let Predicates = [HasVNNI,HasVLX] in {
+ def : Pat<(v8i32 (add VR256X:$src1,
+ (X86vpmaddwd_su VR256X:$src2, VR256X:$src3))),
+ (VPDPWSSDZ256r VR256X:$src1, VR256X:$src2, VR256X:$src3)>;
+ def : Pat<(v8i32 (add VR256X:$src1,
+ (X86vpmaddwd_su VR256X:$src2, (load addr:$src3)))),
+ (VPDPWSSDZ256m VR256X:$src1, VR256X:$src2, addr:$src3)>;
+ def : Pat<(v4i32 (add VR128X:$src1,
+ (X86vpmaddwd_su VR128X:$src2, VR128X:$src3))),
+ (VPDPWSSDZ128r VR128X:$src1, VR128X:$src2, VR128X:$src3)>;
+ def : Pat<(v4i32 (add VR128X:$src1,
+ (X86vpmaddwd_su VR128X:$src2, (load addr:$src3)))),
+ (VPDPWSSDZ128m VR128X:$src1, VR128X:$src2, addr:$src3)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Bit Algorithms
+//===----------------------------------------------------------------------===//
+
+// FIXME: Is there a better scheduler class for VPOPCNTB/VPOPCNTW?
+defm VPOPCNTB : avx512_unary_rm_vl<0x54, "vpopcntb", ctpop, SchedWriteVecALU,
+ avx512vl_i8_info, HasBITALG>;
+defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SchedWriteVecALU,
+ avx512vl_i16_info, HasBITALG>, VEX_W;
+
+defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>;
+defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>;
+
+def X86Vpshufbitqmb_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86Vpshufbitqmb node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
+multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
+ defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst),
+ (ins VTI.RC:$src1, VTI.RC:$src2),
+ "vpshufbitqmb",
+ "$src2, $src1", "$src1, $src2",
+ (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
+ (VTI.VT VTI.RC:$src2)),
+ (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
+ (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD,
+ Sched<[sched]>;
+ defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
+ (ins VTI.RC:$src1, VTI.MemOp:$src2),
+ "vpshufbitqmb",
+ "$src2, $src1", "$src1, $src2",
+ (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
+ (VTI.VT (VTI.LdFrag addr:$src2))),
+ (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
+ (VTI.VT (VTI.LdFrag addr:$src2)))>,
+ EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass VPSHUFBITQMB_common<X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
+ let Predicates = [HasBITALG] in
+ defm Z : VPSHUFBITQMB_rm<sched.ZMM, VTI.info512>, EVEX_V512;
+ let Predicates = [HasBITALG, HasVLX] in {
+ defm Z256 : VPSHUFBITQMB_rm<sched.YMM, VTI.info256>, EVEX_V256;
+ defm Z128 : VPSHUFBITQMB_rm<sched.XMM, VTI.info128>, EVEX_V128;
+ }
+}
+
+// FIXME: Is there a better scheduler class for VPSHUFBITQMB?
+defm VPSHUFBITQMB : VPSHUFBITQMB_common<SchedWriteVecIMul, avx512vl_i8_info>;
+
+//===----------------------------------------------------------------------===//
+// GFNI
+//===----------------------------------------------------------------------===//
+
+multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasGFNI, HasAVX512, HasBWI] in
+ defm Z : avx512_binop_rm<Op, OpStr, OpNode, v64i8_info, sched.ZMM, 1>,
+ EVEX_V512;
+ let Predicates = [HasGFNI, HasVLX, HasBWI] in {
+ defm Z256 : avx512_binop_rm<Op, OpStr, OpNode, v32i8x_info, sched.YMM, 1>,
+ EVEX_V256;
+ defm Z128 : avx512_binop_rm<Op, OpStr, OpNode, v16i8x_info, sched.XMM, 1>,
+ EVEX_V128;
+ }
+}
+
+defm VGF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb,
+ SchedWriteVecALU>,
+ EVEX_CD8<8, CD8VF>, T8PD;
+
+multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
+ X86VectorVTInfo BcstVTI>
+ : avx512_3Op_rm_imm8<Op, OpStr, OpNode, sched, VTI, VTI> {
+ let ExeDomain = VTI.ExeDomain in
+ defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3),
+ OpStr, "$src3, ${src2}"#BcstVTI.BroadcastStr#", $src1",
+ "$src1, ${src2}"#BcstVTI.BroadcastStr#", $src3",
+ (OpNode (VTI.VT VTI.RC:$src1),
+ (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))),
+ (i8 timm:$src3))>, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasGFNI, HasAVX512, HasBWI] in
+ defm Z : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.ZMM,
+ v64i8_info, v8i64_info>, EVEX_V512;
+ let Predicates = [HasGFNI, HasVLX, HasBWI] in {
+ defm Z256 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.YMM,
+ v32i8x_info, v4i64x_info>, EVEX_V256;
+ defm Z128 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.XMM,
+ v16i8x_info, v2i64x_info>, EVEX_V128;
+ }
+}
+
+defm VGF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
+ X86GF2P8affineinvqb, SchedWriteVecIMul>,
+ EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+defm VGF2P8AFFINEQB : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
+ X86GF2P8affineqb, SchedWriteVecIMul>,
+ EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+
+
+//===----------------------------------------------------------------------===//
+// AVX5124FMAPS
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle,
+ Constraints = "$src1 = $dst", Uses = [MXCSR], mayRaiseFPException = 1 in {
+defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info,
+ (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+ "v4fmaddps", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ Sched<[SchedWriteFMA.ZMM.Folded]>;
+
+defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info,
+ (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+ "v4fnmaddps", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ Sched<[SchedWriteFMA.ZMM.Folded]>;
+
+defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info,
+ (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
+ "v4fmaddss", "$src3, $src2", "$src2, $src3",
+ []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ Sched<[SchedWriteFMA.Scl.Folded]>;
+
+defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info,
+ (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
+ "v4fnmaddss", "$src3, $src2", "$src2, $src3",
+ []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ Sched<[SchedWriteFMA.Scl.Folded]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX5124VNNIW
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedInt,
+ Constraints = "$src1 = $dst" in {
+defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info,
+ (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+ "vp4dpwssd", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ Sched<[SchedWriteFMA.ZMM.Folded]>;
+
+defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
+ (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
+ "vp4dpwssds", "$src3, $src2", "$src2, $src3",
+ []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ Sched<[SchedWriteFMA.ZMM.Folded]>;
+}
+
+let hasSideEffects = 0 in {
+ let mayStore = 1, SchedRW = [WriteFStoreX] in
+ def MASKPAIR16STORE : PseudoI<(outs), (ins anymem:$dst, VK16PAIR:$src), []>;
+ let mayLoad = 1, SchedRW = [WriteFLoadX] in
+ def MASKPAIR16LOAD : PseudoI<(outs VK16PAIR:$dst), (ins anymem:$src), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// VP2INTERSECT
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ def rr : I<0x68, MRMSrcReg,
+ (outs _.KRPC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ !strconcat("vp2intersect", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRPC:$dst, (X86vp2intersect
+ _.RC:$src1, (_.VT _.RC:$src2)))]>,
+ EVEX_4V, T8XD, Sched<[sched]>;
+
+ def rm : I<0x68, MRMSrcMem,
+ (outs _.KRPC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2),
+ !strconcat("vp2intersect", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRPC:$dst, (X86vp2intersect
+ _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+ EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ def rmb : I<0x68, MRMSrcMem,
+ (outs _.KRPC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr,
+ ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
+ [(set _.KRPC:$dst, (X86vp2intersect
+ _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
+ EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass avx512_vp2intersect<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512, HasVP2INTERSECT] in
+ defm Z : avx512_vp2intersect_modes<sched.ZMM, _.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVP2INTERSECT, HasVLX] in {
+ defm Z256 : avx512_vp2intersect_modes<sched.YMM, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vp2intersect_modes<sched.XMM, _.info128>, EVEX_V128;
+ }
+}
+
+defm VP2INTERSECTD : avx512_vp2intersect<SchedWriteVecALU, avx512vl_i32_info>;
+defm VP2INTERSECTQ : avx512_vp2intersect<SchedWriteVecALU, avx512vl_i64_info>, VEX_W;
+
+multiclass avx512_binop_all2<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _SrcVTInfo,
+ AVX512VLVectorVTInfo _DstVTInfo,
+ SDNode OpNode, Predicate prd,
+ bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode,
+ _SrcVTInfo.info512, _DstVTInfo.info512,
+ _SrcVTInfo.info512, IsCommutable>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+ let Predicates = [HasVLX, prd] in {
+ defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode,
+ _SrcVTInfo.info256, _DstVTInfo.info256,
+ _SrcVTInfo.info256, IsCommutable>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, sched.XMM, OpNode,
+ _SrcVTInfo.info128, _DstVTInfo.info128,
+ _SrcVTInfo.info128, IsCommutable>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ }
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
+ SchedWriteCvtPD2PS, //FIXME: Should be SchedWriteCvtPS2BF
+ avx512vl_f32_info, avx512vl_i16_info,
+ X86cvtne2ps2bf16, HasBF16, 0>, T8XD;
+
+// Truncate Float to BFloat16
+multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched> {
+ let ExeDomain = SSEPackedSingle in {
+ let Predicates = [HasBF16], Uses = []<Register>, mayRaiseFPException = 0 in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info,
+ X86cvtneps2bf16, X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasBF16, HasVLX] in {
+ let Uses = []<Register>, mayRaiseFPException = 0 in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info,
+ null_frag, null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
+ VK4WM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info,
+ X86cvtneps2bf16, X86cvtneps2bf16,
+ sched.YMM, "{1to8}", "{y}">, EVEX_V256;
+ }
+ } // Predicates = [HasBF16, HasVLX]
+ } // ExeDomain = SSEPackedSingle
+
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst,
+ f128mem:$src), 0, "intel">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst,
+ f256mem:$src), 0, "intel">;
+}
+
+defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16",
+ SchedWriteCvtPD2PS>, T8XS,
+ EVEX_CD8<32, CD8VF>;
+
+let Predicates = [HasBF16, HasVLX] in {
+ // Special patterns to allow use of X86mcvtneps2bf16 for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 VR128X:$src))),
+ (VCVTNEPS2BF16Z128rr VR128X:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8i16 VR128X:$src0),
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rrk VR128X:$src0, VK4WM:$mask, VR128X:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8i16x_info.ImmAllZerosV,
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rrkz VK4WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v8i16 (X86cvtneps2bf16 (loadv4f32 addr:$src))),
+ (VCVTNEPS2BF16Z128rm addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8i16 VR128X:$src0),
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8i16x_info.ImmAllZerosV,
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>;
+
+ def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32
+ (X86VBroadcastld32 addr:$src)))),
+ (VCVTNEPS2BF16Z128rmb addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)),
+ (v8i16 VR128X:$src0), VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)),
+ v8i16x_info.ImmAllZerosV, VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo src_v> {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins src_v.RC:$src2, src_v.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, src_v.RC:$src2, src_v.RC:$src3))>,
+ EVEX_4V, Sched<[sched]>;
+
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins src_v.RC:$src2, src_v.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
+ (src_v.LdFrag addr:$src3)))>, EVEX_4V,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins src_v.RC:$src2, src_v.ScalarMemOp:$src3),
+ OpcodeStr,
+ !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr),
+ (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
+ (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
+ EVEX_B, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo _,
+ AVX512VLVectorVTInfo src_v, Predicate prd> {
+ let Predicates = [prd] in {
+ defm Z : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512,
+ src_v.info512>, EVEX_V512;
+ }
+ let Predicates = [HasVLX, prd] in {
+ defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.YMM, _.info256,
+ src_v.info256>, EVEX_V256;
+ defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.XMM, _.info128,
+ src_v.info128>, EVEX_V128;
+ }
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
+ avx512vl_f32_info, avx512vl_i32_info,
+ HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
new file mode 100644
index 000000000000..e83e1e74ff52
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -0,0 +1,1545 @@
+//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the integer arithmetic instructions in the X86
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LEA - Load Effective Address
+let SchedRW = [WriteLEA] in {
+let hasSideEffects = 0 in
+def LEA16r : I<0x8D, MRMSrcMem,
+ (outs GR16:$dst), (ins anymem:$src),
+ "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize16;
+let isReMaterializable = 1 in
+def LEA32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins anymem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)]>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins lea64_32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea64_32addr:$src)]>,
+ OpSize32, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+ "lea{q}\t{$src|$dst}, {$dst|$src}",
+ [(set GR64:$dst, lea64addr:$src)]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Fixed-Register Multiplication and Division Instructions.
+//
+
+// SchedModel info for instruction that loads one value and gets the second
+// (and possibly third) value from a register.
+// This is used for instructions that put the memory operands before other
+// uses.
+class SchedLoadReg<X86FoldableSchedWrite Sched> : Sched<[Sched.Folded,
+ // Memory operand.
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // Register reads (implicit or explicit).
+ Sched.ReadAfterFold, Sched.ReadAfterFold]>;
+
+// Extra precision multiplication
+
+// AL is really implied by AX, but the registers in Defs must match the
+// SDNode results (i8, i32).
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, GR8:$src)),
+ (implicit EFLAGS)]>, Sched<[WriteIMul8]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
+def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src),
+ "mul{w}\t$src",
+ []>, OpSize16, Sched<[WriteIMul16]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
+def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src),
+ "mul{l}\t$src",
+ [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>,
+ OpSize32, Sched<[WriteIMul32]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+ "mul{q}\t$src",
+ [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>,
+ Sched<[WriteIMul64]>;
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+ "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, (loadi8 addr:$src))),
+ (implicit EFLAGS)]>, SchedLoadReg<WriteIMul8>;
+// AX,DX = AX*[mem16]
+let mayLoad = 1, hasSideEffects = 0 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+ "mul{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIMul16>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+ "mul{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIMul32>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+ "mul{q}\t$src", []>, SchedLoadReg<WriteIMul64>,
+ Requires<[In64BitMode]>;
+}
+
+let hasSideEffects = 0 in {
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", []>,
+ Sched<[WriteIMul8]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", []>,
+ OpSize16, Sched<[WriteIMul16]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", []>,
+ OpSize32, Sched<[WriteIMul32]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", []>,
+ Sched<[WriteIMul64]>;
+
+let mayLoad = 1 in {
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+ "imul{b}\t$src", []>, SchedLoadReg<WriteIMul8>;
+// AX,DX = AX*[mem16]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+ "imul{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIMul16>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+ "imul{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIMul32>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+ "imul{q}\t$src", []>, SchedLoadReg<WriteIMul64>,
+ Requires<[In64BitMode]>;
+}
+} // hasSideEffects
+
+
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst" in {
+
+let isCommutable = 1 in {
+// X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, GR16:$src2))]>,
+ Sched<[WriteIMul16Reg]>, TB, OpSize16;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, GR32:$src2))]>,
+ Sched<[WriteIMul32Reg]>, TB, OpSize32;
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, GR64:$src2))]>,
+ Sched<[WriteIMul64Reg]>, TB;
+} // isCommutable
+
+// Register-Memory Signed Integer Multiply
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, (loadi16 addr:$src2)))]>,
+ Sched<[WriteIMul16Reg.Folded, WriteIMul16Reg.ReadAfterFold]>, TB, OpSize16;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, (loadi32 addr:$src2)))]>,
+ Sched<[WriteIMul32Reg.Folded, WriteIMul32Reg.ReadAfterFold]>, TB, OpSize32;
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, (loadi64 addr:$src2)))]>,
+ Sched<[WriteIMul64Reg.Folded, WriteIMul32Reg.ReadAfterFold]>, TB;
+} // Constraints = "$src1 = $dst"
+
+} // Defs = [EFLAGS]
+
+// Surprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
+// Register-Integer Signed Integer Multiply
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>,
+ Sched<[WriteIMul16Imm]>, OpSize16;
+def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, imm:$src2))]>,
+ Sched<[WriteIMul16Imm]>, OpSize16;
+def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, imm:$src2))]>,
+ Sched<[WriteIMul32Imm]>, OpSize32;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>,
+ Sched<[WriteIMul32Imm]>, OpSize32;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>,
+ Sched<[WriteIMul64Imm]>;
+def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>,
+ Sched<[WriteIMul64Imm]>;
+
+// Memory-Integer Signed Integer Multiply
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
+ (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (loadi16 addr:$src1),
+ i16immSExt8:$src2))]>,
+ Sched<[WriteIMul16Imm.Folded]>, OpSize16;
+def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
+ (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>,
+ Sched<[WriteIMul16Imm.Folded]>, OpSize16;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
+ (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (loadi32 addr:$src1),
+ i32immSExt8:$src2))]>,
+ Sched<[WriteIMul32Imm.Folded]>, OpSize32;
+def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
+ (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>,
+ Sched<[WriteIMul32Imm.Folded]>, OpSize32;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
+ (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (loadi64 addr:$src1),
+ i64immSExt8:$src2))]>,
+ Sched<[WriteIMul64Imm.Folded]>;
+def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32
+ (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (loadi64 addr:$src1),
+ i64immSExt32:$src2))]>,
+ Sched<[WriteIMul64Imm.Folded]>;
+} // Defs = [EFLAGS]
+
+// unsigned division/remainder
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "div{b}\t$src", []>, Sched<[WriteDiv8]>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "div{w}\t$src", []>, Sched<[WriteDiv16]>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "div{l}\t$src", []>, Sched<[WriteDiv32]>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+ "div{q}\t$src", []>, Sched<[WriteDiv64]>;
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "div{b}\t$src", []>, SchedLoadReg<WriteDiv8>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "div{w}\t$src", []>, OpSize16, SchedLoadReg<WriteDiv16>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+ "div{l}\t$src", []>, SchedLoadReg<WriteDiv32>, OpSize32;
+// RDX:RAX/[mem64] = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+ "div{q}\t$src", []>, SchedLoadReg<WriteDiv64>,
+ Requires<[In64BitMode]>;
+}
+
+// Signed division/remainder.
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "idiv{b}\t$src", []>, Sched<[WriteIDiv8]>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "idiv{w}\t$src", []>, Sched<[WriteIDiv16]>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "idiv{l}\t$src", []>, Sched<[WriteIDiv32]>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+ "idiv{q}\t$src", []>, Sched<[WriteIDiv64]>;
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "idiv{b}\t$src", []>, SchedLoadReg<WriteIDiv8>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "idiv{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIDiv16>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+ "idiv{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIDiv32>;
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+ "idiv{q}\t$src", []>, SchedLoadReg<WriteIDiv64>,
+ Requires<[In64BitMode]>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+// Two address Instructions.
+//
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "neg{b}\t$dst",
+ [(set GR8:$dst, (ineg GR8:$src1)),
+ (implicit EFLAGS)]>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "neg{w}\t$dst",
+ [(set GR16:$dst, (ineg GR16:$src1)),
+ (implicit EFLAGS)]>, OpSize16;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "neg{l}\t$dst",
+ [(set GR32:$dst, (ineg GR32:$src1)),
+ (implicit EFLAGS)]>, OpSize32;
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
+ [(set GR64:$dst, (ineg GR64:$src1)),
+ (implicit EFLAGS)]>;
+} // Constraints = "$src1 = $dst", SchedRW
+
+// Read-modify-write negate.
+let SchedRW = [WriteALURMW] in {
+def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
+ "neg{b}\t$dst",
+ [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>;
+def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
+ "neg{w}\t$dst",
+ [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>, OpSize16;
+def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
+ "neg{l}\t$dst",
+ [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>, OpSize32;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+ [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+// Note: NOT does not set EFLAGS!
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "not{b}\t$dst",
+ [(set GR8:$dst, (not GR8:$src1))]>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "not{w}\t$dst",
+ [(set GR16:$dst, (not GR16:$src1))]>, OpSize16;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "not{l}\t$dst",
+ [(set GR32:$dst, (not GR32:$src1))]>, OpSize32;
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
+ [(set GR64:$dst, (not GR64:$src1))]>;
+} // Constraints = "$src1 = $dst", SchedRW
+
+let SchedRW = [WriteALURMW] in {
+def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
+ "not{b}\t$dst",
+ [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
+def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
+ "not{w}\t$dst",
+ [(store (not (loadi16 addr:$dst)), addr:$dst)]>,
+ OpSize16;
+def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
+ "not{l}\t$dst",
+ [(store (not (loadi32 addr:$dst)), addr:$dst)]>,
+ OpSize32;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+ [(store (not (loadi64 addr:$dst)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+} // CodeSize
+
+def X86add_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86add_flag node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 1));
+}]>;
+
+def X86sub_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86sub_flag node:$lhs, node:$rhs), [{
+ // Only use DEC if the result is used.
+ return !SDValue(N, 0).use_empty() && hasNoCarryFlagUses(SDValue(N, 1));
+}]>;
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "inc{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86add_flag_nocf GR8:$src1, 1))]>;
+def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "inc{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86add_flag_nocf GR16:$src1, 1))]>,
+ OpSize16;
+def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "inc{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86add_flag_nocf GR32:$src1, 1))]>,
+ OpSize32;
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86add_flag_nocf GR64:$src1, 1))]>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+ "inc{w}\t$dst", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+ "inc{l}\t$dst", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+let CodeSize = 2, SchedRW = [WriteALURMW] in {
+let Predicates = [UseIncDec] in {
+ def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>;
+ def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>, OpSize16;
+ def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>, OpSize32;
+} // Predicates
+let Predicates = [UseIncDec, In64BitMode] in {
+ def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>;
+} // Predicates
+} // CodeSize = 2, SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "dec{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86sub_flag_nocf GR8:$src1, 1))]>;
+def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "dec{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86sub_flag_nocf GR16:$src1, 1))]>,
+ OpSize16;
+def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "dec{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86sub_flag_nocf GR32:$src1, 1))]>,
+ OpSize32;
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86sub_flag_nocf GR64:$src1, 1))]>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+ "dec{w}\t$dst", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+ "dec{l}\t$dst", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+
+let CodeSize = 2, SchedRW = [WriteALURMW] in {
+let Predicates = [UseIncDec] in {
+ def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>;
+ def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>, OpSize16;
+ def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>, OpSize32;
+} // Predicates
+let Predicates = [UseIncDec, In64BitMode] in {
+ def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>;
+} // Predicates
+} // CodeSize = 2, SchedRW
+} // Defs = [EFLAGS]
+
+/// X86TypeInfo - This is a bunch of information that describes relevant X86
+/// information about value types. For example, it can tell you what the
+/// register class and preferred load to use.
+class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
+ PatFrag loadnode, X86MemOperand memoperand, ImmType immkind,
+ Operand immoperand, SDPatternOperator immoperator,
+ Operand imm8operand, SDPatternOperator imm8operator,
+ bit hasOddOpcode, OperandSize opSize,
+ bit hasREX_WPrefix> {
+ /// VT - This is the value type itself.
+ ValueType VT = vt;
+
+ /// InstrSuffix - This is the suffix used on instructions with this type. For
+ /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q".
+ string InstrSuffix = instrsuffix;
+
+ /// RegClass - This is the register class associated with this type. For
+ /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64.
+ RegisterClass RegClass = regclass;
+
+ /// LoadNode - This is the load node associated with this type. For
+ /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64.
+ PatFrag LoadNode = loadnode;
+
+ /// MemOperand - This is the memory operand associated with this type. For
+ /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem.
+ X86MemOperand MemOperand = memoperand;
+
+ /// ImmEncoding - This is the encoding of an immediate of this type. For
+ /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32
+ /// since the immediate fields of i64 instructions is a 32-bit sign extended
+ /// value.
+ ImmType ImmEncoding = immkind;
+
+ /// ImmOperand - This is the operand kind of an immediate of this type. For
+ /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 ->
+ /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign
+ /// extended value.
+ Operand ImmOperand = immoperand;
+
+ /// ImmOperator - This is the operator that should be used to match an
+ /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32).
+ SDPatternOperator ImmOperator = immoperator;
+
+ /// Imm8Operand - This is the operand kind to use for an imm8 of this type.
+ /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is
+ /// only used for instructions that have a sign-extended imm8 field form.
+ Operand Imm8Operand = imm8operand;
+
+ /// Imm8Operator - This is the operator that should be used to match an 8-bit
+ /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8).
+ SDPatternOperator Imm8Operator = imm8operator;
+
+ /// HasOddOpcode - This bit is true if the instruction should have an odd (as
+ /// opposed to even) opcode. Operations on i8 are usually even, operations on
+ /// other datatypes are odd.
+ bit HasOddOpcode = hasOddOpcode;
+
+ /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
+ /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
+ /// to Opsize16. i32 sets this to OpSize32.
+ OperandSize OpSize = opSize;
+
+ /// HasREX_WPrefix - This bit is set to true if the instruction should have
+ /// the 0x40 REX prefix. This is set for i64 types.
+ bit HasREX_WPrefix = hasREX_WPrefix;
+}
+
+def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
+
+
+def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
+ Imm8, i8imm, imm_su, i8imm, invalid_node,
+ 0, OpSizeFixed, 0>;
+def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
+ Imm16, i16imm, imm_su, i16i8imm, i16immSExt8_su,
+ 1, OpSize16, 0>;
+def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
+ Imm32, i32imm, imm_su, i32i8imm, i32immSExt8_su,
+ 1, OpSize32, 0>;
+def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
+ Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su,
+ 1, OpSizeFixed, 1>;
+
+/// ITy - This instruction base class takes the type info for the instruction.
+/// Using this, it:
+/// 1. Concatenates together the instruction mnemonic with the appropriate
+/// suffix letter, a tab, and the arguments.
+/// 2. Infers whether the instruction should have a 0x66 prefix byte.
+/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+/// or 1 (for i16,i32,i64 operations).
+class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
+ string mnemonic, string args, list<dag> pattern>
+ : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
+ opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
+ f, outs, ins,
+ !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> {
+
+ // Infer instruction prefixes from type info.
+ let OpSize = typeinfo.OpSize;
+ let hasREX_WPrefix = typeinfo.HasREX_WPrefix;
+}
+
+// BinOpRR - Instructions like "add reg, reg, reg".
+class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
+ : ITy<opcode, MRMDestReg, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+ Sched<[sched]>;
+
+// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
+// just a EFLAGS as a result.
+class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs), WriteALU,
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
+
+// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result.
+class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
+
+// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result, and has EFLAGS as input.
+class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
+ EFLAGS))]>;
+
+// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ X86FoldableSchedWrite sched = WriteALU>
+ : ITy<opcode, MRMSrcReg, typeinfo,
+ (outs typeinfo.RegClass:$dst),
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $dst|$dst, $src2}", []>,
+ Sched<[sched]> {
+ // The disassembler should know about this, but not the asmparser.
+ let isCodeGenOnly = 1;
+ let ForceDisassemble = 1;
+ let hasSideEffects = 0;
+}
+
+// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
+class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+ : BinOpRR_Rev<opcode, mnemonic, typeinfo, WriteADC>;
+
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+ : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+ (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", []>,
+ Sched<[WriteALU]> {
+ // The disassembler should know about this, but not the asmparser.
+ let isCodeGenOnly = 1;
+ let ForceDisassemble = 1;
+ let hasSideEffects = 0;
+}
+
+// BinOpRM - Instructions like "add reg, reg, [mem]".
+class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
+ : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+// BinOpRM_F - Instructions like "cmp reg, [mem]".
+class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs), WriteALU,
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RF - Instructions like "add reg, reg, [mem]".
+class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
+class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
+ EFLAGS))]>;
+
+// BinOpRI - Instructions like "add reg, reg, imm".
+class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+ Sched<[sched]> {
+ let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpRI_F - Instructions like "cmp reg, imm".
+class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs), WriteALU,
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_RF - Instructions like "add reg, reg, imm".
+class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
+class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
+ EFLAGS))]>;
+
+// BinOpRI8 - Instructions like "add reg, reg, imm8".
+class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
+ : ITy<opcode, f, typeinfo, outlist,
+ (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+ Sched<[sched]> {
+ let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpRI8_F - Instructions like "cmp reg, imm8".
+class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), WriteALU,
+ [(set EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RF - Instructions like "add reg, reg, imm8".
+class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
+class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
+ (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
+ EFLAGS))]>;
+
+// BinOpMR - Instructions like "add [mem], reg".
+class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ list<dag> pattern>
+ : ITy<opcode, MRMDestMem, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern>;
+
+// BinOpMR_RMW - Instructions like "add [mem], reg".
+class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
+ (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
+
+// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
+class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+ addr:$dst),
+ (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
+
+// BinOpMR_F - Instructions like "cmp [mem], reg".
+class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode>
+ : BinOpMR<opcode, mnemonic, typeinfo,
+ [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
+ typeinfo.RegClass:$src))]>,
+ Sched<[WriteALU.Folded, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault, ReadDefault, WriteALU.ReadAfterFold]>;
+
+// BinOpMI - Instructions like "add [mem], imm".
+class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Format f, list<dag> pattern>
+ : ITy<opcode, f, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern> {
+ let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpMI_RMW - Instructions like "add [mem], imm".
+class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI<opcode, mnemonic, typeinfo, f,
+ [(store (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src), addr:$dst),
+ (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
+// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
+class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDNode opnode, Format f>
+ : BinOpMI<opcode, mnemonic, typeinfo, f,
+ [(store (opnode (typeinfo.VT (load addr:$dst)),
+ typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+ (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
+
+// BinOpMI_F - Instructions like "cmp [mem], imm".
+class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI<opcode, mnemonic, typeinfo, f,
+ [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
+ typeinfo.ImmOperator:$src))]>,
+ Sched<[WriteALU.Folded]>;
+
+// BinOpMI8 - Instructions like "add [mem], imm8".
+class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
+ Format f, list<dag> pattern>
+ : ITy<0x82, f, typeinfo,
+ (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
+ mnemonic, "{$src, $dst|$dst, $src}", pattern> {
+ let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpMI8_RMW - Instructions like "add [mem], imm8".
+class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(store (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src), addr:$dst),
+ (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
+
+// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
+class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(store (opnode (load addr:$dst),
+ typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
+ (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
+
+// BinOpMI8_F - Instructions like "cmp [mem], imm8".
+class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
+ SDPatternOperator opnode, Format f>
+ : BinOpMI8<mnemonic, typeinfo, f,
+ [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
+ typeinfo.Imm8Operator:$src))]>,
+ Sched<[WriteALU.Folded]>;
+
+// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands, X86FoldableSchedWrite sched = WriteALU>
+ : ITy<opcode, RawFrm, typeinfo,
+ (outs), (ins typeinfo.ImmOperand:$src),
+ mnemonic, operands, []>, Sched<[sched]> {
+ let ImmT = typeinfo.ImmEncoding;
+ let Uses = [areg];
+ let Defs = [areg, EFLAGS];
+ let hasSideEffects = 0;
+}
+
+// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// and use EFLAGS.
+class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
+ : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, WriteADC> {
+ let Uses = [areg, EFLAGS];
+}
+
+// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
+ : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+ let Defs = [EFLAGS];
+}
+
+/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (...".
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnodeflag, SDNode opnode,
+ bit CommutableRR, bit ConvertibleToThreeAddress,
+ bit ConvertibleToThreeAddressRR> {
+ let Defs = [EFLAGS] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = CommutableRR in {
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
+ def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+ def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+ } // isConvertibleToThreeAddress
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+ def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+ def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+ def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
+
+ def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+ def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
+
+ def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+ }
+ } // Constraints = "$src1 = $dst"
+
+ let mayLoad = 1, mayStore = 1 in {
+ def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+ }
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
+ def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
+ def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+ // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+ // not in 64-bit mode.
+ let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+ hasSideEffects = 0 in {
+ let Constraints = "$src1 = $dst" in
+ def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+ let mayLoad = 1, mayStore = 1 in
+ def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>;
+ }
+ } // Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and
+/// SBB.
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnode, bit CommutableRR,
+ bit ConvertibleToThreeAddress> {
+ let Uses = [EFLAGS], Defs = [EFLAGS] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = CommutableRR in {
+ def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+ } // isConvertibleToThreeAddress
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+ def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+ def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+ def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
+
+ def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
+
+ def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ }
+ } // Constraints = "$src1 = $dst"
+
+ def NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
+ def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
+ def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+ // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+ // not in 64-bit mode.
+ let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+ hasSideEffects = 0 in {
+ let Constraints = "$src1 = $dst" in
+ def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+ let mayLoad = 1, mayStore = 1 in
+ def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>;
+ }
+ } // Uses = [EFLAGS], Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
+/// defined with "(set EFLAGS, (...". It would be really nice to find a way
+/// to factor this with the other ArithBinOp_*.
+///
+multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+ string mnemonic, Format RegMRM, Format MemMRM,
+ SDNode opnode,
+ bit CommutableRR, bit ConvertibleToThreeAddress> {
+ let Defs = [EFLAGS] in {
+ let isCommutable = CommutableRR in {
+ def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+ }
+ } // isCommutable
+
+ def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+ def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+ def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+ def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
+
+ def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+ def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ }
+
+ def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+ // NOTE: These are order specific, we want the mi8 forms to be listed
+ // first so that they are slightly preferred to the mi forms.
+ def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
+ def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
+ def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+ // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+ // not in 64-bit mode.
+ let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+ hasSideEffects = 0 in {
+ def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+ let mayLoad = 1 in
+ def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>;
+ }
+ } // Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+}
+
+
+defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+ X86and_flag, and, 1, 0, 0>;
+defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+ X86or_flag, or, 1, 0, 0>;
+defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+ X86xor_flag, xor, 1, 0, 0>;
+defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+ X86add_flag, add, 1, 1, 1>;
+let isCompare = 1 in {
+defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+ X86sub_flag, sub, 0, 1, 0>;
+}
+
+// Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of
+// __builtin_parity where the last step xors an h-register with an l-register.
+let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst",
+ Defs = [EFLAGS], isCommutable = 1 in
+def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst),
+ (ins GR8_NOREX:$src1, GR8_NOREX:$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}", []>,
+ Sched<[WriteALU]>;
+
+// Arithmetic.
+defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+ 1, 0>;
+defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+ 0, 0>;
+
+let isCompare = 1 in {
+defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
+
+// Patterns to recognize loads on the LHS of an ADC. We can't make X86adc_flag
+// commutable since it has EFLAGs as an input.
+def : Pat<(X86adc_flag (loadi8 addr:$src2), GR8:$src1, EFLAGS),
+ (ADC8rm GR8:$src1, addr:$src2)>;
+def : Pat<(X86adc_flag (loadi16 addr:$src2), GR16:$src1, EFLAGS),
+ (ADC16rm GR16:$src1, addr:$src2)>;
+def : Pat<(X86adc_flag (loadi32 addr:$src2), GR32:$src1, EFLAGS),
+ (ADC32rm GR32:$src1, addr:$src2)>;
+def : Pat<(X86adc_flag (loadi64 addr:$src2), GR64:$src1, EFLAGS),
+ (ADC64rm GR64:$src1, addr:$src2)>;
+
+// Patterns to recognize RMW ADC with loads in operand 1.
+def : Pat<(store (X86adc_flag GR8:$src, (loadi8 addr:$dst), EFLAGS),
+ addr:$dst),
+ (ADC8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (X86adc_flag GR16:$src, (loadi16 addr:$dst), EFLAGS),
+ addr:$dst),
+ (ADC16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (X86adc_flag GR32:$src, (loadi32 addr:$dst), EFLAGS),
+ addr:$dst),
+ (ADC32mr addr:$dst, GR32:$src)>;
+def : Pat<(store (X86adc_flag GR64:$src, (loadi64 addr:$dst), EFLAGS),
+ addr:$dst),
+ (ADC64mr addr:$dst, GR64:$src)>;
+
+// Patterns for basic arithmetic ops with relocImm for the immediate field.
+multiclass ArithBinOp_RF_relocImm_Pats<SDNode OpNodeFlag, SDNode OpNode> {
+ def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2),
+ (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2),
+ (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2),
+ (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
+ (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+ def : Pat<(store (OpNode (load addr:$dst), relocImm8_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"8mi") addr:$dst, relocImm8_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), i16relocImmSExt8_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"16mi8") addr:$dst, i16relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), relocImm16_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"16mi") addr:$dst, relocImm16_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), i32relocImmSExt8_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"32mi8") addr:$dst, i32relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), relocImm32_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"32mi") addr:$dst, relocImm32_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), i64relocImmSExt8_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"64mi8") addr:$dst, i64relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), i64relocImmSExt32_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"64mi32") addr:$dst, i64relocImmSExt32_su:$src)>;
+}
+
+multiclass ArithBinOp_RFF_relocImm_Pats<SDNode OpNodeFlag> {
+ def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+ def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm8_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"8mi") addr:$dst, relocImm8_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), i16relocImmSExt8_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"16mi8") addr:$dst, i16relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm16_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"16mi") addr:$dst, relocImm16_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), i32relocImmSExt8_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"32mi8") addr:$dst, i32relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm32_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"32mi") addr:$dst, relocImm32_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), i64relocImmSExt8_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"64mi8") addr:$dst, i64relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), i64relocImmSExt32_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"64mi32") addr:$dst, i64relocImmSExt32_su:$src)>;
+}
+
+multiclass ArithBinOp_F_relocImm_Pats<SDNode OpNodeFlag> {
+ def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2),
+ (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2),
+ (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2),
+ (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
+ (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+ def : Pat<(OpNodeFlag (loadi8 addr:$src1), relocImm8_su:$src2),
+ (!cast<Instruction>(NAME#"8mi") addr:$src1, relocImm8_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi16 addr:$src1), i16relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"16mi8") addr:$src1, i16relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi16 addr:$src1), relocImm16_su:$src2),
+ (!cast<Instruction>(NAME#"16mi") addr:$src1, relocImm16_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi32 addr:$src1), i32relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"32mi8") addr:$src1, i32relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi32 addr:$src1), relocImm32_su:$src2),
+ (!cast<Instruction>(NAME#"32mi") addr:$src1, relocImm32_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi64 addr:$src1), i64relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"64mi8") addr:$src1, i64relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi64 addr:$src1), i64relocImmSExt32_su:$src2),
+ (!cast<Instruction>(NAME#"64mi32") addr:$src1, i64relocImmSExt32_su:$src2)>;
+}
+
+defm AND : ArithBinOp_RF_relocImm_Pats<X86and_flag, and>;
+defm OR : ArithBinOp_RF_relocImm_Pats<X86or_flag, or>;
+defm XOR : ArithBinOp_RF_relocImm_Pats<X86xor_flag, xor>;
+defm ADD : ArithBinOp_RF_relocImm_Pats<X86add_flag, add>;
+defm SUB : ArithBinOp_RF_relocImm_Pats<X86sub_flag, sub>;
+
+defm ADC : ArithBinOp_RFF_relocImm_Pats<X86adc_flag>;
+defm SBB : ArithBinOp_RFF_relocImm_Pats<X86sbb_flag>;
+
+defm CMP : ArithBinOp_F_relocImm_Pats<X86cmp>;
+
+// ADC is commutable, but we can't indicate that to tablegen. So manually
+// reverse the operands.
+def : Pat<(X86adc_flag GR8:$src1, relocImm8_su:$src2, EFLAGS),
+ (ADC8ri relocImm8_su:$src2, GR8:$src1)>;
+def : Pat<(X86adc_flag i16relocImmSExt8_su:$src2, GR16:$src1, EFLAGS),
+ (ADC16ri8 GR16:$src1, i16relocImmSExt8_su:$src2)>;
+def : Pat<(X86adc_flag relocImm16_su:$src2, GR16:$src1, EFLAGS),
+ (ADC16ri GR16:$src1, relocImm16_su:$src2)>;
+def : Pat<(X86adc_flag i32relocImmSExt8_su:$src2, GR32:$src1, EFLAGS),
+ (ADC32ri8 GR32:$src1, i32relocImmSExt8_su:$src2)>;
+def : Pat<(X86adc_flag relocImm32_su:$src2, GR32:$src1, EFLAGS),
+ (ADC32ri GR32:$src1, relocImm32_su:$src2)>;
+def : Pat<(X86adc_flag i64relocImmSExt8_su:$src2, GR64:$src1, EFLAGS),
+ (ADC64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
+def : Pat<(X86adc_flag i64relocImmSExt32_su:$src2, GR64:$src1, EFLAGS),
+ (ADC64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+def : Pat<(store (X86adc_flag relocImm8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC8mi addr:$dst, relocImm8_su:$src)>;
+def : Pat<(store (X86adc_flag i16relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC16mi8 addr:$dst, i16relocImmSExt8_su:$src)>;
+def : Pat<(store (X86adc_flag relocImm16_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC16mi addr:$dst, relocImm16_su:$src)>;
+def : Pat<(store (X86adc_flag i32relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC32mi8 addr:$dst, i32relocImmSExt8_su:$src)>;
+def : Pat<(store (X86adc_flag relocImm32_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC32mi addr:$dst, relocImm32_su:$src)>;
+def : Pat<(store (X86adc_flag i64relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC64mi8 addr:$dst, i64relocImmSExt8_su:$src)>;
+def : Pat<(store (X86adc_flag i64relocImmSExt32_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC64mi32 addr:$dst, i64relocImmSExt32_su:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Semantically, test instructions are similar like AND, except they don't
+// generate a result. From an encoding perspective, they are very different:
+// they don't have all the usual imm8 and REV forms, and are encoded into a
+// different space.
+def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
+
+let isCompare = 1 in {
+ let Defs = [EFLAGS] in {
+ let isCommutable = 1 in {
+ // Avoid selecting these and instead use a test+and. Post processing will
+ // combine them. This gives bunch of other patterns that start with
+ // and a chance to match.
+ def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , null_frag>;
+ def TEST16rr : BinOpRR_F<0x84, "test", Xi16, null_frag>;
+ def TEST32rr : BinOpRR_F<0x84, "test", Xi32, null_frag>;
+ def TEST64rr : BinOpRR_F<0x84, "test", Xi64, null_frag>;
+ } // isCommutable
+
+ let hasSideEffects = 0, mayLoad = 1 in {
+ def TEST8mr : BinOpMR_F<0x84, "test", Xi8 , null_frag>;
+ def TEST16mr : BinOpMR_F<0x84, "test", Xi16, null_frag>;
+ def TEST32mr : BinOpMR_F<0x84, "test", Xi32, null_frag>;
+ def TEST64mr : BinOpMR_F<0x84, "test", Xi64, null_frag>;
+ }
+
+ def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+ def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+ def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+ def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+ def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
+ def TEST16mi : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
+ def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
+ let Predicates = [In64BitMode] in
+ def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
+ } // Defs = [EFLAGS]
+
+ def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
+} // isCompare
+
+// Patterns to match a relocImm into the immediate field.
+def : Pat<(X86testpat GR8:$src1, relocImm8_su:$src2),
+ (TEST8ri GR8:$src1, relocImm8_su:$src2)>;
+def : Pat<(X86testpat GR16:$src1, relocImm16_su:$src2),
+ (TEST16ri GR16:$src1, relocImm16_su:$src2)>;
+def : Pat<(X86testpat GR32:$src1, relocImm32_su:$src2),
+ (TEST32ri GR32:$src1, relocImm32_su:$src2)>;
+def : Pat<(X86testpat GR64:$src1, i64relocImmSExt32_su:$src2),
+ (TEST64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+def : Pat<(X86testpat (loadi8 addr:$src1), relocImm8_su:$src2),
+ (TEST8mi addr:$src1, relocImm8_su:$src2)>;
+def : Pat<(X86testpat (loadi16 addr:$src1), relocImm16_su:$src2),
+ (TEST16mi addr:$src1, relocImm16_su:$src2)>;
+def : Pat<(X86testpat (loadi32 addr:$src1), relocImm32_su:$src2),
+ (TEST32mi addr:$src1, relocImm32_su:$src2)>;
+def : Pat<(X86testpat (loadi64 addr:$src1), i64relocImmSExt32_su:$src2),
+ (TEST64mi32 addr:$src1, i64relocImmSExt32_su:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+ PatFrag ld_frag, X86FoldableSchedWrite sched> {
+ def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
+ Sched<[sched]>;
+ def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, EFLAGS,
+ (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+// Complexity is reduced to give and with immediate a chance to match first.
+let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in {
+ defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS, VEX_4V;
+ defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, VEX_4V, VEX_W;
+}
+
+let Predicates = [HasBMI], AddedComplexity = -6 in {
+ def : Pat<(and (not GR32:$src1), GR32:$src2),
+ (ANDN32rr GR32:$src1, GR32:$src2)>;
+ def : Pat<(and (not GR64:$src1), GR64:$src2),
+ (ANDN64rr GR64:$src1, GR64:$src2)>;
+ def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+ (ANDN32rm GR32:$src1, addr:$src2)>;
+ def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+ (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// MULX Instruction
+//
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched> {
+let hasSideEffects = 0 in {
+ def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
+ !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+ []>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>;
+
+ let mayLoad = 1 in
+ def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
+ !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+
+ []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
+
+ // Pseudo instructions to be used when the low result isn't used. The
+ // instruction is defined to keep the high if both destinations are the same.
+ def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src),
+ []>, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
+ []>, Sched<[sched.Folded]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+ let Uses = [EDX] in
+ defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteIMul32>;
+ let Uses = [RDX] in
+ defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteIMul64>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// ADCX and ADOX Instructions
+//
+// We don't have patterns for these as there is no advantage over ADC for
+// most code.
+let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
+ Constraints = "$src1 = $dst", hasSideEffects = 0 in {
+ let SchedRW = [WriteADC], isCommutable = 1 in {
+ def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "adcx{l}\t{$src2, $dst|$dst, $src2}", []>, T8PD;
+ def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "adcx{q}\t{$src2, $dst|$dst, $src2}", []>, T8PD;
+
+ def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "adox{l}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
+
+ def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
+ } // SchedRW
+
+ let mayLoad = 1, SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold] in {
+ def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "adcx{l}\t{$src2, $dst|$dst, $src2}", []>, T8PD;
+
+ def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "adcx{q}\t{$src2, $dst|$dst, $src2}", []>, T8PD;
+
+ def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "adox{l}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
+
+ def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
+ } // mayLoad, SchedRW
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 000000000000..07079ef87fd4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,232 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call. X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense. Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include <cassert>
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly. The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ union {
+ unsigned Reg;
+ int FrameIndex;
+ } Base;
+
+ unsigned Scale;
+ unsigned IndexReg;
+ int Disp;
+ const GlobalValue *GV;
+ unsigned GVOpFlags;
+
+ X86AddressMode()
+ : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr),
+ GVOpFlags(0) {
+ Base.Reg = 0;
+ }
+
+ void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
+ assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
+
+ if (BaseType == X86AddressMode::RegBase)
+ MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, false,
+ false, false, false, 0, false));
+ else {
+ assert(BaseType == X86AddressMode::FrameIndexBase);
+ MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
+ }
+
+ MO.push_back(MachineOperand::CreateImm(Scale));
+ MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, false, false,
+ false, false, 0, false));
+
+ if (GV)
+ MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
+ else
+ MO.push_back(MachineOperand::CreateImm(Disp));
+
+ MO.push_back(MachineOperand::CreateReg(0, false, false, false, false, false,
+ false, 0, false));
+ }
+};
+
+/// Compute the addressing mode from an machine instruction starting with the
+/// given operand.
+static inline X86AddressMode getAddressFromInstr(const MachineInstr *MI,
+ unsigned Operand) {
+ X86AddressMode AM;
+ const MachineOperand &Op0 = MI->getOperand(Operand);
+ if (Op0.isReg()) {
+ AM.BaseType = X86AddressMode::RegBase;
+ AM.Base.Reg = Op0.getReg();
+ } else {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = Op0.getIndex();
+ }
+
+ const MachineOperand &Op1 = MI->getOperand(Operand + 1);
+ AM.Scale = Op1.getImm();
+
+ const MachineOperand &Op2 = MI->getOperand(Operand + 2);
+ AM.IndexReg = Op2.getReg();
+
+ const MachineOperand &Op3 = MI->getOperand(Operand + 3);
+ if (Op3.isGlobal())
+ AM.GV = Op3.getGlobal();
+ else
+ AM.Disp = Op3.getImm();
+
+ return AM;
+}
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+ // Because memory references are always represented with five
+ // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction.
+ return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+/// Replace the address used in the instruction with the direct memory
+/// reference.
+static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand,
+ unsigned Reg) {
+ // Direct memory address is in a form of: Reg/FI, 1 (Scale), NoReg, 0, NoReg.
+ MI->getOperand(Operand).ChangeToRegister(Reg, /*isDef=*/false);
+ MI->getOperand(Operand + 1).setImm(1);
+ MI->getOperand(Operand + 2).setReg(0);
+ MI->getOperand(Operand + 3).ChangeToImmediate(0);
+ MI->getOperand(Operand + 4).setReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+ return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, const MachineOperand& Offset) {
+ return MIB.addImm(1).addReg(0).add(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+ unsigned Reg, bool isKill, int Offset) {
+ return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+ unsigned Reg1, bool isKill1,
+ unsigned Reg2, bool isKill2) {
+ return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+ .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+ const X86AddressMode &AM) {
+ assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+ if (AM.BaseType == X86AddressMode::RegBase)
+ MIB.addReg(AM.Base.Reg);
+ else {
+ assert(AM.BaseType == X86AddressMode::FrameIndexBase);
+ MIB.addFrameIndex(AM.Base.FrameIndex);
+ }
+
+ MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+ if (AM.GV)
+ MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+ else
+ MIB.addImm(AM.Disp);
+
+ return MIB.addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+ MachineInstr *MI = MIB;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MCInstrDesc &MCID = MI->getDesc();
+ auto Flags = MachineMemOperand::MONone;
+ if (MCID.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+ if (MCID.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+ return addOffset(MIB.addFrameIndex(FI), Offset)
+ .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool. The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+ unsigned GlobalBaseReg, unsigned char OpFlags) {
+ //FIXME: factor this
+ return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+ .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCMovSetCC.td
new file mode 100644
index 000000000000..330b8c7a8a43
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -0,0 +1,127 @@
+//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 conditional move and set on condition
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+
+// CMOV instructions.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ isCommutable = 1, SchedRW = [WriteCMOV] in {
+ def CMOV16rr
+ : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
+ "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst,
+ (X86cmov GR16:$src1, GR16:$src2, timm:$cond, EFLAGS))]>,
+ TB, OpSize16;
+ def CMOV32rr
+ : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond),
+ "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst,
+ (X86cmov GR32:$src1, GR32:$src2, timm:$cond, EFLAGS))]>,
+ TB, OpSize32;
+ def CMOV64rr
+ :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond),
+ "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst,
+ (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB;
+}
+
+let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
+ def CMOV16rm
+ : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
+ "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ timm:$cond, EFLAGS))]>, TB, OpSize16;
+ def CMOV32rm
+ : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond),
+ "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ timm:$cond, EFLAGS))]>, TB, OpSize32;
+ def CMOV64rm
+ :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond),
+ "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ timm:$cond, EFLAGS))]>, TB;
+} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // isCodeGenOnly = 1, ForceDisassemble = 1
+
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+ X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue());
+ return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC),
+ SDLoc(N), MVT::i8);
+}]>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+let Predicates = [HasCMov] in {
+ def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
+ (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+ def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
+ (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
+ (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+}
+
+// SetCC instructions.
+let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond),
+ "set${cond}\t$dst",
+ [(set GR8:$dst, (X86setcc timm:$cond, EFLAGS))]>,
+ TB, Sched<[WriteSETCC]>;
+ def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond),
+ "set${cond}\t$dst",
+ [(store (X86setcc timm:$cond, EFLAGS), addr:$dst)]>,
+ TB, Sched<[WriteSETCCStore]>;
+} // Uses = [EFLAGS]
+
+multiclass CMOV_SETCC_Aliases<string Cond, int CC> {
+ def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+ (CMOV16rr GR16:$dst, GR16:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+ (CMOV16rm GR16:$dst, i16mem:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+ (CMOV32rr GR32:$dst, GR32:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+ (CMOV32rm GR32:$dst, i32mem:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+ (CMOV64rr GR64:$dst, GR64:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+ (CMOV64rm GR64:$dst, i64mem:$src, CC), 0>;
+
+ def : InstAlias<"set"#Cond#"\t$dst", (SETCCr GR8:$dst, CC), 0>;
+ def : InstAlias<"set"#Cond#"\t$dst", (SETCCm i8mem:$dst, CC), 0>;
+}
+
+defm : CMOV_SETCC_Aliases<"o" , 0>;
+defm : CMOV_SETCC_Aliases<"no", 1>;
+defm : CMOV_SETCC_Aliases<"b" , 2>;
+defm : CMOV_SETCC_Aliases<"ae", 3>;
+defm : CMOV_SETCC_Aliases<"e" , 4>;
+defm : CMOV_SETCC_Aliases<"ne", 5>;
+defm : CMOV_SETCC_Aliases<"be", 6>;
+defm : CMOV_SETCC_Aliases<"a" , 7>;
+defm : CMOV_SETCC_Aliases<"s" , 8>;
+defm : CMOV_SETCC_Aliases<"ns", 9>;
+defm : CMOV_SETCC_Aliases<"p" , 10>;
+defm : CMOV_SETCC_Aliases<"np", 11>;
+defm : CMOV_SETCC_Aliases<"l" , 12>;
+defm : CMOV_SETCC_Aliases<"ge", 13>;
+defm : CMOV_SETCC_Aliases<"le", 14>;
+defm : CMOV_SETCC_Aliases<"g" , 15>;
+
+// SALC is an undocumented instruction. Information for this instruction can be found
+// here http://www.rcollins.org/secrets/opcodes/SALC.html
+// Set AL if carry.
+let Uses = [EFLAGS], Defs = [AL], SchedRW = [WriteALU] in {
+ def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", []>, Requires<[Not64BitMode]>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
new file mode 100644
index 000000000000..7a2facf226d8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -0,0 +1,2179 @@
+//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various pseudo instructions used by the compiler,
+// as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pattern Matching Support
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 32 bits.
+ return getI32Imm((uint32_t)N->getZExtValue(), SDLoc(N));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Random Pseudo Instructions.
+
+// PIC base construction. This expands to code that looks like this:
+// call $next_inst
+// popl %destreg"
+let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
+ SchedRW = [WriteJump] in
+ def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+ "", []>;
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS, SSP], Uses = [ESP, SSP], SchedRW = [WriteALU] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
+ (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
+ "#ADJCALLSTACKDOWN", []>, Requires<[NotLP64]>;
+def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[NotLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+ (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS, SSP], Uses = [RSP, SSP], SchedRW = [WriteALU] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
+ (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
+ "#ADJCALLSTACKDOWN", []>, Requires<[IsLP64]>;
+def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[IsLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+ (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;
+
+let SchedRW = [WriteSystem] in {
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1, Defs = [EFLAGS] in {
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+ (outs),
+ (ins GR8:$al,
+ i32imm:$regsavefi, i32imm:$offset,
+ variable_ops),
+ "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+ [(X86vastart_save_xmm_regs GR8:$al,
+ timm:$regsavefi,
+ timm:$offset),
+ (implicit EFLAGS)]>;
+
+// The VAARG_64 and VAARG_X32 pseudo-instructions take the address of the
+// va_list, and place the address of the next argument into a register.
+let Defs = [EFLAGS] in {
+def VAARG_64 : I<0, Pseudo,
+ (outs GR64:$dst),
+ (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+ "#VAARG_64 $dst, $ap, $size, $mode, $align",
+ [(set GR64:$dst,
+ (X86vaarg64 addr:$ap, timm:$size, timm:$mode, timm:$align)),
+ (implicit EFLAGS)]>, Requires<[In64BitMode, IsLP64]>;
+def VAARG_X32 : I<0, Pseudo,
+ (outs GR32:$dst),
+ (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+ "#VAARG_X32 $dst, $ap, $size, $mode, $align",
+ [(set GR32:$dst,
+ (X86vaargx32 addr:$ap, timm:$size, timm:$mode, timm:$align)),
+ (implicit EFLAGS)]>, Requires<[In64BitMode, NotLP64]>;
+}
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+ "# variable sized alloca for segmented stacks",
+ [(set GR32:$dst,
+ (X86SegAlloca GR32:$size))]>,
+ Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+ "# variable sized alloca for segmented stacks",
+ [(set GR64:$dst,
+ (X86SegAlloca GR64:$size))]>,
+ Requires<[In64BitMode]>;
+
+// To protect against stack clash, dynamic allocation should perform a memory
+// probe at each page.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def PROBED_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+ "# variable sized alloca with probing",
+ [(set GR32:$dst,
+ (X86ProbedAlloca GR32:$size))]>,
+ Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def PROBED_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+ "# variable sized alloca with probing",
+ [(set GR64:$dst,
+ (X86ProbedAlloca GR64:$size))]>,
+ Requires<[In64BitMode]>;
+}
+
+let hasNoSchedulingInfo = 1 in
+def STACKALLOC_W_PROBING : I<0, Pseudo, (outs), (ins i64imm:$stacksize),
+ "# fixed size alloca with probing",
+ []>;
+
+// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
+// targets. These calls are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def WIN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size),
+ "# dynamic stack allocation",
+ [(X86WinAlloca GR32:$size)]>,
+ Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
+ "# dynamic stack allocation",
+ [(X86WinAlloca GR64:$size)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+// These instructions XOR the frame pointer into a GPR. They are used in some
+// stack protection schemes. These are post-RA pseudos because we only know the
+// frame register after register allocation.
+let Constraints = "$src = $dst", isMoveImm = 1, isPseudo = 1, Defs = [EFLAGS] in {
+ def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
+ "xorl\t$$FP, $src", []>,
+ Requires<[NotLP64]>, Sched<[WriteALU]>;
+ def XOR64_FP : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src),
+ "xorq\t$$FP $src", []>,
+ Requires<[In64BitMode]>, Sched<[WriteALU]>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let SchedRW = [WriteSystem] in {
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR32:$addr)]>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR64:$addr)]>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+ isCodeGenOnly = 1, isReturn = 1, isEHScopeReturn = 1 in {
+ def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>;
+
+ // CATCHRET needs a custom inserter for SEH.
+ let usesCustomInserter = 1 in
+ def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from),
+ "# CATCHRET",
+ [(catchret bb:$dst, bb:$from)]>;
+}
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in {
+ def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+ "#EH_SJLJ_SETJMP32",
+ [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+ Requires<[Not64BitMode]>;
+ def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+ "#EH_SJLJ_SETJMP64",
+ [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+ Requires<[In64BitMode]>;
+ let isTerminator = 1 in {
+ def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+ "#EH_SJLJ_LONGJMP32",
+ [(X86eh_sjlj_longjmp addr:$buf)]>,
+ Requires<[Not64BitMode]>;
+ def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+ "#EH_SJLJ_LONGJMP64",
+ [(X86eh_sjlj_longjmp addr:$buf)]>,
+ Requires<[In64BitMode]>;
+ }
+}
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+ def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+ "#EH_SjLj_Setup\t$dst", []>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by unwind info.
+//
+let isPseudo = 1, SchedRW = [WriteSystem] in {
+ def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
+ "#SEH_PushReg $reg", []>;
+ def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+ "#SEH_SaveReg $reg, $dst", []>;
+ def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+ "#SEH_SaveXMM $reg, $dst", []>;
+ def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
+ "#SEH_StackAlloc $size", []>;
+ def SEH_StackAlign : I<0, Pseudo, (outs), (ins i32imm:$align),
+ "#SEH_StackAlign $align", []>;
+ def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
+ "#SEH_SetFrame $reg, $offset", []>;
+ def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
+ "#SEH_PushFrame $mode", []>;
+ def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
+ "#SEH_EndPrologue", []>;
+ def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
+ "#SEH_Epilogue", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by segmented stacks.
+//
+
+// This is lowered into a RET instruction by MCInstLower. We need
+// this so that we don't have to have a MachineBasicBlock which ends
+// with a RET and also has successors.
+let isPseudo = 1, SchedRW = [WriteJumpLd] in {
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins), "", []>;
+
+// This instruction is lowered to a RET followed by a MOV. The two
+// instructions are not generated on a higher level since then the
+// verifier sees a MachineBasicBlock ending with a non-terminator.
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instruction mapping movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isPseudo = 1, isMoveImm = 1, AddedComplexity = 10 in
+def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, 0)]>, Sched<[WriteZero]>;
+
+// Other widths can also make use of the 32-bit xor, which may have a smaller
+// encoding and avoid partial register updates.
+let AddedComplexity = 10 in {
+def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
+def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
+}
+
+let Predicates = [OptForSize, Not64BitMode],
+ AddedComplexity = 10 in {
+ let SchedRW = [WriteALU] in {
+ // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
+ // which only require 3 bytes compared to MOV32ri which requires 5.
+ let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
+ def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, 1)]>;
+ def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, -1)]>;
+ }
+ } // SchedRW
+
+ // MOV16ri is 4 bytes, so the instructions above are smaller.
+ def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
+ def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
+}
+
+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5,
+ SchedRW = [WriteALU] in {
+// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
+def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
+ [(set GR32:$dst, i32immSExt8:$src)]>,
+ Requires<[OptForMinSize, NotWin64WithoutFP]>;
+def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
+ [(set GR64:$dst, i64immSExt8:$src)]>,
+ Requires<[OptForMinSize, NotWin64WithoutFP]>;
+}
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isPseudo = 1, SchedRW = [WriteMove] in
+def MOV32ri64 : I<0, Pseudo, (outs GR64:$dst), (ins i64i32imm:$src), "",
+ [(set GR64:$dst, i64immZExt32:$src)]>;
+
+// This 64-bit pseudo-move can also be used for labels in the x86-64 small code
+// model.
+def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [X86Wrapper]>;
+def : Pat<(i64 mov64imm32:$src), (MOV32ri64 mov64imm32:$src)>;
+
+// Use sbb to materialize carry bit.
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteADC],
+ hasSideEffects = 0 in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", []>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", []>;
+} // isCodeGenOnly
+
+//===----------------------------------------------------------------------===//
+// String Pseudo Instructions
+//
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins),
+ "{rep;movsb (%esi), %es:(%edi)|rep movsb es:[edi], [esi]}",
+ [(X86rep_movs i8)]>, REP, AdSize32,
+ Requires<[NotLP64]>;
+def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins),
+ "{rep;movsw (%esi), %es:(%edi)|rep movsw es:[edi], [esi]}",
+ [(X86rep_movs i16)]>, REP, AdSize32, OpSize16,
+ Requires<[NotLP64]>;
+def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins),
+ "{rep;movsl (%esi), %es:(%edi)|rep movsd es:[edi], [esi]}",
+ [(X86rep_movs i32)]>, REP, AdSize32, OpSize32,
+ Requires<[NotLP64]>;
+def REP_MOVSQ_32 : RI<0xA5, RawFrm, (outs), (ins),
+ "{rep;movsq (%esi), %es:(%edi)|rep movsq es:[edi], [esi]}",
+ [(X86rep_movs i64)]>, REP, AdSize32,
+ Requires<[NotLP64, In64BitMode]>;
+}
+
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
+def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins),
+ "{rep;movsb (%rsi), %es:(%rdi)|rep movsb es:[rdi], [rsi]}",
+ [(X86rep_movs i8)]>, REP, AdSize64,
+ Requires<[IsLP64]>;
+def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins),
+ "{rep;movsw (%rsi), %es:(%rdi)|rep movsw es:[rdi], [rsi]}",
+ [(X86rep_movs i16)]>, REP, AdSize64, OpSize16,
+ Requires<[IsLP64]>;
+def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins),
+ "{rep;movsl (%rsi), %es:(%rdi)|rep movsdi es:[rdi], [rsi]}",
+ [(X86rep_movs i32)]>, REP, AdSize64, OpSize32,
+ Requires<[IsLP64]>;
+def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins),
+ "{rep;movsq (%rsi), %es:(%rdi)|rep movsq es:[rdi], [rsi]}",
+ [(X86rep_movs i64)]>, REP, AdSize64,
+ Requires<[IsLP64]>;
+}
+
+// FIXME: Should use "(X86rep_stos AL)" as the pattern.
+let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
+ let Uses = [AL,ECX,EDI] in
+ def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins),
+ "{rep;stosb %al, %es:(%edi)|rep stosb es:[edi], al}",
+ [(X86rep_stos i8)]>, REP, AdSize32,
+ Requires<[NotLP64]>;
+ let Uses = [AX,ECX,EDI] in
+ def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins),
+ "{rep;stosw %ax, %es:(%edi)|rep stosw es:[edi], ax}",
+ [(X86rep_stos i16)]>, REP, AdSize32, OpSize16,
+ Requires<[NotLP64]>;
+ let Uses = [EAX,ECX,EDI] in
+ def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins),
+ "{rep;stosl %eax, %es:(%edi)|rep stosd es:[edi], eax}",
+ [(X86rep_stos i32)]>, REP, AdSize32, OpSize32,
+ Requires<[NotLP64]>;
+ let Uses = [RAX,RCX,RDI] in
+ def REP_STOSQ_32 : RI<0xAB, RawFrm, (outs), (ins),
+ "{rep;stosq %rax, %es:(%edi)|rep stosq es:[edi], rax}",
+ [(X86rep_stos i64)]>, REP, AdSize32,
+ Requires<[NotLP64, In64BitMode]>;
+}
+
+let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
+ let Uses = [AL,RCX,RDI] in
+ def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins),
+ "{rep;stosb %al, %es:(%rdi)|rep stosb es:[rdi], al}",
+ [(X86rep_stos i8)]>, REP, AdSize64,
+ Requires<[IsLP64]>;
+ let Uses = [AX,RCX,RDI] in
+ def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins),
+ "{rep;stosw %ax, %es:(%rdi)|rep stosw es:[rdi], ax}",
+ [(X86rep_stos i16)]>, REP, AdSize64, OpSize16,
+ Requires<[IsLP64]>;
+ let Uses = [RAX,RCX,RDI] in
+ def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins),
+ "{rep;stosl %eax, %es:(%rdi)|rep stosd es:[rdi], eax}",
+ [(X86rep_stos i32)]>, REP, AdSize64, OpSize32,
+ Requires<[IsLP64]>;
+
+ let Uses = [RAX,RCX,RDI] in
+ def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins),
+ "{rep;stosq %rax, %es:(%rdi)|rep stosq es:[rdi], rax}",
+ [(X86rep_stos i64)]>, REP, AdSize64,
+ Requires<[IsLP64]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+let SchedRW = [WriteSystem] in {
+
+// ELF TLS Support
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
+ usesCustomInserter = 1, Uses = [ESP, SSP] in {
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_addr32",
+ [(X86tlsaddr tls32addr:$sym)]>,
+ Requires<[Not64BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_base_addr32",
+ [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+ Requires<[Not64BitMode]>;
+}
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF],
+ usesCustomInserter = 1, Uses = [RSP, SSP] in {
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLS_addr64",
+ [(X86tlsaddr tls64addr:$sym)]>,
+ Requires<[In64BitMode, IsLP64]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLS_base_addr64",
+ [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+ Requires<[In64BitMode, IsLP64]>;
+def TLS_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_addrX32",
+ [(X86tlsaddr tls32addr:$sym)]>,
+ Requires<[In64BitMode, NotLP64]>;
+def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_base_addrX32",
+ [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+ Requires<[In64BitMode, NotLP64]>;
+}
+
+// Darwin TLS Support
+// For i386, the address of the thunk is passed on the stack, on return the
+// address of the variable is in %eax. %ecx is trashed during the function
+// call. All other registers are preserved.
+let Defs = [EAX, ECX, EFLAGS, DF],
+ Uses = [ESP, SSP],
+ usesCustomInserter = 1 in
+def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLSCall_32",
+ [(X86TLSCall addr:$sym)]>,
+ Requires<[Not64BitMode]>;
+
+// For x86_64, the address of the thunk is passed in %rdi, but the
+// pseudo directly use the symbol, so do not add an implicit use of
+// %rdi. The lowering will do the right thing with RDI.
+// On return the address of the variable is in %rax. All other
+// registers are preserved.
+let Defs = [RAX, EFLAGS, DF],
+ Uses = [RSP, SSP],
+ usesCustomInserter = 1 in
+def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLSCall_64",
+ [(X86TLSCall addr:$sym)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions
+
+// CMOV* - Used to implement the SELECT DAG operation. Expanded after
+// instruction selection into a branch sequence.
+multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
+ def CMOV#NAME : I<0, Pseudo,
+ (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
+ "#CMOV_"#NAME#" PSEUDO!",
+ [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, timm:$cond,
+ EFLAGS)))]>;
+}
+
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
+ // X86 doesn't have 8-bit conditional moves. Use a customInserter to
+ // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+ // however that requires promoting the operands, and can induce additional
+ // i8 register pressure.
+ defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;
+
+ let Predicates = [NoCMov] in {
+ defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
+ defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
+ } // Predicates = [NoCMov]
+
+ // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+ // SSE1/SSE2.
+ let Predicates = [FPStackf32] in
+ defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>;
+
+ let Predicates = [FPStackf64] in
+ defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>;
+
+ defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
+
+ let Predicates = [HasMMX] in
+ defm _VR64 : CMOVrr_PSEUDO<VR64, x86mmx>;
+
+ let Predicates = [HasSSE1,NoAVX512] in
+ defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
+ let Predicates = [HasSSE2,NoAVX512] in
+ defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
+ let Predicates = [HasAVX512] in {
+ defm _FR32X : CMOVrr_PSEUDO<FR32X, f32>;
+ defm _FR64X : CMOVrr_PSEUDO<FR64X, f64>;
+ }
+ let Predicates = [NoVLX] in {
+ defm _VR128 : CMOVrr_PSEUDO<VR128, v2i64>;
+ defm _VR256 : CMOVrr_PSEUDO<VR256, v4i64>;
+ }
+ let Predicates = [HasVLX] in {
+ defm _VR128X : CMOVrr_PSEUDO<VR128X, v2i64>;
+ defm _VR256X : CMOVrr_PSEUDO<VR256X, v4i64>;
+ }
+ defm _VR512 : CMOVrr_PSEUDO<VR512, v8i64>;
+ defm _VK1 : CMOVrr_PSEUDO<VK1, v1i1>;
+ defm _VK2 : CMOVrr_PSEUDO<VK2, v2i1>;
+ defm _VK4 : CMOVrr_PSEUDO<VK4, v4i1>;
+ defm _VK8 : CMOVrr_PSEUDO<VK8, v8i1>;
+ defm _VK16 : CMOVrr_PSEUDO<VK16, v16i1>;
+ defm _VK32 : CMOVrr_PSEUDO<VK32, v32i1>;
+ defm _VK64 : CMOVrr_PSEUDO<VK64, v64i1>;
+} // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS]
+
+def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+
+let Predicates = [NoVLX] in {
+ def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+
+ def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+}
+let Predicates = [HasVLX] in {
+ def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+
+ def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+}
+
+def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+
+//===----------------------------------------------------------------------===//
+// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Use normal instructions and add lock prefix dynamically.
+
+// Memory barriers
+
+let isCodeGenOnly = 1, Defs = [EFLAGS] in
+def OR32mi8Locked : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$zero),
+ "or{l}\t{$zero, $dst|$dst, $zero}", []>,
+ Requires<[Not64BitMode]>, OpSize32, LOCK,
+ Sched<[WriteALURMW]>;
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
+ "#MEMBARRIER",
+ [(X86MemBarrier)]>, Sched<[WriteLoad]>;
+
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+ Format ImmMod, SDNode Op, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALURMW] in {
+
+def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+ MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, GR8:$src2))]>, LOCK;
+
+def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, GR16:$src2))]>,
+ OpSize16, LOCK;
+
+def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, GR32:$src2))]>,
+ OpSize32, LOCK;
+
+def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, GR64:$src2))]>, LOCK;
+
+// NOTE: These are order specific, we want the mi8 forms to be listed
+// first so that they are slightly preferred to the mi forms.
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>,
+ OpSize16, LOCK;
+
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>,
+ OpSize32, LOCK;
+
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>,
+ LOCK;
+
+def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+ ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))]>, LOCK;
+
+def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))]>,
+ OpSize16, LOCK;
+
+def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))]>,
+ OpSize32, LOCK;
+
+def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))]>,
+ LOCK;
+}
+
+}
+
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, X86lock_add, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, X86lock_sub, "sub">;
+defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, X86lock_or , "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">;
+
+def X86lock_add_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86lock_add node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 0));
+}]>;
+
+def X86lock_sub_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86lock_sub node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 0));
+}]>;
+
+let Predicates = [UseIncDec] in {
+ let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALURMW] in {
+ def LOCK_INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
+ "inc{b}\t$dst",
+ [(set EFLAGS, (X86lock_add_nocf addr:$dst, (i8 1)))]>,
+ LOCK;
+ def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst),
+ "inc{w}\t$dst",
+ [(set EFLAGS, (X86lock_add_nocf addr:$dst, (i16 1)))]>,
+ OpSize16, LOCK;
+ def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst),
+ "inc{l}\t$dst",
+ [(set EFLAGS, (X86lock_add_nocf addr:$dst, (i32 1)))]>,
+ OpSize32, LOCK;
+ def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
+ "inc{q}\t$dst",
+ [(set EFLAGS, (X86lock_add_nocf addr:$dst, (i64 1)))]>,
+ LOCK;
+
+ def LOCK_DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst),
+ "dec{b}\t$dst",
+ [(set EFLAGS, (X86lock_sub_nocf addr:$dst, (i8 1)))]>,
+ LOCK;
+ def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
+ "dec{w}\t$dst",
+ [(set EFLAGS, (X86lock_sub_nocf addr:$dst, (i16 1)))]>,
+ OpSize16, LOCK;
+ def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
+ "dec{l}\t$dst",
+ [(set EFLAGS, (X86lock_sub_nocf addr:$dst, (i32 1)))]>,
+ OpSize32, LOCK;
+ def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
+ "dec{q}\t$dst",
+ [(set EFLAGS, (X86lock_sub_nocf addr:$dst, (i64 1)))]>,
+ LOCK;
+ }
+
+ // Additional patterns for -1 constant.
+ def : Pat<(X86lock_add addr:$dst, (i8 -1)), (LOCK_DEC8m addr:$dst)>;
+ def : Pat<(X86lock_add addr:$dst, (i16 -1)), (LOCK_DEC16m addr:$dst)>;
+ def : Pat<(X86lock_add addr:$dst, (i32 -1)), (LOCK_DEC32m addr:$dst)>;
+ def : Pat<(X86lock_add addr:$dst, (i64 -1)), (LOCK_DEC64m addr:$dst)>;
+ def : Pat<(X86lock_sub addr:$dst, (i8 -1)), (LOCK_INC8m addr:$dst)>;
+ def : Pat<(X86lock_sub addr:$dst, (i16 -1)), (LOCK_INC16m addr:$dst)>;
+ def : Pat<(X86lock_sub addr:$dst, (i32 -1)), (LOCK_INC32m addr:$dst)>;
+ def : Pat<(X86lock_sub addr:$dst, (i64 -1)), (LOCK_INC64m addr:$dst)>;
+}
+
+// Atomic compare and swap.
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+ string mnemonic, SDPatternOperator frag> {
+let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
+ let Defs = [AL, EFLAGS], Uses = [AL] in
+ def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+ !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR8:$swap, 1)]>, TB, LOCK;
+ let Defs = [AX, EFLAGS], Uses = [AX] in
+ def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+ !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR16:$swap, 2)]>, TB, OpSize16, LOCK;
+ let Defs = [EAX, EFLAGS], Uses = [EAX] in
+ def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+ !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR32:$swap, 4)]>, TB, OpSize32, LOCK;
+ let Defs = [RAX, EFLAGS], Uses = [RAX] in
+ def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+ !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
+}
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
+ Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, usesCustomInserter = 1 in {
+def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
+ "cmpxchg8b\t$ptr",
+ [(X86cas8 addr:$ptr)]>, TB, LOCK;
+}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
+def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
+ "cmpxchg16b\t$ptr",
+ []>, TB, LOCK;
+}
+
+// This pseudo must be used when the frame uses RBX as
+// the base pointer. Indeed, in such situation RBX is a reserved
+// register and the register allocator will ignore any use/def of
+// it. In other words, the register will not fix the clobbering of
+// RBX that will happen when setting the arguments for the instrucion.
+//
+// Unlike the actual related instruction, we mark that this one
+// defines RBX (instead of using RBX).
+// The rationale is that we will define RBX during the expansion of
+// the pseudo. The argument feeding RBX is rbx_input.
+//
+// The additional argument, $rbx_save, is a temporary register used to
+// save the value of RBX across the actual instruction.
+//
+// To make sure the register assigned to $rbx_save does not interfere with
+// the definition of the actual instruction, we use a definition $dst which
+// is tied to $rbx_save. That way, the live-range of $rbx_save spans across
+// the instruction and we are sure we will have a valid register to restore
+// the value of RBX.
+let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, isPseudo = 1,
+ mayLoad = 1, mayStore = 1, hasSideEffects = 0,
+ Constraints = "$rbx_save = $dst" in {
+def LCMPXCHG16B_SAVE_RBX :
+ I<0, Pseudo, (outs GR64:$dst),
+ (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), "", []>;
+}
+
+// Pseudo instruction that doesn't read/write RBX. Will be turned into either
+// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter.
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, isPseudo = 1,
+ mayLoad = 1, mayStore = 1, hasSideEffects = 0,
+ usesCustomInserter = 1 in {
+def LCMPXCHG16B_NO_RBX :
+ I<0, Pseudo, (outs), (ins i128mem:$ptr, GR64:$rbx_input), "",
+ [(X86cas16 addr:$ptr, GR64:$rbx_input)]>;
+}
+
+// This pseudo must be used when the frame uses RBX/EBX as
+// the base pointer.
+// cf comment for LCMPXCHG16B_SAVE_RBX.
+let Defs = [EBX], Uses = [ECX, EAX],
+ Predicates = [HasMWAITX], SchedRW = [WriteSystem],
+ isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst" in {
+def MWAITX_SAVE_RBX :
+ I<0, Pseudo, (outs GR64:$dst),
+ (ins GR32:$ebx_input, GR64:$rbx_save),
+ "mwaitx",
+ []>;
+}
+
+// Pseudo mwaitx instruction to use for custom insertion.
+let Predicates = [HasMWAITX], SchedRW = [WriteSystem],
+ isCodeGenOnly = 1, isPseudo = 1,
+ usesCustomInserter = 1 in {
+def MWAITX :
+ I<0, Pseudo, (outs), (ins GR32:$ecx, GR32:$eax, GR32:$ebx),
+ "mwaitx",
+ [(int_x86_mwaitx GR32:$ecx, GR32:$eax, GR32:$ebx)]>;
+}
+
+
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>;
+
+// Atomic exchange and add
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+ string frag> {
+ let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
+ SchedRW = [WriteALURMW] in {
+ def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
+ def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
+ OpSize16;
+ def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
+ OpSize32;
+ def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
+ }
+}
+
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add">, TB, LOCK;
+
+/* The following multiclass tries to make sure that in code like
+ * x.store (immediate op x.load(acquire), release)
+ * and
+ * x.store (register op x.load(acquire), release)
+ * an operation directly on memory is generated instead of wasting a register.
+ * It is not automatic as atomic_store/load are only lowered to MOV instructions
+ * extremely late to prevent them from being accidentally reordered in the backend
+ * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
+ */
+multiclass RELEASE_BINOP_MI<string Name, SDNode op> {
+ def : Pat<(atomic_store_8 addr:$dst,
+ (op (atomic_load_8 addr:$dst), (i8 imm:$src))),
+ (!cast<Instruction>(Name#"8mi") addr:$dst, imm:$src)>;
+ def : Pat<(atomic_store_16 addr:$dst,
+ (op (atomic_load_16 addr:$dst), (i16 imm:$src))),
+ (!cast<Instruction>(Name#"16mi") addr:$dst, imm:$src)>;
+ def : Pat<(atomic_store_32 addr:$dst,
+ (op (atomic_load_32 addr:$dst), (i32 imm:$src))),
+ (!cast<Instruction>(Name#"32mi") addr:$dst, imm:$src)>;
+ def : Pat<(atomic_store_64 addr:$dst,
+ (op (atomic_load_64 addr:$dst), (i64immSExt32:$src))),
+ (!cast<Instruction>(Name#"64mi32") addr:$dst, (i64immSExt32:$src))>;
+
+ def : Pat<(atomic_store_8 addr:$dst,
+ (op (atomic_load_8 addr:$dst), (i8 GR8:$src))),
+ (!cast<Instruction>(Name#"8mr") addr:$dst, GR8:$src)>;
+ def : Pat<(atomic_store_16 addr:$dst,
+ (op (atomic_load_16 addr:$dst), (i16 GR16:$src))),
+ (!cast<Instruction>(Name#"16mr") addr:$dst, GR16:$src)>;
+ def : Pat<(atomic_store_32 addr:$dst,
+ (op (atomic_load_32 addr:$dst), (i32 GR32:$src))),
+ (!cast<Instruction>(Name#"32mr") addr:$dst, GR32:$src)>;
+ def : Pat<(atomic_store_64 addr:$dst,
+ (op (atomic_load_64 addr:$dst), (i64 GR64:$src))),
+ (!cast<Instruction>(Name#"64mr") addr:$dst, GR64:$src)>;
+}
+defm : RELEASE_BINOP_MI<"ADD", add>;
+defm : RELEASE_BINOP_MI<"AND", and>;
+defm : RELEASE_BINOP_MI<"OR", or>;
+defm : RELEASE_BINOP_MI<"XOR", xor>;
+defm : RELEASE_BINOP_MI<"SUB", sub>;
+
+// Atomic load + floating point patterns.
+// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
+multiclass ATOMIC_LOAD_FP_BINOP_MI<string Name, SDNode op> {
+ def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
+ (!cast<Instruction>(Name#"SSrm") FR32:$src1, addr:$src2)>,
+ Requires<[UseSSE1]>;
+ def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SSrm") FR32:$src1, addr:$src2)>,
+ Requires<[UseAVX]>;
+ def : Pat<(op FR32X:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SSZrm") FR32X:$src1, addr:$src2)>,
+ Requires<[HasAVX512]>;
+
+ def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
+ (!cast<Instruction>(Name#"SDrm") FR64:$src1, addr:$src2)>,
+ Requires<[UseSSE1]>;
+ def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SDrm") FR64:$src1, addr:$src2)>,
+ Requires<[UseAVX]>;
+ def : Pat<(op FR64X:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SDZrm") FR64X:$src1, addr:$src2)>,
+ Requires<[HasAVX512]>;
+}
+defm : ATOMIC_LOAD_FP_BINOP_MI<"ADD", fadd>;
+// FIXME: Add fsub, fmul, fdiv, ...
+
+multiclass RELEASE_UNOP<string Name, dag dag8, dag dag16, dag dag32,
+ dag dag64> {
+ def : Pat<(atomic_store_8 addr:$dst, dag8),
+ (!cast<Instruction>(Name#8m) addr:$dst)>;
+ def : Pat<(atomic_store_16 addr:$dst, dag16),
+ (!cast<Instruction>(Name#16m) addr:$dst)>;
+ def : Pat<(atomic_store_32 addr:$dst, dag32),
+ (!cast<Instruction>(Name#32m) addr:$dst)>;
+ def : Pat<(atomic_store_64 addr:$dst, dag64),
+ (!cast<Instruction>(Name#64m) addr:$dst)>;
+}
+
+let Predicates = [UseIncDec] in {
+ defm : RELEASE_UNOP<"INC",
+ (add (atomic_load_8 addr:$dst), (i8 1)),
+ (add (atomic_load_16 addr:$dst), (i16 1)),
+ (add (atomic_load_32 addr:$dst), (i32 1)),
+ (add (atomic_load_64 addr:$dst), (i64 1))>;
+ defm : RELEASE_UNOP<"DEC",
+ (add (atomic_load_8 addr:$dst), (i8 -1)),
+ (add (atomic_load_16 addr:$dst), (i16 -1)),
+ (add (atomic_load_32 addr:$dst), (i32 -1)),
+ (add (atomic_load_64 addr:$dst), (i64 -1))>;
+}
+
+defm : RELEASE_UNOP<"NEG",
+ (ineg (i8 (atomic_load_8 addr:$dst))),
+ (ineg (i16 (atomic_load_16 addr:$dst))),
+ (ineg (i32 (atomic_load_32 addr:$dst))),
+ (ineg (i64 (atomic_load_64 addr:$dst)))>;
+defm : RELEASE_UNOP<"NOT",
+ (not (i8 (atomic_load_8 addr:$dst))),
+ (not (i16 (atomic_load_16 addr:$dst))),
+ (not (i32 (atomic_load_32 addr:$dst))),
+ (not (i64 (atomic_load_64 addr:$dst)))>;
+
+def : Pat<(atomic_store_8 addr:$dst, (i8 imm:$src)),
+ (MOV8mi addr:$dst, imm:$src)>;
+def : Pat<(atomic_store_16 addr:$dst, (i16 imm:$src)),
+ (MOV16mi addr:$dst, imm:$src)>;
+def : Pat<(atomic_store_32 addr:$dst, (i32 imm:$src)),
+ (MOV32mi addr:$dst, imm:$src)>;
+def : Pat<(atomic_store_64 addr:$dst, (i64immSExt32:$src)),
+ (MOV64mi32 addr:$dst, i64immSExt32:$src)>;
+
+def : Pat<(atomic_store_8 addr:$dst, GR8:$src),
+ (MOV8mr addr:$dst, GR8:$src)>;
+def : Pat<(atomic_store_16 addr:$dst, GR16:$src),
+ (MOV16mr addr:$dst, GR16:$src)>;
+def : Pat<(atomic_store_32 addr:$dst, GR32:$src),
+ (MOV32mr addr:$dst, GR32:$src)>;
+def : Pat<(atomic_store_64 addr:$dst, GR64:$src),
+ (MOV64mr addr:$dst, GR64:$src)>;
+
+def : Pat<(i8 (atomic_load_8 addr:$src)), (MOV8rm addr:$src)>;
+def : Pat<(i16 (atomic_load_16 addr:$src)), (MOV16rm addr:$src)>;
+def : Pat<(i32 (atomic_load_32 addr:$src)), (MOV32rm addr:$src)>;
+def : Pat<(i64 (atomic_load_64 addr:$src)), (MOV64rm addr:$src)>;
+
+// Floating point loads/stores.
+def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
+ (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
+def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
+ (VMOVSSmr addr:$dst, FR32:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
+ (VMOVSSZmr addr:$dst, FR32:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
+ (MOVSDmr addr:$dst, FR64:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
+ (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
+ (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
+ (MOVSSrm_alt addr:$src)>, Requires<[UseSSE1]>;
+def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
+ (VMOVSSrm_alt addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
+ (VMOVSSZrm_alt addr:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
+ (MOVSDrm_alt addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
+ (VMOVSDrm_alt addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
+ (VMOVSDZrm_alt addr:$src)>, Requires<[HasAVX512]>;
+
+//===----------------------------------------------------------------------===//
+// DAG Pattern Matching Rules
+//===----------------------------------------------------------------------===//
+
+// Use AND/OR to store 0/-1 in memory when optimizing for minsize. This saves
+// binary size compared to a regular MOV, but it introduces an unnecessary
+// load, so is not suitable for regular or optsize functions.
+let Predicates = [OptForMinSize] in {
+def : Pat<(simple_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
+}
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper mcsym:$dst)),
+ (MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates. FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tconstpool:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tjumptable:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, texternalsym:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, mcsym:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+ Requires<[NearData, IsNotPIC]>;
+
+def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>;
+def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>;
+
+// Calls
+
+// tls has some funny stuff here...
+// This corresponds to movabs $foo@tpoff, %rax
+def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
+ (MOV64ri32 tglobaltlsaddr :$dst)>;
+// This corresponds to add $foo@tpoff, %rax
+def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
+ (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
+
+
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+ (CALL64pcrel32 texternalsym:$dst)>;
+
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+ (X86tcret node:$ptr, node:$off), [{
+ // X86tcret args: (*chain, ptr, imm, regs..., glue)
+ unsigned NumRegs = 0;
+ for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+ if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+ return false;
+ return true;
+}]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
+ Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
+
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(X86tcret (load addr:$dst), timm:$off),
+ (TCRETURNmi addr:$dst, timm:$off)>,
+ Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), timm:$off),
+ (TCRETURNdi tglobaladdr:$dst, timm:$off)>,
+ Requires<[NotLP64]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
+ (TCRETURNdi texternalsym:$dst, timm:$off)>,
+ Requires<[NotLP64]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
+ Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
+
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
+ (TCRETURNmi64 addr:$dst, timm:$off)>,
+ Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>,
+ Requires<[In64BitMode, UseIndirectThunkCalls]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, timm:$off)>,
+ Requires<[Not64BitMode, UseIndirectThunkCalls]>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), timm:$off),
+ (TCRETURNdi64 tglobaladdr:$dst, timm:$off)>,
+ Requires<[IsLP64]>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), timm:$off),
+ (TCRETURNdi64 texternalsym:$dst, timm:$off)>,
+ Requires<[IsLP64]>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+ (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+ (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+ (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+ (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+ (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+ (TEST32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86cmp GR64:$src1, 0),
+ (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// zextload bool -> zextload byte
+// i1 stored in one byte in zero-extended form.
+// Upper bits cleanup should be executed before Store.
+def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src),
+ (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+// extload bool -> extload byte
+// When extloading from 16-bit and smaller memory locations into 64-bit
+// registers, use zero-extending loads so that the entire 64-bit register is
+// defined, avoiding partial-register updates.
+
+def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src),
+ (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src),
+ (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+// NOTE: The extloadi64i32 pattern needs to be first as it will try to form
+// 32-bit loads for 4 byte aligned i8/i16 loads.
+def : Pat<(extloadi64i32 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i1 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i8 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i16 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+ (MOVZX32rr8 GR8 :$src), sub_16bit)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;
+
+// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
+def : Pat<(i32 (anyext GR16:$src)),
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
+
+def : Pat<(i64 (anyext GR8 :$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR32:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, sub_32bit)>;
+
+// If this is an anyext of the remainder of an 8-bit sdivrem, use a MOVSX
+// instead of a MOVZX. The sdivrem lowering will emit emit a MOVSX to move
+// %ah to the lower byte of a register. By using a MOVSX here we allow a
+// post-isel peephole to merge the two MOVSX instructions into one.
+def anyext_sdiv : PatFrag<(ops node:$lhs), (anyext node:$lhs),[{
+ return (N->getOperand(0).getOpcode() == ISD::SDIVREM &&
+ N->getOperand(0).getResNo() == 1);
+}]>;
+def : Pat<(i32 (anyext_sdiv GR8:$src)), (MOVSX32rr8 GR8:$src)>;
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. Any other 32-bit operation will zero-extend
+// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
+// 32 bits, they're probably just qualifying a CopyFromReg.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+ return N->getOpcode() != ISD::TRUNCATE &&
+ N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+ N->getOpcode() != ISD::CopyFromReg &&
+ N->getOpcode() != ISD::AssertSext &&
+ N->getOpcode() != ISD::AssertZext;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+def : Pat<(i64 (and (anyext def32:$src), 0x00000000FFFFFFFF)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern match OR as ADD
+//===----------------------------------------------------------------------===//
+
+// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
+// 3-addressified into an LEA instruction to avoid copies. However, we also
+// want to finally emit these instructions as an or at the end of the code
+// generator to make the generated code easier to read. To do this, we select
+// into "disjoint bits" pseudo ops.
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+ return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+ KnownBits Known0 = CurDAG->computeKnownBits(N->getOperand(0), 0);
+ KnownBits Known1 = CurDAG->computeKnownBits(N->getOperand(1), 0);
+ return (~Known0.Zero & ~Known1.Zero) == 0;
+}]>;
+
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+// Try this before the selecting to OR.
+let SchedRW = [WriteALU] in {
+
+let isConvertibleToThreeAddress = 1, isPseudo = 1,
+ Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
+let isCommutable = 1 in {
+def ADD8rr_DB : I<0, Pseudo, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "", // orb/addb REG, REG
+ [(set GR8:$dst, (or_is_add GR8:$src1, GR8:$src2))]>;
+def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "", // orw/addw REG, REG
+ [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
+def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "", // orl/addl REG, REG
+ [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
+def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "", // orq/addq REG, REG
+ [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
+} // isCommutable
+
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
+def ADD8ri_DB : I<0, Pseudo,
+ (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "", // orb/addb REG, imm8
+ [(set GR8:$dst, (or_is_add GR8:$src1, imm:$src2))]>;
+def ADD16ri8_DB : I<0, Pseudo,
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "", // orw/addw REG, imm8
+ [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
+def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "", // orw/addw REG, imm
+ [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
+
+def ADD32ri8_DB : I<0, Pseudo,
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "", // orl/addl REG, imm8
+ [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
+def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "", // orl/addl REG, imm
+ [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
+
+
+def ADD64ri8_DB : I<0, Pseudo,
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "", // orq/addq REG, imm8
+ [(set GR64:$dst, (or_is_add GR64:$src1,
+ i64immSExt8:$src2))]>;
+def ADD64ri32_DB : I<0, Pseudo,
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "", // orq/addq REG, imm
+ [(set GR64:$dst, (or_is_add GR64:$src1,
+ i64immSExt32:$src2))]>;
+}
+} // AddedComplexity, SchedRW
+
+//===----------------------------------------------------------------------===//
+// Pattern match SUB as XOR
+//===----------------------------------------------------------------------===//
+
+// An immediate in the LHS of a subtract can't be encoded in the instruction.
+// If there is no possibility of a borrow we can use an XOR instead of a SUB
+// to enable the immediate to be folded.
+// TODO: Move this to a DAG combine?
+
+def sub_is_xor : PatFrag<(ops node:$lhs, node:$rhs), (sub node:$lhs, node:$rhs),[{
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+ KnownBits Known = CurDAG->computeKnownBits(N->getOperand(1));
+
+ // If all possible ones in the RHS are set in the LHS then there can't be
+ // a borrow and we can use xor.
+ return (~Known.Zero).isSubsetOf(CN->getAPIntValue());
+ }
+
+ return false;
+}]>;
+
+let AddedComplexity = 5 in {
+def : Pat<(sub_is_xor imm:$src2, GR8:$src1),
+ (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub_is_xor i16immSExt8:$src2, GR16:$src1),
+ (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub_is_xor imm:$src2, GR16:$src1),
+ (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub_is_xor i32immSExt8:$src2, GR32:$src1),
+ (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(sub_is_xor imm:$src2, GR32:$src1),
+ (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub_is_xor i64immSExt8:$src2, GR64:$src1),
+ (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub_is_xor i64immSExt32:$src2, GR64:$src1),
+ (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+ (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+ (SUB16mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR32:$src1, 128),
+ (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+ (SUB32mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR64:$src1, 128),
+ (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+ (SUB64mi8 addr:$dst, -128)>;
+
+def : Pat<(X86add_flag_nocf GR16:$src1, 128),
+ (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(X86add_flag_nocf GR32:$src1, 128),
+ (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(X86add_flag_nocf GR64:$src1, 128),
+ (SUB64ri8 GR64:$src1, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+ (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
+ (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000),
+ (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+
+// To avoid needing to materialize an immediate in a register, use a 32-bit and
+// with implicit zero-extension instead of a 64-bit and if the immediate has at
+// least 32 bits of leading zeros. If in addition the last 32 bits can be
+// represented with a sign extension of a 8 bit constant, use that.
+// This can also reduce instruction size by eliminating the need for the REX
+// prefix.
+
+// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32.
+let AddedComplexity = 1 in {
+def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
+ (SUBREG_TO_REG
+ (i64 0),
+ (AND32ri8
+ (EXTRACT_SUBREG GR64:$src, sub_32bit),
+ (i32 (GetLo32XForm imm:$imm))),
+ sub_32bit)>;
+
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+ (SUBREG_TO_REG
+ (i64 0),
+ (AND32ri
+ (EXTRACT_SUBREG GR64:$src, sub_32bit),
+ (i32 (GetLo32XForm imm:$imm))),
+ sub_32bit)>;
+} // AddedComplexity = 1
+
+
+// AddedComplexity is needed due to the increased complexity on the
+// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all
+// the MOVZX patterns keeps thems together in DAGIsel tables.
+let AddedComplexity = 1 in {
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+ (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)),
+ sub_16bit)>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+ (SUBREG_TO_REG (i64 0),
+ (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
+ sub_32bit)>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+ (SUBREG_TO_REG (i64 0),
+ (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
+ sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+ (SUBREG_TO_REG (i64 0),
+ (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
+ sub_32bit)>;
+} // AddedComplexity = 1
+
+
+// Try to use BTS/BTR/BTC for single bit operations on the upper 32-bits.
+
+def BTRXForm : SDNodeXForm<imm, [{
+ // Transformation function: Find the lowest 0.
+ return getI64Imm((uint8_t)N->getAPIntValue().countTrailingOnes(), SDLoc(N));
+}]>;
+
+def BTCBTSXForm : SDNodeXForm<imm, [{
+ // Transformation function: Find the lowest 1.
+ return getI64Imm((uint8_t)N->getAPIntValue().countTrailingZeros(), SDLoc(N));
+}]>;
+
+def BTRMask64 : ImmLeaf<i64, [{
+ return !isUInt<32>(Imm) && !isInt<32>(Imm) && isPowerOf2_64(~Imm);
+}]>;
+
+def BTCBTSMask64 : ImmLeaf<i64, [{
+ return !isInt<32>(Imm) && isPowerOf2_64(Imm);
+}]>;
+
+// For now only do this for optsize.
+let AddedComplexity = 1, Predicates=[OptForSize] in {
+ def : Pat<(and GR64:$src1, BTRMask64:$mask),
+ (BTR64ri8 GR64:$src1, (BTRXForm imm:$mask))>;
+ def : Pat<(or GR64:$src1, BTCBTSMask64:$mask),
+ (BTS64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>;
+ def : Pat<(xor GR64:$src1, BTCBTSMask64:$mask),
+ (BTC64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>;
+}
+
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+ (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>;
+
+def : Pat<(sext_inreg GR16:$src, i8),
+ (EXTRACT_SUBREG (MOVSX32rr8 (EXTRACT_SUBREG GR16:$src, sub_8bit)),
+ sub_16bit)>;
+
+def : Pat<(sext_inreg GR64:$src, i32),
+ (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+ (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+ (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
+
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+ (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+ (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+ (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+ (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i32 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+
+def immff00_ffff : ImmLeaf<i32, [{
+ return Imm >= 0xff00 && Imm <= 0xffff;
+}]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su (i32 (anyext GR16:$src)), (i8 8)))),
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+ (EXTRACT_SUBREG GR32:$src, sub_8bit_hi)>,
+ Requires<[Not64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
+ sub_16bit)>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
+def : Pat<(srl (and_su GR32:$src, immff00_ffff), (i8 8)),
+ (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32rr8_NOREX
+ (EXTRACT_SUBREG GR64:$src, sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32rr8_NOREX
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32rr8_NOREX
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
+ sub_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG GR64:$src, sub_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+
+// Special pattern to catch the last step of __builtin_parity handling. Our
+// goal is to use an xor of an h-register with the corresponding l-register.
+// The above patterns would handle this on non 64-bit targets, but for 64-bit
+// we need to be more careful. We're using a NOREX instruction here in case
+// register allocation fails to keep the two registers together. So we need to
+// make sure we can't accidentally mix R8-R15 with an h-register.
+def : Pat<(X86xor_flag (i8 (trunc GR32:$src)),
+ (i8 (trunc (srl_su GR32:$src, (i8 8))))),
+ (XOR8rr_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit),
+ (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
+
+// (shl x, 1) ==> (add x, x)
+// Note that if x is undef (immediate or otherwise), we could theoretically
+// end up with the two uses of x getting different values, producing a result
+// where the least significant bit is not 0. However, the probability of this
+// happening is considered low enough that this is officially not a
+// "real problem".
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+def shiftMask8 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 3);
+}]>;
+
+def shiftMask16 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 4);
+}]>;
+
+def shiftMask32 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 5);
+}]>;
+
+def shiftMask64 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 6);
+}]>;
+
+
+// Shift amount is implicitly masked.
+multiclass MaskedShiftAmountPats<SDNode frag, string name> {
+ // (shift x (and y, 31)) ==> (shift x, y)
+ def : Pat<(frag GR8:$src1, (shiftMask32 CL)),
+ (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+ def : Pat<(frag GR16:$src1, (shiftMask32 CL)),
+ (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+ def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
+ (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+ def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask32 CL)), addr:$dst),
+ (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask32 CL)), addr:$dst),
+ (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
+ (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+
+ // (shift x (and y, 63)) ==> (shift x, y)
+ def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
+ (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+ def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
+ (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+}
+
+defm : MaskedShiftAmountPats<shl, "SHL">;
+defm : MaskedShiftAmountPats<srl, "SHR">;
+defm : MaskedShiftAmountPats<sra, "SAR">;
+
+// ROL/ROR instructions allow a stronger mask optimization than shift for 8- and
+// 16-bit. We can remove a mask of any (bitwidth - 1) on the rotation amount
+// because over-rotating produces the same result. This is noted in the Intel
+// docs with: "tempCOUNT <- (COUNT & COUNTMASK) MOD SIZE". Masking the rotation
+// amount could affect EFLAGS results, but that does not matter because we are
+// not tracking flags for these nodes.
+multiclass MaskedRotateAmountPats<SDNode frag, string name> {
+ // (rot x (and y, BitWidth - 1)) ==> (rot x, y)
+ def : Pat<(frag GR8:$src1, (shiftMask8 CL)),
+ (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+ def : Pat<(frag GR16:$src1, (shiftMask16 CL)),
+ (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+ def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
+ (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+ def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask8 CL)), addr:$dst),
+ (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask16 CL)), addr:$dst),
+ (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
+ (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+
+ // (rot x (and y, 63)) ==> (rot x, y)
+ def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
+ (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+ def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
+ (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+}
+
+
+defm : MaskedRotateAmountPats<rotl, "ROL">;
+defm : MaskedRotateAmountPats<rotr, "ROR">;
+
+// Double "funnel" shift amount is implicitly masked.
+// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) (NOTE: modulo32)
+def : Pat<(X86fshl GR16:$src1, GR16:$src2, (shiftMask32 CL)),
+ (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+def : Pat<(X86fshr GR16:$src2, GR16:$src1, (shiftMask32 CL)),
+ (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y)
+def : Pat<(fshl GR32:$src1, GR32:$src2, (shiftMask32 CL)),
+ (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+def : Pat<(fshr GR32:$src2, GR32:$src1, (shiftMask32 CL)),
+ (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+// (fshl/fshr x (and y, 63)) ==> (fshl/fshr x, y)
+def : Pat<(fshl GR64:$src1, GR64:$src2, (shiftMask64 CL)),
+ (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+def : Pat<(fshr GR64:$src2, GR64:$src1, (shiftMask64 CL)),
+ (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+
+let Predicates = [HasBMI2] in {
+ let AddedComplexity = 1 in {
+ def : Pat<(sra GR32:$src1, (shiftMask32 GR8:$src2)),
+ (SARX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra GR64:$src1, (shiftMask64 GR8:$src2)),
+ (SARX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl GR32:$src1, (shiftMask32 GR8:$src2)),
+ (SHRX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl GR64:$src1, (shiftMask64 GR8:$src2)),
+ (SHRX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl GR32:$src1, (shiftMask32 GR8:$src2)),
+ (SHLX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl GR64:$src1, (shiftMask64 GR8:$src2)),
+ (SHLX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ }
+
+ def : Pat<(sra (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
+ (SARX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
+ (SARX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
+ (SHRX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
+ (SHRX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
+ (SHLX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
+ (SHLX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+}
+
+// Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location.
+multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR,
+ Instruction BTS, Instruction BTC,
+ PatFrag ShiftMask> {
+ def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)),
+ (BTR RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(or RC:$src1, (shl 1, GR8:$src2)),
+ (BTS RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(xor RC:$src1, (shl 1, GR8:$src2)),
+ (BTC RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ // Similar to above, but removing unneeded masking of the shift amount.
+ def : Pat<(and RC:$src1, (rotl -2, (ShiftMask GR8:$src2))),
+ (BTR RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(or RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
+ (BTS RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(xor RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
+ (BTC RC:$src1,
+ (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+}
+
+defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
+defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
+defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(add GR64:$src1, GR64:$src2), (ADD64rr GR64:$src1, GR64:$src2)>;
+
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
+ (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
+ (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
+ (ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
+
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
+ (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(sub GR64:$src1, GR64:$src2), (SUB64rr GR64:$src1, GR64:$src2)>;
+
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
+ (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
+ (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
+ (SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
+
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
+ (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, imm:$src2),
+ (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub GR32:$src1, imm:$src2),
+ (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
+ (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
+ (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
+ (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(mul GR32:$src1, GR32:$src2),
+ (IMUL32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(mul GR64:$src1, GR64:$src2),
+ (IMUL64rr GR64:$src1, GR64:$src2)>;
+
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
+ (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
+ (IMUL32rm GR32:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+ (IMUL64rm GR64:$src1, addr:$src2)>;
+
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
+ (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(mul GR32:$src1, imm:$src2),
+ (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
+ (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
+ (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+ (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+ (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+ (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
+ (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
+ (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
+ (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
+ (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
+ (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// Increment/Decrement reg.
+// Do not make INC/DEC if it is slow
+let Predicates = [UseIncDec] in {
+ def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>;
+ def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>;
+ def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>;
+ def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>;
+ def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>;
+ def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
+ def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
+ def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
+
+ def : Pat<(X86add_flag_nocf GR8:$src, -1), (DEC8r GR8:$src)>;
+ def : Pat<(X86add_flag_nocf GR16:$src, -1), (DEC16r GR16:$src)>;
+ def : Pat<(X86add_flag_nocf GR32:$src, -1), (DEC32r GR32:$src)>;
+ def : Pat<(X86add_flag_nocf GR64:$src, -1), (DEC64r GR64:$src)>;
+ def : Pat<(X86sub_flag_nocf GR8:$src, -1), (INC8r GR8:$src)>;
+ def : Pat<(X86sub_flag_nocf GR16:$src, -1), (INC16r GR16:$src)>;
+ def : Pat<(X86sub_flag_nocf GR32:$src, -1), (INC32r GR32:$src)>;
+ def : Pat<(X86sub_flag_nocf GR64:$src, -1), (INC64r GR64:$src)>;
+}
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
+ (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
+ (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
+ (OR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
+ (OR64rm GR64:$src1, addr:$src2)>;
+
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
+ (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
+ (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
+ (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
+ (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
+ (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
+ (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
+ (XOR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
+ (XOR64rm GR64:$src1, addr:$src2)>;
+
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
+ (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, imm:$src2),
+ (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(xor GR32:$src1, imm:$src2),
+ (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
+ (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
+ (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
+ (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
+ (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
+ (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
+ (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
+ (AND32rm GR32:$src1, addr:$src2)>;
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
+ (AND64rm GR64:$src1, addr:$src2)>;
+
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
+ (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, imm:$src2),
+ (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(and GR32:$src1, imm:$src2),
+ (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
+ (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
+ (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
+ (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
+ (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Bit scan instruction patterns to match explicit zero-undef behavior.
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
+
+// When HasMOVBE is enabled it is possible to get a non-legalized
+// register-register 16 bit bswap. This maps it to a ROL instruction.
+let Predicates = [HasMOVBE] in {
+ def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td
new file mode 100644
index 000000000000..4f7867744017
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td
@@ -0,0 +1,430 @@
+//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 jump, return, call, and related instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions.
+//
+
+// Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
+ def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ "ret{l}", []>, OpSize32, Requires<[Not64BitMode]>;
+ def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ "ret{q}", []>, OpSize32, Requires<[In64BitMode]>;
+ def RETW : I <0xC3, RawFrm, (outs), (ins),
+ "ret{w}", []>, OpSize16;
+ def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ "ret{l}\t$amt", []>, OpSize32, Requires<[Not64BitMode]>;
+ def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ "ret{q}\t$amt", []>, OpSize32, Requires<[In64BitMode]>;
+ def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+ "ret{w}\t$amt", []>, OpSize16;
+ def LRETL : I <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{l|f}", []>, OpSize32;
+ def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{|f}q", []>, Requires<[In64BitMode]>;
+ def LRETW : I <0xCB, RawFrm, (outs), (ins),
+ "{l}ret{w|f}", []>, OpSize16;
+ def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{l|f}\t$amt", []>, OpSize32;
+ def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{|f}q\t$amt", []>, Requires<[In64BitMode]>;
+ def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "{l}ret{w|f}\t$amt", []>, OpSize16;
+
+ // The machine return from interrupt instruction, but sometimes we need to
+ // perform a post-epilogue stack adjustment. Codegen emits the pseudo form
+ // which expands to include an SP adjustment if necessary.
+ def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", []>,
+ OpSize16;
+ def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>, OpSize32;
+ def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", []>, Requires<[In64BitMode]>;
+ let isCodeGenOnly = 1 in
+ def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
+ def RET : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
+}
+
+// Unconditional branches.
+let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+ def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+ "jmp\t$dst", [(br bb:$dst)]>;
+ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
+ "jmp\t$dst", []>, OpSize16;
+ def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
+ "jmp\t$dst", []>, OpSize32;
+ }
+}
+
+// Conditional Branches.
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump],
+ isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs),
+ (ins brtarget8:$dst, ccode:$cond),
+ "j${cond}\t$dst",
+ [(X86brcond bb:$dst, timm:$cond, EFLAGS)]>;
+ let hasSideEffects = 0 in {
+ def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs),
+ (ins brtarget16:$dst, ccode:$cond),
+ "j${cond}\t$dst",
+ []>, OpSize16, TB;
+ def JCC_4 : Ii32PCRel<0x80, AddCCFrm, (outs),
+ (ins brtarget32:$dst, ccode:$cond),
+ "j${cond}\t$dst",
+ []>, TB, OpSize32;
+ }
+}
+
+def : InstAlias<"jo\t$dst", (JCC_1 brtarget8:$dst, 0), 0>;
+def : InstAlias<"jno\t$dst", (JCC_1 brtarget8:$dst, 1), 0>;
+def : InstAlias<"jb\t$dst", (JCC_1 brtarget8:$dst, 2), 0>;
+def : InstAlias<"jae\t$dst", (JCC_1 brtarget8:$dst, 3), 0>;
+def : InstAlias<"je\t$dst", (JCC_1 brtarget8:$dst, 4), 0>;
+def : InstAlias<"jne\t$dst", (JCC_1 brtarget8:$dst, 5), 0>;
+def : InstAlias<"jbe\t$dst", (JCC_1 brtarget8:$dst, 6), 0>;
+def : InstAlias<"ja\t$dst", (JCC_1 brtarget8:$dst, 7), 0>;
+def : InstAlias<"js\t$dst", (JCC_1 brtarget8:$dst, 8), 0>;
+def : InstAlias<"jns\t$dst", (JCC_1 brtarget8:$dst, 9), 0>;
+def : InstAlias<"jp\t$dst", (JCC_1 brtarget8:$dst, 10), 0>;
+def : InstAlias<"jnp\t$dst", (JCC_1 brtarget8:$dst, 11), 0>;
+def : InstAlias<"jl\t$dst", (JCC_1 brtarget8:$dst, 12), 0>;
+def : InstAlias<"jge\t$dst", (JCC_1 brtarget8:$dst, 13), 0>;
+def : InstAlias<"jle\t$dst", (JCC_1 brtarget8:$dst, 14), 0>;
+def : InstAlias<"jg\t$dst", (JCC_1 brtarget8:$dst, 15), 0>;
+
+// jcx/jecx/jrcx instructions.
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
+ // These are the 32-bit versions of this instruction for the asmparser. In
+ // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
+ // jecxz.
+ let Uses = [CX] in
+ def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jcxz\t$dst", []>, AdSize16, Requires<[Not64BitMode]>;
+ let Uses = [ECX] in
+ def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jecxz\t$dst", []>, AdSize32;
+
+ let Uses = [RCX] in
+ def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jrcxz\t$dst", []>, AdSize64, Requires<[In64BitMode]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
+ [(brind GR16:$dst)]>, Requires<[Not64BitMode]>,
+ OpSize16, Sched<[WriteJump]>;
+ def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
+ [(brind (loadi16 addr:$dst))]>, Requires<[Not64BitMode]>,
+ OpSize16, Sched<[WriteJumpLd]>;
+
+ def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+ [(brind GR32:$dst)]>, Requires<[Not64BitMode]>,
+ OpSize32, Sched<[WriteJump]>;
+ def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+ [(brind (loadi32 addr:$dst))]>, Requires<[Not64BitMode]>,
+ OpSize32, Sched<[WriteJumpLd]>;
+
+ def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+ [(brind GR64:$dst)]>, Requires<[In64BitMode]>,
+ Sched<[WriteJump]>;
+ def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+ [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>,
+ Sched<[WriteJumpLd]>;
+
+ // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
+ // These are switched from TAILJMPr/m64_REX in MCInstLower.
+ let isCodeGenOnly = 1, hasREX_WPrefix = 1 in {
+ def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst),
+ "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>;
+ let mayLoad = 1 in
+ def JMP64m_REX : I<0xFF, MRM4m, (outs), (ins i64mem:$dst),
+ "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJumpLd]>;
+
+ }
+
+ // Non-tracking jumps for IBT, use with caution.
+ let isCodeGenOnly = 1 in {
+ def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst",
+ [(X86NoTrackBrind GR16 : $dst)]>, Requires<[Not64BitMode]>,
+ OpSize16, Sched<[WriteJump]>, NOTRACK;
+
+ def JMP16m_NT : I<0xFF, MRM4m, (outs), (ins i16mem : $dst), "jmp{w}\t{*}$dst",
+ [(X86NoTrackBrind (loadi16 addr : $dst))]>,
+ Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>,
+ NOTRACK;
+
+ def JMP32r_NT : I<0xFF, MRM4r, (outs), (ins GR32 : $dst), "jmp{l}\t{*}$dst",
+ [(X86NoTrackBrind GR32 : $dst)]>, Requires<[Not64BitMode]>,
+ OpSize32, Sched<[WriteJump]>, NOTRACK;
+ def JMP32m_NT : I<0xFF, MRM4m, (outs), (ins i32mem : $dst), "jmp{l}\t{*}$dst",
+ [(X86NoTrackBrind (loadi32 addr : $dst))]>,
+ Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>,
+ NOTRACK;
+
+ def JMP64r_NT : I<0xFF, MRM4r, (outs), (ins GR64 : $dst), "jmp{q}\t{*}$dst",
+ [(X86NoTrackBrind GR64 : $dst)]>, Requires<[In64BitMode]>,
+ Sched<[WriteJump]>, NOTRACK;
+ def JMP64m_NT : I<0xFF, MRM4m, (outs), (ins i64mem : $dst), "jmp{q}\t{*}$dst",
+ [(X86NoTrackBrind(loadi64 addr : $dst))]>,
+ Requires<[In64BitMode]>, Sched<[WriteJumpLd]>, NOTRACK;
+ }
+
+ let Predicates = [Not64BitMode], AsmVariantName = "att" in {
+ def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
+ (ins i16imm:$off, i16imm:$seg),
+ "ljmp{w}\t$seg, $off", []>,
+ OpSize16, Sched<[WriteJump]>;
+ def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
+ (ins i32imm:$off, i16imm:$seg),
+ "ljmp{l}\t$seg, $off", []>,
+ OpSize32, Sched<[WriteJump]>;
+ }
+ let mayLoad = 1 in {
+ def FARJMP64m : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+ "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>;
+
+ let AsmVariantName = "att" in
+ def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+ "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+ def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+ "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+ }
+}
+
+// Loop instructions
+let SchedRW = [WriteJump] in {
+def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>;
+def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. ESP is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Uses = [ESP, SSP] in {
+ def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i32imm_brtarget:$dst),
+ "call{l}\t$dst", []>, OpSize32,
+ Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+ let hasSideEffects = 0 in
+ def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+ (outs), (ins i16imm_brtarget:$dst),
+ "call{w}\t$dst", []>, OpSize16,
+ Sched<[WriteJump]>;
+ def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
+ "call{w}\t{*}$dst", [(X86call GR16:$dst)]>,
+ OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+ def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
+ "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))]>,
+ OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>;
+ def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
+ "call{l}\t{*}$dst", [(X86call GR32:$dst)]>, OpSize32,
+ Requires<[Not64BitMode,NotUseIndirectThunkCalls]>,
+ Sched<[WriteJump]>;
+ def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
+ "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>,
+ OpSize32,
+ Requires<[Not64BitMode,FavorMemIndirectCall,
+ NotUseIndirectThunkCalls]>,
+ Sched<[WriteJumpLd]>;
+
+ // Non-tracking calls for IBT, use with caution.
+ let isCodeGenOnly = 1 in {
+ def CALL16r_NT : I<0xFF, MRM2r, (outs), (ins GR16 : $dst),
+ "call{w}\t{*}$dst",[(X86NoTrackCall GR16 : $dst)]>,
+ OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK;
+ def CALL16m_NT : I<0xFF, MRM2m, (outs), (ins i16mem : $dst),
+ "call{w}\t{*}$dst",[(X86NoTrackCall(loadi16 addr : $dst))]>,
+ OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>, NOTRACK;
+ def CALL32r_NT : I<0xFF, MRM2r, (outs), (ins GR32 : $dst),
+ "call{l}\t{*}$dst",[(X86NoTrackCall GR32 : $dst)]>,
+ OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK;
+ def CALL32m_NT : I<0xFF, MRM2m, (outs), (ins i32mem : $dst),
+ "call{l}\t{*}$dst",[(X86NoTrackCall(loadi32 addr : $dst))]>,
+ OpSize32, Requires<[Not64BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>, NOTRACK;
+ }
+
+ let Predicates = [Not64BitMode], AsmVariantName = "att" in {
+ def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
+ (ins i16imm:$off, i16imm:$seg),
+ "lcall{w}\t$seg, $off", []>,
+ OpSize16, Sched<[WriteJump]>;
+ def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
+ (ins i32imm:$off, i16imm:$seg),
+ "lcall{l}\t$seg, $off", []>,
+ OpSize32, Sched<[WriteJump]>;
+ }
+
+ let mayLoad = 1 in {
+ def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+ "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+ def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+ "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+ }
+ }
+
+
+// Tail call stuff.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ isCodeGenOnly = 1, Uses = [ESP, SSP] in {
+ def TCRETURNdi : PseudoI<(outs), (ins i32imm_brtarget:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>, NotMemoryFoldable;
+ def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>, NotMemoryFoldable;
+ let mayLoad = 1 in
+ def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
+ []>, Sched<[WriteJumpLd]>;
+
+ def TAILJMPd : PseudoI<(outs), (ins i32imm_brtarget:$dst),
+ []>, Sched<[WriteJump]>;
+
+ def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+ []>, Sched<[WriteJump]>;
+ let mayLoad = 1 in
+ def TAILJMPm : PseudoI<(outs), (ins i32mem_TC:$dst),
+ []>, Sched<[WriteJumpLd]>;
+}
+
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+ isCodeGenOnly = 1, SchedRW = [WriteJump] in
+ let Uses = [ESP, EFLAGS, SSP] in {
+ def TCRETURNdicc : PseudoI<(outs),
+ (ins i32imm_brtarget:$dst, i32imm:$offset, i32imm:$cond),
+ []>;
+
+ // This gets substituted to a conditional jump instruction in MC lowering.
+ def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_brtarget:$dst, i32imm:$cond), []>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+
+// RSP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
+ // NOTE: this pattern doesn't match "X86call imm", because we do not know
+ // that the offset between an arbitrary immediate and the call will fit in
+ // the 32-bit pcrel field that we have.
+ def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i64i32imm_brtarget:$dst),
+ "call{q}\t$dst", []>, OpSize32,
+ Requires<[In64BitMode]>;
+ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
+ "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
+ Requires<[In64BitMode,NotUseIndirectThunkCalls]>;
+ def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
+ "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
+ Requires<[In64BitMode,FavorMemIndirectCall,
+ NotUseIndirectThunkCalls]>;
+
+ // Non-tracking calls for IBT, use with caution.
+ let isCodeGenOnly = 1 in {
+ def CALL64r_NT : I<0xFF, MRM2r, (outs), (ins GR64 : $dst),
+ "call{q}\t{*}$dst",[(X86NoTrackCall GR64 : $dst)]>,
+ Requires<[In64BitMode]>, NOTRACK;
+ def CALL64m_NT : I<0xFF, MRM2m, (outs), (ins i64mem : $dst),
+ "call{q}\t{*}$dst",
+ [(X86NoTrackCall(loadi64 addr : $dst))]>,
+ Requires<[In64BitMode,FavorMemIndirectCall]>, NOTRACK;
+ }
+
+ let mayLoad = 1 in
+ def FARCALL64m : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+ "lcall{q}\t{*}$dst", []>;
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+ isCodeGenOnly = 1, Uses = [RSP, SSP] in {
+ def TCRETURNdi64 : PseudoI<(outs),
+ (ins i64i32imm_brtarget:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>;
+ def TCRETURNri64 : PseudoI<(outs),
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>, NotMemoryFoldable;
+ let mayLoad = 1 in
+ def TCRETURNmi64 : PseudoI<(outs),
+ (ins i64mem_TC:$dst, i32imm:$offset),
+ []>, Sched<[WriteJumpLd]>, NotMemoryFoldable;
+
+ def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_brtarget:$dst),
+ []>, Sched<[WriteJump]>;
+
+ def TAILJMPr64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+ []>, Sched<[WriteJump]>;
+
+ let mayLoad = 1 in
+ def TAILJMPm64 : PseudoI<(outs), (ins i64mem_TC:$dst),
+ []>, Sched<[WriteJumpLd]>;
+
+ // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
+ let hasREX_WPrefix = 1 in {
+ def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+ []>, Sched<[WriteJump]>;
+
+ let mayLoad = 1 in
+ def TAILJMPm64_REX : PseudoI<(outs), (ins i64mem_TC:$dst),
+ []>, Sched<[WriteJumpLd]>;
+ }
+}
+
+let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
+ Uses = [RSP, SSP],
+ usesCustomInserter = 1,
+ SchedRW = [WriteJump] in {
+ def INDIRECT_THUNK_CALL32 :
+ PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>,
+ Requires<[Not64BitMode,UseIndirectThunkCalls]>;
+
+ def INDIRECT_THUNK_CALL64 :
+ PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>,
+ Requires<[In64BitMode,UseIndirectThunkCalls]>;
+
+ // Indirect thunk variant of indirect tail calls.
+ let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+ def INDIRECT_THUNK_TCRETURN64 :
+ PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>;
+ def INDIRECT_THUNK_TCRETURN32 :
+ PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>;
+ }
+}
+
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+ isCodeGenOnly = 1, SchedRW = [WriteJump] in
+ let Uses = [RSP, EFLAGS, SSP] in {
+ def TCRETURNdi64cc : PseudoI<(outs),
+ (ins i64i32imm_brtarget:$dst, i32imm:$offset,
+ i32imm:$cond), []>;
+
+ // This gets substituted to a conditional jump instruction in MC lowering.
+ def TAILJMPd64_CC : PseudoI<(outs),
+ (ins i64i32imm_brtarget:$dst, i32imm:$cond), []>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrExtension.td
new file mode 100644
index 000000000000..7a4eb138ec34
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrExtension.td
@@ -0,0 +1,222 @@
+//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the sign and zero extension operations.
+//
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+ let Defs = [AX], Uses = [AL] in // AX = signext(AL)
+ def CBW : I<0x98, RawFrm, (outs), (ins),
+ "{cbtw|cbw}", []>, OpSize16, Sched<[WriteALU]>;
+ let Defs = [EAX], Uses = [AX] in // EAX = signext(AX)
+ def CWDE : I<0x98, RawFrm, (outs), (ins),
+ "{cwtl|cwde}", []>, OpSize32, Sched<[WriteALU]>;
+ let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
+ def CDQE : RI<0x98, RawFrm, (outs), (ins),
+ "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
+
+ // FIXME: CWD/CDQ/CQO shouldn't Def the A register, but the fast register
+ // allocator crashes if you remove it.
+ let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX)
+ def CWD : I<0x99, RawFrm, (outs), (ins),
+ "{cwtd|cwd}", []>, OpSize16, Sched<[WriteALU]>;
+ let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX)
+ def CDQ : I<0x99, RawFrm, (outs), (ins),
+ "{cltd|cdq}", []>, OpSize32, Sched<[WriteALU]>;
+ let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX)
+ def CQO : RI<0x99, RawFrm, (outs), (ins),
+ "{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
+}
+
+// Sign/Zero extenders
+let hasSideEffects = 0 in {
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", []>,
+ TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", []>,
+ TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR8:$src))]>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB,
+ OpSize32, Sched<[WriteALULd]>;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR16:$src))]>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i16 addr:$src))]>,
+ OpSize32, TB, Sched<[WriteALULd]>;
+
+let hasSideEffects = 0 in {
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", []>,
+ TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", []>,
+ TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR8:$src))]>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB,
+ OpSize32, Sched<[WriteALULd]>;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR16:$src))]>, TB,
+ OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i16 addr:$src))]>,
+ TB, OpSize32, Sched<[WriteALULd]>;
+
+// These instructions exist as a consequence of operand size prefix having
+// control of the destination size, but not the input size. Only support them
+// for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVSX16rr16: I<0xBF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "movs{ww|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize16, Sched<[WriteALU]>, NotMemoryFoldable;
+def MOVZX16rr16: I<0xB7, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "movz{ww|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize16, Sched<[WriteALU]>, NotMemoryFoldable;
+let mayLoad = 1 in {
+def MOVSX16rm16: I<0xBF, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "movs{ww|x}\t{$src, $dst|$dst, $src}",
+ []>, OpSize16, TB, Sched<[WriteALULd]>, NotMemoryFoldable;
+def MOVZX16rm16: I<0xB7, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "movz{ww|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize16, Sched<[WriteALULd]>, NotMemoryFoldable;
+} // mayLoad = 1
+} // isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
+def MOVZX32rr8_NOREX : I<0xB6, MRMSrcReg,
+ (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize32, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX32rm8_NOREX : I<0xB6, MRMSrcMem,
+ (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize32, Sched<[WriteALULd]>;
+
+def MOVSX32rr8_NOREX : I<0xBE, MRMSrcReg,
+ (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize32, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX32rm8_NOREX : I<0xBE, MRMSrcMem,
+ (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ []>, TB, OpSize32, Sched<[WriteALULd]>;
+}
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR8:$src))]>, TB,
+ Sched<[WriteALU]>;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i8 addr:$src))]>,
+ TB, Sched<[WriteALULd]>;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR16:$src))]>, TB,
+ Sched<[WriteALU]>;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i16 addr:$src))]>,
+ TB, Sched<[WriteALULd]>;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR32:$src))]>,
+ Sched<[WriteALU]>, Requires<[In64BitMode]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))]>,
+ Sched<[WriteALULd]>, Requires<[In64BitMode]>;
+
+// These instructions exist as a consequence of operand size prefix having
+// control of the destination size, but not the input size. Only support them
+// for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVSX16rr32: I<0x63, MRMSrcReg, (outs GR16:$dst), (ins GR32:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteALU]>, OpSize16, Requires<[In64BitMode]>;
+def MOVSX32rr32: I<0x63, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteALU]>, OpSize32, Requires<[In64BitMode]>;
+let mayLoad = 1 in {
+def MOVSX16rm32: I<0x63, MRMSrcMem, (outs GR16:$dst), (ins i32mem:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteALULd]>, OpSize16, Requires<[In64BitMode]>;
+def MOVSX32rm32: I<0x63, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteALULd]>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad = 1
+} // isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0
+
+// movzbq and movzwq encodings for the disassembler
+let hasSideEffects = 0 in {
+def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", []>,
+ TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", []>,
+ TB, Sched<[WriteALULd]>;
+def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", []>,
+ TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", []>,
+ TB, Sched<[WriteALULd]>;
+}
+
+// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
+// 32-bit register.
+def : Pat<(i64 (zext GR8:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>;
+def : Pat<(zextloadi64i8 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+def : Pat<(i64 (zext GR16:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>;
+def : Pat<(zextloadi64i16 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a
+// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible
+// when the 32-bit value is defined by a truncate or is copied from something
+// where the high bits aren't necessarily all zero. In such cases, we fall back
+// to these explicit zext instructions.
+def : Pat<(i64 (zext GR32:$src)),
+ (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>;
+def : Pat<(i64 (zextloadi64i32 addr:$src)),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
new file mode 100644
index 000000000000..f9be3a783279
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
@@ -0,0 +1,640 @@
+//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes FMA (Fused Multiply-Add) instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FMA3 - Intel 3 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+// For all FMA opcodes declared in fma3p_rm_* and fma3s_rm_* multiclasses
+// defined below, both the register and memory variants are commutable.
+// For the register form the commutable operands are 1, 2 and 3.
+// For the memory variant the folded operand must be in 3. Thus,
+// in that case, only the operands 1 and 2 can be swapped.
+// Commuting some of operands may require the opcode change.
+// FMA*213*:
+// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
+// operands 1 and 3 (register forms only): *213* --> *231*;
+// operands 2 and 3 (register forms only): *213* --> *132*.
+// FMA*132*:
+// operands 1 and 2 (memory & register forms): *132* --> *231*;
+// operands 1 and 3 (register forms only): *132* --> *132*(no changes);
+// operands 2 and 3 (register forms only): *132* --> *213*.
+// FMA*231*:
+// operands 1 and 2 (memory & register forms): *231* --> *132*;
+// operands 1 and 3 (register forms only): *231* --> *213*;
+// operands 2 and 3 (register forms only): *231* --> *231*(no changes).
+
+multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
+ SDNode Op, X86FoldableSchedWrite sched> {
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>,
+ Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (VT (Op RC:$src2, RC:$src1,
+ (MemFrag addr:$src3))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+}
+
+multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
+ SDNode Op, X86FoldableSchedWrite sched> {
+ let hasSideEffects = 0 in
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (VT (Op RC:$src2, (MemFrag addr:$src3),
+ RC:$src1)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+}
+
+multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
+ SDNode Op, X86FoldableSchedWrite sched> {
+ let hasSideEffects = 0 in
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[sched]>;
+
+ // Pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1,
+ RC:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+}
+
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1,
+ Uses = [MXCSR], mayRaiseFPException = 1 in
+multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpcodeStr, string PackTy, string Suff,
+ PatFrag MemFrag128, PatFrag MemFrag256,
+ SDNode Op, ValueType OpTy128, ValueType OpTy256,
+ X86SchedWriteWidths sched> {
+ defm NAME#213#Suff : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
+ VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
+ defm NAME#231#Suff : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
+ VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
+ defm NAME#132#Suff : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
+ VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>;
+
+ defm NAME#213#Suff#Y : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
+ VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
+ VEX_L;
+ defm NAME#231#Suff#Y : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
+ VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
+ VEX_L;
+ defm NAME#132#Suff#Y : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
+ VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>,
+ VEX_L;
+}
+
+// Fused Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
+ loadv4f32, loadv8f32, any_fma, v4f32, v8f32,
+ SchedWriteFMA>;
+ defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
+ loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
+ SchedWriteFMA>;
+ defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32,
+ SchedWriteFMA>;
+ defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32,
+ SchedWriteFMA>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
+ loadv2f64, loadv4f64, any_fma, v2f64,
+ v4f64, SchedWriteFMA>, VEX_W;
+ defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
+ loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
+ v4f64, SchedWriteFMA>, VEX_W;
+ defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmaddsub,
+ v2f64, v4f64, SchedWriteFMA>, VEX_W;
+ defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmsubadd,
+ v2f64, v4f64, SchedWriteFMA>, VEX_W;
+}
+
+// Fused Negative Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
+ loadv8f32, X86any_Fnmadd, v4f32, v8f32, SchedWriteFMA>;
+ defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
+ loadv8f32, X86any_Fnmsub, v4f32, v8f32, SchedWriteFMA>;
+}
+let ExeDomain = SSEPackedDouble in {
+ defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
+ loadv4f64, X86any_Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+ defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
+ loadv4f64, X86any_Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+}
+
+// All source register operands of FMA opcodes defined in fma3s_rm multiclass
+// can be commuted. In many cases such commute transformation requires an opcode
+// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
+// would require an opcode change to FMA*231:
+// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
+// -->
+// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
+// Please see more detailed comment at the very beginning of the section
+// defining FMA3 opcodes above.
+multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ SDPatternOperator OpNode,
+ X86FoldableSchedWrite sched> {
+ def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>,
+ Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+}
+
+multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
+ let hasSideEffects = 0 in
+ def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src2, (load addr:$src3), RC:$src1))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+}
+
+multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ SDPatternOperator OpNode, X86FoldableSchedWrite sched> {
+ let hasSideEffects = 0 in
+ def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[sched]>;
+
+ // Pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
+ let mayLoad = 1 in
+ def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+}
+
+let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
+ hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, string PackTy, string Suff,
+ SDNode OpNode, RegisterClass RC,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched> {
+ defm NAME#213#Suff : fma3s_rm_213<opc213, !strconcat(OpStr, "213", PackTy),
+ x86memop, RC, OpNode, sched>;
+ defm NAME#231#Suff : fma3s_rm_231<opc231, !strconcat(OpStr, "231", PackTy),
+ x86memop, RC, OpNode, sched>;
+ defm NAME#132#Suff : fma3s_rm_132<opc132, !strconcat(OpStr, "132", PackTy),
+ x86memop, RC, OpNode, sched>;
+}
+
+// These FMA*_Int instructions are defined specially for being used when
+// the scalar FMA intrinsics are lowered to machine instructions, and in that
+// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
+// instructions.
+//
+// All of the FMA*_Int opcodes are defined as commutable here.
+// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
+// and the corresponding optimizations have been developed.
+// Commuting the 1st operand of FMA*_Int requires some additional analysis,
+// the commute optimization is legal only if all users of FMA*_Int use only
+// the lowest element of the FMA*_Int instruction. Even though such analysis
+// may be not implemented yet we allow the routines doing the actual commute
+// transformation to decide if one or another instruction is commutable or not.
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
+ Operand memopr, RegisterClass RC,
+ X86FoldableSchedWrite sched> {
+ def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, memopr:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+}
+
+// The FMA 213 form is created for lowering of scalar FMA intrinsics
+// to machine instructions.
+// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
+// of FMA 213 form.
+// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
+// forms and is possible only after special analysis of all uses of the initial
+// instruction. Such analysis do not exist yet and thus introducing the 231
+// form of FMA*_Int instructions is done using an optimistic assumption that
+// such analysis will be implemented eventually.
+multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, string PackTy, string Suff,
+ RegisterClass RC, Operand memop,
+ X86FoldableSchedWrite sched> {
+ defm NAME#132#Suff : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
+ memop, RC, sched>;
+ defm NAME#213#Suff : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+ memop, RC, sched>;
+ defm NAME#231#Suff : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
+ memop, RC, sched>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, SDNode OpNode, X86FoldableSchedWrite sched> {
+ let ExeDomain = SSEPackedSingle in
+ defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
+ FR32, f32mem, sched>,
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", "SS",
+ VR128, ssmem, sched>;
+
+ let ExeDomain = SSEPackedDouble in
+ defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "SD", OpNode,
+ FR64, f64mem, sched>,
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", "SD",
+ VR128, sdmem, sched>, VEX_W;
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", any_fma,
+ SchedWriteFMA.Scl>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
+ SchedWriteFMA.Scl>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86any_Fnmadd,
+ SchedWriteFMA.Scl>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub,
+ SchedWriteFMA.Scl>, VEX_LIG;
+
+multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
+ SDNode Move, ValueType VT, ValueType EltVT,
+ RegisterClass RC, PatFrag mem_frag> {
+ let Predicates = [HasFMA, NoAVX512] in {
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op RC:$src2,
+ (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
+ RC:$src3))))),
+ (!cast<Instruction>(Prefix#"213"#Suffix#"r_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op RC:$src2, RC:$src3,
+ (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))),
+ (!cast<Instruction>(Prefix#"231"#Suffix#"r_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op RC:$src2,
+ (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
+ (mem_frag addr:$src3)))))),
+ (!cast<Instruction>(Prefix#"213"#Suffix#"m_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ addr:$src3)>;
+
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
+ (mem_frag addr:$src3), RC:$src2))))),
+ (!cast<Instruction>(Prefix#"132"#Suffix#"m_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ addr:$src3)>;
+
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op RC:$src2, (mem_frag addr:$src3),
+ (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))),
+ (!cast<Instruction>(Prefix#"231"#Suffix#"m_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ addr:$src3)>;
+ }
+}
+
+defm : scalar_fma_patterns<any_fma, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+
+defm : scalar_fma_patterns<any_fma, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+
+//===----------------------------------------------------------------------===//
+// FMA4 - AMD 4 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+let Uses = [MXCSR], mayRaiseFPException = 1 in
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+ PatFrag mem_frag, X86FoldableSchedWrite sched> {
+ let isCommutable = 1 in
+ def rr : FMA4S<opc, MRMSrcRegOp4, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG,
+ Sched<[sched]>;
+ def rm : FMA4S<opc, MRMSrcMemOp4, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+ (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG,
+ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+ def mr : FMA4S<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG,
+ Sched<[sched.Folded, sched.ReadAfterFold,
+ // x86memop:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src3
+ sched.ReadAfterFold]>;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : FMA4S<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+ VEX_LIG, FoldGenData<NAME#rr>, Sched<[sched]>;
+}
+
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+ ValueType VT, X86FoldableSchedWrite sched> {
+let isCodeGenOnly = 1, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
+ def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, VEX_W, VEX_LIG, Sched<[sched]>;
+ let mayLoad = 1 in
+ def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, VEX_W, VEX_LIG,
+ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+ let mayLoad = 1 in
+ def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, memop:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>,
+ VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold,
+ // memop:$src2
+ ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault, ReadDefault,
+ // VR128::$src3
+ sched.ReadAfterFold]>;
+ def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, VEX_LIG, FoldGenData<NAME#rr_Int>, Sched<[sched]>;
+} // isCodeGenOnly = 1
+}
+
+let Uses = [MXCSR], mayRaiseFPException = 1 in
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT128, ValueType OpVT256,
+ PatFrag ld_frag128, PatFrag ld_frag256,
+ X86SchedWriteWidths sched> {
+ let isCommutable = 1 in
+ def rr : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+ VEX_W, Sched<[sched.XMM]>;
+ def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
+ (ld_frag128 addr:$src3)))]>, VEX_W,
+ Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold, sched.XMM.ReadAfterFold]>;
+ def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>,
+ Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold,
+ // f128mem:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // VR128::$src3
+ sched.XMM.ReadAfterFold]>;
+ let isCommutable = 1 in
+ def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst,
+ (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+ VEX_W, VEX_L, Sched<[sched.YMM]>;
+ def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+ (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L,
+ Sched<[sched.YMM.Folded, sched.YMM.ReadAfterFold, sched.YMM.ReadAfterFold]>;
+ def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR256:$dst, (OpNode VR256:$src1,
+ (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L,
+ Sched<[sched.YMM.Folded, sched.YMM.ReadAfterFold,
+ // f256mem:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // VR256::$src3
+ sched.YMM.ReadAfterFold]>;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+ def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+ Sched<[sched.XMM]>, FoldGenData<NAME#rr>;
+ def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+ VEX_L, Sched<[sched.YMM]>, FoldGenData<NAME#Yrr>;
+} // isCodeGenOnly = 1
+}
+
+let ExeDomain = SSEPackedSingle in {
+ // Scalar Instructions
+ defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, any_fma, loadf32,
+ SchedWriteFMA.Scl>,
+ fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
+ SchedWriteFMA.Scl>;
+ defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32,
+ SchedWriteFMA.Scl>,
+ fma4s_int<0x6E, "vfmsubss", ssmem, v4f32,
+ SchedWriteFMA.Scl>;
+ defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+ X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>,
+ fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32,
+ SchedWriteFMA.Scl>;
+ defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+ X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>,
+ fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
+ SchedWriteFMA.Scl>;
+ // Packed Instructions
+ defm VFMADDPS4 : fma4p<0x68, "vfmaddps", any_fma, v4f32, v8f32,
+ loadv4f32, loadv8f32, SchedWriteFMA>;
+ defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
+ loadv4f32, loadv8f32, SchedWriteFMA>;
+ defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86any_Fnmadd, v4f32, v8f32,
+ loadv4f32, loadv8f32, SchedWriteFMA>;
+ defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86any_Fnmsub, v4f32, v8f32,
+ loadv4f32, loadv8f32, SchedWriteFMA>;
+ defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+ loadv4f32, loadv8f32, SchedWriteFMA>;
+ defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+ loadv4f32, loadv8f32, SchedWriteFMA>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ // Scalar Instructions
+ defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, any_fma, loadf64,
+ SchedWriteFMA.Scl>,
+ fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
+ SchedWriteFMA.Scl>;
+ defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64,
+ SchedWriteFMA.Scl>,
+ fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64,
+ SchedWriteFMA.Scl>;
+ defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+ X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>,
+ fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64,
+ SchedWriteFMA.Scl>;
+ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+ X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>,
+ fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
+ SchedWriteFMA.Scl>;
+ // Packed Instructions
+ defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", any_fma, v2f64, v4f64,
+ loadv2f64, loadv4f64, SchedWriteFMA>;
+ defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
+ loadv2f64, loadv4f64, SchedWriteFMA>;
+ defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86any_Fnmadd, v2f64, v4f64,
+ loadv2f64, loadv4f64, SchedWriteFMA>;
+ defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86any_Fnmsub, v2f64, v4f64,
+ loadv2f64, loadv4f64, SchedWriteFMA>;
+ defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+ loadv2f64, loadv4f64, SchedWriteFMA>;
+ defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+ loadv2f64, loadv4f64, SchedWriteFMA>;
+}
+
+multiclass scalar_fma4_patterns<SDNode Op, string Name,
+ ValueType VT, ValueType EltVT,
+ RegisterClass RC, PatFrag mem_frag> {
+ let Predicates = [HasFMA4] in {
+ def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
+ (Op RC:$src1, RC:$src2, RC:$src3))))),
+ (!cast<Instruction>(Name#"rr_Int")
+ (VT (COPY_TO_REGCLASS RC:$src1, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+ def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
+ (Op RC:$src1, RC:$src2,
+ (mem_frag addr:$src3)))))),
+ (!cast<Instruction>(Name#"rm_Int")
+ (VT (COPY_TO_REGCLASS RC:$src1, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src2, VR128)), addr:$src3)>;
+
+ def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
+ (Op RC:$src1, (mem_frag addr:$src2),
+ RC:$src3))))),
+ (!cast<Instruction>(Name#"mr_Int")
+ (VT (COPY_TO_REGCLASS RC:$src1, VR128)), addr:$src2,
+ (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+ }
+}
+
+defm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
+
+defm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
new file mode 100644
index 000000000000..6d803e931b68
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -0,0 +1,164 @@
+//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrFMA3Info.h"
+#include "X86InstrInfo.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Threading.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define FMA3GROUP(Name, Suf, Attrs) \
+ { { X86::Name##132##Suf, X86::Name##213##Suf, X86::Name##231##Suf }, Attrs },
+
+#define FMA3GROUP_MASKED(Name, Suf, Attrs) \
+ FMA3GROUP(Name, Suf, Attrs) \
+ FMA3GROUP(Name, Suf##k, Attrs | X86InstrFMA3Group::KMergeMasked) \
+ FMA3GROUP(Name, Suf##kz, Attrs | X86InstrFMA3Group::KZeroMasked)
+
+#define FMA3GROUP_PACKED_WIDTHS(Name, Suf, Attrs) \
+ FMA3GROUP(Name, Suf##Ym, Attrs) \
+ FMA3GROUP(Name, Suf##Yr, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Z128m, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Z128r, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Z256m, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Z256r, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Zm, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Zr, Attrs) \
+ FMA3GROUP(Name, Suf##m, Attrs) \
+ FMA3GROUP(Name, Suf##r, Attrs)
+
+#define FMA3GROUP_PACKED(Name, Attrs) \
+ FMA3GROUP_PACKED_WIDTHS(Name, PD, Attrs) \
+ FMA3GROUP_PACKED_WIDTHS(Name, PS, Attrs)
+
+#define FMA3GROUP_SCALAR_WIDTHS(Name, Suf, Attrs) \
+ FMA3GROUP(Name, Suf##Zm, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Zm_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+ FMA3GROUP(Name, Suf##Zr, Attrs) \
+ FMA3GROUP_MASKED(Name, Suf##Zr_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+ FMA3GROUP(Name, Suf##m, Attrs) \
+ FMA3GROUP(Name, Suf##m_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+ FMA3GROUP(Name, Suf##r, Attrs) \
+ FMA3GROUP(Name, Suf##r_Int, Attrs | X86InstrFMA3Group::Intrinsic)
+
+#define FMA3GROUP_SCALAR(Name, Attrs) \
+ FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \
+ FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs)
+
+#define FMA3GROUP_FULL(Name, Attrs) \
+ FMA3GROUP_PACKED(Name, Attrs) \
+ FMA3GROUP_SCALAR(Name, Attrs)
+
+static const X86InstrFMA3Group Groups[] = {
+ FMA3GROUP_FULL(VFMADD, 0)
+ FMA3GROUP_PACKED(VFMADDSUB, 0)
+ FMA3GROUP_FULL(VFMSUB, 0)
+ FMA3GROUP_PACKED(VFMSUBADD, 0)
+ FMA3GROUP_FULL(VFNMADD, 0)
+ FMA3GROUP_FULL(VFNMSUB, 0)
+};
+
+#define FMA3GROUP_PACKED_AVX512_WIDTHS(Name, Type, Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, Type##Z128##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, Type##Z256##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, Type##Z##Suf, Attrs)
+
+#define FMA3GROUP_PACKED_AVX512(Name, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
+
+#define FMA3GROUP_PACKED_AVX512_ROUND(Name, Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, PDZ##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, PSZ##Suf, Attrs)
+
+#define FMA3GROUP_SCALAR_AVX512_ROUND(Name, Suf, Attrs) \
+ FMA3GROUP(Name, SDZ##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, SDZ##Suf##_Int, Attrs) \
+ FMA3GROUP(Name, SSZ##Suf, Attrs) \
+ FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs)
+
+static const X86InstrFMA3Group BroadcastGroups[] = {
+ FMA3GROUP_PACKED_AVX512(VFMADD, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFMADDSUB, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFMSUB, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFMSUBADD, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFNMADD, mb, 0)
+ FMA3GROUP_PACKED_AVX512(VFNMSUB, mb, 0)
+};
+
+static const X86InstrFMA3Group RoundGroups[] = {
+ FMA3GROUP_PACKED_AVX512_ROUND(VFMADD, rb, 0)
+ FMA3GROUP_SCALAR_AVX512_ROUND(VFMADD, rb, X86InstrFMA3Group::Intrinsic)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFMADDSUB, rb, 0)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFMSUB, rb, 0)
+ FMA3GROUP_SCALAR_AVX512_ROUND(VFMSUB, rb, X86InstrFMA3Group::Intrinsic)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFMSUBADD, rb, 0)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFNMADD, rb, 0)
+ FMA3GROUP_SCALAR_AVX512_ROUND(VFNMADD, rb, X86InstrFMA3Group::Intrinsic)
+ FMA3GROUP_PACKED_AVX512_ROUND(VFNMSUB, rb, 0)
+ FMA3GROUP_SCALAR_AVX512_ROUND(VFNMSUB, rb, X86InstrFMA3Group::Intrinsic)
+};
+
+static void verifyTables() {
+#ifndef NDEBUG
+ static std::atomic<bool> TableChecked(false);
+ if (!TableChecked.load(std::memory_order_relaxed)) {
+ assert(llvm::is_sorted(Groups) && llvm::is_sorted(RoundGroups) &&
+ llvm::is_sorted(BroadcastGroups) && "FMA3 tables not sorted!");
+ TableChecked.store(true, std::memory_order_relaxed);
+ }
+#endif
+}
+
+/// Returns a reference to a group of FMA3 opcodes to where the given
+/// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+/// and not included into any FMA3 group, then nullptr is returned.
+const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) {
+
+ // FMA3 instructions have a well defined encoding pattern we can exploit.
+ uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+ bool IsFMA3 = ((TSFlags & X86II::EncodingMask) == X86II::VEX ||
+ (TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
+ (TSFlags & X86II::OpMapMask) == X86II::T8 &&
+ (TSFlags & X86II::OpPrefixMask) == X86II::PD &&
+ ((BaseOpcode >= 0x96 && BaseOpcode <= 0x9F) ||
+ (BaseOpcode >= 0xA6 && BaseOpcode <= 0xAF) ||
+ (BaseOpcode >= 0xB6 && BaseOpcode <= 0xBF));
+ if (!IsFMA3)
+ return nullptr;
+
+ verifyTables();
+
+ ArrayRef<X86InstrFMA3Group> Table;
+ if (TSFlags & X86II::EVEX_RC)
+ Table = makeArrayRef(RoundGroups);
+ else if (TSFlags & X86II::EVEX_B)
+ Table = makeArrayRef(BroadcastGroups);
+ else
+ Table = makeArrayRef(Groups);
+
+ // FMA 132 instructions have an opcode of 0x96-0x9F
+ // FMA 213 instructions have an opcode of 0xA6-0xAF
+ // FMA 231 instructions have an opcode of 0xB6-0xBF
+ unsigned FormIndex = ((BaseOpcode - 0x90) >> 4) & 0x3;
+
+ auto I = partition_point(Table, [=](const X86InstrFMA3Group &Group) {
+ return Group.Opcodes[FormIndex] < Opcode;
+ });
+ assert(I != Table.end() && I->Opcodes[FormIndex] == Opcode &&
+ "Couldn't find FMA3 opcode!");
+ return I;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h
new file mode 100644
index 000000000000..ce0a7cc7f82e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h
@@ -0,0 +1,97 @@
+//===- X86InstrFMA3Info.h - X86 FMA3 Instruction Information ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+
+#include <cstdint>
+
+namespace llvm {
+
+/// This class is used to group {132, 213, 231} forms of FMA opcodes together.
+/// Each of the groups has either 3 opcodes, Also, each group has an attributes
+/// field describing it.
+struct X86InstrFMA3Group {
+ /// An array holding 3 forms of FMA opcodes.
+ uint16_t Opcodes[3];
+
+ /// This bitfield specifies the attributes associated with the created
+ /// FMA groups of opcodes.
+ uint16_t Attributes;
+
+ enum {
+ Form132,
+ Form213,
+ Form231,
+ };
+
+ enum : uint16_t {
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of FMA intrinsic opcodes.
+ Intrinsic = 0x1,
+
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
+ /// passing the elements from the 1st operand to the result of the operation
+ /// when the correpondings bits in the k-mask are unset.
+ KMergeMasked = 0x2,
+
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
+ KZeroMasked = 0x4,
+ };
+
+ /// Returns the 132 form of FMA opcode.
+ unsigned get132Opcode() const {
+ return Opcodes[Form132];
+ }
+
+ /// Returns the 213 form of FMA opcode.
+ unsigned get213Opcode() const {
+ return Opcodes[Form213];
+ }
+
+ /// Returns the 231 form of FMA opcode.
+ unsigned get231Opcode() const {
+ return Opcodes[Form231];
+ }
+
+ /// Returns true iff the group of FMA opcodes holds intrinsic opcodes.
+ bool isIntrinsic() const { return (Attributes & Intrinsic) != 0; }
+
+ /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes.
+ bool isKMergeMasked() const {
+ return (Attributes & KMergeMasked) != 0;
+ }
+
+ /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes.
+ bool isKZeroMasked() const { return (Attributes &KZeroMasked) != 0; }
+
+ /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes.
+ bool isKMasked() const {
+ return (Attributes & (KMergeMasked | KZeroMasked)) != 0;
+ }
+
+ bool operator<(const X86InstrFMA3Group &RHS) const {
+ return Opcodes[0] < RHS.Opcodes[0];
+ }
+};
+
+/// Returns a reference to a group of FMA3 opcodes to where the given
+/// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+/// and not included into any FMA3 group, then nullptr is returned.
+const X86InstrFMA3Group *getFMA3Group(unsigned Opcode, uint64_t TSFlags);
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 000000000000..961b4e590365
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,815 @@
+//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86Fld : SDTypeProfile<1, 1, [SDTCisFP<0>,
+ SDTCisPtrTy<1>]>;
+def SDTX86Fst : SDTypeProfile<0, 2, [SDTCisFP<0>,
+ SDTCisPtrTy<1>]>;
+def SDTX86Fild : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fist : SDNode<"X86ISD::FIST", SDTX86Fist,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
+ [SDNPHasChain, SDNPMayStore, SDNPSideEffect,
+ SDNPMemOperand]>;
+
+def X86fstf32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f32;
+}]>;
+def X86fstf64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f64;
+}]>;
+def X86fstf80 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f80;
+}]>;
+
+def X86fldf32 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f32;
+}]>;
+def X86fldf64 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f64;
+}]>;
+def X86fldf80 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f80;
+}]>;
+
+def X86fild16 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def X86fild32 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def X86fild64 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def X86fist32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fist node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def X86fist64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fist node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def X86fp_to_i16mem : PatFrag<(ops node:$val, node:$ptr),
+ (X86fp_to_mem node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def X86fp_to_i32mem : PatFrag<(ops node:$val, node:$ptr),
+ (X86fp_to_mem node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def X86fp_to_i64mem : PatFrag<(ops node:$val, node:$ptr),
+ (X86fp_to_mem node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions - expanded after instruction selection.
+// Clobbers EFLAGS due to OR instruction used internally.
+// FIXME: Can we model this in SelectionDAG?
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in {
+ def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
+ [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
+ [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src),
+ [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+ def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src),
+ [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src),
+ [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src),
+ [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+ def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src),
+ [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src),
+ [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
+ [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+// All FP Stack operations are represented with four instructions here. The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values. These sizes apply to the values,
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information. These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler. These use "RST" registers, although frequently
+// the actual register(s) used are implicit. These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+ [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+ [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+ [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+ bit Forward = 1> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m : FpIf32<(outs RFP32:$dst),
+ (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (loadf32 addr:$src2))),
+ (set RFP32:$dst,
+ (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>;
+def _Fp64m : FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (loadf64 addr:$src2))),
+ (set RFP64:$dst,
+ (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))),
+ (set RFP64:$dst,
+ (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))),
+ (set RFP80:$dst,
+ (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
+ (set RFP80:$dst,
+ (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src),
+ !strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src),
+ !strconcat("f", asmstring, "{l}\t$src")>;
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (X86fild16 addr:$src2))),
+ (set RFP32:$dst,
+ (OpNode (X86fild16 addr:$src2), RFP32:$src1)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (X86fild32 addr:$src2))),
+ (set RFP32:$dst,
+ (OpNode (X86fild32 addr:$src2), RFP32:$src1)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (X86fild16 addr:$src2))),
+ (set RFP64:$dst,
+ (OpNode (X86fild16 addr:$src2), RFP64:$src1)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (X86fild32 addr:$src2))),
+ (set RFP64:$dst,
+ (OpNode (X86fild32 addr:$src2), RFP64:$src1)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (X86fild16 addr:$src2))),
+ (set RFP80:$dst,
+ (OpNode (X86fild16 addr:$src2), RFP80:$src1)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (X86fild32 addr:$src2))),
+ (set RFP80:$dst,
+ (OpNode (X86fild32 addr:$src2), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src),
+ !strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
+ !strconcat("fi", asmstring, "{l}\t$src")>;
+}
+
+let Uses = [FPCW], mayRaiseFPException = 1 in {
+// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
+// resources.
+let hasNoSchedulingInfo = 1 in {
+defm ADD : FPBinary_rr<any_fadd>;
+defm SUB : FPBinary_rr<any_fsub>;
+defm MUL : FPBinary_rr<any_fmul>;
+defm DIV : FPBinary_rr<any_fdiv>;
+}
+
+// Sets the scheduling resources for the actual NAME#_F<size>m definitions.
+let SchedRW = [WriteFAddLd] in {
+defm ADD : FPBinary<any_fadd, MRM0m, "add">;
+defm SUB : FPBinary<any_fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<any_fsub ,MRM5m, "subr", 0>;
+}
+
+let SchedRW = [WriteFMulLd] in {
+defm MUL : FPBinary<any_fmul, MRM1m, "mul">;
+}
+
+let SchedRW = [WriteFDivLd] in {
+defm DIV : FPBinary<any_fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<any_fdiv, MRM7m, "divr", 0>;
+}
+} // Uses = [FPCW], mayRaiseFPException = 1
+
+class FPST0rInst<Format fp, string asm>
+ : FPI<0xD8, fp, (outs), (ins RSTi:$op), asm>;
+class FPrST0Inst<Format fp, string asm>
+ : FPI<0xDC, fp, (outs), (ins RSTi:$op), asm>;
+class FPrST0PInst<Format fp, string asm>
+ : FPI<0xDE, fp, (outs), (ins RSTi:$op), asm>;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
+// we have to put some 'r's in and take them out of weird places.
+let SchedRW = [WriteFAdd], Uses = [FPCW], mayRaiseFPException = 1 in {
+def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t{$op, %st|st, $op}">;
+def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st, $op|$op, st}">;
+def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t{%st, $op|$op, st}">;
+def SUBR_FST0r : FPST0rInst <MRM5r, "fsubr\t{$op, %st|st, $op}">;
+def SUB_FrST0 : FPrST0Inst <MRM5r, "fsub{r}\t{%st, $op|$op, st}">;
+def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t{%st, $op|$op, st}">;
+def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t{$op, %st|st, $op}">;
+def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st, $op|$op, st}">;
+def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t{%st, $op|$op, st}">;
+} // SchedRW
+let SchedRW = [WriteFCom], Uses = [FPCW], mayRaiseFPException = 1 in {
+def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">;
+def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">;
+} // SchedRW
+let SchedRW = [WriteFMul], Uses = [FPCW], mayRaiseFPException = 1 in {
+def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t{$op, %st|st, $op}">;
+def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st, $op|$op, st}">;
+def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t{%st, $op|$op, st}">;
+} // SchedRW
+let SchedRW = [WriteFDiv], Uses = [FPCW], mayRaiseFPException = 1 in {
+def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t{$op, %st|st, $op}">;
+def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st, $op|$op, st}">;
+def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t{%st, $op|$op, st}">;
+def DIV_FST0r : FPST0rInst <MRM6r, "fdiv\t{$op, %st|st, $op}">;
+def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st, $op|$op, st}">;
+def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t{%st, $op|$op, st}">;
+} // SchedRW
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F : FPI<0xD9, fp, (outs), (ins), asmstring>;
+}
+
+let SchedRW = [WriteFSign] in {
+defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
+}
+
+let Uses = [FPCW], mayRaiseFPException = 1 in {
+let SchedRW = [WriteFSqrt80] in
+defm SQRT: FPUnary<any_fsqrt,MRM_FA, "fsqrt">;
+
+let SchedRW = [WriteFCom] in {
+let hasSideEffects = 0 in {
+def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+} // hasSideEffects
+
+def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
+} // SchedRW
+} // Uses = [FPCW], mayRaiseFPException = 1
+
+// Versions of FP instructions that take a single memory operand. Added for the
+// disassembler; remove as they are included with patterns elsewhere.
+let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1,
+ mayLoad = 1 in {
+def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
+
+def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+} // SchedRW
+
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW, FPCW], mayLoad = 1 in {
+def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins anymem:$src), "fldenv\t$src">;
+def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins anymem:$src), "frstor\t$src">;
+}
+
+let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW], mayStore = 1 in {
+def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins anymem:$dst), "fnstenv\t$dst">;
+def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins anymem:$dst), "fnsave\t$dst">;
+}
+
+let Uses = [FPSW], mayStore = 1 in
+def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">;
+
+let mayLoad = 1 in
+def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
+let Uses = [FPCW] ,mayRaiseFPException = 1, mayStore = 1 in
+def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
+} // SchedRW
+
+// Floating point cmovs.
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+
+multiclass FPCMov<PatLeaf cc> {
+ def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+ CondMovFP,
+ [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+ cc, EFLAGS))]>;
+ def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+ CondMovFP,
+ [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+ cc, EFLAGS))]>;
+ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+ CondMovFP,
+ [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+ cc, EFLAGS))]>,
+ Requires<[HasCMov]>;
+}
+
+let SchedRW = [WriteFCMOV] in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
+defm CMOVB : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE : FPCMov<X86_COND_E>;
+defm CMOVP : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
+
+let Predicates = [HasCMov] in {
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RSTi:$op),
+ "fcmovb\t{$op, %st|st, $op}">;
+def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RSTi:$op),
+ "fcmovbe\t{$op, %st|st, $op}">;
+def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RSTi:$op),
+ "fcmove\t{$op, %st|st, $op}">;
+def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RSTi:$op),
+ "fcmovu\t{$op, %st|st, $op}">;
+def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RSTi:$op),
+ "fcmovnb\t{$op, %st|st, $op}">;
+def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RSTi:$op),
+ "fcmovnbe\t{$op, %st|st, $op}">;
+def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RSTi:$op),
+ "fcmovne\t{$op, %st|st, $op}">;
+def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op),
+ "fcmovnu\t{$op, %st|st, $op}">;
+} // Predicates = [HasCMov]
+} // SchedRW
+
+let mayRaiseFPException = 1 in {
+// Floating point loads & stores.
+let SchedRW = [WriteLoad], Uses = [FPCW] in {
+let canFoldAsLoad = 1 in {
+def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (loadf32 addr:$src))]>;
+def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (loadf80 addr:$src))]>;
+} // canFoldAsLoad
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+let mayRaiseFPException = 0 in {
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild16 addr:$src))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild32 addr:$src))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild64 addr:$src))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild16 addr:$src))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild32 addr:$src))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild64 addr:$src))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild16 addr:$src))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild32 addr:$src))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild64 addr:$src))]>;
+} // mayRaiseFPException = 0
+} // SchedRW
+
+let SchedRW = [WriteStore], Uses = [FPCW] in {
+def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+ [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+ [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+ [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, hasSideEffects = 0 in {
+def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+} // mayStore
+
+def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+ [(store RFP80:$src, addr:$op)]>;
+
+let mayStore = 1, hasSideEffects = 0 in {
+def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+ [(X86fist32 RFP32:$src, addr:$op)]>;
+def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+ [(X86fist64 RFP32:$src, addr:$op)]>;
+def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+ [(X86fist32 RFP64:$src, addr:$op)]>;
+def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+ [(X86fist64 RFP64:$src, addr:$op)]>;
+def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+ [(X86fist32 RFP80:$src, addr:$op)]>;
+def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+ [(X86fist64 RFP80:$src, addr:$op)]>;
+} // mayStore
+} // SchedRW, Uses = [FPCW]
+
+let mayLoad = 1, SchedRW = [WriteLoad], Uses = [FPCW] in {
+def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
+def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
+def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
+let mayRaiseFPException = 0 in {
+def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
+def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
+def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
+}
+}
+let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in {
+def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
+def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
+def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
+def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
+def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
+def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
+def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+let Predicates = [HasSSE3], SchedRW = [WriteStore], Uses = [FPCW] in {
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
+} // Predicates = [HasSSE3]
+
+let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">;
+}
+
+// FP Stack manipulation instructions.
+let SchedRW = [WriteMove], Uses = [FPCW] in {
+def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RSTi:$op), "fld\t$op">;
+def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RSTi:$op), "fst\t$op">;
+def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RSTi:$op), "fstp\t$op">;
+let mayRaiseFPException = 0 in
+def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RSTi:$op), "fxch\t$op">;
+}
+
+// Floating point constant loads.
+let SchedRW = [WriteZero], Uses = [FPCW] in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm1)]>;
+}
+
+let SchedRW = [WriteFLD0], Uses = [FPCW], mayRaiseFPException = 0 in
+def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">;
+
+let SchedRW = [WriteFLD1], Uses = [FPCW], mayRaiseFPException = 0 in
+def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">;
+
+let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW], mayRaiseFPException = 0 in {
+def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>;
+def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>;
+def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>;
+def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", []>;
+def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>;
+} // SchedRW
+
+// Floating point compares.
+let SchedRW = [WriteFCom], Uses = [FPCW], hasSideEffects = 0 in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, []>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, []>;
+def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, []>;
+def COM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, []>;
+def COM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, []>;
+def COM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, []>;
+} // SchedRW
+} // mayRaiseFPException = 1
+
+let SchedRW = [WriteFCom], mayRaiseFPException = 1 in {
+// CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW], Uses = [FPCW] in {
+def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set EFLAGS, (X86any_fcmp RFP32:$lhs, RFP32:$rhs))]>,
+ Requires<[FPStackf32, HasCMov]>;
+def UCOM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set EFLAGS, (X86any_fcmp RFP64:$lhs, RFP64:$rhs))]>,
+ Requires<[FPStackf64, HasCMov]>;
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set EFLAGS, (X86any_fcmp RFP80:$lhs, RFP80:$rhs))]>,
+ Requires<[HasCMov]>;
+def COM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set EFLAGS, (X86strict_fcmps RFP32:$lhs, RFP32:$rhs))]>,
+ Requires<[FPStackf32, HasCMov]>;
+def COM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set EFLAGS, (X86strict_fcmps RFP64:$lhs, RFP64:$rhs))]>,
+ Requires<[FPStackf64, HasCMov]>;
+def COM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set EFLAGS, (X86strict_fcmps RFP80:$lhs, RFP80:$rhs))]>,
+ Requires<[HasCMov]>;
+}
+
+let Uses = [ST0, FPCW] in {
+def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i)
+ (outs), (ins RSTi:$reg), "fucom\t$reg">;
+def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop
+ (outs), (ins RSTi:$reg), "fucomp\t$reg">;
+def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop
+ (outs), (ins), "fucompp">;
+}
+
+let Defs = [EFLAGS, FPSW], Uses = [ST0, FPCW] in {
+def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i)
+ (outs), (ins RSTi:$reg), "fucomi\t{$reg, %st|st, $reg}">;
+def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop
+ (outs), (ins RSTi:$reg), "fucompi\t{$reg, %st|st, $reg}">;
+
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RSTi:$reg),
+ "fcomi\t{$reg, %st|st, $reg}">;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RSTi:$reg),
+ "fcompi\t{$reg, %st|st, $reg}">;
+}
+} // SchedRW
+
+// Floating point flag ops.
+let SchedRW = [WriteALU] in {
+let Defs = [AX, FPSW], Uses = [FPSW], hasSideEffects = 0 in
+def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags
+ (outs), (ins), "fnstsw\t{%ax|ax}", []>;
+let Defs = [FPSW], Uses = [FPCW] in
+def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
+ (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+ [(X86fp_cwd_get16 addr:$dst)]>;
+} // SchedRW
+let Defs = [FPSW,FPCW], mayLoad = 1 in
+def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
+ (outs), (ins i16mem:$dst), "fldcw\t$dst", []>,
+ Sched<[WriteLoad]>;
+
+// FPU control instructions
+let SchedRW = [WriteMicrocoded] in {
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RSTi:$reg), "ffree\t$reg">;
+def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RSTi:$reg), "ffreep\t$reg">;
+
+let Defs = [FPSW, FPCW] in
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>;
+// Clear exceptions
+let Defs = [FPSW] in
+def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", []>;
+} // SchedRW
+
+// Operand-less floating-point instructions for the disassembler.
+let Defs = [FPSW] in
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", []>, Sched<[WriteNop]>;
+
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW] in {
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
+def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", []>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", []>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", []>;
+let Uses = [FPCW], mayRaiseFPException = 1 in {
+def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", []>;
+def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", []>;
+def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", []>;
+def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", []>;
+def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", []>;
+def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", []>;
+def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", []>;
+def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", []>;
+def FSIN : I<0xD9, MRM_FE, (outs), (ins), "fsin", []>;
+def FCOS : I<0xD9, MRM_FF, (outs), (ins), "fcos", []>;
+def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", []>;
+def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", []>;
+def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", []>;
+def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>;
+} // Uses = [FPCW], mayRaiseFPException = 1
+} // Defs = [FPSW]
+
+let Uses = [FPSW, FPCW] in {
+def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
+ "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, PS,
+ Requires<[HasFXSR]>;
+def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
+ "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>,
+ PS, Requires<[HasFXSR, In64BitMode]>;
+} // Uses = [FPSW, FPCW]
+
+let Defs = [FPSW, FPCW] in {
+def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src),
+ "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>,
+ PS, Requires<[HasFXSR]>;
+def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
+ "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>,
+ PS, Requires<[HasFXSR, In64BitMode]>;
+} // Defs = [FPSW, FPCW]
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fldf32 addr:$src), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fldf32 addr:$src), (LD_Fp32m64 addr:$src)>;
+def : Pat<(X86fldf64 addr:$src), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fldf32 addr:$src), (LD_Fp32m80 addr:$src)>;
+def : Pat<(X86fldf64 addr:$src), (LD_Fp64m80 addr:$src)>;
+def : Pat<(X86fldf80 addr:$src), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fstf32 RFP32:$src, addr:$op), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fstf32 RFP64:$src, addr:$op), (ST_Fp64m32 addr:$op, RFP64:$src)>;
+def : Pat<(X86fstf64 RFP64:$src, addr:$op), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fstf32 RFP80:$src, addr:$op), (ST_Fp80m32 addr:$op, RFP80:$src)>;
+def : Pat<(X86fstf64 RFP80:$src, addr:$op), (ST_Fp80m64 addr:$op, RFP80:$src)>;
+def : Pat<(X86fstf80 RFP80:$src, addr:$op), (ST_FpP80m addr:$op, RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (any_fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+ Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack. We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (any_fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f32 (any_fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f64 (any_fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+ Requires<[FPStackf64]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
new file mode 100644
index 000000000000..17fe7f0bd310
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -0,0 +1,5697 @@
+//===-- X86InstrFoldTables.cpp - X86 Instruction Folding Tables -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 memory folding tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrFoldTables.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include <vector>
+
+using namespace llvm;
+
+// These tables are sorted by their RegOp value allowing them to be binary
+// searched at runtime without the need for additional storage. The enum values
+// are currently emitted in X86GenInstrInfo.inc in alphabetical order. Which
+// makes sorting these tables a simple matter of alphabetizing the table.
+//
+// We also have a tablegen emitter that tries to autogenerate these tables
+// by comparing encoding information. This can be enabled by passing
+// X86_GEN_FOLD_TABLES=ON to cmake which fill produce X86GenFoldTables.inc
+// in the build area. There are currently some bugs in the autogenerated table
+// that require a manual review to copy them from the autogenerated table into
+// this table. It is unclear if we will ever be able to fully automate this
+// because as new instruction are added into holes in the X86 opcode map they
+// potentially pair up with old instructions and create new entries in the
+// tables that would be incorrect. The manual review process allows us a chance
+// to catch these before they become observable bugs.
+static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+ { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
+ { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
+ { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
+ { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
+ { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
+ { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
+ { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
+ { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
+ { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
+ { X86::ADD8ri_DB, X86::ADD8mi, TB_NO_REVERSE },
+ { X86::ADD8rr_DB, X86::ADD8mr, TB_NO_REVERSE },
+ { X86::ADC16ri, X86::ADC16mi, 0 },
+ { X86::ADC16ri8, X86::ADC16mi8, 0 },
+ { X86::ADC16rr, X86::ADC16mr, 0 },
+ { X86::ADC32ri, X86::ADC32mi, 0 },
+ { X86::ADC32ri8, X86::ADC32mi8, 0 },
+ { X86::ADC32rr, X86::ADC32mr, 0 },
+ { X86::ADC64ri32, X86::ADC64mi32, 0 },
+ { X86::ADC64ri8, X86::ADC64mi8, 0 },
+ { X86::ADC64rr, X86::ADC64mr, 0 },
+ { X86::ADC8ri, X86::ADC8mi, 0 },
+ { X86::ADC8ri8, X86::ADC8mi8, 0 },
+ { X86::ADC8rr, X86::ADC8mr, 0 },
+ { X86::ADD16ri, X86::ADD16mi, 0 },
+ { X86::ADD16ri8, X86::ADD16mi8, 0 },
+ { X86::ADD16rr, X86::ADD16mr, 0 },
+ { X86::ADD32ri, X86::ADD32mi, 0 },
+ { X86::ADD32ri8, X86::ADD32mi8, 0 },
+ { X86::ADD32rr, X86::ADD32mr, 0 },
+ { X86::ADD64ri32, X86::ADD64mi32, 0 },
+ { X86::ADD64ri8, X86::ADD64mi8, 0 },
+ { X86::ADD64rr, X86::ADD64mr, 0 },
+ { X86::ADD8ri, X86::ADD8mi, 0 },
+ { X86::ADD8ri8, X86::ADD8mi8, 0 },
+ { X86::ADD8rr, X86::ADD8mr, 0 },
+ { X86::AND16ri, X86::AND16mi, 0 },
+ { X86::AND16ri8, X86::AND16mi8, 0 },
+ { X86::AND16rr, X86::AND16mr, 0 },
+ { X86::AND32ri, X86::AND32mi, 0 },
+ { X86::AND32ri8, X86::AND32mi8, 0 },
+ { X86::AND32rr, X86::AND32mr, 0 },
+ { X86::AND64ri32, X86::AND64mi32, 0 },
+ { X86::AND64ri8, X86::AND64mi8, 0 },
+ { X86::AND64rr, X86::AND64mr, 0 },
+ { X86::AND8ri, X86::AND8mi, 0 },
+ { X86::AND8ri8, X86::AND8mi8, 0 },
+ { X86::AND8rr, X86::AND8mr, 0 },
+ { X86::BTC16ri8, X86::BTC16mi8, 0 },
+ { X86::BTC32ri8, X86::BTC32mi8, 0 },
+ { X86::BTC64ri8, X86::BTC64mi8, 0 },
+ { X86::BTR16ri8, X86::BTR16mi8, 0 },
+ { X86::BTR32ri8, X86::BTR32mi8, 0 },
+ { X86::BTR64ri8, X86::BTR64mi8, 0 },
+ { X86::BTS16ri8, X86::BTS16mi8, 0 },
+ { X86::BTS32ri8, X86::BTS32mi8, 0 },
+ { X86::BTS64ri8, X86::BTS64mi8, 0 },
+ { X86::DEC16r, X86::DEC16m, 0 },
+ { X86::DEC32r, X86::DEC32m, 0 },
+ { X86::DEC64r, X86::DEC64m, 0 },
+ { X86::DEC8r, X86::DEC8m, 0 },
+ { X86::INC16r, X86::INC16m, 0 },
+ { X86::INC32r, X86::INC32m, 0 },
+ { X86::INC64r, X86::INC64m, 0 },
+ { X86::INC8r, X86::INC8m, 0 },
+ { X86::NEG16r, X86::NEG16m, 0 },
+ { X86::NEG32r, X86::NEG32m, 0 },
+ { X86::NEG64r, X86::NEG64m, 0 },
+ { X86::NEG8r, X86::NEG8m, 0 },
+ { X86::NOT16r, X86::NOT16m, 0 },
+ { X86::NOT32r, X86::NOT32m, 0 },
+ { X86::NOT64r, X86::NOT64m, 0 },
+ { X86::NOT8r, X86::NOT8m, 0 },
+ { X86::OR16ri, X86::OR16mi, 0 },
+ { X86::OR16ri8, X86::OR16mi8, 0 },
+ { X86::OR16rr, X86::OR16mr, 0 },
+ { X86::OR32ri, X86::OR32mi, 0 },
+ { X86::OR32ri8, X86::OR32mi8, 0 },
+ { X86::OR32rr, X86::OR32mr, 0 },
+ { X86::OR64ri32, X86::OR64mi32, 0 },
+ { X86::OR64ri8, X86::OR64mi8, 0 },
+ { X86::OR64rr, X86::OR64mr, 0 },
+ { X86::OR8ri, X86::OR8mi, 0 },
+ { X86::OR8ri8, X86::OR8mi8, 0 },
+ { X86::OR8rr, X86::OR8mr, 0 },
+ { X86::RCL16r1, X86::RCL16m1, 0 },
+ { X86::RCL16rCL, X86::RCL16mCL, 0 },
+ { X86::RCL16ri, X86::RCL16mi, 0 },
+ { X86::RCL32r1, X86::RCL32m1, 0 },
+ { X86::RCL32rCL, X86::RCL32mCL, 0 },
+ { X86::RCL32ri, X86::RCL32mi, 0 },
+ { X86::RCL64r1, X86::RCL64m1, 0 },
+ { X86::RCL64rCL, X86::RCL64mCL, 0 },
+ { X86::RCL64ri, X86::RCL64mi, 0 },
+ { X86::RCL8r1, X86::RCL8m1, 0 },
+ { X86::RCL8rCL, X86::RCL8mCL, 0 },
+ { X86::RCL8ri, X86::RCL8mi, 0 },
+ { X86::RCR16r1, X86::RCR16m1, 0 },
+ { X86::RCR16rCL, X86::RCR16mCL, 0 },
+ { X86::RCR16ri, X86::RCR16mi, 0 },
+ { X86::RCR32r1, X86::RCR32m1, 0 },
+ { X86::RCR32rCL, X86::RCR32mCL, 0 },
+ { X86::RCR32ri, X86::RCR32mi, 0 },
+ { X86::RCR64r1, X86::RCR64m1, 0 },
+ { X86::RCR64rCL, X86::RCR64mCL, 0 },
+ { X86::RCR64ri, X86::RCR64mi, 0 },
+ { X86::RCR8r1, X86::RCR8m1, 0 },
+ { X86::RCR8rCL, X86::RCR8mCL, 0 },
+ { X86::RCR8ri, X86::RCR8mi, 0 },
+ { X86::ROL16r1, X86::ROL16m1, 0 },
+ { X86::ROL16rCL, X86::ROL16mCL, 0 },
+ { X86::ROL16ri, X86::ROL16mi, 0 },
+ { X86::ROL32r1, X86::ROL32m1, 0 },
+ { X86::ROL32rCL, X86::ROL32mCL, 0 },
+ { X86::ROL32ri, X86::ROL32mi, 0 },
+ { X86::ROL64r1, X86::ROL64m1, 0 },
+ { X86::ROL64rCL, X86::ROL64mCL, 0 },
+ { X86::ROL64ri, X86::ROL64mi, 0 },
+ { X86::ROL8r1, X86::ROL8m1, 0 },
+ { X86::ROL8rCL, X86::ROL8mCL, 0 },
+ { X86::ROL8ri, X86::ROL8mi, 0 },
+ { X86::ROR16r1, X86::ROR16m1, 0 },
+ { X86::ROR16rCL, X86::ROR16mCL, 0 },
+ { X86::ROR16ri, X86::ROR16mi, 0 },
+ { X86::ROR32r1, X86::ROR32m1, 0 },
+ { X86::ROR32rCL, X86::ROR32mCL, 0 },
+ { X86::ROR32ri, X86::ROR32mi, 0 },
+ { X86::ROR64r1, X86::ROR64m1, 0 },
+ { X86::ROR64rCL, X86::ROR64mCL, 0 },
+ { X86::ROR64ri, X86::ROR64mi, 0 },
+ { X86::ROR8r1, X86::ROR8m1, 0 },
+ { X86::ROR8rCL, X86::ROR8mCL, 0 },
+ { X86::ROR8ri, X86::ROR8mi, 0 },
+ { X86::SAR16r1, X86::SAR16m1, 0 },
+ { X86::SAR16rCL, X86::SAR16mCL, 0 },
+ { X86::SAR16ri, X86::SAR16mi, 0 },
+ { X86::SAR32r1, X86::SAR32m1, 0 },
+ { X86::SAR32rCL, X86::SAR32mCL, 0 },
+ { X86::SAR32ri, X86::SAR32mi, 0 },
+ { X86::SAR64r1, X86::SAR64m1, 0 },
+ { X86::SAR64rCL, X86::SAR64mCL, 0 },
+ { X86::SAR64ri, X86::SAR64mi, 0 },
+ { X86::SAR8r1, X86::SAR8m1, 0 },
+ { X86::SAR8rCL, X86::SAR8mCL, 0 },
+ { X86::SAR8ri, X86::SAR8mi, 0 },
+ { X86::SBB16ri, X86::SBB16mi, 0 },
+ { X86::SBB16ri8, X86::SBB16mi8, 0 },
+ { X86::SBB16rr, X86::SBB16mr, 0 },
+ { X86::SBB32ri, X86::SBB32mi, 0 },
+ { X86::SBB32ri8, X86::SBB32mi8, 0 },
+ { X86::SBB32rr, X86::SBB32mr, 0 },
+ { X86::SBB64ri32, X86::SBB64mi32, 0 },
+ { X86::SBB64ri8, X86::SBB64mi8, 0 },
+ { X86::SBB64rr, X86::SBB64mr, 0 },
+ { X86::SBB8ri, X86::SBB8mi, 0 },
+ { X86::SBB8ri8, X86::SBB8mi8, 0 },
+ { X86::SBB8rr, X86::SBB8mr, 0 },
+ { X86::SHL16r1, X86::SHL16m1, 0 },
+ { X86::SHL16rCL, X86::SHL16mCL, 0 },
+ { X86::SHL16ri, X86::SHL16mi, 0 },
+ { X86::SHL32r1, X86::SHL32m1, 0 },
+ { X86::SHL32rCL, X86::SHL32mCL, 0 },
+ { X86::SHL32ri, X86::SHL32mi, 0 },
+ { X86::SHL64r1, X86::SHL64m1, 0 },
+ { X86::SHL64rCL, X86::SHL64mCL, 0 },
+ { X86::SHL64ri, X86::SHL64mi, 0 },
+ { X86::SHL8r1, X86::SHL8m1, 0 },
+ { X86::SHL8rCL, X86::SHL8mCL, 0 },
+ { X86::SHL8ri, X86::SHL8mi, 0 },
+ { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
+ { X86::SHLD16rri8, X86::SHLD16mri8, 0 },
+ { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
+ { X86::SHLD32rri8, X86::SHLD32mri8, 0 },
+ { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
+ { X86::SHLD64rri8, X86::SHLD64mri8, 0 },
+ { X86::SHR16r1, X86::SHR16m1, 0 },
+ { X86::SHR16rCL, X86::SHR16mCL, 0 },
+ { X86::SHR16ri, X86::SHR16mi, 0 },
+ { X86::SHR32r1, X86::SHR32m1, 0 },
+ { X86::SHR32rCL, X86::SHR32mCL, 0 },
+ { X86::SHR32ri, X86::SHR32mi, 0 },
+ { X86::SHR64r1, X86::SHR64m1, 0 },
+ { X86::SHR64rCL, X86::SHR64mCL, 0 },
+ { X86::SHR64ri, X86::SHR64mi, 0 },
+ { X86::SHR8r1, X86::SHR8m1, 0 },
+ { X86::SHR8rCL, X86::SHR8mCL, 0 },
+ { X86::SHR8ri, X86::SHR8mi, 0 },
+ { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
+ { X86::SHRD16rri8, X86::SHRD16mri8, 0 },
+ { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
+ { X86::SHRD32rri8, X86::SHRD32mri8, 0 },
+ { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
+ { X86::SHRD64rri8, X86::SHRD64mri8, 0 },
+ { X86::SUB16ri, X86::SUB16mi, 0 },
+ { X86::SUB16ri8, X86::SUB16mi8, 0 },
+ { X86::SUB16rr, X86::SUB16mr, 0 },
+ { X86::SUB32ri, X86::SUB32mi, 0 },
+ { X86::SUB32ri8, X86::SUB32mi8, 0 },
+ { X86::SUB32rr, X86::SUB32mr, 0 },
+ { X86::SUB64ri32, X86::SUB64mi32, 0 },
+ { X86::SUB64ri8, X86::SUB64mi8, 0 },
+ { X86::SUB64rr, X86::SUB64mr, 0 },
+ { X86::SUB8ri, X86::SUB8mi, 0 },
+ { X86::SUB8ri8, X86::SUB8mi8, 0 },
+ { X86::SUB8rr, X86::SUB8mr, 0 },
+ { X86::XOR16ri, X86::XOR16mi, 0 },
+ { X86::XOR16ri8, X86::XOR16mi8, 0 },
+ { X86::XOR16rr, X86::XOR16mr, 0 },
+ { X86::XOR32ri, X86::XOR32mi, 0 },
+ { X86::XOR32ri8, X86::XOR32mi8, 0 },
+ { X86::XOR32rr, X86::XOR32mr, 0 },
+ { X86::XOR64ri32, X86::XOR64mi32, 0 },
+ { X86::XOR64ri8, X86::XOR64mi8, 0 },
+ { X86::XOR64rr, X86::XOR64mr, 0 },
+ { X86::XOR8ri, X86::XOR8mi, 0 },
+ { X86::XOR8ri8, X86::XOR8mi8, 0 },
+ { X86::XOR8rr, X86::XOR8mr, 0 },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
+ { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
+ { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
+ { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
+ { X86::CALL16r, X86::CALL16m, TB_FOLDED_LOAD },
+ { X86::CALL16r_NT, X86::CALL16m_NT, TB_FOLDED_LOAD },
+ { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
+ { X86::CALL32r_NT, X86::CALL32m_NT, TB_FOLDED_LOAD },
+ { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
+ { X86::CALL64r_NT, X86::CALL64m_NT, TB_FOLDED_LOAD },
+ { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
+ { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
+ { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
+ { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
+ { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
+ { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
+ { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
+ { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
+ { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
+ { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
+ { X86::CMP8ri8, X86::CMP8mi8, TB_FOLDED_LOAD },
+ { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
+ { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
+ { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
+ { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
+ { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
+ { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
+ { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
+ { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
+ { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
+ { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
+ { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
+ { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
+ { X86::JMP16r, X86::JMP16m, TB_FOLDED_LOAD },
+ { X86::JMP16r_NT, X86::JMP16m_NT, TB_FOLDED_LOAD },
+ { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
+ { X86::JMP32r_NT, X86::JMP32m_NT, TB_FOLDED_LOAD },
+ { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
+ { X86::JMP64r_NT, X86::JMP64m_NT, TB_FOLDED_LOAD },
+ { X86::MMX_MOVD64from64rr, X86::MMX_MOVD64from64rm, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MMX_MOVD64grr, X86::MMX_MOVD64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
+ { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
+ { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
+ { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
+ { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
+ { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
+ { X86::MOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
+ { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
+ { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+ { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
+ { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOVSDto64rr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOVSS2DIrr, X86::MOVSSmr, TB_FOLDED_STORE },
+ { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
+ { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
+ { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
+ { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
+ { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
+ { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
+ { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
+ { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
+ { X86::PTWRITE64r, X86::PTWRITE64m, TB_FOLDED_LOAD },
+ { X86::PTWRITEr, X86::PTWRITEm, TB_FOLDED_LOAD },
+ { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
+ { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
+ { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
+ { X86::SETCCr, X86::SETCCm, TB_FOLDED_STORE },
+ { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
+ { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
+ { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
+ { X86::TCRETURNri, X86::TCRETURNmi, TB_FOLDED_LOAD | TB_NO_FORWARD },
+ { X86::TCRETURNri64, X86::TCRETURNmi64, TB_FOLDED_LOAD | TB_NO_FORWARD },
+ { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
+ { X86::TEST16rr, X86::TEST16mr, TB_FOLDED_LOAD },
+ { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
+ { X86::TEST32rr, X86::TEST32mr, TB_FOLDED_LOAD },
+ { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
+ { X86::TEST64rr, X86::TEST64mr, TB_FOLDED_LOAD },
+ { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
+ { X86::TEST8rr, X86::TEST8mr, TB_FOLDED_LOAD },
+ { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE },
+ { X86::VCVTPS2PHZ256rr, X86::VCVTPS2PHZ256mr, TB_FOLDED_STORE },
+ { X86::VCVTPS2PHZrr, X86::VCVTPS2PHZmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF32x4Z256rr, X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF32x4Zrr, X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF32x8Zrr, X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Z256rr, X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Zrr, X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x4Zrr, X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Z256rr, X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Zrr, X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x8Zrr, X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Z256rr, X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Zrr, X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x4Zrr, X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTPSrr, X86::VEXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDI2SSZrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE },
+ { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE },
+ { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
+ { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+ { X86::VMOVPDI2DIrr, X86::VMOVPDI2DImr, TB_FOLDED_STORE },
+ { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVSDto64Zrr, X86::VMOVSDZmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVSDto64rr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVSS2DIZrr, X86::VMOVSSZmr, TB_FOLDED_STORE },
+ { X86::VMOVSS2DIrr, X86::VMOVSSmr, TB_FOLDED_STORE },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
+ { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
+ { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
+ { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE },
+ { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE },
+ { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE },
+ { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
+ { X86::VPEXTRDZrr, X86::VPEXTRDZmr, TB_FOLDED_STORE },
+ { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
+ { X86::VPEXTRQZrr, X86::VPEXTRQZmr, TB_FOLDED_STORE },
+ { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
+ { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+ { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
+ { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
+ { X86::BEXTR32rr, X86::BEXTR32rm, 0 },
+ { X86::BEXTR64rr, X86::BEXTR64rm, 0 },
+ { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 },
+ { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 },
+ { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 },
+ { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 },
+ { X86::BLCI32rr, X86::BLCI32rm, 0 },
+ { X86::BLCI64rr, X86::BLCI64rm, 0 },
+ { X86::BLCIC32rr, X86::BLCIC32rm, 0 },
+ { X86::BLCIC64rr, X86::BLCIC64rm, 0 },
+ { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 },
+ { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 },
+ { X86::BLCS32rr, X86::BLCS32rm, 0 },
+ { X86::BLCS64rr, X86::BLCS64rm, 0 },
+ { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 },
+ { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 },
+ { X86::BLSI32rr, X86::BLSI32rm, 0 },
+ { X86::BLSI64rr, X86::BLSI64rm, 0 },
+ { X86::BLSIC32rr, X86::BLSIC32rm, 0 },
+ { X86::BLSIC64rr, X86::BLSIC64rm, 0 },
+ { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
+ { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
+ { X86::BLSR32rr, X86::BLSR32rm, 0 },
+ { X86::BLSR64rr, X86::BLSR64rm, 0 },
+ { X86::BSF16rr, X86::BSF16rm, 0 },
+ { X86::BSF32rr, X86::BSF32rm, 0 },
+ { X86::BSF64rr, X86::BSF64rm, 0 },
+ { X86::BSR16rr, X86::BSR16rm, 0 },
+ { X86::BSR32rr, X86::BSR32rm, 0 },
+ { X86::BSR64rr, X86::BSR64rm, 0 },
+ { X86::BZHI32rr, X86::BZHI32rm, 0 },
+ { X86::BZHI64rr, X86::BZHI64rm, 0 },
+ { X86::CMP16rr, X86::CMP16rm, 0 },
+ { X86::CMP32rr, X86::CMP32rm, 0 },
+ { X86::CMP64rr, X86::CMP64rm, 0 },
+ { X86::CMP8rr, X86::CMP8rm, 0 },
+ { X86::COMISDrr, X86::COMISDrm, 0 },
+ { X86::COMISDrr_Int, X86::COMISDrm_Int, TB_NO_REVERSE },
+ { X86::COMISSrr, X86::COMISSrm, 0 },
+ { X86::COMISSrr_Int, X86::COMISSrm_Int, TB_NO_REVERSE },
+ { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
+ { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
+ { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
+ { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
+ { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
+ { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 },
+ { X86::CVTSD2SI64rr_Int, X86::CVTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 },
+ { X86::CVTSD2SIrr_Int, X86::CVTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
+ { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
+ { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
+ { X86::CVTSI642SDrr, X86::CVTSI642SDrm, 0 },
+ { X86::CVTSI642SSrr, X86::CVTSI642SSrm, 0 },
+ { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
+ { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 },
+ { X86::CVTSS2SI64rr_Int, X86::CVTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 },
+ { X86::CVTSS2SIrr_Int, X86::CVTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
+ { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
+ { X86::CVTTSD2SI64rr_Int, X86::CVTTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
+ { X86::CVTTSD2SIrr_Int, X86::CVTTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
+ { X86::CVTTSS2SI64rr_Int, X86::CVTTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
+ { X86::CVTTSS2SIrr_Int, X86::CVTTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::IMUL16rri, X86::IMUL16rmi, 0 },
+ { X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
+ { X86::IMUL32rri, X86::IMUL32rmi, 0 },
+ { X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
+ { X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
+ { X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
+ { X86::LWPINS32rri, X86::LWPINS32rmi, 0 },
+ { X86::LWPINS64rri, X86::LWPINS64rmi, 0 },
+ { X86::LWPVAL32rri, X86::LWPVAL32rmi, 0 },
+ { X86::LWPVAL64rri, X86::LWPVAL64rmi, 0 },
+ { X86::LZCNT16rr, X86::LZCNT16rm, 0 },
+ { X86::LZCNT32rr, X86::LZCNT32rm, 0 },
+ { X86::LZCNT64rr, X86::LZCNT64rm, 0 },
+ { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, TB_ALIGN_16 },
+ { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 },
+ { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, TB_NO_REVERSE },
+ { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, TB_ALIGN_16 },
+ { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, TB_NO_REVERSE },
+ { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 },
+ { X86::MMX_PABSBrr, X86::MMX_PABSBrm, 0 },
+ { X86::MMX_PABSDrr, X86::MMX_PABSDrm, 0 },
+ { X86::MMX_PABSWrr, X86::MMX_PABSWrm, 0 },
+ { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },
+ { X86::MOV16rr, X86::MOV16rm, 0 },
+ { X86::MOV32rr, X86::MOV32rm, 0 },
+ { X86::MOV64rr, X86::MOV64rm, 0 },
+ { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::MOV64toSDrr, X86::MOVSDrm_alt, TB_NO_REVERSE },
+ { X86::MOV8rr, X86::MOV8rm, 0 },
+ { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
+ { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
+ { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
+ { X86::MOVDI2SSrr, X86::MOVSSrm_alt, 0 },
+ { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUrm, 0 },
+ { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
+ { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
+ { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
+ { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
+ { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
+ { X86::MOVSX32rr8_NOREX, X86::MOVSX32rm8_NOREX, 0 },
+ { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
+ { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
+ { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
+ { X86::MOVUPDrr, X86::MOVUPDrm, 0 },
+ { X86::MOVUPSrr, X86::MOVUPSrm, 0 },
+ { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
+ { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
+ { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
+ { X86::MOVZX32rr8_NOREX, X86::MOVZX32rm8_NOREX, 0 },
+ { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 },
+ { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 },
+ { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 },
+ { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
+ { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
+ { X86::PCMPESTRIrr, X86::PCMPESTRIrm, 0 },
+ { X86::PCMPESTRMrr, X86::PCMPESTRMrm, 0 },
+ { X86::PCMPISTRIrr, X86::PCMPISTRIrm, 0 },
+ { X86::PCMPISTRMrr, X86::PCMPISTRMrm, 0 },
+ { X86::PF2IDrr, X86::PF2IDrm, 0 },
+ { X86::PF2IWrr, X86::PF2IWrm, 0 },
+ { X86::PFRCPrr, X86::PFRCPrm, 0 },
+ { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
+ { X86::PHMINPOSUWrr, X86::PHMINPOSUWrm, TB_ALIGN_16 },
+ { X86::PI2FDrr, X86::PI2FDrm, 0 },
+ { X86::PI2FWrr, X86::PI2FWrm, 0 },
+ { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
+ { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
+ { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE },
+ { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE },
+ { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE },
+ { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE },
+ { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE },
+ { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE },
+ { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE },
+ { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE },
+ { X86::POPCNT16rr, X86::POPCNT16rm, 0 },
+ { X86::POPCNT32rr, X86::POPCNT32rm, 0 },
+ { X86::POPCNT64rr, X86::POPCNT64rm, 0 },
+ { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
+ { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
+ { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
+ { X86::PSWAPDrr, X86::PSWAPDrm, 0 },
+ { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
+ { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
+ { X86::RCPSSr, X86::RCPSSm, 0 },
+ { X86::RORX32ri, X86::RORX32mi, 0 },
+ { X86::RORX64ri, X86::RORX64mi, 0 },
+ { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
+ { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
+ { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
+ { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
+ { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
+ { X86::RSQRTSSr, X86::RSQRTSSm, 0 },
+ { X86::SARX32rr, X86::SARX32rm, 0 },
+ { X86::SARX64rr, X86::SARX64rm, 0 },
+ { X86::SHLX32rr, X86::SHLX32rm, 0 },
+ { X86::SHLX64rr, X86::SHLX64rm, 0 },
+ { X86::SHRX32rr, X86::SHRX32rm, 0 },
+ { X86::SHRX64rr, X86::SHRX64rm, 0 },
+ { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
+ { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
+ { X86::SQRTSDr, X86::SQRTSDm, 0 },
+ { X86::SQRTSSr, X86::SQRTSSm, 0 },
+ { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
+ { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
+ { X86::TZCNT16rr, X86::TZCNT16rm, 0 },
+ { X86::TZCNT32rr, X86::TZCNT32rm, 0 },
+ { X86::TZCNT64rr, X86::TZCNT64rm, 0 },
+ { X86::TZMSK32rr, X86::TZMSK32rm, 0 },
+ { X86::TZMSK64rr, X86::TZMSK64rm, 0 },
+ { X86::UCOMISDrr, X86::UCOMISDrm, 0 },
+ { X86::UCOMISDrr_Int, X86::UCOMISDrm_Int, TB_NO_REVERSE },
+ { X86::UCOMISSrr, X86::UCOMISSrm, 0 },
+ { X86::UCOMISSrr_Int, X86::UCOMISSrm_Int, TB_NO_REVERSE },
+ { X86::VAESIMCrr, X86::VAESIMCrm, 0 },
+ { X86::VAESKEYGENASSIST128rr,X86::VAESKEYGENASSIST128rm,0 },
+ { X86::VBROADCASTF32X2Z256rr,X86::VBROADCASTF32X2Z256rm,TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Zrr, X86::VBROADCASTF32X2Zrm, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z128rr,X86::VBROADCASTI32X2Z128rm,TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z256rr,X86::VBROADCASTI32X2Z256rm,TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Zrr, X86::VBROADCASTI32X2Zrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rr, X86::VBROADCASTSDZ256rm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrr, X86::VBROADCASTSDZrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rr, X86::VBROADCASTSSZ128rm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rr, X86::VBROADCASTSSZ256rm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZrr, X86::VBROADCASTSSZrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
+ { X86::VCOMISDZrr, X86::VCOMISDZrm, 0 },
+ { X86::VCOMISDZrr_Int, X86::VCOMISDZrm_Int, TB_NO_REVERSE },
+ { X86::VCOMISDrr, X86::VCOMISDrm, 0 },
+ { X86::VCOMISDrr_Int, X86::VCOMISDrm_Int, TB_NO_REVERSE },
+ { X86::VCOMISSZrr, X86::VCOMISSZrm, 0 },
+ { X86::VCOMISSZrr_Int, X86::VCOMISSZrm_Int, TB_NO_REVERSE },
+ { X86::VCOMISSrr, X86::VCOMISSrm, 0 },
+ { X86::VCOMISSrr_Int, X86::VCOMISSrm_Int, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
+ { X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 },
+ { X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrm, 0 },
+ { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
+ { X86::VCVTDQ2PSZ128rr, X86::VCVTDQ2PSZ128rm, 0 },
+ { X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rm, 0 },
+ { X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrm, 0 },
+ { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
+ { X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rm, 0 },
+ { X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rm, 0 },
+ { X86::VCVTNEPS2BF16Zrr, X86::VCVTNEPS2BF16Zrm, 0 },
+ { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
+ { X86::VCVTPD2DQZ128rr, X86::VCVTPD2DQZ128rm, 0 },
+ { X86::VCVTPD2DQZ256rr, X86::VCVTPD2DQZ256rm, 0 },
+ { X86::VCVTPD2DQZrr, X86::VCVTPD2DQZrm, 0 },
+ { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
+ { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
+ { X86::VCVTPD2PSZ128rr, X86::VCVTPD2PSZ128rm, 0 },
+ { X86::VCVTPD2PSZ256rr, X86::VCVTPD2PSZ256rm, 0 },
+ { X86::VCVTPD2PSZrr, X86::VCVTPD2PSZrm, 0 },
+ { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 },
+ { X86::VCVTPD2QQZ128rr, X86::VCVTPD2QQZ128rm, 0 },
+ { X86::VCVTPD2QQZ256rr, X86::VCVTPD2QQZ256rm, 0 },
+ { X86::VCVTPD2QQZrr, X86::VCVTPD2QQZrm, 0 },
+ { X86::VCVTPD2UDQZ128rr, X86::VCVTPD2UDQZ128rm, 0 },
+ { X86::VCVTPD2UDQZ256rr, X86::VCVTPD2UDQZ256rm, 0 },
+ { X86::VCVTPD2UDQZrr, X86::VCVTPD2UDQZrm, 0 },
+ { X86::VCVTPD2UQQZ128rr, X86::VCVTPD2UQQZ128rm, 0 },
+ { X86::VCVTPD2UQQZ256rr, X86::VCVTPD2UQQZ256rm, 0 },
+ { X86::VCVTPD2UQQZrr, X86::VCVTPD2UQQZrm, 0 },
+ { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
+ { X86::VCVTPH2PSZ128rr, X86::VCVTPH2PSZ128rm, TB_NO_REVERSE },
+ { X86::VCVTPH2PSZ256rr, X86::VCVTPH2PSZ256rm, 0 },
+ { X86::VCVTPH2PSZrr, X86::VCVTPH2PSZrm, 0 },
+ { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, TB_NO_REVERSE },
+ { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
+ { X86::VCVTPS2DQZ128rr, X86::VCVTPS2DQZ128rm, 0 },
+ { X86::VCVTPS2DQZ256rr, X86::VCVTPS2DQZ256rm, 0 },
+ { X86::VCVTPS2DQZrr, X86::VCVTPS2DQZrm, 0 },
+ { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
+ { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 },
+ { X86::VCVTPS2PDZ128rr, X86::VCVTPS2PDZ128rm, TB_NO_REVERSE },
+ { X86::VCVTPS2PDZ256rr, X86::VCVTPS2PDZ256rm, 0 },
+ { X86::VCVTPS2PDZrr, X86::VCVTPS2PDZrm, 0 },
+ { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE },
+ { X86::VCVTPS2QQZ128rr, X86::VCVTPS2QQZ128rm, TB_NO_REVERSE },
+ { X86::VCVTPS2QQZ256rr, X86::VCVTPS2QQZ256rm, 0 },
+ { X86::VCVTPS2QQZrr, X86::VCVTPS2QQZrm, 0 },
+ { X86::VCVTPS2UDQZ128rr, X86::VCVTPS2UDQZ128rm, 0 },
+ { X86::VCVTPS2UDQZ256rr, X86::VCVTPS2UDQZ256rm, 0 },
+ { X86::VCVTPS2UDQZrr, X86::VCVTPS2UDQZrm, 0 },
+ { X86::VCVTPS2UQQZ128rr, X86::VCVTPS2UQQZ128rm, TB_NO_REVERSE },
+ { X86::VCVTPS2UQQZ256rr, X86::VCVTPS2UQQZ256rm, 0 },
+ { X86::VCVTPS2UQQZrr, X86::VCVTPS2UQQZrm, 0 },
+ { X86::VCVTQQ2PDZ128rr, X86::VCVTQQ2PDZ128rm, 0 },
+ { X86::VCVTQQ2PDZ256rr, X86::VCVTQQ2PDZ256rm, 0 },
+ { X86::VCVTQQ2PDZrr, X86::VCVTQQ2PDZrm, 0 },
+ { X86::VCVTQQ2PSZ128rr, X86::VCVTQQ2PSZ128rm, 0 },
+ { X86::VCVTQQ2PSZ256rr, X86::VCVTQQ2PSZ256rm, 0 },
+ { X86::VCVTQQ2PSZrr, X86::VCVTQQ2PSZrm, 0 },
+ { X86::VCVTSD2SI64Zrr, X86::VCVTSD2SI64Zrm, 0 },
+ { X86::VCVTSD2SI64Zrr_Int, X86::VCVTSD2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 },
+ { X86::VCVTSD2SI64rr_Int, X86::VCVTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SIZrr, X86::VCVTSD2SIZrm, 0 },
+ { X86::VCVTSD2SIZrr_Int, X86::VCVTSD2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
+ { X86::VCVTSD2SIrr_Int, X86::VCVTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2USI64Zrr_Int, X86::VCVTSD2USI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2USIZrr_Int, X86::VCVTSD2USIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64Zrr, X86::VCVTSS2SI64Zrm, 0 },
+ { X86::VCVTSS2SI64Zrr_Int, X86::VCVTSS2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 },
+ { X86::VCVTSS2SI64rr_Int, X86::VCVTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SIZrr, X86::VCVTSS2SIZrm, 0 },
+ { X86::VCVTSS2SIZrr_Int, X86::VCVTSS2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 },
+ { X86::VCVTSS2SIrr_Int, X86::VCVTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2USI64Zrr_Int, X86::VCVTSS2USI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2USIZrr_Int, X86::VCVTSS2USIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
+ { X86::VCVTTPD2DQZ128rr, X86::VCVTTPD2DQZ128rm, 0 },
+ { X86::VCVTTPD2DQZ256rr, X86::VCVTTPD2DQZ256rm, 0 },
+ { X86::VCVTTPD2DQZrr, X86::VCVTTPD2DQZrm, 0 },
+ { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 },
+ { X86::VCVTTPD2QQZ128rr, X86::VCVTTPD2QQZ128rm, 0 },
+ { X86::VCVTTPD2QQZ256rr, X86::VCVTTPD2QQZ256rm, 0 },
+ { X86::VCVTTPD2QQZrr, X86::VCVTTPD2QQZrm, 0 },
+ { X86::VCVTTPD2UDQZ128rr, X86::VCVTTPD2UDQZ128rm, 0 },
+ { X86::VCVTTPD2UDQZ256rr, X86::VCVTTPD2UDQZ256rm, 0 },
+ { X86::VCVTTPD2UDQZrr, X86::VCVTTPD2UDQZrm, 0 },
+ { X86::VCVTTPD2UQQZ128rr, X86::VCVTTPD2UQQZ128rm, 0 },
+ { X86::VCVTTPD2UQQZ256rr, X86::VCVTTPD2UQQZ256rm, 0 },
+ { X86::VCVTTPD2UQQZrr, X86::VCVTTPD2UQQZrm, 0 },
+ { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
+ { X86::VCVTTPS2DQZ128rr, X86::VCVTTPS2DQZ128rm, 0 },
+ { X86::VCVTTPS2DQZ256rr, X86::VCVTTPS2DQZ256rm, 0 },
+ { X86::VCVTTPS2DQZrr, X86::VCVTTPS2DQZrm, 0 },
+ { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
+ { X86::VCVTTPS2QQZ128rr, X86::VCVTTPS2QQZ128rm, TB_NO_REVERSE },
+ { X86::VCVTTPS2QQZ256rr, X86::VCVTTPS2QQZ256rm, 0 },
+ { X86::VCVTTPS2QQZrr, X86::VCVTTPS2QQZrm, 0 },
+ { X86::VCVTTPS2UDQZ128rr, X86::VCVTTPS2UDQZ128rm, 0 },
+ { X86::VCVTTPS2UDQZ256rr, X86::VCVTTPS2UDQZ256rm, 0 },
+ { X86::VCVTTPS2UDQZrr, X86::VCVTTPS2UDQZrm, 0 },
+ { X86::VCVTTPS2UQQZ128rr, X86::VCVTTPS2UQQZ128rm, TB_NO_REVERSE },
+ { X86::VCVTTPS2UQQZ256rr, X86::VCVTTPS2UQQZ256rm, 0 },
+ { X86::VCVTTPS2UQQZrr, X86::VCVTTPS2UQQZrm, 0 },
+ { X86::VCVTTSD2SI64Zrr, X86::VCVTTSD2SI64Zrm, 0 },
+ { X86::VCVTTSD2SI64Zrr_Int, X86::VCVTTSD2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
+ { X86::VCVTTSD2SI64rr_Int, X86::VCVTTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SIZrm, 0 },
+ { X86::VCVTTSD2SIZrr_Int, X86::VCVTTSD2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
+ { X86::VCVTTSD2SIrr_Int, X86::VCVTTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2USI64Zrr, X86::VCVTTSD2USI64Zrm, 0 },
+ { X86::VCVTTSD2USI64Zrr_Int, X86::VCVTTSD2USI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSD2USIZrr, X86::VCVTTSD2USIZrm, 0 },
+ { X86::VCVTTSD2USIZrr_Int, X86::VCVTTSD2USIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2SI64Zrr, X86::VCVTTSS2SI64Zrm, 0 },
+ { X86::VCVTTSS2SI64Zrr_Int, X86::VCVTTSS2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
+ { X86::VCVTTSS2SI64rr_Int, X86::VCVTTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SIZrm, 0 },
+ { X86::VCVTTSS2SIZrr_Int, X86::VCVTTSS2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
+ { X86::VCVTTSS2SIrr_Int, X86::VCVTTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2USI64Zrr, X86::VCVTTSS2USI64Zrm, 0 },
+ { X86::VCVTTSS2USI64Zrr_Int, X86::VCVTTSS2USI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTSS2USIZrr, X86::VCVTTSS2USIZrm, 0 },
+ { X86::VCVTTSS2USIZrr_Int, X86::VCVTTSS2USIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTUDQ2PDZ128rr, X86::VCVTUDQ2PDZ128rm, TB_NO_REVERSE },
+ { X86::VCVTUDQ2PDZ256rr, X86::VCVTUDQ2PDZ256rm, 0 },
+ { X86::VCVTUDQ2PDZrr, X86::VCVTUDQ2PDZrm, 0 },
+ { X86::VCVTUDQ2PSZ128rr, X86::VCVTUDQ2PSZ128rm, 0 },
+ { X86::VCVTUDQ2PSZ256rr, X86::VCVTUDQ2PSZ256rm, 0 },
+ { X86::VCVTUDQ2PSZrr, X86::VCVTUDQ2PSZrm, 0 },
+ { X86::VCVTUQQ2PDZ128rr, X86::VCVTUQQ2PDZ128rm, 0 },
+ { X86::VCVTUQQ2PDZ256rr, X86::VCVTUQQ2PDZ256rm, 0 },
+ { X86::VCVTUQQ2PDZrr, X86::VCVTUQQ2PDZrm, 0 },
+ { X86::VCVTUQQ2PSZ128rr, X86::VCVTUQQ2PSZ128rm, 0 },
+ { X86::VCVTUQQ2PSZ256rr, X86::VCVTUQQ2PSZ256rm, 0 },
+ { X86::VCVTUQQ2PSZrr, X86::VCVTUQQ2PSZrm, 0 },
+ { X86::VEXP2PDZr, X86::VEXP2PDZm, 0 },
+ { X86::VEXP2PSZr, X86::VEXP2PSZm, 0 },
+ { X86::VEXPANDPDZ128rr, X86::VEXPANDPDZ128rm, TB_NO_REVERSE },
+ { X86::VEXPANDPDZ256rr, X86::VEXPANDPDZ256rm, TB_NO_REVERSE },
+ { X86::VEXPANDPDZrr, X86::VEXPANDPDZrm, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ128rr, X86::VEXPANDPSZ128rm, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ256rr, X86::VEXPANDPSZ256rm, TB_NO_REVERSE },
+ { X86::VEXPANDPSZrr, X86::VEXPANDPSZrm, TB_NO_REVERSE },
+ { X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rm, 0 },
+ { X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rm, 0 },
+ { X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrm, 0 },
+ { X86::VFPCLASSPSZ128rr, X86::VFPCLASSPSZ128rm, 0 },
+ { X86::VFPCLASSPSZ256rr, X86::VFPCLASSPSZ256rm, 0 },
+ { X86::VFPCLASSPSZrr, X86::VFPCLASSPSZrm, 0 },
+ { X86::VFPCLASSSDZrr, X86::VFPCLASSSDZrm, TB_NO_REVERSE },
+ { X86::VFPCLASSSSZrr, X86::VFPCLASSSSZrm, TB_NO_REVERSE },
+ { X86::VFRCZPDYrr, X86::VFRCZPDYrm, 0 },
+ { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
+ { X86::VFRCZPSYrr, X86::VFRCZPSYrm, 0 },
+ { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 },
+ { X86::VFRCZSDrr, X86::VFRCZSDrm, TB_NO_REVERSE },
+ { X86::VFRCZSSrr, X86::VFRCZSSrm, TB_NO_REVERSE },
+ { X86::VGETEXPPDZ128r, X86::VGETEXPPDZ128m, 0 },
+ { X86::VGETEXPPDZ256r, X86::VGETEXPPDZ256m, 0 },
+ { X86::VGETEXPPDZr, X86::VGETEXPPDZm, 0 },
+ { X86::VGETEXPPSZ128r, X86::VGETEXPPSZ128m, 0 },
+ { X86::VGETEXPPSZ256r, X86::VGETEXPPSZ256m, 0 },
+ { X86::VGETEXPPSZr, X86::VGETEXPPSZm, 0 },
+ { X86::VGETMANTPDZ128rri, X86::VGETMANTPDZ128rmi, 0 },
+ { X86::VGETMANTPDZ256rri, X86::VGETMANTPDZ256rmi, 0 },
+ { X86::VGETMANTPDZrri, X86::VGETMANTPDZrmi, 0 },
+ { X86::VGETMANTPSZ128rri, X86::VGETMANTPSZ128rmi, 0 },
+ { X86::VGETMANTPSZ256rri, X86::VGETMANTPSZ256rmi, 0 },
+ { X86::VGETMANTPSZrri, X86::VGETMANTPSZrmi, 0 },
+ { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
+ { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::VMOV64toSDZrr, X86::VMOVSDZrm_alt, TB_NO_REVERSE },
+ { X86::VMOV64toSDrr, X86::VMOVSDrm_alt, TB_NO_REVERSE },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
+ { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
+ { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
+ { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
+ { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
+ { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
+ { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
+ { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
+ { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rm, TB_NO_REVERSE },
+ { X86::VMOVDDUPZ256rr, X86::VMOVDDUPZ256rm, 0 },
+ { X86::VMOVDDUPZrr, X86::VMOVDDUPZrm, 0 },
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
+ { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 },
+ { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
+ { X86::VMOVDI2SSZrr, X86::VMOVSSZrm_alt, 0 },
+ { X86::VMOVDI2SSrr, X86::VMOVSSrm_alt, 0 },
+ { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
+ { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
+ { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
+ { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 },
+ { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 },
+ { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
+ { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
+ { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 },
+ { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 },
+ { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 },
+ { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 },
+ { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 },
+ { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 },
+ { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
+ { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
+ { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 },
+ { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 },
+ { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 },
+ { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 },
+ { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 },
+ { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
+ { X86::VMOVSHDUPZ128rr, X86::VMOVSHDUPZ128rm, 0 },
+ { X86::VMOVSHDUPZ256rr, X86::VMOVSHDUPZ256rm, 0 },
+ { X86::VMOVSHDUPZrr, X86::VMOVSHDUPZrm, 0 },
+ { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
+ { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
+ { X86::VMOVSLDUPZ128rr, X86::VMOVSLDUPZ128rm, 0 },
+ { X86::VMOVSLDUPZ256rr, X86::VMOVSLDUPZ256rm, 0 },
+ { X86::VMOVSLDUPZrr, X86::VMOVSLDUPZrm, 0 },
+ { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
+ { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
+ { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
+ { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 },
+ { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
+ { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
+ { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
+ { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
+ { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
+ { X86::VMOVZPQILo2PQIZrr, X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
+ { X86::VMOVZPQILo2PQIrr, X86::VMOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::VPABSBYrr, X86::VPABSBYrm, 0 },
+ { X86::VPABSBZ128rr, X86::VPABSBZ128rm, 0 },
+ { X86::VPABSBZ256rr, X86::VPABSBZ256rm, 0 },
+ { X86::VPABSBZrr, X86::VPABSBZrm, 0 },
+ { X86::VPABSBrr, X86::VPABSBrm, 0 },
+ { X86::VPABSDYrr, X86::VPABSDYrm, 0 },
+ { X86::VPABSDZ128rr, X86::VPABSDZ128rm, 0 },
+ { X86::VPABSDZ256rr, X86::VPABSDZ256rm, 0 },
+ { X86::VPABSDZrr, X86::VPABSDZrm, 0 },
+ { X86::VPABSDrr, X86::VPABSDrm, 0 },
+ { X86::VPABSQZ128rr, X86::VPABSQZ128rm, 0 },
+ { X86::VPABSQZ256rr, X86::VPABSQZ256rm, 0 },
+ { X86::VPABSQZrr, X86::VPABSQZrm, 0 },
+ { X86::VPABSWYrr, X86::VPABSWYrm, 0 },
+ { X86::VPABSWZ128rr, X86::VPABSWZ128rm, 0 },
+ { X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0 },
+ { X86::VPABSWZrr, X86::VPABSWZrm, 0 },
+ { X86::VPABSWrr, X86::VPABSWrm, 0 },
+ { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ128rr, X86::VPBROADCASTBZ128rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ256rr, X86::VPBROADCASTBZ256rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZrr, X86::VPBROADCASTBZrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBrr , X86::VPBROADCASTBrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ128rr, X86::VPBROADCASTDZ128rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ256rr, X86::VPBROADCASTDZ256rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZrr, X86::VPBROADCASTDZrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ128rr, X86::VPBROADCASTQZ128rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ256rr, X86::VPBROADCASTQZ256rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZrr, X86::VPBROADCASTQZrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ128rr, X86::VPBROADCASTWZ128rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ256rr, X86::VPBROADCASTWZ256rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZrr, X86::VPBROADCASTWZrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
+ { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
+ { X86::VPCMPESTRMrr, X86::VPCMPESTRMrm, 0 },
+ { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
+ { X86::VPCMPISTRMrr, X86::VPCMPISTRMrm, 0 },
+ { X86::VPCONFLICTDZ128rr, X86::VPCONFLICTDZ128rm, 0 },
+ { X86::VPCONFLICTDZ256rr, X86::VPCONFLICTDZ256rm, 0 },
+ { X86::VPCONFLICTDZrr, X86::VPCONFLICTDZrm, 0 },
+ { X86::VPCONFLICTQZ128rr, X86::VPCONFLICTQZ128rm, 0 },
+ { X86::VPCONFLICTQZ256rr, X86::VPCONFLICTQZ256rm, 0 },
+ { X86::VPCONFLICTQZrr, X86::VPCONFLICTQZrm, 0 },
+ { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
+ { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
+ { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
+ { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
+ { X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
+ { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
+ { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
+ { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
+ { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
+ { X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
+ { X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
+ { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
+ { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPERMQYri, X86::VPERMQYmi, 0 },
+ { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
+ { X86::VPERMQZri, X86::VPERMQZmi, 0 },
+ { X86::VPEXPANDBZ128rr, X86::VPEXPANDBZ128rm, TB_NO_REVERSE },
+ { X86::VPEXPANDBZ256rr, X86::VPEXPANDBZ256rm, TB_NO_REVERSE },
+ { X86::VPEXPANDBZrr, X86::VPEXPANDBZrm, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ128rr, X86::VPEXPANDDZ128rm, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ256rr, X86::VPEXPANDDZ256rm, TB_NO_REVERSE },
+ { X86::VPEXPANDDZrr, X86::VPEXPANDDZrm, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ128rr, X86::VPEXPANDQZ128rm, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ256rr, X86::VPEXPANDQZ256rm, TB_NO_REVERSE },
+ { X86::VPEXPANDQZrr, X86::VPEXPANDQZrm, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ128rr, X86::VPEXPANDWZ128rm, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ256rr, X86::VPEXPANDWZ256rm, TB_NO_REVERSE },
+ { X86::VPEXPANDWZrr, X86::VPEXPANDWZrm, TB_NO_REVERSE },
+ { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 },
+ { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 },
+ { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 },
+ { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 },
+ { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 },
+ { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 },
+ { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 },
+ { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 },
+ { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 },
+ { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 },
+ { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 },
+ { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 },
+ { X86::VPHMINPOSUWrr, X86::VPHMINPOSUWrm, 0 },
+ { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 },
+ { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 },
+ { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 },
+ { X86::VPLZCNTDZ128rr, X86::VPLZCNTDZ128rm, 0 },
+ { X86::VPLZCNTDZ256rr, X86::VPLZCNTDZ256rm, 0 },
+ { X86::VPLZCNTDZrr, X86::VPLZCNTDZrm, 0 },
+ { X86::VPLZCNTQZ128rr, X86::VPLZCNTQZ128rm, 0 },
+ { X86::VPLZCNTQZ256rr, X86::VPLZCNTQZ256rm, 0 },
+ { X86::VPLZCNTQZrr, X86::VPLZCNTQZrm, 0 },
+ { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
+ { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
+ { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
+ { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
+ { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
+ { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 },
+ { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 },
+ { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
+ { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 },
+ { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 },
+ { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 },
+ { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 },
+ { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
+ { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 },
+ { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 },
+ { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
+ { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 },
+ { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
+ { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
+ { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 },
+ { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
+ { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
+ { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE },
+ { X86::VPOPCNTBZ128rr, X86::VPOPCNTBZ128rm, 0 },
+ { X86::VPOPCNTBZ256rr, X86::VPOPCNTBZ256rm, 0 },
+ { X86::VPOPCNTBZrr, X86::VPOPCNTBZrm, 0 },
+ { X86::VPOPCNTDZ128rr, X86::VPOPCNTDZ128rm, 0 },
+ { X86::VPOPCNTDZ256rr, X86::VPOPCNTDZ256rm, 0 },
+ { X86::VPOPCNTDZrr, X86::VPOPCNTDZrm, 0 },
+ { X86::VPOPCNTQZ128rr, X86::VPOPCNTQZ128rm, 0 },
+ { X86::VPOPCNTQZ256rr, X86::VPOPCNTQZ256rm, 0 },
+ { X86::VPOPCNTQZrr, X86::VPOPCNTQZrm, 0 },
+ { X86::VPOPCNTWZ128rr, X86::VPOPCNTWZ128rm, 0 },
+ { X86::VPOPCNTWZ256rr, X86::VPOPCNTWZ256rm, 0 },
+ { X86::VPOPCNTWZrr, X86::VPOPCNTWZrm, 0 },
+ { X86::VPROLDZ128ri, X86::VPROLDZ128mi, 0 },
+ { X86::VPROLDZ256ri, X86::VPROLDZ256mi, 0 },
+ { X86::VPROLDZri, X86::VPROLDZmi, 0 },
+ { X86::VPROLQZ128ri, X86::VPROLQZ128mi, 0 },
+ { X86::VPROLQZ256ri, X86::VPROLQZ256mi, 0 },
+ { X86::VPROLQZri, X86::VPROLQZmi, 0 },
+ { X86::VPRORDZ128ri, X86::VPRORDZ128mi, 0 },
+ { X86::VPRORDZ256ri, X86::VPRORDZ256mi, 0 },
+ { X86::VPRORDZri, X86::VPRORDZmi, 0 },
+ { X86::VPRORQZ128ri, X86::VPRORQZ128mi, 0 },
+ { X86::VPRORQZ256ri, X86::VPRORQZ256mi, 0 },
+ { X86::VPRORQZri, X86::VPRORQZmi, 0 },
+ { X86::VPROTBri, X86::VPROTBmi, 0 },
+ { X86::VPROTBrr, X86::VPROTBmr, 0 },
+ { X86::VPROTDri, X86::VPROTDmi, 0 },
+ { X86::VPROTDrr, X86::VPROTDmr, 0 },
+ { X86::VPROTQri, X86::VPROTQmi, 0 },
+ { X86::VPROTQrr, X86::VPROTQmr, 0 },
+ { X86::VPROTWri, X86::VPROTWmi, 0 },
+ { X86::VPROTWrr, X86::VPROTWmr, 0 },
+ { X86::VPSHABrr, X86::VPSHABmr, 0 },
+ { X86::VPSHADrr, X86::VPSHADmr, 0 },
+ { X86::VPSHAQrr, X86::VPSHAQmr, 0 },
+ { X86::VPSHAWrr, X86::VPSHAWmr, 0 },
+ { X86::VPSHLBrr, X86::VPSHLBmr, 0 },
+ { X86::VPSHLDrr, X86::VPSHLDmr, 0 },
+ { X86::VPSHLQrr, X86::VPSHLQmr, 0 },
+ { X86::VPSHLWrr, X86::VPSHLWmr, 0 },
+ { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
+ { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 },
+ { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 },
+ { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
+ { X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
+ { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
+ { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 },
+ { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 },
+ { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
+ { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
+ { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
+ { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 },
+ { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },
+ { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
+ { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
+ { X86::VPSLLDQZ128ri, X86::VPSLLDQZ128mi, 0 },
+ { X86::VPSLLDQZ256ri, X86::VPSLLDQZ256mi, 0 },
+ { X86::VPSLLDQZri, X86::VPSLLDQZmi, 0 },
+ { X86::VPSLLDZ128ri, X86::VPSLLDZ128mi, 0 },
+ { X86::VPSLLDZ256ri, X86::VPSLLDZ256mi, 0 },
+ { X86::VPSLLDZri, X86::VPSLLDZmi, 0 },
+ { X86::VPSLLQZ128ri, X86::VPSLLQZ128mi, 0 },
+ { X86::VPSLLQZ256ri, X86::VPSLLQZ256mi, 0 },
+ { X86::VPSLLQZri, X86::VPSLLQZmi, 0 },
+ { X86::VPSLLWZ128ri, X86::VPSLLWZ128mi, 0 },
+ { X86::VPSLLWZ256ri, X86::VPSLLWZ256mi, 0 },
+ { X86::VPSLLWZri, X86::VPSLLWZmi, 0 },
+ { X86::VPSRADZ128ri, X86::VPSRADZ128mi, 0 },
+ { X86::VPSRADZ256ri, X86::VPSRADZ256mi, 0 },
+ { X86::VPSRADZri, X86::VPSRADZmi, 0 },
+ { X86::VPSRAQZ128ri, X86::VPSRAQZ128mi, 0 },
+ { X86::VPSRAQZ256ri, X86::VPSRAQZ256mi, 0 },
+ { X86::VPSRAQZri, X86::VPSRAQZmi, 0 },
+ { X86::VPSRAWZ128ri, X86::VPSRAWZ128mi, 0 },
+ { X86::VPSRAWZ256ri, X86::VPSRAWZ256mi, 0 },
+ { X86::VPSRAWZri, X86::VPSRAWZmi, 0 },
+ { X86::VPSRLDQZ128ri, X86::VPSRLDQZ128mi, 0 },
+ { X86::VPSRLDQZ256ri, X86::VPSRLDQZ256mi, 0 },
+ { X86::VPSRLDQZri, X86::VPSRLDQZmi, 0 },
+ { X86::VPSRLDZ128ri, X86::VPSRLDZ128mi, 0 },
+ { X86::VPSRLDZ256ri, X86::VPSRLDZ256mi, 0 },
+ { X86::VPSRLDZri, X86::VPSRLDZmi, 0 },
+ { X86::VPSRLQZ128ri, X86::VPSRLQZ128mi, 0 },
+ { X86::VPSRLQZ256ri, X86::VPSRLQZ256mi, 0 },
+ { X86::VPSRLQZri, X86::VPSRLQZmi, 0 },
+ { X86::VPSRLWZ128ri, X86::VPSRLWZ128mi, 0 },
+ { X86::VPSRLWZ256ri, X86::VPSRLWZ256mi, 0 },
+ { X86::VPSRLWZri, X86::VPSRLWZmi, 0 },
+ { X86::VPTESTYrr, X86::VPTESTYrm, 0 },
+ { X86::VPTESTrr, X86::VPTESTrm, 0 },
+ { X86::VRCP14PDZ128r, X86::VRCP14PDZ128m, 0 },
+ { X86::VRCP14PDZ256r, X86::VRCP14PDZ256m, 0 },
+ { X86::VRCP14PDZr, X86::VRCP14PDZm, 0 },
+ { X86::VRCP14PSZ128r, X86::VRCP14PSZ128m, 0 },
+ { X86::VRCP14PSZ256r, X86::VRCP14PSZ256m, 0 },
+ { X86::VRCP14PSZr, X86::VRCP14PSZm, 0 },
+ { X86::VRCP28PDZr, X86::VRCP28PDZm, 0 },
+ { X86::VRCP28PSZr, X86::VRCP28PSZm, 0 },
+ { X86::VRCPPSYr, X86::VRCPPSYm, 0 },
+ { X86::VRCPPSr, X86::VRCPPSm, 0 },
+ { X86::VREDUCEPDZ128rri, X86::VREDUCEPDZ128rmi, 0 },
+ { X86::VREDUCEPDZ256rri, X86::VREDUCEPDZ256rmi, 0 },
+ { X86::VREDUCEPDZrri, X86::VREDUCEPDZrmi, 0 },
+ { X86::VREDUCEPSZ128rri, X86::VREDUCEPSZ128rmi, 0 },
+ { X86::VREDUCEPSZ256rri, X86::VREDUCEPSZ256rmi, 0 },
+ { X86::VREDUCEPSZrri, X86::VREDUCEPSZrmi, 0 },
+ { X86::VRNDSCALEPDZ128rri, X86::VRNDSCALEPDZ128rmi, 0 },
+ { X86::VRNDSCALEPDZ256rri, X86::VRNDSCALEPDZ256rmi, 0 },
+ { X86::VRNDSCALEPDZrri, X86::VRNDSCALEPDZrmi, 0 },
+ { X86::VRNDSCALEPSZ128rri, X86::VRNDSCALEPSZ128rmi, 0 },
+ { X86::VRNDSCALEPSZ256rri, X86::VRNDSCALEPSZ256rmi, 0 },
+ { X86::VRNDSCALEPSZrri, X86::VRNDSCALEPSZrmi, 0 },
+ { X86::VROUNDPDYr, X86::VROUNDPDYm, 0 },
+ { X86::VROUNDPDr, X86::VROUNDPDm, 0 },
+ { X86::VROUNDPSYr, X86::VROUNDPSYm, 0 },
+ { X86::VROUNDPSr, X86::VROUNDPSm, 0 },
+ { X86::VRSQRT14PDZ128r, X86::VRSQRT14PDZ128m, 0 },
+ { X86::VRSQRT14PDZ256r, X86::VRSQRT14PDZ256m, 0 },
+ { X86::VRSQRT14PDZr, X86::VRSQRT14PDZm, 0 },
+ { X86::VRSQRT14PSZ128r, X86::VRSQRT14PSZ128m, 0 },
+ { X86::VRSQRT14PSZ256r, X86::VRSQRT14PSZ256m, 0 },
+ { X86::VRSQRT14PSZr, X86::VRSQRT14PSZm, 0 },
+ { X86::VRSQRT28PDZr, X86::VRSQRT28PDZm, 0 },
+ { X86::VRSQRT28PSZr, X86::VRSQRT28PSZm, 0 },
+ { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
+ { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
+ { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
+ { X86::VSQRTPDZ128r, X86::VSQRTPDZ128m, 0 },
+ { X86::VSQRTPDZ256r, X86::VSQRTPDZ256m, 0 },
+ { X86::VSQRTPDZr, X86::VSQRTPDZm, 0 },
+ { X86::VSQRTPDr, X86::VSQRTPDm, 0 },
+ { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
+ { X86::VSQRTPSZ128r, X86::VSQRTPSZ128m, 0 },
+ { X86::VSQRTPSZ256r, X86::VSQRTPSZ256m, 0 },
+ { X86::VSQRTPSZr, X86::VSQRTPSZm, 0 },
+ { X86::VSQRTPSr, X86::VSQRTPSm, 0 },
+ { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
+ { X86::VTESTPDrr, X86::VTESTPDrm, 0 },
+ { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },
+ { X86::VTESTPSrr, X86::VTESTPSrm, 0 },
+ { X86::VUCOMISDZrr, X86::VUCOMISDZrm, 0 },
+ { X86::VUCOMISDZrr_Int, X86::VUCOMISDZrm_Int, TB_NO_REVERSE },
+ { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
+ { X86::VUCOMISDrr_Int, X86::VUCOMISDrm_Int, TB_NO_REVERSE },
+ { X86::VUCOMISSZrr, X86::VUCOMISSZrm, 0 },
+ { X86::VUCOMISSZrr_Int, X86::VUCOMISSZrm_Int, TB_NO_REVERSE },
+ { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
+ { X86::VUCOMISSrr_Int, X86::VUCOMISSrm_Int, TB_NO_REVERSE },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
+ { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
+ { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
+ { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
+ { X86::ADD8rr_DB, X86::ADD8rm, TB_NO_REVERSE },
+ { X86::ADC16rr, X86::ADC16rm, 0 },
+ { X86::ADC32rr, X86::ADC32rm, 0 },
+ { X86::ADC64rr, X86::ADC64rm, 0 },
+ { X86::ADC8rr, X86::ADC8rm, 0 },
+ { X86::ADCX32rr, X86::ADCX32rm, 0 },
+ { X86::ADCX64rr, X86::ADCX64rm, 0 },
+ { X86::ADD16rr, X86::ADD16rm, 0 },
+ { X86::ADD32rr, X86::ADD32rm, 0 },
+ { X86::ADD64rr, X86::ADD64rm, 0 },
+ { X86::ADD8rr, X86::ADD8rm, 0 },
+ { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
+ { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
+ { X86::ADDSDrr, X86::ADDSDrm, 0 },
+ { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE },
+ { X86::ADDSSrr, X86::ADDSSrm, 0 },
+ { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE },
+ { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
+ { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
+ { X86::ADOX32rr, X86::ADOX32rm, 0 },
+ { X86::ADOX64rr, X86::ADOX64rm, 0 },
+ { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
+ { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 },
+ { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 },
+ { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 },
+ { X86::AND16rr, X86::AND16rm, 0 },
+ { X86::AND32rr, X86::AND32rm, 0 },
+ { X86::AND64rr, X86::AND64rm, 0 },
+ { X86::AND8rr, X86::AND8rm, 0 },
+ { X86::ANDN32rr, X86::ANDN32rm, 0 },
+ { X86::ANDN64rr, X86::ANDN64rm, 0 },
+ { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
+ { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
+ { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
+ { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
+ { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
+ { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
+ { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
+ { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
+ { X86::CMOV16rr, X86::CMOV16rm, 0 },
+ { X86::CMOV32rr, X86::CMOV32rm, 0 },
+ { X86::CMOV64rr, X86::CMOV64rm, 0 },
+ { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
+ { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
+ { X86::CMPSDrr, X86::CMPSDrm, 0 },
+ { X86::CMPSDrr_Int, X86::CMPSDrm_Int, TB_NO_REVERSE },
+ { X86::CMPSSrr, X86::CMPSSrm, 0 },
+ { X86::CMPSSrr_Int, X86::CMPSSrm_Int, TB_NO_REVERSE },
+ { X86::CRC32r32r16, X86::CRC32r32m16, 0 },
+ { X86::CRC32r32r32, X86::CRC32r32m32, 0 },
+ { X86::CRC32r32r8, X86::CRC32r32m8, 0 },
+ { X86::CRC32r64r64, X86::CRC32r64m64, 0 },
+ { X86::CRC32r64r8, X86::CRC32r64m8, 0 },
+ { X86::CVTSD2SSrr_Int, X86::CVTSD2SSrm_Int, TB_NO_REVERSE },
+ { X86::CVTSI2SDrr_Int, X86::CVTSI2SDrm_Int, 0 },
+ { X86::CVTSI2SSrr_Int, X86::CVTSI2SSrm_Int, 0 },
+ { X86::CVTSI642SDrr_Int, X86::CVTSI642SDrm_Int, 0 },
+ { X86::CVTSI642SSrr_Int, X86::CVTSI642SSrm_Int, 0 },
+ { X86::CVTSS2SDrr_Int, X86::CVTSS2SDrm_Int, TB_NO_REVERSE },
+ { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
+ { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
+ { X86::DIVSDrr, X86::DIVSDrm, 0 },
+ { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE },
+ { X86::DIVSSrr, X86::DIVSSrm, 0 },
+ { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE },
+ { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
+ { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
+ { X86::GF2P8AFFINEINVQBrri, X86::GF2P8AFFINEINVQBrmi, TB_ALIGN_16 },
+ { X86::GF2P8AFFINEQBrri, X86::GF2P8AFFINEQBrmi, TB_ALIGN_16 },
+ { X86::GF2P8MULBrr, X86::GF2P8MULBrm, TB_ALIGN_16 },
+ { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
+ { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
+ { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
+ { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
+ { X86::IMUL16rr, X86::IMUL16rm, 0 },
+ { X86::IMUL32rr, X86::IMUL32rm, 0 },
+ { X86::IMUL64rr, X86::IMUL64rm, 0 },
+ { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
+ { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 },
+ { X86::MAXCSDrr, X86::MAXCSDrm, 0 },
+ { X86::MAXCSSrr, X86::MAXCSSrm, 0 },
+ { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
+ { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
+ { X86::MAXSDrr, X86::MAXSDrm, 0 },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE },
+ { X86::MAXSSrr, X86::MAXSSrm, 0 },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE },
+ { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 },
+ { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 },
+ { X86::MINCSDrr, X86::MINCSDrm, 0 },
+ { X86::MINCSSrr, X86::MINCSSrm, 0 },
+ { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
+ { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
+ { X86::MINSDrr, X86::MINSDrm, 0 },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE },
+ { X86::MINSSrr, X86::MINSSrm, 0 },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE },
+ { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 },
+ { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 },
+ { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 },
+ { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 },
+ { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 },
+ { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 },
+ { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 },
+ { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 },
+ { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 },
+ { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 },
+ { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 },
+ { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 },
+ { X86::MMX_PALIGNRrri, X86::MMX_PALIGNRrmi, 0 },
+ { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 },
+ { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 },
+ { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 },
+ { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 },
+ { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 },
+ { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 },
+ { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 },
+ { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 },
+ { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 },
+ { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 },
+ { X86::MMX_PHADDDrr, X86::MMX_PHADDDrm, 0 },
+ { X86::MMX_PHADDSWrr, X86::MMX_PHADDSWrm, 0 },
+ { X86::MMX_PHADDWrr, X86::MMX_PHADDWrm, 0 },
+ { X86::MMX_PHSUBDrr, X86::MMX_PHSUBDrm, 0 },
+ { X86::MMX_PHSUBSWrr, X86::MMX_PHSUBSWrm, 0 },
+ { X86::MMX_PHSUBWrr, X86::MMX_PHSUBWrm, 0 },
+ { X86::MMX_PINSRWrr, X86::MMX_PINSRWrm, TB_NO_REVERSE },
+ { X86::MMX_PMADDUBSWrr, X86::MMX_PMADDUBSWrm, 0 },
+ { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 },
+ { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 },
+ { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 },
+ { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 },
+ { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 },
+ { X86::MMX_PMULHRSWrr, X86::MMX_PMULHRSWrm, 0 },
+ { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 },
+ { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 },
+ { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 },
+ { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 },
+ { X86::MMX_PORirr, X86::MMX_PORirm, 0 },
+ { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 },
+ { X86::MMX_PSHUFBrr, X86::MMX_PSHUFBrm, 0 },
+ { X86::MMX_PSIGNBrr, X86::MMX_PSIGNBrm, 0 },
+ { X86::MMX_PSIGNDrr, X86::MMX_PSIGNDrm, 0 },
+ { X86::MMX_PSIGNWrr, X86::MMX_PSIGNWrm, 0 },
+ { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 },
+ { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 },
+ { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 },
+ { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 },
+ { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 },
+ { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 },
+ { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 },
+ { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 },
+ { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 },
+ { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 },
+ { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 },
+ { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 },
+ { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 },
+ { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 },
+ { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 },
+ { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 },
+ { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 },
+ { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 },
+ { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 },
+ { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, TB_NO_REVERSE },
+ { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, TB_NO_REVERSE },
+ { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, TB_NO_REVERSE },
+ { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
+ { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
+ { X86::MOVSDrr, X86::MOVLPDrm, TB_NO_REVERSE },
+ { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
+ { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
+ { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
+ { X86::MULSDrr, X86::MULSDrm, 0 },
+ { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE },
+ { X86::MULSSrr, X86::MULSSrm, 0 },
+ { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE },
+ { X86::MULX32rr, X86::MULX32rm, 0 },
+ { X86::MULX64rr, X86::MULX64rm, 0 },
+ { X86::OR16rr, X86::OR16rm, 0 },
+ { X86::OR32rr, X86::OR32rm, 0 },
+ { X86::OR64rr, X86::OR64rm, 0 },
+ { X86::OR8rr, X86::OR8rm, 0 },
+ { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
+ { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
+ { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
+ { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
+ { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
+ { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
+ { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
+ { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
+ { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
+ { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
+ { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
+ { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
+ { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
+ { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
+ { X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 },
+ { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
+ { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
+ { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
+ { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
+ { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
+ { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
+ { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
+ { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
+ { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
+ { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
+ { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
+ { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
+ { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
+ { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
+ { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
+ { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
+ { X86::PDEP32rr, X86::PDEP32rm, 0 },
+ { X86::PDEP64rr, X86::PDEP64rm, 0 },
+ { X86::PEXT32rr, X86::PEXT32rm, 0 },
+ { X86::PEXT64rr, X86::PEXT64rm, 0 },
+ { X86::PFACCrr, X86::PFACCrm, 0 },
+ { X86::PFADDrr, X86::PFADDrm, 0 },
+ { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
+ { X86::PFCMPGErr, X86::PFCMPGErm, 0 },
+ { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
+ { X86::PFMAXrr, X86::PFMAXrm, 0 },
+ { X86::PFMINrr, X86::PFMINrm, 0 },
+ { X86::PFMULrr, X86::PFMULrm, 0 },
+ { X86::PFNACCrr, X86::PFNACCrm, 0 },
+ { X86::PFPNACCrr, X86::PFPNACCrm, 0 },
+ { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
+ { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
+ { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
+ { X86::PFSUBRrr, X86::PFSUBRrm, 0 },
+ { X86::PFSUBrr, X86::PFSUBrm, 0 },
+ { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
+ { X86::PHADDSWrr, X86::PHADDSWrm, TB_ALIGN_16 },
+ { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
+ { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
+ { X86::PHSUBSWrr, X86::PHSUBSWrm, TB_ALIGN_16 },
+ { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
+ { X86::PINSRBrr, X86::PINSRBrm, TB_NO_REVERSE },
+ { X86::PINSRDrr, X86::PINSRDrm, 0 },
+ { X86::PINSRQrr, X86::PINSRQrm, 0 },
+ { X86::PINSRWrr, X86::PINSRWrm, TB_NO_REVERSE },
+ { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 },
+ { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
+ { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
+ { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
+ { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
+ { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
+ { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
+ { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
+ { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
+ { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
+ { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
+ { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
+ { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
+ { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
+ { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
+ { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 },
+ { X86::PMULHRWrr, X86::PMULHRWrm, 0 },
+ { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
+ { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
+ { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
+ { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
+ { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
+ { X86::PORrr, X86::PORrm, TB_ALIGN_16 },
+ { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
+ { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
+ { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 },
+ { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 },
+ { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 },
+ { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
+ { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
+ { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
+ { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
+ { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
+ { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
+ { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
+ { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
+ { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
+ { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
+ { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
+ { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
+ { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
+ { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
+ { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
+ { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
+ { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
+ { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
+ { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
+ { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
+ { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
+ { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
+ { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
+ { X86::SBB16rr, X86::SBB16rm, 0 },
+ { X86::SBB32rr, X86::SBB32rm, 0 },
+ { X86::SBB64rr, X86::SBB64rm, 0 },
+ { X86::SBB8rr, X86::SBB8rm, 0 },
+ { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 },
+ { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 },
+ { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 },
+ { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 },
+ { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 },
+ { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 },
+ { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 },
+ { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
+ { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
+ { X86::SUB16rr, X86::SUB16rm, 0 },
+ { X86::SUB32rr, X86::SUB32rm, 0 },
+ { X86::SUB64rr, X86::SUB64rm, 0 },
+ { X86::SUB8rr, X86::SUB8rm, 0 },
+ { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
+ { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
+ { X86::SUBSDrr, X86::SUBSDrm, 0 },
+ { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
+ { X86::SUBSSrr, X86::SUBSSrm, 0 },
+ { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
+ { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
+ { X86::VADDPDYrr, X86::VADDPDYrm, 0 },
+ { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
+ { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
+ { X86::VADDPDZrr, X86::VADDPDZrm, 0 },
+ { X86::VADDPDrr, X86::VADDPDrm, 0 },
+ { X86::VADDPSYrr, X86::VADDPSYrm, 0 },
+ { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
+ { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
+ { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
+ { X86::VADDPSrr, X86::VADDPSrm, 0 },
+ { X86::VADDSDZrr, X86::VADDSDZrm, 0 },
+ { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE },
+ { X86::VADDSDrr, X86::VADDSDrm, 0 },
+ { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE },
+ { X86::VADDSSZrr, X86::VADDSSZrm, 0 },
+ { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE },
+ { X86::VADDSSrr, X86::VADDSSrm, 0 },
+ { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE },
+ { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
+ { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
+ { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
+ { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
+ { X86::VAESDECLASTYrr, X86::VAESDECLASTYrm, 0 },
+ { X86::VAESDECLASTZ128rr, X86::VAESDECLASTZ128rm, 0 },
+ { X86::VAESDECLASTZ256rr, X86::VAESDECLASTZ256rm, 0 },
+ { X86::VAESDECLASTZrr, X86::VAESDECLASTZrm, 0 },
+ { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 },
+ { X86::VAESDECYrr, X86::VAESDECYrm, 0 },
+ { X86::VAESDECZ128rr, X86::VAESDECZ128rm, 0 },
+ { X86::VAESDECZ256rr, X86::VAESDECZ256rm, 0 },
+ { X86::VAESDECZrr, X86::VAESDECZrm, 0 },
+ { X86::VAESDECrr, X86::VAESDECrm, 0 },
+ { X86::VAESENCLASTYrr, X86::VAESENCLASTYrm, 0 },
+ { X86::VAESENCLASTZ128rr, X86::VAESENCLASTZ128rm, 0 },
+ { X86::VAESENCLASTZ256rr, X86::VAESENCLASTZ256rm, 0 },
+ { X86::VAESENCLASTZrr, X86::VAESENCLASTZrm, 0 },
+ { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 },
+ { X86::VAESENCYrr, X86::VAESENCYrm, 0 },
+ { X86::VAESENCZ128rr, X86::VAESENCZ128rm, 0 },
+ { X86::VAESENCZ256rr, X86::VAESENCZ256rm, 0 },
+ { X86::VAESENCZrr, X86::VAESENCZrm, 0 },
+ { X86::VAESENCrr, X86::VAESENCrm, 0 },
+ { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 },
+ { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
+ { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 },
+ { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
+ { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 },
+ { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 },
+ { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 },
+ { X86::VANDNPDrr, X86::VANDNPDrm, 0 },
+ { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
+ { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 },
+ { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 },
+ { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 },
+ { X86::VANDNPSrr, X86::VANDNPSrm, 0 },
+ { X86::VANDPDYrr, X86::VANDPDYrm, 0 },
+ { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 },
+ { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 },
+ { X86::VANDPDZrr, X86::VANDPDZrm, 0 },
+ { X86::VANDPDrr, X86::VANDPDrm, 0 },
+ { X86::VANDPSYrr, X86::VANDPSYrm, 0 },
+ { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 },
+ { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 },
+ { X86::VANDPSZrr, X86::VANDPSZrm, 0 },
+ { X86::VANDPSrr, X86::VANDPSrm, 0 },
+ { X86::VBLENDMPDZ128rr, X86::VBLENDMPDZ128rm, 0 },
+ { X86::VBLENDMPDZ256rr, X86::VBLENDMPDZ256rm, 0 },
+ { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 },
+ { X86::VBLENDMPSZ128rr, X86::VBLENDMPSZ128rm, 0 },
+ { X86::VBLENDMPSZ256rr, X86::VBLENDMPSZ256rm, 0 },
+ { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 },
+ { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
+ { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
+ { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
+ { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
+ { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
+ { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
+ { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
+ { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
+ { X86::VBROADCASTF32X2Z256rrkz, X86::VBROADCASTF32X2Z256rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Zrrkz, X86::VBROADCASTF32X2Zrmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z128rrkz, X86::VBROADCASTI32X2Z128rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z256rrkz, X86::VBROADCASTI32X2Z256rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Zrrkz, X86::VBROADCASTI32X2Zrmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rrkz, X86::VBROADCASTSDZ256rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrrkz, X86::VBROADCASTSDZrmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rrkz, X86::VBROADCASTSSZ128rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rrkz, X86::VBROADCASTSSZ256rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZrrkz, X86::VBROADCASTSSZrmkz, TB_NO_REVERSE },
+ { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
+ { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
+ { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
+ { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 },
+ { X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
+ { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
+ { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 },
+ { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 },
+ { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 },
+ { X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
+ { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 },
+ { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE },
+ { X86::VCMPSDrr, X86::VCMPSDrm, 0 },
+ { X86::VCMPSDrr_Int, X86::VCMPSDrm_Int, TB_NO_REVERSE },
+ { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 },
+ { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE },
+ { X86::VCMPSSrr, X86::VCMPSSrm, 0 },
+ { X86::VCMPSSrr_Int, X86::VCMPSSrm_Int, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ128rrkz, X86::VCVTDQ2PDZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ256rrkz, X86::VCVTDQ2PDZ256rmkz, 0 },
+ { X86::VCVTDQ2PDZrrkz, X86::VCVTDQ2PDZrmkz, 0 },
+ { X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmkz, 0 },
+ { X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmkz, 0 },
+ { X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmkz, 0 },
+ { X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rm, 0 },
+ { X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rm, 0 },
+ { X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrm, 0 },
+ { X86::VCVTNEPS2BF16Z128rrkz, X86::VCVTNEPS2BF16Z128rmkz, 0 },
+ { X86::VCVTNEPS2BF16Z256rrkz, X86::VCVTNEPS2BF16Z256rmkz, 0 },
+ { X86::VCVTNEPS2BF16Zrrkz, X86::VCVTNEPS2BF16Zrmkz, 0 },
+ { X86::VCVTPD2DQZ128rrkz, X86::VCVTPD2DQZ128rmkz, 0 },
+ { X86::VCVTPD2DQZ256rrkz, X86::VCVTPD2DQZ256rmkz, 0 },
+ { X86::VCVTPD2DQZrrkz, X86::VCVTPD2DQZrmkz, 0 },
+ { X86::VCVTPD2PSZ128rrkz, X86::VCVTPD2PSZ128rmkz, 0 },
+ { X86::VCVTPD2PSZ256rrkz, X86::VCVTPD2PSZ256rmkz, 0 },
+ { X86::VCVTPD2PSZrrkz, X86::VCVTPD2PSZrmkz, 0 },
+ { X86::VCVTPD2QQZ128rrkz, X86::VCVTPD2QQZ128rmkz, 0 },
+ { X86::VCVTPD2QQZ256rrkz, X86::VCVTPD2QQZ256rmkz, 0 },
+ { X86::VCVTPD2QQZrrkz, X86::VCVTPD2QQZrmkz, 0 },
+ { X86::VCVTPD2UDQZ128rrkz, X86::VCVTPD2UDQZ128rmkz, 0 },
+ { X86::VCVTPD2UDQZ256rrkz, X86::VCVTPD2UDQZ256rmkz, 0 },
+ { X86::VCVTPD2UDQZrrkz, X86::VCVTPD2UDQZrmkz, 0 },
+ { X86::VCVTPD2UQQZ128rrkz, X86::VCVTPD2UQQZ128rmkz, 0 },
+ { X86::VCVTPD2UQQZ256rrkz, X86::VCVTPD2UQQZ256rmkz, 0 },
+ { X86::VCVTPD2UQQZrrkz, X86::VCVTPD2UQQZrmkz, 0 },
+ { X86::VCVTPH2PSZ128rrkz, X86::VCVTPH2PSZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTPH2PSZ256rrkz, X86::VCVTPH2PSZ256rmkz, 0 },
+ { X86::VCVTPH2PSZrrkz, X86::VCVTPH2PSZrmkz, 0 },
+ { X86::VCVTPS2DQZ128rrkz, X86::VCVTPS2DQZ128rmkz, 0 },
+ { X86::VCVTPS2DQZ256rrkz, X86::VCVTPS2DQZ256rmkz, 0 },
+ { X86::VCVTPS2DQZrrkz, X86::VCVTPS2DQZrmkz, 0 },
+ { X86::VCVTPS2PDZ128rrkz, X86::VCVTPS2PDZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTPS2PDZ256rrkz, X86::VCVTPS2PDZ256rmkz, 0 },
+ { X86::VCVTPS2PDZrrkz, X86::VCVTPS2PDZrmkz, 0 },
+ { X86::VCVTPS2QQZ128rrkz, X86::VCVTPS2QQZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTPS2QQZ256rrkz, X86::VCVTPS2QQZ256rmkz, 0 },
+ { X86::VCVTPS2QQZrrkz, X86::VCVTPS2QQZrmkz, 0 },
+ { X86::VCVTPS2UDQZ128rrkz, X86::VCVTPS2UDQZ128rmkz, 0 },
+ { X86::VCVTPS2UDQZ256rrkz, X86::VCVTPS2UDQZ256rmkz, 0 },
+ { X86::VCVTPS2UDQZrrkz, X86::VCVTPS2UDQZrmkz, 0 },
+ { X86::VCVTPS2UQQZ128rrkz, X86::VCVTPS2UQQZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTPS2UQQZ256rrkz, X86::VCVTPS2UQQZ256rmkz, 0 },
+ { X86::VCVTPS2UQQZrrkz, X86::VCVTPS2UQQZrmkz, 0 },
+ { X86::VCVTQQ2PDZ128rrkz, X86::VCVTQQ2PDZ128rmkz, 0 },
+ { X86::VCVTQQ2PDZ256rrkz, X86::VCVTQQ2PDZ256rmkz, 0 },
+ { X86::VCVTQQ2PDZrrkz, X86::VCVTQQ2PDZrmkz, 0 },
+ { X86::VCVTQQ2PSZ128rrkz, X86::VCVTQQ2PSZ128rmkz, 0 },
+ { X86::VCVTQQ2PSZ256rrkz, X86::VCVTQQ2PSZ256rmkz, 0 },
+ { X86::VCVTQQ2PSZrrkz, X86::VCVTQQ2PSZrmkz, 0 },
+ { X86::VCVTSD2SSZrr, X86::VCVTSD2SSZrm, 0 },
+ { X86::VCVTSD2SSZrr_Int, X86::VCVTSD2SSZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
+ { X86::VCVTSD2SSrr_Int, X86::VCVTSD2SSrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSI2SDZrr, X86::VCVTSI2SDZrm, 0 },
+ { X86::VCVTSI2SDZrr_Int, X86::VCVTSI2SDZrm_Int, 0 },
+ { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
+ { X86::VCVTSI2SDrr_Int, X86::VCVTSI2SDrm_Int, 0 },
+ { X86::VCVTSI2SSZrr, X86::VCVTSI2SSZrm, 0 },
+ { X86::VCVTSI2SSZrr_Int, X86::VCVTSI2SSZrm_Int, 0 },
+ { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
+ { X86::VCVTSI2SSrr_Int, X86::VCVTSI2SSrm_Int, 0 },
+ { X86::VCVTSI642SDZrr, X86::VCVTSI642SDZrm, 0 },
+ { X86::VCVTSI642SDZrr_Int, X86::VCVTSI642SDZrm_Int, 0 },
+ { X86::VCVTSI642SDrr, X86::VCVTSI642SDrm, 0 },
+ { X86::VCVTSI642SDrr_Int, X86::VCVTSI642SDrm_Int, 0 },
+ { X86::VCVTSI642SSZrr, X86::VCVTSI642SSZrm, 0 },
+ { X86::VCVTSI642SSZrr_Int, X86::VCVTSI642SSZrm_Int, 0 },
+ { X86::VCVTSI642SSrr, X86::VCVTSI642SSrm, 0 },
+ { X86::VCVTSI642SSrr_Int, X86::VCVTSI642SSrm_Int, 0 },
+ { X86::VCVTSS2SDZrr, X86::VCVTSS2SDZrm, 0 },
+ { X86::VCVTSS2SDZrr_Int, X86::VCVTSS2SDZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
+ { X86::VCVTSS2SDrr_Int, X86::VCVTSS2SDrm_Int, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQZ128rrkz, X86::VCVTTPD2DQZ128rmkz, 0 },
+ { X86::VCVTTPD2DQZ256rrkz, X86::VCVTTPD2DQZ256rmkz, 0 },
+ { X86::VCVTTPD2DQZrrkz, X86::VCVTTPD2DQZrmkz, 0 },
+ { X86::VCVTTPD2QQZ128rrkz, X86::VCVTTPD2QQZ128rmkz, 0 },
+ { X86::VCVTTPD2QQZ256rrkz, X86::VCVTTPD2QQZ256rmkz, 0 },
+ { X86::VCVTTPD2QQZrrkz, X86::VCVTTPD2QQZrmkz, 0 },
+ { X86::VCVTTPD2UDQZ128rrkz, X86::VCVTTPD2UDQZ128rmkz, 0 },
+ { X86::VCVTTPD2UDQZ256rrkz, X86::VCVTTPD2UDQZ256rmkz, 0 },
+ { X86::VCVTTPD2UDQZrrkz, X86::VCVTTPD2UDQZrmkz, 0 },
+ { X86::VCVTTPD2UQQZ128rrkz, X86::VCVTTPD2UQQZ128rmkz, 0 },
+ { X86::VCVTTPD2UQQZ256rrkz, X86::VCVTTPD2UQQZ256rmkz, 0 },
+ { X86::VCVTTPD2UQQZrrkz, X86::VCVTTPD2UQQZrmkz, 0 },
+ { X86::VCVTTPS2DQZ128rrkz, X86::VCVTTPS2DQZ128rmkz, 0 },
+ { X86::VCVTTPS2DQZ256rrkz, X86::VCVTTPS2DQZ256rmkz, 0 },
+ { X86::VCVTTPS2DQZrrkz, X86::VCVTTPS2DQZrmkz, 0 },
+ { X86::VCVTTPS2QQZ128rrkz, X86::VCVTTPS2QQZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTTPS2QQZ256rrkz, X86::VCVTTPS2QQZ256rmkz, 0 },
+ { X86::VCVTTPS2QQZrrkz, X86::VCVTTPS2QQZrmkz, 0 },
+ { X86::VCVTTPS2UDQZ128rrkz, X86::VCVTTPS2UDQZ128rmkz, 0 },
+ { X86::VCVTTPS2UDQZ256rrkz, X86::VCVTTPS2UDQZ256rmkz, 0 },
+ { X86::VCVTTPS2UDQZrrkz, X86::VCVTTPS2UDQZrmkz, 0 },
+ { X86::VCVTTPS2UQQZ128rrkz, X86::VCVTTPS2UQQZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTTPS2UQQZ256rrkz, X86::VCVTTPS2UQQZ256rmkz, 0 },
+ { X86::VCVTTPS2UQQZrrkz, X86::VCVTTPS2UQQZrmkz, 0 },
+ { X86::VCVTUDQ2PDZ128rrkz, X86::VCVTUDQ2PDZ128rmkz, TB_NO_REVERSE },
+ { X86::VCVTUDQ2PDZ256rrkz, X86::VCVTUDQ2PDZ256rmkz, 0 },
+ { X86::VCVTUDQ2PDZrrkz, X86::VCVTUDQ2PDZrmkz, 0 },
+ { X86::VCVTUDQ2PSZ128rrkz, X86::VCVTUDQ2PSZ128rmkz, 0 },
+ { X86::VCVTUDQ2PSZ256rrkz, X86::VCVTUDQ2PSZ256rmkz, 0 },
+ { X86::VCVTUDQ2PSZrrkz, X86::VCVTUDQ2PSZrmkz, 0 },
+ { X86::VCVTUQQ2PDZ128rrkz, X86::VCVTUQQ2PDZ128rmkz, 0 },
+ { X86::VCVTUQQ2PDZ256rrkz, X86::VCVTUQQ2PDZ256rmkz, 0 },
+ { X86::VCVTUQQ2PDZrrkz, X86::VCVTUQQ2PDZrmkz, 0 },
+ { X86::VCVTUQQ2PSZ128rrkz, X86::VCVTUQQ2PSZ128rmkz, 0 },
+ { X86::VCVTUQQ2PSZ256rrkz, X86::VCVTUQQ2PSZ256rmkz, 0 },
+ { X86::VCVTUQQ2PSZrrkz, X86::VCVTUQQ2PSZrmkz, 0 },
+ { X86::VCVTUSI2SDZrr, X86::VCVTUSI2SDZrm, 0 },
+ { X86::VCVTUSI2SDZrr_Int, X86::VCVTUSI2SDZrm_Int, 0 },
+ { X86::VCVTUSI2SSZrr, X86::VCVTUSI2SSZrm, 0 },
+ { X86::VCVTUSI2SSZrr_Int, X86::VCVTUSI2SSZrm_Int, 0 },
+ { X86::VCVTUSI642SDZrr, X86::VCVTUSI642SDZrm, 0 },
+ { X86::VCVTUSI642SDZrr_Int, X86::VCVTUSI642SDZrm_Int, 0 },
+ { X86::VCVTUSI642SSZrr, X86::VCVTUSI642SSZrm, 0 },
+ { X86::VCVTUSI642SSZrr_Int, X86::VCVTUSI642SSZrm_Int, 0 },
+ { X86::VDBPSADBWZ128rri, X86::VDBPSADBWZ128rmi, 0 },
+ { X86::VDBPSADBWZ256rri, X86::VDBPSADBWZ256rmi, 0 },
+ { X86::VDBPSADBWZrri, X86::VDBPSADBWZrmi, 0 },
+ { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
+ { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 },
+ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 },
+ { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
+ { X86::VDIVPDrr, X86::VDIVPDrm, 0 },
+ { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
+ { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 },
+ { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 },
+ { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
+ { X86::VDIVPSrr, X86::VDIVPSrm, 0 },
+ { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 },
+ { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSDrr, X86::VDIVSDrm, 0 },
+ { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
+ { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSSrr, X86::VDIVSSrm, 0 },
+ { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE },
+ { X86::VDPPDrri, X86::VDPPDrmi, 0 },
+ { X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
+ { X86::VDPPSrri, X86::VDPPSrmi, 0 },
+ { X86::VEXP2PDZrkz, X86::VEXP2PDZmkz, 0 },
+ { X86::VEXP2PSZrkz, X86::VEXP2PSZmkz, 0 },
+ { X86::VEXPANDPDZ128rrkz, X86::VEXPANDPDZ128rmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPDZ256rrkz, X86::VEXPANDPDZ256rmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPDZrrkz, X86::VEXPANDPDZrmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ128rrkz, X86::VEXPANDPSZ128rmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ256rrkz, X86::VEXPANDPSZ256rmkz, TB_NO_REVERSE },
+ { X86::VEXPANDPSZrrkz, X86::VEXPANDPSZrmkz, TB_NO_REVERSE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, 0 },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, 0 },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4mr, 0 },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, 0 },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, 0 },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, 0 },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, 0 },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, 0 },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, 0 },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, 0 },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, 0 },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, 0 },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, 0 },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, 0 },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, 0 },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, 0 },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, 0 },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, 0 },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, 0 },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, 0 },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, 0 },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, 0 },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE },
+ { X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmk, 0 },
+ { X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmk, 0 },
+ { X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmk, 0 },
+ { X86::VFPCLASSPSZ128rrk, X86::VFPCLASSPSZ128rmk, 0 },
+ { X86::VFPCLASSPSZ256rrk, X86::VFPCLASSPSZ256rmk, 0 },
+ { X86::VFPCLASSPSZrrk, X86::VFPCLASSPSZrmk, 0 },
+ { X86::VFPCLASSSDZrrk, X86::VFPCLASSSDZrmk, TB_NO_REVERSE },
+ { X86::VFPCLASSSSZrrk, X86::VFPCLASSSSZrmk, TB_NO_REVERSE },
+ { X86::VGETEXPPDZ128rkz, X86::VGETEXPPDZ128mkz, 0 },
+ { X86::VGETEXPPDZ256rkz, X86::VGETEXPPDZ256mkz, 0 },
+ { X86::VGETEXPPDZrkz, X86::VGETEXPPDZmkz, 0 },
+ { X86::VGETEXPPSZ128rkz, X86::VGETEXPPSZ128mkz, 0 },
+ { X86::VGETEXPPSZ256rkz, X86::VGETEXPPSZ256mkz, 0 },
+ { X86::VGETEXPPSZrkz, X86::VGETEXPPSZmkz, 0 },
+ { X86::VGETEXPSDZr, X86::VGETEXPSDZm, TB_NO_REVERSE },
+ { X86::VGETEXPSSZr, X86::VGETEXPSSZm, TB_NO_REVERSE },
+ { X86::VGETMANTPDZ128rrikz, X86::VGETMANTPDZ128rmikz, 0 },
+ { X86::VGETMANTPDZ256rrikz, X86::VGETMANTPDZ256rmikz, 0 },
+ { X86::VGETMANTPDZrrikz, X86::VGETMANTPDZrmikz, 0 },
+ { X86::VGETMANTPSZ128rrikz, X86::VGETMANTPSZ128rmikz, 0 },
+ { X86::VGETMANTPSZ256rrikz, X86::VGETMANTPSZ256rmikz, 0 },
+ { X86::VGETMANTPSZrrikz, X86::VGETMANTPSZrmikz, 0 },
+ { X86::VGETMANTSDZrri, X86::VGETMANTSDZrmi, TB_NO_REVERSE },
+ { X86::VGETMANTSSZrri, X86::VGETMANTSSZrmi, TB_NO_REVERSE },
+ { X86::VGF2P8AFFINEINVQBYrri, X86::VGF2P8AFFINEINVQBYrmi, 0 },
+ { X86::VGF2P8AFFINEINVQBZ128rri, X86::VGF2P8AFFINEINVQBZ128rmi, 0 },
+ { X86::VGF2P8AFFINEINVQBZ256rri, X86::VGF2P8AFFINEINVQBZ256rmi, 0 },
+ { X86::VGF2P8AFFINEINVQBZrri, X86::VGF2P8AFFINEINVQBZrmi, 0 },
+ { X86::VGF2P8AFFINEINVQBrri, X86::VGF2P8AFFINEINVQBrmi, 0 },
+ { X86::VGF2P8AFFINEQBYrri, X86::VGF2P8AFFINEQBYrmi, 0 },
+ { X86::VGF2P8AFFINEQBZ128rri, X86::VGF2P8AFFINEQBZ128rmi, 0 },
+ { X86::VGF2P8AFFINEQBZ256rri, X86::VGF2P8AFFINEQBZ256rmi, 0 },
+ { X86::VGF2P8AFFINEQBZrri, X86::VGF2P8AFFINEQBZrmi, 0 },
+ { X86::VGF2P8AFFINEQBrri, X86::VGF2P8AFFINEQBrmi, 0 },
+ { X86::VGF2P8MULBYrr, X86::VGF2P8MULBYrm, 0 },
+ { X86::VGF2P8MULBZ128rr, X86::VGF2P8MULBZ128rm, 0 },
+ { X86::VGF2P8MULBZ256rr, X86::VGF2P8MULBZ256rm, 0 },
+ { X86::VGF2P8MULBZrr, X86::VGF2P8MULBZrm, 0 },
+ { X86::VGF2P8MULBrr, X86::VGF2P8MULBrm, 0 },
+ { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
+ { X86::VHADDPDrr, X86::VHADDPDrm, 0 },
+ { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
+ { X86::VHADDPSrr, X86::VHADDPSrm, 0 },
+ { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
+ { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
+ { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
+ { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
+ { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
+ { X86::VINSERTF32x4Z256rr, X86::VINSERTF32x4Z256rm, 0 },
+ { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 },
+ { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 },
+ { X86::VINSERTF64x2Z256rr, X86::VINSERTF64x2Z256rm, 0 },
+ { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 },
+ { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 },
+ { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
+ { X86::VINSERTI32x4Z256rr, X86::VINSERTI32x4Z256rm, 0 },
+ { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 },
+ { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 },
+ { X86::VINSERTI64x2Z256rr, X86::VINSERTI64x2Z256rm, 0 },
+ { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 },
+ { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 },
+ { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 },
+ { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 },
+ { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 },
+ { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 },
+ { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
+ { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 },
+ { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 },
+ { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 },
+ { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 },
+ { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
+ { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 },
+ { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
+ { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 },
+ { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 },
+ { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
+ { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 },
+ { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 },
+ { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
+ { X86::VMAXPDrr, X86::VMAXPDrm, 0 },
+ { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
+ { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 },
+ { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 },
+ { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
+ { X86::VMAXPSrr, X86::VMAXPSrm, 0 },
+ { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 },
+ { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSDrr, X86::VMAXSDrm, 0 },
+ { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 },
+ { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSSrr, X86::VMAXSSrm, 0 },
+ { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE },
+ { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 },
+ { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 },
+ { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 },
+ { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 },
+ { X86::VMINCPDrr, X86::VMINCPDrm, 0 },
+ { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 },
+ { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 },
+ { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 },
+ { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 },
+ { X86::VMINCPSrr, X86::VMINCPSrm, 0 },
+ { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 },
+ { X86::VMINCSDrr, X86::VMINCSDrm, 0 },
+ { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 },
+ { X86::VMINCSSrr, X86::VMINCSSrm, 0 },
+ { X86::VMINPDYrr, X86::VMINPDYrm, 0 },
+ { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 },
+ { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 },
+ { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
+ { X86::VMINPDrr, X86::VMINPDrm, 0 },
+ { X86::VMINPSYrr, X86::VMINPSYrm, 0 },
+ { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 },
+ { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 },
+ { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
+ { X86::VMINPSrr, X86::VMINPSrm, 0 },
+ { X86::VMINSDZrr, X86::VMINSDZrm, 0 },
+ { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMINSDrr, X86::VMINSDrm, 0 },
+ { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE },
+ { X86::VMINSSZrr, X86::VMINSSZrm, 0 },
+ { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMINSSrr, X86::VMINSSrm, 0 },
+ { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE },
+ { X86::VMOVAPDZ128rrkz, X86::VMOVAPDZ128rmkz, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVAPDZ256rrkz, X86::VMOVAPDZ256rmkz, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVAPDZrrkz, X86::VMOVAPDZrmkz, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVAPSZ128rrkz, X86::VMOVAPSZ128rmkz, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVAPSZ256rrkz, X86::VMOVAPSZ256rmkz, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVAPSZrrkz, X86::VMOVAPSZrmkz, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDDUPZ128rrkz, X86::VMOVDDUPZ128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDDUPZ256rrkz, X86::VMOVDDUPZ256rmkz, 0 },
+ { X86::VMOVDDUPZrrkz, X86::VMOVDDUPZrmkz, 0 },
+ { X86::VMOVDQA32Z128rrkz, X86::VMOVDQA32Z128rmkz, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVDQA32Z256rrkz, X86::VMOVDQA32Z256rmkz, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVDQA32Zrrkz, X86::VMOVDQA32Zrmkz, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDQA64Z128rrkz, X86::VMOVDQA64Z128rmkz, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVDQA64Z256rrkz, X86::VMOVDQA64Z256rmkz, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVDQA64Zrrkz, X86::VMOVDQA64Zrmkz, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDQU16Z128rrkz, X86::VMOVDQU16Z128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU16Z256rrkz, X86::VMOVDQU16Z256rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU16Zrrkz, X86::VMOVDQU16Zrmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU32Z128rrkz, X86::VMOVDQU32Z128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU32Z256rrkz, X86::VMOVDQU32Z256rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU32Zrrkz, X86::VMOVDQU32Zrmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU64Z128rrkz, X86::VMOVDQU64Z128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU64Z256rrkz, X86::VMOVDQU64Z256rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU64Zrrkz, X86::VMOVDQU64Zrmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU8Z128rrkz, X86::VMOVDQU8Z128rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU8Z256rrkz, X86::VMOVDQU8Z256rmkz, TB_NO_REVERSE },
+ { X86::VMOVDQU8Zrrkz, X86::VMOVDQU8Zrmkz, TB_NO_REVERSE },
+ { X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE },
+ { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
+ { X86::VMOVSDZrr, X86::VMOVLPDZ128rm, TB_NO_REVERSE },
+ { X86::VMOVSDrr, X86::VMOVLPDrm, TB_NO_REVERSE },
+ { X86::VMOVSHDUPZ128rrkz, X86::VMOVSHDUPZ128rmkz, 0 },
+ { X86::VMOVSHDUPZ256rrkz, X86::VMOVSHDUPZ256rmkz, 0 },
+ { X86::VMOVSHDUPZrrkz, X86::VMOVSHDUPZrmkz, 0 },
+ { X86::VMOVSLDUPZ128rrkz, X86::VMOVSLDUPZ128rmkz, 0 },
+ { X86::VMOVSLDUPZ256rrkz, X86::VMOVSLDUPZ256rmkz, 0 },
+ { X86::VMOVSLDUPZrrkz, X86::VMOVSLDUPZrmkz, 0 },
+ { X86::VMOVUPDZ128rrkz, X86::VMOVUPDZ128rmkz, TB_NO_REVERSE },
+ { X86::VMOVUPDZ256rrkz, X86::VMOVUPDZ256rmkz, TB_NO_REVERSE },
+ { X86::VMOVUPDZrrkz, X86::VMOVUPDZrmkz, TB_NO_REVERSE },
+ { X86::VMOVUPSZ128rrkz, X86::VMOVUPSZ128rmkz, TB_NO_REVERSE },
+ { X86::VMOVUPSZ256rrkz, X86::VMOVUPSZ256rmkz, TB_NO_REVERSE },
+ { X86::VMOVUPSZrrkz, X86::VMOVUPSZrmkz, TB_NO_REVERSE },
+ { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
+ { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
+ { X86::VMULPDYrr, X86::VMULPDYrm, 0 },
+ { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 },
+ { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 },
+ { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
+ { X86::VMULPDrr, X86::VMULPDrm, 0 },
+ { X86::VMULPSYrr, X86::VMULPSYrm, 0 },
+ { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 },
+ { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 },
+ { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
+ { X86::VMULPSrr, X86::VMULPSrm, 0 },
+ { X86::VMULSDZrr, X86::VMULSDZrm, 0 },
+ { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMULSDrr, X86::VMULSDrm, 0 },
+ { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE },
+ { X86::VMULSSZrr, X86::VMULSSZrm, 0 },
+ { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMULSSrr, X86::VMULSSrm, 0 },
+ { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE },
+ { X86::VORPDYrr, X86::VORPDYrm, 0 },
+ { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 },
+ { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 },
+ { X86::VORPDZrr, X86::VORPDZrm, 0 },
+ { X86::VORPDrr, X86::VORPDrm, 0 },
+ { X86::VORPSYrr, X86::VORPSYrm, 0 },
+ { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 },
+ { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
+ { X86::VORPSZrr, X86::VORPSZrm, 0 },
+ { X86::VORPSrr, X86::VORPSrm, 0 },
+ { X86::VP2INTERSECTDZ128rr, X86::VP2INTERSECTDZ128rm, 0 },
+ { X86::VP2INTERSECTDZ256rr, X86::VP2INTERSECTDZ256rm, 0 },
+ { X86::VP2INTERSECTDZrr, X86::VP2INTERSECTDZrm, 0 },
+ { X86::VP2INTERSECTQZ128rr, X86::VP2INTERSECTQZ128rm, 0 },
+ { X86::VP2INTERSECTQZ256rr, X86::VP2INTERSECTQZ256rm, 0 },
+ { X86::VP2INTERSECTQZrr, X86::VP2INTERSECTQZrm, 0 },
+ { X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 },
+ { X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 },
+ { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 },
+ { X86::VPABSDZ128rrkz, X86::VPABSDZ128rmkz, 0 },
+ { X86::VPABSDZ256rrkz, X86::VPABSDZ256rmkz, 0 },
+ { X86::VPABSDZrrkz, X86::VPABSDZrmkz, 0 },
+ { X86::VPABSQZ128rrkz, X86::VPABSQZ128rmkz, 0 },
+ { X86::VPABSQZ256rrkz, X86::VPABSQZ256rmkz, 0 },
+ { X86::VPABSQZrrkz, X86::VPABSQZrmkz, 0 },
+ { X86::VPABSWZ128rrkz, X86::VPABSWZ128rmkz, 0 },
+ { X86::VPABSWZ256rrkz, X86::VPABSWZ256rmkz, 0 },
+ { X86::VPABSWZrrkz, X86::VPABSWZrmkz, 0 },
+ { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
+ { X86::VPACKSSDWZ128rr, X86::VPACKSSDWZ128rm, 0 },
+ { X86::VPACKSSDWZ256rr, X86::VPACKSSDWZ256rm, 0 },
+ { X86::VPACKSSDWZrr, X86::VPACKSSDWZrm, 0 },
+ { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
+ { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
+ { X86::VPACKSSWBZ128rr, X86::VPACKSSWBZ128rm, 0 },
+ { X86::VPACKSSWBZ256rr, X86::VPACKSSWBZ256rm, 0 },
+ { X86::VPACKSSWBZrr, X86::VPACKSSWBZrm, 0 },
+ { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
+ { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
+ { X86::VPACKUSDWZ128rr, X86::VPACKUSDWZ128rm, 0 },
+ { X86::VPACKUSDWZ256rr, X86::VPACKUSDWZ256rm, 0 },
+ { X86::VPACKUSDWZrr, X86::VPACKUSDWZrm, 0 },
+ { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
+ { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
+ { X86::VPACKUSWBZ128rr, X86::VPACKUSWBZ128rm, 0 },
+ { X86::VPACKUSWBZ256rr, X86::VPACKUSWBZ256rm, 0 },
+ { X86::VPACKUSWBZrr, X86::VPACKUSWBZrm, 0 },
+ { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
+ { X86::VPADDBYrr, X86::VPADDBYrm, 0 },
+ { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 },
+ { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 },
+ { X86::VPADDBZrr, X86::VPADDBZrm, 0 },
+ { X86::VPADDBrr, X86::VPADDBrm, 0 },
+ { X86::VPADDDYrr, X86::VPADDDYrm, 0 },
+ { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 },
+ { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 },
+ { X86::VPADDDZrr, X86::VPADDDZrm, 0 },
+ { X86::VPADDDrr, X86::VPADDDrm, 0 },
+ { X86::VPADDQYrr, X86::VPADDQYrm, 0 },
+ { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 },
+ { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 },
+ { X86::VPADDQZrr, X86::VPADDQZrm, 0 },
+ { X86::VPADDQrr, X86::VPADDQrm, 0 },
+ { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
+ { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 },
+ { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 },
+ { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 },
+ { X86::VPADDSBrr, X86::VPADDSBrm, 0 },
+ { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
+ { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 },
+ { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 },
+ { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 },
+ { X86::VPADDSWrr, X86::VPADDSWrm, 0 },
+ { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
+ { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 },
+ { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 },
+ { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 },
+ { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
+ { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
+ { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 },
+ { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 },
+ { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 },
+ { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
+ { X86::VPADDWYrr, X86::VPADDWYrm, 0 },
+ { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 },
+ { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 },
+ { X86::VPADDWZrr, X86::VPADDWZrm, 0 },
+ { X86::VPADDWrr, X86::VPADDWrm, 0 },
+ { X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 },
+ { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 },
+ { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 },
+ { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 },
+ { X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 },
+ { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 },
+ { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 },
+ { X86::VPANDDZrr, X86::VPANDDZrm, 0 },
+ { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 },
+ { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 },
+ { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 },
+ { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 },
+ { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 },
+ { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 },
+ { X86::VPANDNYrr, X86::VPANDNYrm, 0 },
+ { X86::VPANDNrr, X86::VPANDNrm, 0 },
+ { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 },
+ { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 },
+ { X86::VPANDQZrr, X86::VPANDQZrm, 0 },
+ { X86::VPANDYrr, X86::VPANDYrm, 0 },
+ { X86::VPANDrr, X86::VPANDrm, 0 },
+ { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
+ { X86::VPAVGBZ128rr, X86::VPAVGBZ128rm, 0 },
+ { X86::VPAVGBZ256rr, X86::VPAVGBZ256rm, 0 },
+ { X86::VPAVGBZrr, X86::VPAVGBZrm, 0 },
+ { X86::VPAVGBrr, X86::VPAVGBrm, 0 },
+ { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
+ { X86::VPAVGWZ128rr, X86::VPAVGWZ128rm, 0 },
+ { X86::VPAVGWZ256rr, X86::VPAVGWZ256rm, 0 },
+ { X86::VPAVGWZrr, X86::VPAVGWZrm, 0 },
+ { X86::VPAVGWrr, X86::VPAVGWrm, 0 },
+ { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
+ { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
+ { X86::VPBLENDMBZ128rr, X86::VPBLENDMBZ128rm, 0 },
+ { X86::VPBLENDMBZ256rr, X86::VPBLENDMBZ256rm, 0 },
+ { X86::VPBLENDMBZrr, X86::VPBLENDMBZrm, 0 },
+ { X86::VPBLENDMDZ128rr, X86::VPBLENDMDZ128rm, 0 },
+ { X86::VPBLENDMDZ256rr, X86::VPBLENDMDZ256rm, 0 },
+ { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 },
+ { X86::VPBLENDMQZ128rr, X86::VPBLENDMQZ128rm, 0 },
+ { X86::VPBLENDMQZ256rr, X86::VPBLENDMQZ256rm, 0 },
+ { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 },
+ { X86::VPBLENDMWZ128rr, X86::VPBLENDMWZ128rm, 0 },
+ { X86::VPBLENDMWZ256rr, X86::VPBLENDMWZ256rm, 0 },
+ { X86::VPBLENDMWZrr, X86::VPBLENDMWZrm, 0 },
+ { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
+ { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
+ { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
+ { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
+ { X86::VPBROADCASTBZ128rrkz, X86::VPBROADCASTBZ128rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ256rrkz, X86::VPBROADCASTBZ256rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZrrkz, X86::VPBROADCASTBZrmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ128rrkz, X86::VPBROADCASTDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ256rrkz, X86::VPBROADCASTDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZrrkz, X86::VPBROADCASTDZrmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ128rrkz, X86::VPBROADCASTQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ256rrkz, X86::VPBROADCASTQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZrrkz, X86::VPBROADCASTQZrmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ128rrkz, X86::VPBROADCASTWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ256rrkz, X86::VPBROADCASTWZ256rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZrrkz, X86::VPBROADCASTWZrmkz, TB_NO_REVERSE },
+ { X86::VPCLMULQDQYrr, X86::VPCLMULQDQYrm, 0 },
+ { X86::VPCLMULQDQZ128rr, X86::VPCLMULQDQZ128rm, 0 },
+ { X86::VPCLMULQDQZ256rr, X86::VPCLMULQDQZ256rm, 0 },
+ { X86::VPCLMULQDQZrr, X86::VPCLMULQDQZrm, 0 },
+ { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
+ { X86::VPCMOVYrrr, X86::VPCMOVYrmr, 0 },
+ { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 },
+ { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 },
+ { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 },
+ { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 },
+ { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 },
+ { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 },
+ { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 },
+ { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
+ { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 },
+ { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 },
+ { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 },
+ { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
+ { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
+ { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 },
+ { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 },
+ { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 },
+ { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
+ { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
+ { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 },
+ { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 },
+ { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 },
+ { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
+ { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
+ { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 },
+ { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 },
+ { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 },
+ { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
+ { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
+ { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 },
+ { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 },
+ { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 },
+ { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
+ { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
+ { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 },
+ { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 },
+ { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 },
+ { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
+ { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
+ { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 },
+ { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 },
+ { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 },
+ { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
+ { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
+ { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 },
+ { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 },
+ { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 },
+ { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
+ { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 },
+ { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 },
+ { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 },
+ { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 },
+ { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 },
+ { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 },
+ { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 },
+ { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 },
+ { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 },
+ { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 },
+ { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 },
+ { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
+ { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 },
+ { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
+ { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
+ { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
+ { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
+ { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
+ { X86::VPCOMBri, X86::VPCOMBmi, 0 },
+ { X86::VPCOMDri, X86::VPCOMDmi, 0 },
+ { X86::VPCOMQri, X86::VPCOMQmi, 0 },
+ { X86::VPCOMUBri, X86::VPCOMUBmi, 0 },
+ { X86::VPCOMUDri, X86::VPCOMUDmi, 0 },
+ { X86::VPCOMUQri, X86::VPCOMUQmi, 0 },
+ { X86::VPCOMUWri, X86::VPCOMUWmi, 0 },
+ { X86::VPCOMWri, X86::VPCOMWmi, 0 },
+ { X86::VPCONFLICTDZ128rrkz, X86::VPCONFLICTDZ128rmkz, 0 },
+ { X86::VPCONFLICTDZ256rrkz, X86::VPCONFLICTDZ256rmkz, 0 },
+ { X86::VPCONFLICTDZrrkz, X86::VPCONFLICTDZrmkz, 0 },
+ { X86::VPCONFLICTQZ128rrkz, X86::VPCONFLICTQZ128rmkz, 0 },
+ { X86::VPCONFLICTQZ256rrkz, X86::VPCONFLICTQZ256rmkz, 0 },
+ { X86::VPCONFLICTQZrrkz, X86::VPCONFLICTQZrmkz, 0 },
+ { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
+ { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
+ { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
+ { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
+ { X86::VPERMBZrr, X86::VPERMBZrm, 0 },
+ { X86::VPERMDYrr, X86::VPERMDYrm, 0 },
+ { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
+ { X86::VPERMDZrr, X86::VPERMDZrm, 0 },
+ { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYmr, 0 },
+ { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 },
+ { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYmr, 0 },
+ { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 },
+ { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
+ { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
+ { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 },
+ { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
+ { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 },
+ { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
+ { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 },
+ { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
+ { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
+ { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
+ { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 },
+ { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
+ { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 },
+ { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
+ { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 },
+ { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
+ { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
+ { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
+ { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
+ { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
+ { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
+ { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
+ { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
+ { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
+ { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
+ { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
+ { X86::VPERMQZrr, X86::VPERMQZrm, 0 },
+ { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
+ { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
+ { X86::VPERMWZrr, X86::VPERMWZrm, 0 },
+ { X86::VPEXPANDBZ128rrkz, X86::VPEXPANDBZ128rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDBZ256rrkz, X86::VPEXPANDBZ256rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDBZrrkz, X86::VPEXPANDBZrmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ128rrkz, X86::VPEXPANDDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ256rrkz, X86::VPEXPANDDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDDZrrkz, X86::VPEXPANDDZrmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ128rrkz, X86::VPEXPANDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ256rrkz, X86::VPEXPANDQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDQZrrkz, X86::VPEXPANDQZrmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ128rrkz, X86::VPEXPANDWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ256rrkz, X86::VPEXPANDWZ256rmkz, TB_NO_REVERSE },
+ { X86::VPEXPANDWZrrkz, X86::VPEXPANDWZrmkz, TB_NO_REVERSE },
+ { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
+ { X86::VPHADDDrr, X86::VPHADDDrm, 0 },
+ { X86::VPHADDSWYrr, X86::VPHADDSWYrm, 0 },
+ { X86::VPHADDSWrr, X86::VPHADDSWrm, 0 },
+ { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
+ { X86::VPHADDWrr, X86::VPHADDWrm, 0 },
+ { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
+ { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
+ { X86::VPHSUBSWYrr, X86::VPHSUBSWYrm, 0 },
+ { X86::VPHSUBSWrr, X86::VPHSUBSWrm, 0 },
+ { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
+ { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
+ { X86::VPINSRBZrr, X86::VPINSRBZrm, TB_NO_REVERSE },
+ { X86::VPINSRBrr, X86::VPINSRBrm, TB_NO_REVERSE },
+ { X86::VPINSRDZrr, X86::VPINSRDZrm, 0 },
+ { X86::VPINSRDrr, X86::VPINSRDrm, 0 },
+ { X86::VPINSRQZrr, X86::VPINSRQZrm, 0 },
+ { X86::VPINSRQrr, X86::VPINSRQrm, 0 },
+ { X86::VPINSRWZrr, X86::VPINSRWZrm, TB_NO_REVERSE },
+ { X86::VPINSRWrr, X86::VPINSRWrm, TB_NO_REVERSE },
+ { X86::VPLZCNTDZ128rrkz, X86::VPLZCNTDZ128rmkz, 0 },
+ { X86::VPLZCNTDZ256rrkz, X86::VPLZCNTDZ256rmkz, 0 },
+ { X86::VPLZCNTDZrrkz, X86::VPLZCNTDZrmkz, 0 },
+ { X86::VPLZCNTQZ128rrkz, X86::VPLZCNTQZ128rmkz, 0 },
+ { X86::VPLZCNTQZ256rrkz, X86::VPLZCNTQZ256rmkz, 0 },
+ { X86::VPLZCNTQZrrkz, X86::VPLZCNTQZrmkz, 0 },
+ { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 },
+ { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 },
+ { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 },
+ { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 },
+ { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 },
+ { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 },
+ { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 },
+ { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 },
+ { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 },
+ { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 },
+ { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 },
+ { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 },
+ { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 },
+ { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
+ { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
+ { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
+ { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 },
+ { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
+ { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
+ { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
+ { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
+ { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
+ { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
+ { X86::VPMAXSBZ128rr, X86::VPMAXSBZ128rm, 0 },
+ { X86::VPMAXSBZ256rr, X86::VPMAXSBZ256rm, 0 },
+ { X86::VPMAXSBZrr, X86::VPMAXSBZrm, 0 },
+ { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
+ { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
+ { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rm, 0 },
+ { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rm, 0 },
+ { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
+ { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
+ { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rm, 0 },
+ { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rm, 0 },
+ { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
+ { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
+ { X86::VPMAXSWZ128rr, X86::VPMAXSWZ128rm, 0 },
+ { X86::VPMAXSWZ256rr, X86::VPMAXSWZ256rm, 0 },
+ { X86::VPMAXSWZrr, X86::VPMAXSWZrm, 0 },
+ { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
+ { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
+ { X86::VPMAXUBZ128rr, X86::VPMAXUBZ128rm, 0 },
+ { X86::VPMAXUBZ256rr, X86::VPMAXUBZ256rm, 0 },
+ { X86::VPMAXUBZrr, X86::VPMAXUBZrm, 0 },
+ { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
+ { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
+ { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rm, 0 },
+ { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rm, 0 },
+ { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
+ { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
+ { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rm, 0 },
+ { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rm, 0 },
+ { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
+ { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
+ { X86::VPMAXUWZ128rr, X86::VPMAXUWZ128rm, 0 },
+ { X86::VPMAXUWZ256rr, X86::VPMAXUWZ256rm, 0 },
+ { X86::VPMAXUWZrr, X86::VPMAXUWZrm, 0 },
+ { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
+ { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
+ { X86::VPMINSBZ128rr, X86::VPMINSBZ128rm, 0 },
+ { X86::VPMINSBZ256rr, X86::VPMINSBZ256rm, 0 },
+ { X86::VPMINSBZrr, X86::VPMINSBZrm, 0 },
+ { X86::VPMINSBrr, X86::VPMINSBrm, 0 },
+ { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
+ { X86::VPMINSDZ128rr, X86::VPMINSDZ128rm, 0 },
+ { X86::VPMINSDZ256rr, X86::VPMINSDZ256rm, 0 },
+ { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
+ { X86::VPMINSDrr, X86::VPMINSDrm, 0 },
+ { X86::VPMINSQZ128rr, X86::VPMINSQZ128rm, 0 },
+ { X86::VPMINSQZ256rr, X86::VPMINSQZ256rm, 0 },
+ { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
+ { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
+ { X86::VPMINSWZ128rr, X86::VPMINSWZ128rm, 0 },
+ { X86::VPMINSWZ256rr, X86::VPMINSWZ256rm, 0 },
+ { X86::VPMINSWZrr, X86::VPMINSWZrm, 0 },
+ { X86::VPMINSWrr, X86::VPMINSWrm, 0 },
+ { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
+ { X86::VPMINUBZ128rr, X86::VPMINUBZ128rm, 0 },
+ { X86::VPMINUBZ256rr, X86::VPMINUBZ256rm, 0 },
+ { X86::VPMINUBZrr, X86::VPMINUBZrm, 0 },
+ { X86::VPMINUBrr, X86::VPMINUBrm, 0 },
+ { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
+ { X86::VPMINUDZ128rr, X86::VPMINUDZ128rm, 0 },
+ { X86::VPMINUDZ256rr, X86::VPMINUDZ256rm, 0 },
+ { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
+ { X86::VPMINUDrr, X86::VPMINUDrm, 0 },
+ { X86::VPMINUQZ128rr, X86::VPMINUQZ128rm, 0 },
+ { X86::VPMINUQZ256rr, X86::VPMINUQZ256rm, 0 },
+ { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
+ { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
+ { X86::VPMINUWZ128rr, X86::VPMINUWZ128rm, 0 },
+ { X86::VPMINUWZ256rr, X86::VPMINUWZ256rm, 0 },
+ { X86::VPMINUWZrr, X86::VPMINUWZrm, 0 },
+ { X86::VPMINUWrr, X86::VPMINUWrm, 0 },
+ { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
+ { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
+ { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
+ { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 },
+ { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 },
+ { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 },
+ { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 },
+ { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 },
+ { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 },
+ { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 },
+ { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 },
+ { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 },
+ { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
+ { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 },
+ { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
+ { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
+ { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
+ { X86::VPMULDQZ128rr, X86::VPMULDQZ128rm, 0 },
+ { X86::VPMULDQZ256rr, X86::VPMULDQZ256rm, 0 },
+ { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
+ { X86::VPMULDQrr, X86::VPMULDQrm, 0 },
+ { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 },
+ { X86::VPMULHRSWZ128rr, X86::VPMULHRSWZ128rm, 0 },
+ { X86::VPMULHRSWZ256rr, X86::VPMULHRSWZ256rm, 0 },
+ { X86::VPMULHRSWZrr, X86::VPMULHRSWZrm, 0 },
+ { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 },
+ { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
+ { X86::VPMULHUWZ128rr, X86::VPMULHUWZ128rm, 0 },
+ { X86::VPMULHUWZ256rr, X86::VPMULHUWZ256rm, 0 },
+ { X86::VPMULHUWZrr, X86::VPMULHUWZrm, 0 },
+ { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
+ { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
+ { X86::VPMULHWZ128rr, X86::VPMULHWZ128rm, 0 },
+ { X86::VPMULHWZ256rr, X86::VPMULHWZ256rm, 0 },
+ { X86::VPMULHWZrr, X86::VPMULHWZrm, 0 },
+ { X86::VPMULHWrr, X86::VPMULHWrm, 0 },
+ { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
+ { X86::VPMULLDZ128rr, X86::VPMULLDZ128rm, 0 },
+ { X86::VPMULLDZ256rr, X86::VPMULLDZ256rm, 0 },
+ { X86::VPMULLDZrr, X86::VPMULLDZrm, 0 },
+ { X86::VPMULLDrr, X86::VPMULLDrm, 0 },
+ { X86::VPMULLQZ128rr, X86::VPMULLQZ128rm, 0 },
+ { X86::VPMULLQZ256rr, X86::VPMULLQZ256rm, 0 },
+ { X86::VPMULLQZrr, X86::VPMULLQZrm, 0 },
+ { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
+ { X86::VPMULLWZ128rr, X86::VPMULLWZ128rm, 0 },
+ { X86::VPMULLWZ256rr, X86::VPMULLWZ256rm, 0 },
+ { X86::VPMULLWZrr, X86::VPMULLWZrm, 0 },
+ { X86::VPMULLWrr, X86::VPMULLWrm, 0 },
+ { X86::VPMULTISHIFTQBZ128rr, X86::VPMULTISHIFTQBZ128rm, 0 },
+ { X86::VPMULTISHIFTQBZ256rr, X86::VPMULTISHIFTQBZ256rm, 0 },
+ { X86::VPMULTISHIFTQBZrr, X86::VPMULTISHIFTQBZrm, 0 },
+ { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
+ { X86::VPMULUDQZ128rr, X86::VPMULUDQZ128rm, 0 },
+ { X86::VPMULUDQZ256rr, X86::VPMULUDQZ256rm, 0 },
+ { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
+ { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
+ { X86::VPOPCNTBZ128rrkz, X86::VPOPCNTBZ128rmkz, 0 },
+ { X86::VPOPCNTBZ256rrkz, X86::VPOPCNTBZ256rmkz, 0 },
+ { X86::VPOPCNTBZrrkz, X86::VPOPCNTBZrmkz, 0 },
+ { X86::VPOPCNTDZ128rrkz, X86::VPOPCNTDZ128rmkz, 0 },
+ { X86::VPOPCNTDZ256rrkz, X86::VPOPCNTDZ256rmkz, 0 },
+ { X86::VPOPCNTDZrrkz, X86::VPOPCNTDZrmkz, 0 },
+ { X86::VPOPCNTQZ128rrkz, X86::VPOPCNTQZ128rmkz, 0 },
+ { X86::VPOPCNTQZ256rrkz, X86::VPOPCNTQZ256rmkz, 0 },
+ { X86::VPOPCNTQZrrkz, X86::VPOPCNTQZrmkz, 0 },
+ { X86::VPOPCNTWZ128rrkz, X86::VPOPCNTWZ128rmkz, 0 },
+ { X86::VPOPCNTWZ256rrkz, X86::VPOPCNTWZ256rmkz, 0 },
+ { X86::VPOPCNTWZrrkz, X86::VPOPCNTWZrmkz, 0 },
+ { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 },
+ { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 },
+ { X86::VPORDZrr, X86::VPORDZrm, 0 },
+ { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 },
+ { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 },
+ { X86::VPORQZrr, X86::VPORQZrm, 0 },
+ { X86::VPORYrr, X86::VPORYrm, 0 },
+ { X86::VPORrr, X86::VPORrm, 0 },
+ { X86::VPPERMrrr, X86::VPPERMrmr, 0 },
+ { X86::VPROLDZ128rikz, X86::VPROLDZ128mikz, 0 },
+ { X86::VPROLDZ256rikz, X86::VPROLDZ256mikz, 0 },
+ { X86::VPROLDZrikz, X86::VPROLDZmikz, 0 },
+ { X86::VPROLQZ128rikz, X86::VPROLQZ128mikz, 0 },
+ { X86::VPROLQZ256rikz, X86::VPROLQZ256mikz, 0 },
+ { X86::VPROLQZrikz, X86::VPROLQZmikz, 0 },
+ { X86::VPROLVDZ128rr, X86::VPROLVDZ128rm, 0 },
+ { X86::VPROLVDZ256rr, X86::VPROLVDZ256rm, 0 },
+ { X86::VPROLVDZrr, X86::VPROLVDZrm, 0 },
+ { X86::VPROLVQZ128rr, X86::VPROLVQZ128rm, 0 },
+ { X86::VPROLVQZ256rr, X86::VPROLVQZ256rm, 0 },
+ { X86::VPROLVQZrr, X86::VPROLVQZrm, 0 },
+ { X86::VPRORDZ128rikz, X86::VPRORDZ128mikz, 0 },
+ { X86::VPRORDZ256rikz, X86::VPRORDZ256mikz, 0 },
+ { X86::VPRORDZrikz, X86::VPRORDZmikz, 0 },
+ { X86::VPRORQZ128rikz, X86::VPRORQZ128mikz, 0 },
+ { X86::VPRORQZ256rikz, X86::VPRORQZ256mikz, 0 },
+ { X86::VPRORQZrikz, X86::VPRORQZmikz, 0 },
+ { X86::VPRORVDZ128rr, X86::VPRORVDZ128rm, 0 },
+ { X86::VPRORVDZ256rr, X86::VPRORVDZ256rm, 0 },
+ { X86::VPRORVDZrr, X86::VPRORVDZrm, 0 },
+ { X86::VPRORVQZ128rr, X86::VPRORVQZ128rm, 0 },
+ { X86::VPRORVQZ256rr, X86::VPRORVQZ256rm, 0 },
+ { X86::VPRORVQZrr, X86::VPRORVQZrm, 0 },
+ { X86::VPROTBrr, X86::VPROTBrm, 0 },
+ { X86::VPROTDrr, X86::VPROTDrm, 0 },
+ { X86::VPROTQrr, X86::VPROTQrm, 0 },
+ { X86::VPROTWrr, X86::VPROTWrm, 0 },
+ { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
+ { X86::VPSADBWZ128rr, X86::VPSADBWZ128rm, 0 },
+ { X86::VPSADBWZ256rr, X86::VPSADBWZ256rm, 0 },
+ { X86::VPSADBWZrr, X86::VPSADBWZrm, 0 },
+ { X86::VPSADBWrr, X86::VPSADBWrm, 0 },
+ { X86::VPSHABrr, X86::VPSHABrm, 0 },
+ { X86::VPSHADrr, X86::VPSHADrm, 0 },
+ { X86::VPSHAQrr, X86::VPSHAQrm, 0 },
+ { X86::VPSHAWrr, X86::VPSHAWrm, 0 },
+ { X86::VPSHLBrr, X86::VPSHLBrm, 0 },
+ { X86::VPSHLDDZ128rri, X86::VPSHLDDZ128rmi, 0 },
+ { X86::VPSHLDDZ256rri, X86::VPSHLDDZ256rmi, 0 },
+ { X86::VPSHLDDZrri, X86::VPSHLDDZrmi, 0 },
+ { X86::VPSHLDQZ128rri, X86::VPSHLDQZ128rmi, 0 },
+ { X86::VPSHLDQZ256rri, X86::VPSHLDQZ256rmi, 0 },
+ { X86::VPSHLDQZrri, X86::VPSHLDQZrmi, 0 },
+ { X86::VPSHLDWZ128rri, X86::VPSHLDWZ128rmi, 0 },
+ { X86::VPSHLDWZ256rri, X86::VPSHLDWZ256rmi, 0 },
+ { X86::VPSHLDWZrri, X86::VPSHLDWZrmi, 0 },
+ { X86::VPSHLDrr, X86::VPSHLDrm, 0 },
+ { X86::VPSHLQrr, X86::VPSHLQrm, 0 },
+ { X86::VPSHLWrr, X86::VPSHLWrm, 0 },
+ { X86::VPSHRDDZ128rri, X86::VPSHRDDZ128rmi, 0 },
+ { X86::VPSHRDDZ256rri, X86::VPSHRDDZ256rmi, 0 },
+ { X86::VPSHRDDZrri, X86::VPSHRDDZrmi, 0 },
+ { X86::VPSHRDQZ128rri, X86::VPSHRDQZ128rmi, 0 },
+ { X86::VPSHRDQZ256rri, X86::VPSHRDQZ256rmi, 0 },
+ { X86::VPSHRDQZrri, X86::VPSHRDQZrmi, 0 },
+ { X86::VPSHRDWZ128rri, X86::VPSHRDWZ128rmi, 0 },
+ { X86::VPSHRDWZ256rri, X86::VPSHRDWZ256rmi, 0 },
+ { X86::VPSHRDWZrri, X86::VPSHRDWZrmi, 0 },
+ { X86::VPSHUFBITQMBZ128rr, X86::VPSHUFBITQMBZ128rm, 0 },
+ { X86::VPSHUFBITQMBZ256rr, X86::VPSHUFBITQMBZ256rm, 0 },
+ { X86::VPSHUFBITQMBZrr, X86::VPSHUFBITQMBZrm, 0 },
+ { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
+ { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 },
+ { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 },
+ { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
+ { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
+ { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 },
+ { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 },
+ { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
+ { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 },
+ { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 },
+ { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
+ { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 },
+ { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 },
+ { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
+ { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 },
+ { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 },
+ { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 },
+ { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 },
+ { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 },
+ { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 },
+ { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
+ { X86::VPSLLDZ128rikz, X86::VPSLLDZ128mikz, 0 },
+ { X86::VPSLLDZ128rr, X86::VPSLLDZ128rm, 0 },
+ { X86::VPSLLDZ256rikz, X86::VPSLLDZ256mikz, 0 },
+ { X86::VPSLLDZ256rr, X86::VPSLLDZ256rm, 0 },
+ { X86::VPSLLDZrikz, X86::VPSLLDZmikz, 0 },
+ { X86::VPSLLDZrr, X86::VPSLLDZrm, 0 },
+ { X86::VPSLLDrr, X86::VPSLLDrm, 0 },
+ { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
+ { X86::VPSLLQZ128rikz, X86::VPSLLQZ128mikz, 0 },
+ { X86::VPSLLQZ128rr, X86::VPSLLQZ128rm, 0 },
+ { X86::VPSLLQZ256rikz, X86::VPSLLQZ256mikz, 0 },
+ { X86::VPSLLQZ256rr, X86::VPSLLQZ256rm, 0 },
+ { X86::VPSLLQZrikz, X86::VPSLLQZmikz, 0 },
+ { X86::VPSLLQZrr, X86::VPSLLQZrm, 0 },
+ { X86::VPSLLQrr, X86::VPSLLQrm, 0 },
+ { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
+ { X86::VPSLLVDZ128rr, X86::VPSLLVDZ128rm, 0 },
+ { X86::VPSLLVDZ256rr, X86::VPSLLVDZ256rm, 0 },
+ { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
+ { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
+ { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
+ { X86::VPSLLVQZ128rr, X86::VPSLLVQZ128rm, 0 },
+ { X86::VPSLLVQZ256rr, X86::VPSLLVQZ256rm, 0 },
+ { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
+ { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
+ { X86::VPSLLVWZ128rr, X86::VPSLLVWZ128rm, 0 },
+ { X86::VPSLLVWZ256rr, X86::VPSLLVWZ256rm, 0 },
+ { X86::VPSLLVWZrr, X86::VPSLLVWZrm, 0 },
+ { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
+ { X86::VPSLLWZ128rikz, X86::VPSLLWZ128mikz, 0 },
+ { X86::VPSLLWZ128rr, X86::VPSLLWZ128rm, 0 },
+ { X86::VPSLLWZ256rikz, X86::VPSLLWZ256mikz, 0 },
+ { X86::VPSLLWZ256rr, X86::VPSLLWZ256rm, 0 },
+ { X86::VPSLLWZrikz, X86::VPSLLWZmikz, 0 },
+ { X86::VPSLLWZrr, X86::VPSLLWZrm, 0 },
+ { X86::VPSLLWrr, X86::VPSLLWrm, 0 },
+ { X86::VPSRADYrr, X86::VPSRADYrm, 0 },
+ { X86::VPSRADZ128rikz, X86::VPSRADZ128mikz, 0 },
+ { X86::VPSRADZ128rr, X86::VPSRADZ128rm, 0 },
+ { X86::VPSRADZ256rikz, X86::VPSRADZ256mikz, 0 },
+ { X86::VPSRADZ256rr, X86::VPSRADZ256rm, 0 },
+ { X86::VPSRADZrikz, X86::VPSRADZmikz, 0 },
+ { X86::VPSRADZrr, X86::VPSRADZrm, 0 },
+ { X86::VPSRADrr, X86::VPSRADrm, 0 },
+ { X86::VPSRAQZ128rikz, X86::VPSRAQZ128mikz, 0 },
+ { X86::VPSRAQZ128rr, X86::VPSRAQZ128rm, 0 },
+ { X86::VPSRAQZ256rikz, X86::VPSRAQZ256mikz, 0 },
+ { X86::VPSRAQZ256rr, X86::VPSRAQZ256rm, 0 },
+ { X86::VPSRAQZrikz, X86::VPSRAQZmikz, 0 },
+ { X86::VPSRAQZrr, X86::VPSRAQZrm, 0 },
+ { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
+ { X86::VPSRAVDZ128rr, X86::VPSRAVDZ128rm, 0 },
+ { X86::VPSRAVDZ256rr, X86::VPSRAVDZ256rm, 0 },
+ { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
+ { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
+ { X86::VPSRAVQZ128rr, X86::VPSRAVQZ128rm, 0 },
+ { X86::VPSRAVQZ256rr, X86::VPSRAVQZ256rm, 0 },
+ { X86::VPSRAVQZrr, X86::VPSRAVQZrm, 0 },
+ { X86::VPSRAVWZ128rr, X86::VPSRAVWZ128rm, 0 },
+ { X86::VPSRAVWZ256rr, X86::VPSRAVWZ256rm, 0 },
+ { X86::VPSRAVWZrr, X86::VPSRAVWZrm, 0 },
+ { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
+ { X86::VPSRAWZ128rikz, X86::VPSRAWZ128mikz, 0 },
+ { X86::VPSRAWZ128rr, X86::VPSRAWZ128rm, 0 },
+ { X86::VPSRAWZ256rikz, X86::VPSRAWZ256mikz, 0 },
+ { X86::VPSRAWZ256rr, X86::VPSRAWZ256rm, 0 },
+ { X86::VPSRAWZrikz, X86::VPSRAWZmikz, 0 },
+ { X86::VPSRAWZrr, X86::VPSRAWZrm, 0 },
+ { X86::VPSRAWrr, X86::VPSRAWrm, 0 },
+ { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
+ { X86::VPSRLDZ128rikz, X86::VPSRLDZ128mikz, 0 },
+ { X86::VPSRLDZ128rr, X86::VPSRLDZ128rm, 0 },
+ { X86::VPSRLDZ256rikz, X86::VPSRLDZ256mikz, 0 },
+ { X86::VPSRLDZ256rr, X86::VPSRLDZ256rm, 0 },
+ { X86::VPSRLDZrikz, X86::VPSRLDZmikz, 0 },
+ { X86::VPSRLDZrr, X86::VPSRLDZrm, 0 },
+ { X86::VPSRLDrr, X86::VPSRLDrm, 0 },
+ { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
+ { X86::VPSRLQZ128rikz, X86::VPSRLQZ128mikz, 0 },
+ { X86::VPSRLQZ128rr, X86::VPSRLQZ128rm, 0 },
+ { X86::VPSRLQZ256rikz, X86::VPSRLQZ256mikz, 0 },
+ { X86::VPSRLQZ256rr, X86::VPSRLQZ256rm, 0 },
+ { X86::VPSRLQZrikz, X86::VPSRLQZmikz, 0 },
+ { X86::VPSRLQZrr, X86::VPSRLQZrm, 0 },
+ { X86::VPSRLQrr, X86::VPSRLQrm, 0 },
+ { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
+ { X86::VPSRLVDZ128rr, X86::VPSRLVDZ128rm, 0 },
+ { X86::VPSRLVDZ256rr, X86::VPSRLVDZ256rm, 0 },
+ { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
+ { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
+ { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
+ { X86::VPSRLVQZ128rr, X86::VPSRLVQZ128rm, 0 },
+ { X86::VPSRLVQZ256rr, X86::VPSRLVQZ256rm, 0 },
+ { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
+ { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
+ { X86::VPSRLVWZ128rr, X86::VPSRLVWZ128rm, 0 },
+ { X86::VPSRLVWZ256rr, X86::VPSRLVWZ256rm, 0 },
+ { X86::VPSRLVWZrr, X86::VPSRLVWZrm, 0 },
+ { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
+ { X86::VPSRLWZ128rikz, X86::VPSRLWZ128mikz, 0 },
+ { X86::VPSRLWZ128rr, X86::VPSRLWZ128rm, 0 },
+ { X86::VPSRLWZ256rikz, X86::VPSRLWZ256mikz, 0 },
+ { X86::VPSRLWZ256rr, X86::VPSRLWZ256rm, 0 },
+ { X86::VPSRLWZrikz, X86::VPSRLWZmikz, 0 },
+ { X86::VPSRLWZrr, X86::VPSRLWZrm, 0 },
+ { X86::VPSRLWrr, X86::VPSRLWrm, 0 },
+ { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
+ { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 },
+ { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 },
+ { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 },
+ { X86::VPSUBBrr, X86::VPSUBBrm, 0 },
+ { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
+ { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 },
+ { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 },
+ { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
+ { X86::VPSUBDrr, X86::VPSUBDrm, 0 },
+ { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
+ { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 },
+ { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 },
+ { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
+ { X86::VPSUBQrr, X86::VPSUBQrm, 0 },
+ { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
+ { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 },
+ { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 },
+ { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 },
+ { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
+ { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
+ { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 },
+ { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 },
+ { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 },
+ { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
+ { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
+ { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 },
+ { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 },
+ { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 },
+ { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
+ { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
+ { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 },
+ { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 },
+ { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 },
+ { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
+ { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
+ { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 },
+ { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 },
+ { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 },
+ { X86::VPSUBWrr, X86::VPSUBWrm, 0 },
+ { X86::VPTESTMBZ128rr, X86::VPTESTMBZ128rm, 0 },
+ { X86::VPTESTMBZ256rr, X86::VPTESTMBZ256rm, 0 },
+ { X86::VPTESTMBZrr, X86::VPTESTMBZrm, 0 },
+ { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rm, 0 },
+ { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rm, 0 },
+ { X86::VPTESTMDZrr, X86::VPTESTMDZrm, 0 },
+ { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rm, 0 },
+ { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rm, 0 },
+ { X86::VPTESTMQZrr, X86::VPTESTMQZrm, 0 },
+ { X86::VPTESTMWZ128rr, X86::VPTESTMWZ128rm, 0 },
+ { X86::VPTESTMWZ256rr, X86::VPTESTMWZ256rm, 0 },
+ { X86::VPTESTMWZrr, X86::VPTESTMWZrm, 0 },
+ { X86::VPTESTNMBZ128rr, X86::VPTESTNMBZ128rm, 0 },
+ { X86::VPTESTNMBZ256rr, X86::VPTESTNMBZ256rm, 0 },
+ { X86::VPTESTNMBZrr, X86::VPTESTNMBZrm, 0 },
+ { X86::VPTESTNMDZ128rr, X86::VPTESTNMDZ128rm, 0 },
+ { X86::VPTESTNMDZ256rr, X86::VPTESTNMDZ256rm, 0 },
+ { X86::VPTESTNMDZrr, X86::VPTESTNMDZrm, 0 },
+ { X86::VPTESTNMQZ128rr, X86::VPTESTNMQZ128rm, 0 },
+ { X86::VPTESTNMQZ256rr, X86::VPTESTNMQZ256rm, 0 },
+ { X86::VPTESTNMQZrr, X86::VPTESTNMQZrm, 0 },
+ { X86::VPTESTNMWZ128rr, X86::VPTESTNMWZ128rm, 0 },
+ { X86::VPTESTNMWZ256rr, X86::VPTESTNMWZ256rm, 0 },
+ { X86::VPTESTNMWZrr, X86::VPTESTNMWZrm, 0 },
+ { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
+ { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 },
+ { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 },
+ { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 },
+ { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
+ { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
+ { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 },
+ { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 },
+ { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 },
+ { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
+ { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
+ { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 },
+ { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 },
+ { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 },
+ { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
+ { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
+ { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 },
+ { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 },
+ { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 },
+ { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
+ { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
+ { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 },
+ { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 },
+ { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 },
+ { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
+ { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
+ { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 },
+ { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 },
+ { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 },
+ { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
+ { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
+ { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 },
+ { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 },
+ { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 },
+ { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
+ { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
+ { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 },
+ { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 },
+ { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 },
+ { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
+ { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 },
+ { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 },
+ { X86::VPXORDZrr, X86::VPXORDZrm, 0 },
+ { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 },
+ { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 },
+ { X86::VPXORQZrr, X86::VPXORQZrm, 0 },
+ { X86::VPXORYrr, X86::VPXORYrm, 0 },
+ { X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VRANGEPDZ128rri, X86::VRANGEPDZ128rmi, 0 },
+ { X86::VRANGEPDZ256rri, X86::VRANGEPDZ256rmi, 0 },
+ { X86::VRANGEPDZrri, X86::VRANGEPDZrmi, 0 },
+ { X86::VRANGEPSZ128rri, X86::VRANGEPSZ128rmi, 0 },
+ { X86::VRANGEPSZ256rri, X86::VRANGEPSZ256rmi, 0 },
+ { X86::VRANGEPSZrri, X86::VRANGEPSZrmi, 0 },
+ { X86::VRANGESDZrri, X86::VRANGESDZrmi, TB_NO_REVERSE },
+ { X86::VRANGESSZrri, X86::VRANGESSZrmi, TB_NO_REVERSE },
+ { X86::VRCP14PDZ128rkz, X86::VRCP14PDZ128mkz, 0 },
+ { X86::VRCP14PDZ256rkz, X86::VRCP14PDZ256mkz, 0 },
+ { X86::VRCP14PDZrkz, X86::VRCP14PDZmkz, 0 },
+ { X86::VRCP14PSZ128rkz, X86::VRCP14PSZ128mkz, 0 },
+ { X86::VRCP14PSZ256rkz, X86::VRCP14PSZ256mkz, 0 },
+ { X86::VRCP14PSZrkz, X86::VRCP14PSZmkz, 0 },
+ { X86::VRCP14SDZrr, X86::VRCP14SDZrm, TB_NO_REVERSE },
+ { X86::VRCP14SSZrr, X86::VRCP14SSZrm, TB_NO_REVERSE },
+ { X86::VRCP28PDZrkz, X86::VRCP28PDZmkz, 0 },
+ { X86::VRCP28PSZrkz, X86::VRCP28PSZmkz, 0 },
+ { X86::VRCP28SDZr, X86::VRCP28SDZm, TB_NO_REVERSE },
+ { X86::VRCP28SSZr, X86::VRCP28SSZm, TB_NO_REVERSE },
+ { X86::VRCPSSr, X86::VRCPSSm, 0 },
+ { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
+ { X86::VREDUCEPDZ128rrikz, X86::VREDUCEPDZ128rmikz, 0 },
+ { X86::VREDUCEPDZ256rrikz, X86::VREDUCEPDZ256rmikz, 0 },
+ { X86::VREDUCEPDZrrikz, X86::VREDUCEPDZrmikz, 0 },
+ { X86::VREDUCEPSZ128rrikz, X86::VREDUCEPSZ128rmikz, 0 },
+ { X86::VREDUCEPSZ256rrikz, X86::VREDUCEPSZ256rmikz, 0 },
+ { X86::VREDUCEPSZrrikz, X86::VREDUCEPSZrmikz, 0 },
+ { X86::VREDUCESDZrri, X86::VREDUCESDZrmi, TB_NO_REVERSE },
+ { X86::VREDUCESSZrri, X86::VREDUCESSZrmi, TB_NO_REVERSE },
+ { X86::VRNDSCALEPDZ128rrikz, X86::VRNDSCALEPDZ128rmikz, 0 },
+ { X86::VRNDSCALEPDZ256rrikz, X86::VRNDSCALEPDZ256rmikz, 0 },
+ { X86::VRNDSCALEPDZrrikz, X86::VRNDSCALEPDZrmikz, 0 },
+ { X86::VRNDSCALEPSZ128rrikz, X86::VRNDSCALEPSZ128rmikz, 0 },
+ { X86::VRNDSCALEPSZ256rrikz, X86::VRNDSCALEPSZ256rmikz, 0 },
+ { X86::VRNDSCALEPSZrrikz, X86::VRNDSCALEPSZrmikz, 0 },
+ { X86::VRNDSCALESDZr, X86::VRNDSCALESDZm, 0 },
+ { X86::VRNDSCALESDZr_Int, X86::VRNDSCALESDZm_Int, TB_NO_REVERSE },
+ { X86::VRNDSCALESSZr, X86::VRNDSCALESSZm, 0 },
+ { X86::VRNDSCALESSZr_Int, X86::VRNDSCALESSZm_Int, TB_NO_REVERSE },
+ { X86::VROUNDSDr, X86::VROUNDSDm, 0 },
+ { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE },
+ { X86::VROUNDSSr, X86::VROUNDSSm, 0 },
+ { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE },
+ { X86::VRSQRT14PDZ128rkz, X86::VRSQRT14PDZ128mkz, 0 },
+ { X86::VRSQRT14PDZ256rkz, X86::VRSQRT14PDZ256mkz, 0 },
+ { X86::VRSQRT14PDZrkz, X86::VRSQRT14PDZmkz, 0 },
+ { X86::VRSQRT14PSZ128rkz, X86::VRSQRT14PSZ128mkz, 0 },
+ { X86::VRSQRT14PSZ256rkz, X86::VRSQRT14PSZ256mkz, 0 },
+ { X86::VRSQRT14PSZrkz, X86::VRSQRT14PSZmkz, 0 },
+ { X86::VRSQRT14SDZrr, X86::VRSQRT14SDZrm, TB_NO_REVERSE },
+ { X86::VRSQRT14SSZrr, X86::VRSQRT14SSZrm, TB_NO_REVERSE },
+ { X86::VRSQRT28PDZrkz, X86::VRSQRT28PDZmkz, 0 },
+ { X86::VRSQRT28PSZrkz, X86::VRSQRT28PSZmkz, 0 },
+ { X86::VRSQRT28SDZr, X86::VRSQRT28SDZm, TB_NO_REVERSE },
+ { X86::VRSQRT28SSZr, X86::VRSQRT28SSZm, TB_NO_REVERSE },
+ { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
+ { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
+ { X86::VSCALEFPDZ128rr, X86::VSCALEFPDZ128rm, 0 },
+ { X86::VSCALEFPDZ256rr, X86::VSCALEFPDZ256rm, 0 },
+ { X86::VSCALEFPDZrr, X86::VSCALEFPDZrm, 0 },
+ { X86::VSCALEFPSZ128rr, X86::VSCALEFPSZ128rm, 0 },
+ { X86::VSCALEFPSZ256rr, X86::VSCALEFPSZ256rm, 0 },
+ { X86::VSCALEFPSZrr, X86::VSCALEFPSZrm, 0 },
+ { X86::VSCALEFSDZrr, X86::VSCALEFSDZrm, TB_NO_REVERSE },
+ { X86::VSCALEFSSZrr, X86::VSCALEFSSZrm, TB_NO_REVERSE },
+ { X86::VSHUFF32X4Z256rri, X86::VSHUFF32X4Z256rmi, 0 },
+ { X86::VSHUFF32X4Zrri, X86::VSHUFF32X4Zrmi, 0 },
+ { X86::VSHUFF64X2Z256rri, X86::VSHUFF64X2Z256rmi, 0 },
+ { X86::VSHUFF64X2Zrri, X86::VSHUFF64X2Zrmi, 0 },
+ { X86::VSHUFI32X4Z256rri, X86::VSHUFI32X4Z256rmi, 0 },
+ { X86::VSHUFI32X4Zrri, X86::VSHUFI32X4Zrmi, 0 },
+ { X86::VSHUFI64X2Z256rri, X86::VSHUFI64X2Z256rmi, 0 },
+ { X86::VSHUFI64X2Zrri, X86::VSHUFI64X2Zrmi, 0 },
+ { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
+ { X86::VSHUFPDZ128rri, X86::VSHUFPDZ128rmi, 0 },
+ { X86::VSHUFPDZ256rri, X86::VSHUFPDZ256rmi, 0 },
+ { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
+ { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
+ { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
+ { X86::VSHUFPSZ128rri, X86::VSHUFPSZ128rmi, 0 },
+ { X86::VSHUFPSZ256rri, X86::VSHUFPSZ256rmi, 0 },
+ { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
+ { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
+ { X86::VSQRTPDZ128rkz, X86::VSQRTPDZ128mkz, 0 },
+ { X86::VSQRTPDZ256rkz, X86::VSQRTPDZ256mkz, 0 },
+ { X86::VSQRTPDZrkz, X86::VSQRTPDZmkz, 0 },
+ { X86::VSQRTPSZ128rkz, X86::VSQRTPSZ128mkz, 0 },
+ { X86::VSQRTPSZ256rkz, X86::VSQRTPSZ256mkz, 0 },
+ { X86::VSQRTPSZrkz, X86::VSQRTPSZmkz, 0 },
+ { X86::VSQRTSDZr, X86::VSQRTSDZm, 0 },
+ { X86::VSQRTSDZr_Int, X86::VSQRTSDZm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
+ { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSSZr, X86::VSQRTSSZm, 0 },
+ { X86::VSQRTSSZr_Int, X86::VSQRTSSZm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
+ { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE },
+ { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
+ { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 },
+ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 },
+ { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
+ { X86::VSUBPDrr, X86::VSUBPDrm, 0 },
+ { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
+ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 },
+ { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 },
+ { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
+ { X86::VSUBPSrr, X86::VSUBPSrm, 0 },
+ { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
+ { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSDrr, X86::VSUBSDrm, 0 },
+ { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
+ { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSSrr, X86::VSUBSSrm, 0 },
+ { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE },
+ { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
+ { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 },
+ { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 },
+ { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 },
+ { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
+ { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
+ { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 },
+ { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 },
+ { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 },
+ { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
+ { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
+ { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 },
+ { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 },
+ { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 },
+ { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
+ { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
+ { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 },
+ { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 },
+ { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 },
+ { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
+ { X86::VXORPDYrr, X86::VXORPDYrm, 0 },
+ { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 },
+ { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 },
+ { X86::VXORPDZrr, X86::VXORPDZrm, 0 },
+ { X86::VXORPDrr, X86::VXORPDrm, 0 },
+ { X86::VXORPSYrr, X86::VXORPSYrm, 0 },
+ { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 },
+ { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
+ { X86::VXORPSZrr, X86::VXORPSZrm, 0 },
+ { X86::VXORPSrr, X86::VXORPSrm, 0 },
+ { X86::XOR16rr, X86::XOR16rm, 0 },
+ { X86::XOR32rr, X86::XOR32rm, 0 },
+ { X86::XOR64rr, X86::XOR64rm, 0 },
+ { X86::XOR8rr, X86::XOR8rm, 0 },
+ { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
+ { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
+ { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
+ { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
+ { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
+ { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
+ { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
+ { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
+ { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 },
+ { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 },
+ { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
+ { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 },
+ { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 },
+ { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
+ { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 },
+ { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 },
+ { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
+ { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 },
+ { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 },
+ { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 },
+ { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 },
+ { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 },
+ { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 },
+ { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 },
+ { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 },
+ { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
+ { X86::VBLENDMPDZ128rrk, X86::VBLENDMPDZ128rmk, 0 },
+ { X86::VBLENDMPDZ256rrk, X86::VBLENDMPDZ256rmk, 0 },
+ { X86::VBLENDMPDZrrk, X86::VBLENDMPDZrmk, 0 },
+ { X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmk, 0 },
+ { X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmk, 0 },
+ { X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmk, 0 },
+ { X86::VBROADCASTF32X2Z256rrk, X86::VBROADCASTF32X2Z256rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Zrrk, X86::VBROADCASTF32X2Zrmk, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z128rrk, X86::VBROADCASTI32X2Z128rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z256rrk, X86::VBROADCASTI32X2Z256rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Zrrk, X86::VBROADCASTI32X2Zrmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rrk, X86::VBROADCASTSDZ256rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrrk, X86::VBROADCASTSDZrmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rrk, X86::VBROADCASTSSZ128rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rrk, X86::VBROADCASTSSZ256rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZrrk, X86::VBROADCASTSSZrmk, TB_NO_REVERSE },
+ { X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0 },
+ { X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0 },
+ { X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0 },
+ { X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmik, 0 },
+ { X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmik, 0 },
+ { X86::VCMPPSZrrik, X86::VCMPPSZrmik, 0 },
+ { X86::VCMPSDZrr_Intk, X86::VCMPSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VCMPSSZrr_Intk, X86::VCMPSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmk, 0 },
+ { X86::VCVTDQ2PDZrrk, X86::VCVTDQ2PDZrmk, 0 },
+ { X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmk, 0 },
+ { X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmk, 0 },
+ { X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmk, 0 },
+ { X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmkz, 0 },
+ { X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmkz, 0 },
+ { X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmkz, 0 },
+ { X86::VCVTNEPS2BF16Z128rrk, X86::VCVTNEPS2BF16Z128rmk, 0 },
+ { X86::VCVTNEPS2BF16Z256rrk, X86::VCVTNEPS2BF16Z256rmk, 0 },
+ { X86::VCVTNEPS2BF16Zrrk, X86::VCVTNEPS2BF16Zrmk, 0 },
+ { X86::VCVTPD2DQZ128rrk, X86::VCVTPD2DQZ128rmk, 0 },
+ { X86::VCVTPD2DQZ256rrk, X86::VCVTPD2DQZ256rmk, 0 },
+ { X86::VCVTPD2DQZrrk, X86::VCVTPD2DQZrmk, 0 },
+ { X86::VCVTPD2PSZ128rrk, X86::VCVTPD2PSZ128rmk, 0 },
+ { X86::VCVTPD2PSZ256rrk, X86::VCVTPD2PSZ256rmk, 0 },
+ { X86::VCVTPD2PSZrrk, X86::VCVTPD2PSZrmk, 0 },
+ { X86::VCVTPD2QQZ128rrk, X86::VCVTPD2QQZ128rmk, 0 },
+ { X86::VCVTPD2QQZ256rrk, X86::VCVTPD2QQZ256rmk, 0 },
+ { X86::VCVTPD2QQZrrk, X86::VCVTPD2QQZrmk, 0 },
+ { X86::VCVTPD2UDQZ128rrk, X86::VCVTPD2UDQZ128rmk, 0 },
+ { X86::VCVTPD2UDQZ256rrk, X86::VCVTPD2UDQZ256rmk, 0 },
+ { X86::VCVTPD2UDQZrrk, X86::VCVTPD2UDQZrmk, 0 },
+ { X86::VCVTPD2UQQZ128rrk, X86::VCVTPD2UQQZ128rmk, 0 },
+ { X86::VCVTPD2UQQZ256rrk, X86::VCVTPD2UQQZ256rmk, 0 },
+ { X86::VCVTPD2UQQZrrk, X86::VCVTPD2UQQZrmk, 0 },
+ { X86::VCVTPH2PSZ128rrk, X86::VCVTPH2PSZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTPH2PSZ256rrk, X86::VCVTPH2PSZ256rmk, 0 },
+ { X86::VCVTPH2PSZrrk, X86::VCVTPH2PSZrmk, 0 },
+ { X86::VCVTPS2DQZ128rrk, X86::VCVTPS2DQZ128rmk, 0 },
+ { X86::VCVTPS2DQZ256rrk, X86::VCVTPS2DQZ256rmk, 0 },
+ { X86::VCVTPS2DQZrrk, X86::VCVTPS2DQZrmk, 0 },
+ { X86::VCVTPS2PDZ128rrk, X86::VCVTPS2PDZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTPS2PDZ256rrk, X86::VCVTPS2PDZ256rmk, 0 },
+ { X86::VCVTPS2PDZrrk, X86::VCVTPS2PDZrmk, 0 },
+ { X86::VCVTPS2QQZ128rrk, X86::VCVTPS2QQZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTPS2QQZ256rrk, X86::VCVTPS2QQZ256rmk, 0 },
+ { X86::VCVTPS2QQZrrk, X86::VCVTPS2QQZrmk, 0 },
+ { X86::VCVTPS2UDQZ128rrk, X86::VCVTPS2UDQZ128rmk, 0 },
+ { X86::VCVTPS2UDQZ256rrk, X86::VCVTPS2UDQZ256rmk, 0 },
+ { X86::VCVTPS2UDQZrrk, X86::VCVTPS2UDQZrmk, 0 },
+ { X86::VCVTPS2UQQZ128rrk, X86::VCVTPS2UQQZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTPS2UQQZ256rrk, X86::VCVTPS2UQQZ256rmk, 0 },
+ { X86::VCVTPS2UQQZrrk, X86::VCVTPS2UQQZrmk, 0 },
+ { X86::VCVTQQ2PDZ128rrk, X86::VCVTQQ2PDZ128rmk, 0 },
+ { X86::VCVTQQ2PDZ256rrk, X86::VCVTQQ2PDZ256rmk, 0 },
+ { X86::VCVTQQ2PDZrrk, X86::VCVTQQ2PDZrmk, 0 },
+ { X86::VCVTQQ2PSZ128rrk, X86::VCVTQQ2PSZ128rmk, 0 },
+ { X86::VCVTQQ2PSZ256rrk, X86::VCVTQQ2PSZ256rmk, 0 },
+ { X86::VCVTQQ2PSZrrk, X86::VCVTQQ2PSZrmk, 0 },
+ { X86::VCVTSD2SSZrr_Intkz, X86::VCVTSD2SSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VCVTSS2SDZrr_Intkz, X86::VCVTSS2SDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQZ128rrk, X86::VCVTTPD2DQZ128rmk, 0 },
+ { X86::VCVTTPD2DQZ256rrk, X86::VCVTTPD2DQZ256rmk, 0 },
+ { X86::VCVTTPD2DQZrrk, X86::VCVTTPD2DQZrmk, 0 },
+ { X86::VCVTTPD2QQZ128rrk, X86::VCVTTPD2QQZ128rmk, 0 },
+ { X86::VCVTTPD2QQZ256rrk, X86::VCVTTPD2QQZ256rmk, 0 },
+ { X86::VCVTTPD2QQZrrk, X86::VCVTTPD2QQZrmk, 0 },
+ { X86::VCVTTPD2UDQZ128rrk, X86::VCVTTPD2UDQZ128rmk, 0 },
+ { X86::VCVTTPD2UDQZ256rrk, X86::VCVTTPD2UDQZ256rmk, 0 },
+ { X86::VCVTTPD2UDQZrrk, X86::VCVTTPD2UDQZrmk, 0 },
+ { X86::VCVTTPD2UQQZ128rrk, X86::VCVTTPD2UQQZ128rmk, 0 },
+ { X86::VCVTTPD2UQQZ256rrk, X86::VCVTTPD2UQQZ256rmk, 0 },
+ { X86::VCVTTPD2UQQZrrk, X86::VCVTTPD2UQQZrmk, 0 },
+ { X86::VCVTTPS2DQZ128rrk, X86::VCVTTPS2DQZ128rmk, 0 },
+ { X86::VCVTTPS2DQZ256rrk, X86::VCVTTPS2DQZ256rmk, 0 },
+ { X86::VCVTTPS2DQZrrk, X86::VCVTTPS2DQZrmk, 0 },
+ { X86::VCVTTPS2QQZ128rrk, X86::VCVTTPS2QQZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTTPS2QQZ256rrk, X86::VCVTTPS2QQZ256rmk, 0 },
+ { X86::VCVTTPS2QQZrrk, X86::VCVTTPS2QQZrmk, 0 },
+ { X86::VCVTTPS2UDQZ128rrk, X86::VCVTTPS2UDQZ128rmk, 0 },
+ { X86::VCVTTPS2UDQZ256rrk, X86::VCVTTPS2UDQZ256rmk, 0 },
+ { X86::VCVTTPS2UDQZrrk, X86::VCVTTPS2UDQZrmk, 0 },
+ { X86::VCVTTPS2UQQZ128rrk, X86::VCVTTPS2UQQZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTTPS2UQQZ256rrk, X86::VCVTTPS2UQQZ256rmk, 0 },
+ { X86::VCVTTPS2UQQZrrk, X86::VCVTTPS2UQQZrmk, 0 },
+ { X86::VCVTUDQ2PDZ128rrk, X86::VCVTUDQ2PDZ128rmk, TB_NO_REVERSE },
+ { X86::VCVTUDQ2PDZ256rrk, X86::VCVTUDQ2PDZ256rmk, 0 },
+ { X86::VCVTUDQ2PDZrrk, X86::VCVTUDQ2PDZrmk, 0 },
+ { X86::VCVTUDQ2PSZ128rrk, X86::VCVTUDQ2PSZ128rmk, 0 },
+ { X86::VCVTUDQ2PSZ256rrk, X86::VCVTUDQ2PSZ256rmk, 0 },
+ { X86::VCVTUDQ2PSZrrk, X86::VCVTUDQ2PSZrmk, 0 },
+ { X86::VCVTUQQ2PDZ128rrk, X86::VCVTUQQ2PDZ128rmk, 0 },
+ { X86::VCVTUQQ2PDZ256rrk, X86::VCVTUQQ2PDZ256rmk, 0 },
+ { X86::VCVTUQQ2PDZrrk, X86::VCVTUQQ2PDZrmk, 0 },
+ { X86::VCVTUQQ2PSZ128rrk, X86::VCVTUQQ2PSZ128rmk, 0 },
+ { X86::VCVTUQQ2PSZ256rrk, X86::VCVTUQQ2PSZ256rmk, 0 },
+ { X86::VCVTUQQ2PSZrrk, X86::VCVTUQQ2PSZrmk, 0 },
+ { X86::VDBPSADBWZ128rrikz, X86::VDBPSADBWZ128rmikz, 0 },
+ { X86::VDBPSADBWZ256rrikz, X86::VDBPSADBWZ256rmikz, 0 },
+ { X86::VDBPSADBWZrrikz, X86::VDBPSADBWZrmikz, 0 },
+ { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
+ { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
+ { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
+ { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
+ { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
+ { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
+ { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0 },
+ { X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0 },
+ { X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0 },
+ { X86::VEXP2PDZrk, X86::VEXP2PDZmk, 0 },
+ { X86::VEXP2PSZrk, X86::VEXP2PSZmk, 0 },
+ { X86::VEXPANDPDZ128rrk, X86::VEXPANDPDZ128rmk, TB_NO_REVERSE },
+ { X86::VEXPANDPDZ256rrk, X86::VEXPANDPDZ256rmk, TB_NO_REVERSE },
+ { X86::VEXPANDPDZrrk, X86::VEXPANDPDZrmk, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ128rrk, X86::VEXPANDPSZ128rmk, TB_NO_REVERSE },
+ { X86::VEXPANDPSZ256rrk, X86::VEXPANDPSZ256rmk, TB_NO_REVERSE },
+ { X86::VEXPANDPSZrrk, X86::VEXPANDPSZrmk, TB_NO_REVERSE },
+ { X86::VFIXUPIMMPDZ128rri, X86::VFIXUPIMMPDZ128rmi, 0 },
+ { X86::VFIXUPIMMPDZ256rri, X86::VFIXUPIMMPDZ256rmi, 0 },
+ { X86::VFIXUPIMMPDZrri, X86::VFIXUPIMMPDZrmi, 0 },
+ { X86::VFIXUPIMMPSZ128rri, X86::VFIXUPIMMPSZ128rmi, 0 },
+ { X86::VFIXUPIMMPSZ256rri, X86::VFIXUPIMMPSZ256rmi, 0 },
+ { X86::VFIXUPIMMPSZrri, X86::VFIXUPIMMPSZrmi, 0 },
+ { X86::VFIXUPIMMSDZrri, X86::VFIXUPIMMSDZrmi, TB_NO_REVERSE },
+ { X86::VFIXUPIMMSSZrri, X86::VFIXUPIMMSSZrmi, TB_NO_REVERSE },
+ { X86::VFMADD132PDYr, X86::VFMADD132PDYm, 0 },
+ { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128m, 0 },
+ { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256m, 0 },
+ { X86::VFMADD132PDZr, X86::VFMADD132PDZm, 0 },
+ { X86::VFMADD132PDr, X86::VFMADD132PDm, 0 },
+ { X86::VFMADD132PSYr, X86::VFMADD132PSYm, 0 },
+ { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128m, 0 },
+ { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256m, 0 },
+ { X86::VFMADD132PSZr, X86::VFMADD132PSZm, 0 },
+ { X86::VFMADD132PSr, X86::VFMADD132PSm, 0 },
+ { X86::VFMADD132SDZr, X86::VFMADD132SDZm, 0 },
+ { X86::VFMADD132SDZr_Int, X86::VFMADD132SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD132SDr, X86::VFMADD132SDm, 0 },
+ { X86::VFMADD132SDr_Int, X86::VFMADD132SDm_Int, TB_NO_REVERSE },
+ { X86::VFMADD132SSZr, X86::VFMADD132SSZm, 0 },
+ { X86::VFMADD132SSZr_Int, X86::VFMADD132SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD132SSr, X86::VFMADD132SSm, 0 },
+ { X86::VFMADD132SSr_Int, X86::VFMADD132SSm_Int, TB_NO_REVERSE },
+ { X86::VFMADD213PDYr, X86::VFMADD213PDYm, 0 },
+ { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128m, 0 },
+ { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256m, 0 },
+ { X86::VFMADD213PDZr, X86::VFMADD213PDZm, 0 },
+ { X86::VFMADD213PDr, X86::VFMADD213PDm, 0 },
+ { X86::VFMADD213PSYr, X86::VFMADD213PSYm, 0 },
+ { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128m, 0 },
+ { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256m, 0 },
+ { X86::VFMADD213PSZr, X86::VFMADD213PSZm, 0 },
+ { X86::VFMADD213PSr, X86::VFMADD213PSm, 0 },
+ { X86::VFMADD213SDZr, X86::VFMADD213SDZm, 0 },
+ { X86::VFMADD213SDZr_Int, X86::VFMADD213SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD213SDr, X86::VFMADD213SDm, 0 },
+ { X86::VFMADD213SDr_Int, X86::VFMADD213SDm_Int, TB_NO_REVERSE },
+ { X86::VFMADD213SSZr, X86::VFMADD213SSZm, 0 },
+ { X86::VFMADD213SSZr_Int, X86::VFMADD213SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD213SSr, X86::VFMADD213SSm, 0 },
+ { X86::VFMADD213SSr_Int, X86::VFMADD213SSm_Int, TB_NO_REVERSE },
+ { X86::VFMADD231PDYr, X86::VFMADD231PDYm, 0 },
+ { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128m, 0 },
+ { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256m, 0 },
+ { X86::VFMADD231PDZr, X86::VFMADD231PDZm, 0 },
+ { X86::VFMADD231PDr, X86::VFMADD231PDm, 0 },
+ { X86::VFMADD231PSYr, X86::VFMADD231PSYm, 0 },
+ { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128m, 0 },
+ { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256m, 0 },
+ { X86::VFMADD231PSZr, X86::VFMADD231PSZm, 0 },
+ { X86::VFMADD231PSr, X86::VFMADD231PSm, 0 },
+ { X86::VFMADD231SDZr, X86::VFMADD231SDZm, 0 },
+ { X86::VFMADD231SDZr_Int, X86::VFMADD231SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD231SDr, X86::VFMADD231SDm, 0 },
+ { X86::VFMADD231SDr_Int, X86::VFMADD231SDm_Int, TB_NO_REVERSE },
+ { X86::VFMADD231SSZr, X86::VFMADD231SSZm, 0 },
+ { X86::VFMADD231SSZr_Int, X86::VFMADD231SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMADD231SSr, X86::VFMADD231SSm, 0 },
+ { X86::VFMADD231SSr_Int, X86::VFMADD231SSm_Int, TB_NO_REVERSE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, 0 },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4rm, 0 },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, 0 },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4rm, 0 },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFMADDSUB132PDYr, X86::VFMADDSUB132PDYm, 0 },
+ { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128m, 0 },
+ { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256m, 0 },
+ { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZm, 0 },
+ { X86::VFMADDSUB132PDr, X86::VFMADDSUB132PDm, 0 },
+ { X86::VFMADDSUB132PSYr, X86::VFMADDSUB132PSYm, 0 },
+ { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128m, 0 },
+ { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256m, 0 },
+ { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZm, 0 },
+ { X86::VFMADDSUB132PSr, X86::VFMADDSUB132PSm, 0 },
+ { X86::VFMADDSUB213PDYr, X86::VFMADDSUB213PDYm, 0 },
+ { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128m, 0 },
+ { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256m, 0 },
+ { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZm, 0 },
+ { X86::VFMADDSUB213PDr, X86::VFMADDSUB213PDm, 0 },
+ { X86::VFMADDSUB213PSYr, X86::VFMADDSUB213PSYm, 0 },
+ { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128m, 0 },
+ { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256m, 0 },
+ { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZm, 0 },
+ { X86::VFMADDSUB213PSr, X86::VFMADDSUB213PSm, 0 },
+ { X86::VFMADDSUB231PDYr, X86::VFMADDSUB231PDYm, 0 },
+ { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128m, 0 },
+ { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256m, 0 },
+ { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZm, 0 },
+ { X86::VFMADDSUB231PDr, X86::VFMADDSUB231PDm, 0 },
+ { X86::VFMADDSUB231PSYr, X86::VFMADDSUB231PSYm, 0 },
+ { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128m, 0 },
+ { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256m, 0 },
+ { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZm, 0 },
+ { X86::VFMADDSUB231PSr, X86::VFMADDSUB231PSm, 0 },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, 0 },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, 0 },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, 0 },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, 0 },
+ { X86::VFMSUB132PDYr, X86::VFMSUB132PDYm, 0 },
+ { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128m, 0 },
+ { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256m, 0 },
+ { X86::VFMSUB132PDZr, X86::VFMSUB132PDZm, 0 },
+ { X86::VFMSUB132PDr, X86::VFMSUB132PDm, 0 },
+ { X86::VFMSUB132PSYr, X86::VFMSUB132PSYm, 0 },
+ { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128m, 0 },
+ { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256m, 0 },
+ { X86::VFMSUB132PSZr, X86::VFMSUB132PSZm, 0 },
+ { X86::VFMSUB132PSr, X86::VFMSUB132PSm, 0 },
+ { X86::VFMSUB132SDZr, X86::VFMSUB132SDZm, 0 },
+ { X86::VFMSUB132SDZr_Int, X86::VFMSUB132SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB132SDr, X86::VFMSUB132SDm, 0 },
+ { X86::VFMSUB132SDr_Int, X86::VFMSUB132SDm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB132SSZr, X86::VFMSUB132SSZm, 0 },
+ { X86::VFMSUB132SSZr_Int, X86::VFMSUB132SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB132SSr, X86::VFMSUB132SSm, 0 },
+ { X86::VFMSUB132SSr_Int, X86::VFMSUB132SSm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB213PDYr, X86::VFMSUB213PDYm, 0 },
+ { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128m, 0 },
+ { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256m, 0 },
+ { X86::VFMSUB213PDZr, X86::VFMSUB213PDZm, 0 },
+ { X86::VFMSUB213PDr, X86::VFMSUB213PDm, 0 },
+ { X86::VFMSUB213PSYr, X86::VFMSUB213PSYm, 0 },
+ { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128m, 0 },
+ { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256m, 0 },
+ { X86::VFMSUB213PSZr, X86::VFMSUB213PSZm, 0 },
+ { X86::VFMSUB213PSr, X86::VFMSUB213PSm, 0 },
+ { X86::VFMSUB213SDZr, X86::VFMSUB213SDZm, 0 },
+ { X86::VFMSUB213SDZr_Int, X86::VFMSUB213SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB213SDr, X86::VFMSUB213SDm, 0 },
+ { X86::VFMSUB213SDr_Int, X86::VFMSUB213SDm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB213SSZr, X86::VFMSUB213SSZm, 0 },
+ { X86::VFMSUB213SSZr_Int, X86::VFMSUB213SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB213SSr, X86::VFMSUB213SSm, 0 },
+ { X86::VFMSUB213SSr_Int, X86::VFMSUB213SSm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB231PDYr, X86::VFMSUB231PDYm, 0 },
+ { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128m, 0 },
+ { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256m, 0 },
+ { X86::VFMSUB231PDZr, X86::VFMSUB231PDZm, 0 },
+ { X86::VFMSUB231PDr, X86::VFMSUB231PDm, 0 },
+ { X86::VFMSUB231PSYr, X86::VFMSUB231PSYm, 0 },
+ { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128m, 0 },
+ { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256m, 0 },
+ { X86::VFMSUB231PSZr, X86::VFMSUB231PSZm, 0 },
+ { X86::VFMSUB231PSr, X86::VFMSUB231PSm, 0 },
+ { X86::VFMSUB231SDZr, X86::VFMSUB231SDZm, 0 },
+ { X86::VFMSUB231SDZr_Int, X86::VFMSUB231SDZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB231SDr, X86::VFMSUB231SDm, 0 },
+ { X86::VFMSUB231SDr_Int, X86::VFMSUB231SDm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB231SSZr, X86::VFMSUB231SSZm, 0 },
+ { X86::VFMSUB231SSZr_Int, X86::VFMSUB231SSZm_Int, TB_NO_REVERSE },
+ { X86::VFMSUB231SSr, X86::VFMSUB231SSm, 0 },
+ { X86::VFMSUB231SSr_Int, X86::VFMSUB231SSm_Int, TB_NO_REVERSE },
+ { X86::VFMSUBADD132PDYr, X86::VFMSUBADD132PDYm, 0 },
+ { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128m, 0 },
+ { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256m, 0 },
+ { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZm, 0 },
+ { X86::VFMSUBADD132PDr, X86::VFMSUBADD132PDm, 0 },
+ { X86::VFMSUBADD132PSYr, X86::VFMSUBADD132PSYm, 0 },
+ { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128m, 0 },
+ { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256m, 0 },
+ { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZm, 0 },
+ { X86::VFMSUBADD132PSr, X86::VFMSUBADD132PSm, 0 },
+ { X86::VFMSUBADD213PDYr, X86::VFMSUBADD213PDYm, 0 },
+ { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128m, 0 },
+ { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256m, 0 },
+ { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZm, 0 },
+ { X86::VFMSUBADD213PDr, X86::VFMSUBADD213PDm, 0 },
+ { X86::VFMSUBADD213PSYr, X86::VFMSUBADD213PSYm, 0 },
+ { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128m, 0 },
+ { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256m, 0 },
+ { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZm, 0 },
+ { X86::VFMSUBADD213PSr, X86::VFMSUBADD213PSm, 0 },
+ { X86::VFMSUBADD231PDYr, X86::VFMSUBADD231PDYm, 0 },
+ { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128m, 0 },
+ { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256m, 0 },
+ { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZm, 0 },
+ { X86::VFMSUBADD231PDr, X86::VFMSUBADD231PDm, 0 },
+ { X86::VFMSUBADD231PSYr, X86::VFMSUBADD231PSYm, 0 },
+ { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128m, 0 },
+ { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256m, 0 },
+ { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZm, 0 },
+ { X86::VFMSUBADD231PSr, X86::VFMSUBADD231PSm, 0 },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, 0 },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, 0 },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, 0 },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, 0 },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, 0 },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, 0 },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, 0 },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, 0 },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD132PDYr, X86::VFNMADD132PDYm, 0 },
+ { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128m, 0 },
+ { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, 0 },
+ { X86::VFNMADD132PDZr, X86::VFNMADD132PDZm, 0 },
+ { X86::VFNMADD132PDr, X86::VFNMADD132PDm, 0 },
+ { X86::VFNMADD132PSYr, X86::VFNMADD132PSYm, 0 },
+ { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128m, 0 },
+ { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256m, 0 },
+ { X86::VFNMADD132PSZr, X86::VFNMADD132PSZm, 0 },
+ { X86::VFNMADD132PSr, X86::VFNMADD132PSm, 0 },
+ { X86::VFNMADD132SDZr, X86::VFNMADD132SDZm, 0 },
+ { X86::VFNMADD132SDZr_Int, X86::VFNMADD132SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD132SDr, X86::VFNMADD132SDm, 0 },
+ { X86::VFNMADD132SDr_Int, X86::VFNMADD132SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD132SSZr, X86::VFNMADD132SSZm, 0 },
+ { X86::VFNMADD132SSZr_Int, X86::VFNMADD132SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD132SSr, X86::VFNMADD132SSm, 0 },
+ { X86::VFNMADD132SSr_Int, X86::VFNMADD132SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD213PDYr, X86::VFNMADD213PDYm, 0 },
+ { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128m, 0 },
+ { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256m, 0 },
+ { X86::VFNMADD213PDZr, X86::VFNMADD213PDZm, 0 },
+ { X86::VFNMADD213PDr, X86::VFNMADD213PDm, 0 },
+ { X86::VFNMADD213PSYr, X86::VFNMADD213PSYm, 0 },
+ { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128m, 0 },
+ { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256m, 0 },
+ { X86::VFNMADD213PSZr, X86::VFNMADD213PSZm, 0 },
+ { X86::VFNMADD213PSr, X86::VFNMADD213PSm, 0 },
+ { X86::VFNMADD213SDZr, X86::VFNMADD213SDZm, 0 },
+ { X86::VFNMADD213SDZr_Int, X86::VFNMADD213SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD213SDr, X86::VFNMADD213SDm, 0 },
+ { X86::VFNMADD213SDr_Int, X86::VFNMADD213SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD213SSZr, X86::VFNMADD213SSZm, 0 },
+ { X86::VFNMADD213SSZr_Int, X86::VFNMADD213SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD213SSr, X86::VFNMADD213SSm, 0 },
+ { X86::VFNMADD213SSr_Int, X86::VFNMADD213SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD231PDYr, X86::VFNMADD231PDYm, 0 },
+ { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128m, 0 },
+ { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256m, 0 },
+ { X86::VFNMADD231PDZr, X86::VFNMADD231PDZm, 0 },
+ { X86::VFNMADD231PDr, X86::VFNMADD231PDm, 0 },
+ { X86::VFNMADD231PSYr, X86::VFNMADD231PSYm, 0 },
+ { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128m, 0 },
+ { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256m, 0 },
+ { X86::VFNMADD231PSZr, X86::VFNMADD231PSZm, 0 },
+ { X86::VFNMADD231PSr, X86::VFNMADD231PSm, 0 },
+ { X86::VFNMADD231SDZr, X86::VFNMADD231SDZm, 0 },
+ { X86::VFNMADD231SDZr_Int, X86::VFNMADD231SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD231SDr, X86::VFNMADD231SDm, 0 },
+ { X86::VFNMADD231SDr_Int, X86::VFNMADD231SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD231SSZr, X86::VFNMADD231SSZm, 0 },
+ { X86::VFNMADD231SSZr_Int, X86::VFNMADD231SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMADD231SSr, X86::VFNMADD231SSm, 0 },
+ { X86::VFNMADD231SSr_Int, X86::VFNMADD231SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, 0 },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, 0 },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, 0 },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, 0 },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB132PDYr, X86::VFNMSUB132PDYm, 0 },
+ { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128m, 0 },
+ { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256m, 0 },
+ { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZm, 0 },
+ { X86::VFNMSUB132PDr, X86::VFNMSUB132PDm, 0 },
+ { X86::VFNMSUB132PSYr, X86::VFNMSUB132PSYm, 0 },
+ { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128m, 0 },
+ { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256m, 0 },
+ { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZm, 0 },
+ { X86::VFNMSUB132PSr, X86::VFNMSUB132PSm, 0 },
+ { X86::VFNMSUB132SDZr, X86::VFNMSUB132SDZm, 0 },
+ { X86::VFNMSUB132SDZr_Int, X86::VFNMSUB132SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB132SDr, X86::VFNMSUB132SDm, 0 },
+ { X86::VFNMSUB132SDr_Int, X86::VFNMSUB132SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB132SSZr, X86::VFNMSUB132SSZm, 0 },
+ { X86::VFNMSUB132SSZr_Int, X86::VFNMSUB132SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB132SSr, X86::VFNMSUB132SSm, 0 },
+ { X86::VFNMSUB132SSr_Int, X86::VFNMSUB132SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB213PDYr, X86::VFNMSUB213PDYm, 0 },
+ { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128m, 0 },
+ { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256m, 0 },
+ { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZm, 0 },
+ { X86::VFNMSUB213PDr, X86::VFNMSUB213PDm, 0 },
+ { X86::VFNMSUB213PSYr, X86::VFNMSUB213PSYm, 0 },
+ { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128m, 0 },
+ { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256m, 0 },
+ { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZm, 0 },
+ { X86::VFNMSUB213PSr, X86::VFNMSUB213PSm, 0 },
+ { X86::VFNMSUB213SDZr, X86::VFNMSUB213SDZm, 0 },
+ { X86::VFNMSUB213SDZr_Int, X86::VFNMSUB213SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB213SDr, X86::VFNMSUB213SDm, 0 },
+ { X86::VFNMSUB213SDr_Int, X86::VFNMSUB213SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB213SSZr, X86::VFNMSUB213SSZm, 0 },
+ { X86::VFNMSUB213SSZr_Int, X86::VFNMSUB213SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB213SSr, X86::VFNMSUB213SSm, 0 },
+ { X86::VFNMSUB213SSr_Int, X86::VFNMSUB213SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB231PDYr, X86::VFNMSUB231PDYm, 0 },
+ { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128m, 0 },
+ { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256m, 0 },
+ { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZm, 0 },
+ { X86::VFNMSUB231PDr, X86::VFNMSUB231PDm, 0 },
+ { X86::VFNMSUB231PSYr, X86::VFNMSUB231PSYm, 0 },
+ { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128m, 0 },
+ { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256m, 0 },
+ { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZm, 0 },
+ { X86::VFNMSUB231PSr, X86::VFNMSUB231PSm, 0 },
+ { X86::VFNMSUB231SDZr, X86::VFNMSUB231SDZm, 0 },
+ { X86::VFNMSUB231SDZr_Int, X86::VFNMSUB231SDZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB231SDr, X86::VFNMSUB231SDm, 0 },
+ { X86::VFNMSUB231SDr_Int, X86::VFNMSUB231SDm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB231SSZr, X86::VFNMSUB231SSZm, 0 },
+ { X86::VFNMSUB231SSZr_Int, X86::VFNMSUB231SSZm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUB231SSr, X86::VFNMSUB231SSm, 0 },
+ { X86::VFNMSUB231SSr_Int, X86::VFNMSUB231SSm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, 0 },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, 0 },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, 0 },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, 0 },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE },
+ { X86::VGETEXPPDZ128rk, X86::VGETEXPPDZ128mk, 0 },
+ { X86::VGETEXPPDZ256rk, X86::VGETEXPPDZ256mk, 0 },
+ { X86::VGETEXPPDZrk, X86::VGETEXPPDZmk, 0 },
+ { X86::VGETEXPPSZ128rk, X86::VGETEXPPSZ128mk, 0 },
+ { X86::VGETEXPPSZ256rk, X86::VGETEXPPSZ256mk, 0 },
+ { X86::VGETEXPPSZrk, X86::VGETEXPPSZmk, 0 },
+ { X86::VGETEXPSDZrkz, X86::VGETEXPSDZmkz, TB_NO_REVERSE },
+ { X86::VGETEXPSSZrkz, X86::VGETEXPSSZmkz, TB_NO_REVERSE },
+ { X86::VGETMANTPDZ128rrik, X86::VGETMANTPDZ128rmik, 0 },
+ { X86::VGETMANTPDZ256rrik, X86::VGETMANTPDZ256rmik, 0 },
+ { X86::VGETMANTPDZrrik, X86::VGETMANTPDZrmik, 0 },
+ { X86::VGETMANTPSZ128rrik, X86::VGETMANTPSZ128rmik, 0 },
+ { X86::VGETMANTPSZ256rrik, X86::VGETMANTPSZ256rmik, 0 },
+ { X86::VGETMANTPSZrrik, X86::VGETMANTPSZrmik, 0 },
+ { X86::VGETMANTSDZrrikz, X86::VGETMANTSDZrmikz, TB_NO_REVERSE },
+ { X86::VGETMANTSSZrrikz, X86::VGETMANTSSZrmikz, TB_NO_REVERSE },
+ { X86::VGF2P8AFFINEINVQBZ128rrikz, X86::VGF2P8AFFINEINVQBZ128rmikz, 0 },
+ { X86::VGF2P8AFFINEINVQBZ256rrikz, X86::VGF2P8AFFINEINVQBZ256rmikz, 0 },
+ { X86::VGF2P8AFFINEINVQBZrrikz, X86::VGF2P8AFFINEINVQBZrmikz, 0 },
+ { X86::VGF2P8AFFINEQBZ128rrikz, X86::VGF2P8AFFINEQBZ128rmikz, 0 },
+ { X86::VGF2P8AFFINEQBZ256rrikz, X86::VGF2P8AFFINEQBZ256rmikz, 0 },
+ { X86::VGF2P8AFFINEQBZrrikz, X86::VGF2P8AFFINEQBZrmikz, 0 },
+ { X86::VGF2P8MULBZ128rrkz, X86::VGF2P8MULBZ128rmkz, 0 },
+ { X86::VGF2P8MULBZ256rrkz, X86::VGF2P8MULBZ256rmkz, 0 },
+ { X86::VGF2P8MULBZrrkz, X86::VGF2P8MULBZrmkz, 0 },
+ { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 },
+ { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
+ { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
+ { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 },
+ { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
+ { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 },
+ { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 },
+ { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 },
+ { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 },
+ { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 },
+ { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 },
+ { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 },
+ { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 },
+ { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 },
+ { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 },
+ { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 },
+ { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 },
+ { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
+ { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 },
+ { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
+ { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
+ { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
+ { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
+ { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
+ { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 },
+ { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 },
+ { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
+ { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 },
+ { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 },
+ { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
+ { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
+ { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
+ { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
+ { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
+ { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
+ { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
+ { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMOVAPDZ128rrk, X86::VMOVAPDZ128rmk, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVAPDZ256rrk, X86::VMOVAPDZ256rmk, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVAPDZrrk, X86::VMOVAPDZrmk, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVAPSZ128rrk, X86::VMOVAPSZ128rmk, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVAPSZ256rrk, X86::VMOVAPSZ256rmk, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVAPSZrrk, X86::VMOVAPSZrmk, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDDUPZ128rrk, X86::VMOVDDUPZ128rmk, TB_NO_REVERSE },
+ { X86::VMOVDDUPZ256rrk, X86::VMOVDDUPZ256rmk, 0 },
+ { X86::VMOVDDUPZrrk, X86::VMOVDDUPZrmk, 0 },
+ { X86::VMOVDQA32Z128rrk, X86::VMOVDQA32Z128rmk, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVDQA32Z256rrk, X86::VMOVDQA32Z256rmk, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVDQA32Zrrk, X86::VMOVDQA32Zrmk, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDQA64Z128rrk, X86::VMOVDQA64Z128rmk, TB_NO_REVERSE | TB_ALIGN_16 },
+ { X86::VMOVDQA64Z256rrk, X86::VMOVDQA64Z256rmk, TB_NO_REVERSE | TB_ALIGN_32 },
+ { X86::VMOVDQA64Zrrk, X86::VMOVDQA64Zrmk, TB_NO_REVERSE | TB_ALIGN_64 },
+ { X86::VMOVDQU16Z128rrk, X86::VMOVDQU16Z128rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU16Z256rrk, X86::VMOVDQU16Z256rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU16Zrrk, X86::VMOVDQU16Zrmk, TB_NO_REVERSE },
+ { X86::VMOVDQU32Z128rrk, X86::VMOVDQU32Z128rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU32Z256rrk, X86::VMOVDQU32Z256rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU32Zrrk, X86::VMOVDQU32Zrmk, TB_NO_REVERSE },
+ { X86::VMOVDQU64Z128rrk, X86::VMOVDQU64Z128rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU64Z256rrk, X86::VMOVDQU64Z256rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU64Zrrk, X86::VMOVDQU64Zrmk, TB_NO_REVERSE },
+ { X86::VMOVDQU8Z128rrk, X86::VMOVDQU8Z128rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU8Z256rrk, X86::VMOVDQU8Z256rmk, TB_NO_REVERSE },
+ { X86::VMOVDQU8Zrrk, X86::VMOVDQU8Zrmk, TB_NO_REVERSE },
+ { X86::VMOVSHDUPZ128rrk, X86::VMOVSHDUPZ128rmk, 0 },
+ { X86::VMOVSHDUPZ256rrk, X86::VMOVSHDUPZ256rmk, 0 },
+ { X86::VMOVSHDUPZrrk, X86::VMOVSHDUPZrmk, 0 },
+ { X86::VMOVSLDUPZ128rrk, X86::VMOVSLDUPZ128rmk, 0 },
+ { X86::VMOVSLDUPZ256rrk, X86::VMOVSLDUPZ256rmk, 0 },
+ { X86::VMOVSLDUPZrrk, X86::VMOVSLDUPZrmk, 0 },
+ { X86::VMOVUPDZ128rrk, X86::VMOVUPDZ128rmk, TB_NO_REVERSE },
+ { X86::VMOVUPDZ256rrk, X86::VMOVUPDZ256rmk, TB_NO_REVERSE },
+ { X86::VMOVUPDZrrk, X86::VMOVUPDZrmk, TB_NO_REVERSE },
+ { X86::VMOVUPSZ128rrk, X86::VMOVUPSZ128rmk, TB_NO_REVERSE },
+ { X86::VMOVUPSZ256rrk, X86::VMOVUPSZ256rmk, TB_NO_REVERSE },
+ { X86::VMOVUPSZrrk, X86::VMOVUPSZrmk, TB_NO_REVERSE },
+ { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
+ { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
+ { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
+ { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
+ { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
+ { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
+ { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
+ { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
+ { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
+ { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
+ { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
+ { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
+ { X86::VPABSBZ128rrk, X86::VPABSBZ128rmk, 0 },
+ { X86::VPABSBZ256rrk, X86::VPABSBZ256rmk, 0 },
+ { X86::VPABSBZrrk, X86::VPABSBZrmk, 0 },
+ { X86::VPABSDZ128rrk, X86::VPABSDZ128rmk, 0 },
+ { X86::VPABSDZ256rrk, X86::VPABSDZ256rmk, 0 },
+ { X86::VPABSDZrrk, X86::VPABSDZrmk, 0 },
+ { X86::VPABSQZ128rrk, X86::VPABSQZ128rmk, 0 },
+ { X86::VPABSQZ256rrk, X86::VPABSQZ256rmk, 0 },
+ { X86::VPABSQZrrk, X86::VPABSQZrmk, 0 },
+ { X86::VPABSWZ128rrk, X86::VPABSWZ128rmk, 0 },
+ { X86::VPABSWZ256rrk, X86::VPABSWZ256rmk, 0 },
+ { X86::VPABSWZrrk, X86::VPABSWZrmk, 0 },
+ { X86::VPACKSSDWZ128rrkz, X86::VPACKSSDWZ128rmkz, 0 },
+ { X86::VPACKSSDWZ256rrkz, X86::VPACKSSDWZ256rmkz, 0 },
+ { X86::VPACKSSDWZrrkz, X86::VPACKSSDWZrmkz, 0 },
+ { X86::VPACKSSWBZ128rrkz, X86::VPACKSSWBZ128rmkz, 0 },
+ { X86::VPACKSSWBZ256rrkz, X86::VPACKSSWBZ256rmkz, 0 },
+ { X86::VPACKSSWBZrrkz, X86::VPACKSSWBZrmkz, 0 },
+ { X86::VPACKUSDWZ128rrkz, X86::VPACKUSDWZ128rmkz, 0 },
+ { X86::VPACKUSDWZ256rrkz, X86::VPACKUSDWZ256rmkz, 0 },
+ { X86::VPACKUSDWZrrkz, X86::VPACKUSDWZrmkz, 0 },
+ { X86::VPACKUSWBZ128rrkz, X86::VPACKUSWBZ128rmkz, 0 },
+ { X86::VPACKUSWBZ256rrkz, X86::VPACKUSWBZ256rmkz, 0 },
+ { X86::VPACKUSWBZrrkz, X86::VPACKUSWBZrmkz, 0 },
+ { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
+ { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
+ { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
+ { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
+ { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
+ { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 },
+ { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
+ { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
+ { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 },
+ { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
+ { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
+ { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 },
+ { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
+ { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
+ { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 },
+ { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
+ { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
+ { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 },
+ { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
+ { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
+ { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 },
+ { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
+ { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
+ { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 },
+ { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 },
+ { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 },
+ { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 },
+ { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
+ { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
+ { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 },
+ { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
+ { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
+ { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
+ { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
+ { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
+ { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
+ { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
+ { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
+ { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
+ { X86::VPAVGBZ128rrkz, X86::VPAVGBZ128rmkz, 0 },
+ { X86::VPAVGBZ256rrkz, X86::VPAVGBZ256rmkz, 0 },
+ { X86::VPAVGBZrrkz, X86::VPAVGBZrmkz, 0 },
+ { X86::VPAVGWZ128rrkz, X86::VPAVGWZ128rmkz, 0 },
+ { X86::VPAVGWZ256rrkz, X86::VPAVGWZ256rmkz, 0 },
+ { X86::VPAVGWZrrkz, X86::VPAVGWZrmkz, 0 },
+ { X86::VPBLENDMBZ128rrk, X86::VPBLENDMBZ128rmk, 0 },
+ { X86::VPBLENDMBZ256rrk, X86::VPBLENDMBZ256rmk, 0 },
+ { X86::VPBLENDMBZrrk, X86::VPBLENDMBZrmk, 0 },
+ { X86::VPBLENDMDZ128rrk, X86::VPBLENDMDZ128rmk, 0 },
+ { X86::VPBLENDMDZ256rrk, X86::VPBLENDMDZ256rmk, 0 },
+ { X86::VPBLENDMDZrrk, X86::VPBLENDMDZrmk, 0 },
+ { X86::VPBLENDMQZ128rrk, X86::VPBLENDMQZ128rmk, 0 },
+ { X86::VPBLENDMQZ256rrk, X86::VPBLENDMQZ256rmk, 0 },
+ { X86::VPBLENDMQZrrk, X86::VPBLENDMQZrmk, 0 },
+ { X86::VPBLENDMWZ128rrk, X86::VPBLENDMWZ128rmk, 0 },
+ { X86::VPBLENDMWZ256rrk, X86::VPBLENDMWZ256rmk, 0 },
+ { X86::VPBLENDMWZrrk, X86::VPBLENDMWZrmk, 0 },
+ { X86::VPBROADCASTBZ128rrk, X86::VPBROADCASTBZ128rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ256rrk, X86::VPBROADCASTBZ256rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZrrk, X86::VPBROADCASTBZrmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ128rrk, X86::VPBROADCASTDZ128rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ256rrk, X86::VPBROADCASTDZ256rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZrrk, X86::VPBROADCASTDZrmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ128rrk, X86::VPBROADCASTQZ128rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ256rrk, X86::VPBROADCASTQZ256rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZrrk, X86::VPBROADCASTQZrmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ128rrk, X86::VPBROADCASTWZ128rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ256rrk, X86::VPBROADCASTWZ256rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZrrk, X86::VPBROADCASTWZrmk, TB_NO_REVERSE },
+ { X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 },
+ { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
+ { X86::VPCMPBZ128rrik, X86::VPCMPBZ128rmik, 0 },
+ { X86::VPCMPBZ256rrik, X86::VPCMPBZ256rmik, 0 },
+ { X86::VPCMPBZrrik, X86::VPCMPBZrmik, 0 },
+ { X86::VPCMPDZ128rrik, X86::VPCMPDZ128rmik, 0 },
+ { X86::VPCMPDZ256rrik, X86::VPCMPDZ256rmik, 0 },
+ { X86::VPCMPDZrrik, X86::VPCMPDZrmik, 0 },
+ { X86::VPCMPEQBZ128rrk, X86::VPCMPEQBZ128rmk, 0 },
+ { X86::VPCMPEQBZ256rrk, X86::VPCMPEQBZ256rmk, 0 },
+ { X86::VPCMPEQBZrrk, X86::VPCMPEQBZrmk, 0 },
+ { X86::VPCMPEQDZ128rrk, X86::VPCMPEQDZ128rmk, 0 },
+ { X86::VPCMPEQDZ256rrk, X86::VPCMPEQDZ256rmk, 0 },
+ { X86::VPCMPEQDZrrk, X86::VPCMPEQDZrmk, 0 },
+ { X86::VPCMPEQQZ128rrk, X86::VPCMPEQQZ128rmk, 0 },
+ { X86::VPCMPEQQZ256rrk, X86::VPCMPEQQZ256rmk, 0 },
+ { X86::VPCMPEQQZrrk, X86::VPCMPEQQZrmk, 0 },
+ { X86::VPCMPEQWZ128rrk, X86::VPCMPEQWZ128rmk, 0 },
+ { X86::VPCMPEQWZ256rrk, X86::VPCMPEQWZ256rmk, 0 },
+ { X86::VPCMPEQWZrrk, X86::VPCMPEQWZrmk, 0 },
+ { X86::VPCMPGTBZ128rrk, X86::VPCMPGTBZ128rmk, 0 },
+ { X86::VPCMPGTBZ256rrk, X86::VPCMPGTBZ256rmk, 0 },
+ { X86::VPCMPGTBZrrk, X86::VPCMPGTBZrmk, 0 },
+ { X86::VPCMPGTDZ128rrk, X86::VPCMPGTDZ128rmk, 0 },
+ { X86::VPCMPGTDZ256rrk, X86::VPCMPGTDZ256rmk, 0 },
+ { X86::VPCMPGTDZrrk, X86::VPCMPGTDZrmk, 0 },
+ { X86::VPCMPGTQZ128rrk, X86::VPCMPGTQZ128rmk, 0 },
+ { X86::VPCMPGTQZ256rrk, X86::VPCMPGTQZ256rmk, 0 },
+ { X86::VPCMPGTQZrrk, X86::VPCMPGTQZrmk, 0 },
+ { X86::VPCMPGTWZ128rrk, X86::VPCMPGTWZ128rmk, 0 },
+ { X86::VPCMPGTWZ256rrk, X86::VPCMPGTWZ256rmk, 0 },
+ { X86::VPCMPGTWZrrk, X86::VPCMPGTWZrmk, 0 },
+ { X86::VPCMPQZ128rrik, X86::VPCMPQZ128rmik, 0 },
+ { X86::VPCMPQZ256rrik, X86::VPCMPQZ256rmik, 0 },
+ { X86::VPCMPQZrrik, X86::VPCMPQZrmik, 0 },
+ { X86::VPCMPUBZ128rrik, X86::VPCMPUBZ128rmik, 0 },
+ { X86::VPCMPUBZ256rrik, X86::VPCMPUBZ256rmik, 0 },
+ { X86::VPCMPUBZrrik, X86::VPCMPUBZrmik, 0 },
+ { X86::VPCMPUDZ128rrik, X86::VPCMPUDZ128rmik, 0 },
+ { X86::VPCMPUDZ256rrik, X86::VPCMPUDZ256rmik, 0 },
+ { X86::VPCMPUDZrrik, X86::VPCMPUDZrmik, 0 },
+ { X86::VPCMPUQZ128rrik, X86::VPCMPUQZ128rmik, 0 },
+ { X86::VPCMPUQZ256rrik, X86::VPCMPUQZ256rmik, 0 },
+ { X86::VPCMPUQZrrik, X86::VPCMPUQZrmik, 0 },
+ { X86::VPCMPUWZ128rrik, X86::VPCMPUWZ128rmik, 0 },
+ { X86::VPCMPUWZ256rrik, X86::VPCMPUWZ256rmik, 0 },
+ { X86::VPCMPUWZrrik, X86::VPCMPUWZrmik, 0 },
+ { X86::VPCMPWZ128rrik, X86::VPCMPWZ128rmik, 0 },
+ { X86::VPCMPWZ256rrik, X86::VPCMPWZ256rmik, 0 },
+ { X86::VPCMPWZrrik, X86::VPCMPWZrmik, 0 },
+ { X86::VPCONFLICTDZ128rrk, X86::VPCONFLICTDZ128rmk, 0 },
+ { X86::VPCONFLICTDZ256rrk, X86::VPCONFLICTDZ256rmk, 0 },
+ { X86::VPCONFLICTDZrrk, X86::VPCONFLICTDZrmk, 0 },
+ { X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 },
+ { X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 },
+ { X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 },
+ { X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 },
+ { X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 },
+ { X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 },
+ { X86::VPDPBUSDSZr, X86::VPDPBUSDSZm, 0 },
+ { X86::VPDPBUSDSrr, X86::VPDPBUSDSrm, 0 },
+ { X86::VPDPBUSDYrr, X86::VPDPBUSDYrm, 0 },
+ { X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128m, 0 },
+ { X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 },
+ { X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 },
+ { X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 },
+ { X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 },
+ { X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 },
+ { X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 },
+ { X86::VPDPWSSDSZr, X86::VPDPWSSDSZm, 0 },
+ { X86::VPDPWSSDSrr, X86::VPDPWSSDSrm, 0 },
+ { X86::VPDPWSSDYrr, X86::VPDPWSSDYrm, 0 },
+ { X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128m, 0 },
+ { X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0 },
+ { X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0 },
+ { X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0 },
+ { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
+ { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
+ { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
+ { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
+ { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
+ { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 },
+ { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 },
+ { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 },
+ { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 },
+ { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 },
+ { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
+ { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 },
+ { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 },
+ { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
+ { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 },
+ { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 },
+ { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
+ { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 },
+ { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 },
+ { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
+ { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 },
+ { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 },
+ { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 },
+ { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYrm, 0 },
+ { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 },
+ { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYrm, 0 },
+ { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 },
+ { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
+ { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 },
+ { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
+ { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 },
+ { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
+ { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 },
+ { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
+ { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 },
+ { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
+ { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 },
+ { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
+ { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 },
+ { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
+ { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
+ { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
+ { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
+ { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
+ { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
+ { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
+ { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
+ { X86::VPERMQZrik, X86::VPERMQZmik, 0 },
+ { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
+ { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 },
+ { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 },
+ { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 },
+ { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 },
+ { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 },
+ { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 },
+ { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 },
+ { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 },
+ { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
+ { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
+ { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
+ { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 },
+ { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
+ { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
+ { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
+ { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
+ { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
+ { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
+ { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
+ { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
+ { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
+ { X86::VPEXPANDBZ128rrk, X86::VPEXPANDBZ128rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDBZ256rrk, X86::VPEXPANDBZ256rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDBZrrk, X86::VPEXPANDBZrmk, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ128rrk, X86::VPEXPANDDZ128rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDDZ256rrk, X86::VPEXPANDDZ256rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDDZrrk, X86::VPEXPANDDZrmk, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ128rrk, X86::VPEXPANDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDQZ256rrk, X86::VPEXPANDQZ256rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDQZrrk, X86::VPEXPANDQZrmk, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ128rrk, X86::VPEXPANDWZ128rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDWZ256rrk, X86::VPEXPANDWZ256rmk, TB_NO_REVERSE },
+ { X86::VPEXPANDWZrrk, X86::VPEXPANDWZrmk, TB_NO_REVERSE },
+ { X86::VPLZCNTDZ128rrk, X86::VPLZCNTDZ128rmk, 0 },
+ { X86::VPLZCNTDZ256rrk, X86::VPLZCNTDZ256rmk, 0 },
+ { X86::VPLZCNTDZrrk, X86::VPLZCNTDZrmk, 0 },
+ { X86::VPLZCNTQZ128rrk, X86::VPLZCNTQZ128rmk, 0 },
+ { X86::VPLZCNTQZ256rrk, X86::VPLZCNTQZ256rmk, 0 },
+ { X86::VPLZCNTQZrrk, X86::VPLZCNTQZrmk, 0 },
+ { X86::VPMADD52HUQZ128r, X86::VPMADD52HUQZ128m, 0 },
+ { X86::VPMADD52HUQZ256r, X86::VPMADD52HUQZ256m, 0 },
+ { X86::VPMADD52HUQZr, X86::VPMADD52HUQZm, 0 },
+ { X86::VPMADD52LUQZ128r, X86::VPMADD52LUQZ128m, 0 },
+ { X86::VPMADD52LUQZ256r, X86::VPMADD52LUQZ256m, 0 },
+ { X86::VPMADD52LUQZr, X86::VPMADD52LUQZm, 0 },
+ { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
+ { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
+ { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
+ { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
+ { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
+ { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
+ { X86::VPMAXSBZ128rrkz, X86::VPMAXSBZ128rmkz, 0 },
+ { X86::VPMAXSBZ256rrkz, X86::VPMAXSBZ256rmkz, 0 },
+ { X86::VPMAXSBZrrkz, X86::VPMAXSBZrmkz, 0 },
+ { X86::VPMAXSDZ128rrkz, X86::VPMAXSDZ128rmkz, 0 },
+ { X86::VPMAXSDZ256rrkz, X86::VPMAXSDZ256rmkz, 0 },
+ { X86::VPMAXSDZrrkz, X86::VPMAXSDZrmkz, 0 },
+ { X86::VPMAXSQZ128rrkz, X86::VPMAXSQZ128rmkz, 0 },
+ { X86::VPMAXSQZ256rrkz, X86::VPMAXSQZ256rmkz, 0 },
+ { X86::VPMAXSQZrrkz, X86::VPMAXSQZrmkz, 0 },
+ { X86::VPMAXSWZ128rrkz, X86::VPMAXSWZ128rmkz, 0 },
+ { X86::VPMAXSWZ256rrkz, X86::VPMAXSWZ256rmkz, 0 },
+ { X86::VPMAXSWZrrkz, X86::VPMAXSWZrmkz, 0 },
+ { X86::VPMAXUBZ128rrkz, X86::VPMAXUBZ128rmkz, 0 },
+ { X86::VPMAXUBZ256rrkz, X86::VPMAXUBZ256rmkz, 0 },
+ { X86::VPMAXUBZrrkz, X86::VPMAXUBZrmkz, 0 },
+ { X86::VPMAXUDZ128rrkz, X86::VPMAXUDZ128rmkz, 0 },
+ { X86::VPMAXUDZ256rrkz, X86::VPMAXUDZ256rmkz, 0 },
+ { X86::VPMAXUDZrrkz, X86::VPMAXUDZrmkz, 0 },
+ { X86::VPMAXUQZ128rrkz, X86::VPMAXUQZ128rmkz, 0 },
+ { X86::VPMAXUQZ256rrkz, X86::VPMAXUQZ256rmkz, 0 },
+ { X86::VPMAXUQZrrkz, X86::VPMAXUQZrmkz, 0 },
+ { X86::VPMAXUWZ128rrkz, X86::VPMAXUWZ128rmkz, 0 },
+ { X86::VPMAXUWZ256rrkz, X86::VPMAXUWZ256rmkz, 0 },
+ { X86::VPMAXUWZrrkz, X86::VPMAXUWZrmkz, 0 },
+ { X86::VPMINSBZ128rrkz, X86::VPMINSBZ128rmkz, 0 },
+ { X86::VPMINSBZ256rrkz, X86::VPMINSBZ256rmkz, 0 },
+ { X86::VPMINSBZrrkz, X86::VPMINSBZrmkz, 0 },
+ { X86::VPMINSDZ128rrkz, X86::VPMINSDZ128rmkz, 0 },
+ { X86::VPMINSDZ256rrkz, X86::VPMINSDZ256rmkz, 0 },
+ { X86::VPMINSDZrrkz, X86::VPMINSDZrmkz, 0 },
+ { X86::VPMINSQZ128rrkz, X86::VPMINSQZ128rmkz, 0 },
+ { X86::VPMINSQZ256rrkz, X86::VPMINSQZ256rmkz, 0 },
+ { X86::VPMINSQZrrkz, X86::VPMINSQZrmkz, 0 },
+ { X86::VPMINSWZ128rrkz, X86::VPMINSWZ128rmkz, 0 },
+ { X86::VPMINSWZ256rrkz, X86::VPMINSWZ256rmkz, 0 },
+ { X86::VPMINSWZrrkz, X86::VPMINSWZrmkz, 0 },
+ { X86::VPMINUBZ128rrkz, X86::VPMINUBZ128rmkz, 0 },
+ { X86::VPMINUBZ256rrkz, X86::VPMINUBZ256rmkz, 0 },
+ { X86::VPMINUBZrrkz, X86::VPMINUBZrmkz, 0 },
+ { X86::VPMINUDZ128rrkz, X86::VPMINUDZ128rmkz, 0 },
+ { X86::VPMINUDZ256rrkz, X86::VPMINUDZ256rmkz, 0 },
+ { X86::VPMINUDZrrkz, X86::VPMINUDZrmkz, 0 },
+ { X86::VPMINUQZ128rrkz, X86::VPMINUQZ128rmkz, 0 },
+ { X86::VPMINUQZ256rrkz, X86::VPMINUQZ256rmkz, 0 },
+ { X86::VPMINUQZrrkz, X86::VPMINUQZrmkz, 0 },
+ { X86::VPMINUWZ128rrkz, X86::VPMINUWZ128rmkz, 0 },
+ { X86::VPMINUWZ256rrkz, X86::VPMINUWZ256rmkz, 0 },
+ { X86::VPMINUWZrrkz, X86::VPMINUWZrmkz, 0 },
+ { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
+ { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
+ { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
+ { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 },
+ { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 },
+ { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 },
+ { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 },
+ { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 },
+ { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 },
+ { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 },
+ { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 },
+ { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 },
+ { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
+ { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 },
+ { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
+ { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
+ { X86::VPMULDQZ128rrkz, X86::VPMULDQZ128rmkz, 0 },
+ { X86::VPMULDQZ256rrkz, X86::VPMULDQZ256rmkz, 0 },
+ { X86::VPMULDQZrrkz, X86::VPMULDQZrmkz, 0 },
+ { X86::VPMULHRSWZ128rrkz, X86::VPMULHRSWZ128rmkz, 0 },
+ { X86::VPMULHRSWZ256rrkz, X86::VPMULHRSWZ256rmkz, 0 },
+ { X86::VPMULHRSWZrrkz, X86::VPMULHRSWZrmkz, 0 },
+ { X86::VPMULHUWZ128rrkz, X86::VPMULHUWZ128rmkz, 0 },
+ { X86::VPMULHUWZ256rrkz, X86::VPMULHUWZ256rmkz, 0 },
+ { X86::VPMULHUWZrrkz, X86::VPMULHUWZrmkz, 0 },
+ { X86::VPMULHWZ128rrkz, X86::VPMULHWZ128rmkz, 0 },
+ { X86::VPMULHWZ256rrkz, X86::VPMULHWZ256rmkz, 0 },
+ { X86::VPMULHWZrrkz, X86::VPMULHWZrmkz, 0 },
+ { X86::VPMULLDZ128rrkz, X86::VPMULLDZ128rmkz, 0 },
+ { X86::VPMULLDZ256rrkz, X86::VPMULLDZ256rmkz, 0 },
+ { X86::VPMULLDZrrkz, X86::VPMULLDZrmkz, 0 },
+ { X86::VPMULLQZ128rrkz, X86::VPMULLQZ128rmkz, 0 },
+ { X86::VPMULLQZ256rrkz, X86::VPMULLQZ256rmkz, 0 },
+ { X86::VPMULLQZrrkz, X86::VPMULLQZrmkz, 0 },
+ { X86::VPMULLWZ128rrkz, X86::VPMULLWZ128rmkz, 0 },
+ { X86::VPMULLWZ256rrkz, X86::VPMULLWZ256rmkz, 0 },
+ { X86::VPMULLWZrrkz, X86::VPMULLWZrmkz, 0 },
+ { X86::VPMULTISHIFTQBZ128rrkz, X86::VPMULTISHIFTQBZ128rmkz, 0 },
+ { X86::VPMULTISHIFTQBZ256rrkz, X86::VPMULTISHIFTQBZ256rmkz, 0 },
+ { X86::VPMULTISHIFTQBZrrkz, X86::VPMULTISHIFTQBZrmkz, 0 },
+ { X86::VPMULUDQZ128rrkz, X86::VPMULUDQZ128rmkz, 0 },
+ { X86::VPMULUDQZ256rrkz, X86::VPMULUDQZ256rmkz, 0 },
+ { X86::VPMULUDQZrrkz, X86::VPMULUDQZrmkz, 0 },
+ { X86::VPOPCNTBZ128rrk, X86::VPOPCNTBZ128rmk, 0 },
+ { X86::VPOPCNTBZ256rrk, X86::VPOPCNTBZ256rmk, 0 },
+ { X86::VPOPCNTBZrrk, X86::VPOPCNTBZrmk, 0 },
+ { X86::VPOPCNTDZ128rrk, X86::VPOPCNTDZ128rmk, 0 },
+ { X86::VPOPCNTDZ256rrk, X86::VPOPCNTDZ256rmk, 0 },
+ { X86::VPOPCNTDZrrk, X86::VPOPCNTDZrmk, 0 },
+ { X86::VPOPCNTQZ128rrk, X86::VPOPCNTQZ128rmk, 0 },
+ { X86::VPOPCNTQZ256rrk, X86::VPOPCNTQZ256rmk, 0 },
+ { X86::VPOPCNTQZrrk, X86::VPOPCNTQZrmk, 0 },
+ { X86::VPOPCNTWZ128rrk, X86::VPOPCNTWZ128rmk, 0 },
+ { X86::VPOPCNTWZ256rrk, X86::VPOPCNTWZ256rmk, 0 },
+ { X86::VPOPCNTWZrrk, X86::VPOPCNTWZrmk, 0 },
+ { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
+ { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
+ { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
+ { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
+ { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
+ { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 },
+ { X86::VPPERMrrr, X86::VPPERMrrm, 0 },
+ { X86::VPROLDZ128rik, X86::VPROLDZ128mik, 0 },
+ { X86::VPROLDZ256rik, X86::VPROLDZ256mik, 0 },
+ { X86::VPROLDZrik, X86::VPROLDZmik, 0 },
+ { X86::VPROLQZ128rik, X86::VPROLQZ128mik, 0 },
+ { X86::VPROLQZ256rik, X86::VPROLQZ256mik, 0 },
+ { X86::VPROLQZrik, X86::VPROLQZmik, 0 },
+ { X86::VPROLVDZ128rrkz, X86::VPROLVDZ128rmkz, 0 },
+ { X86::VPROLVDZ256rrkz, X86::VPROLVDZ256rmkz, 0 },
+ { X86::VPROLVDZrrkz, X86::VPROLVDZrmkz, 0 },
+ { X86::VPROLVQZ128rrkz, X86::VPROLVQZ128rmkz, 0 },
+ { X86::VPROLVQZ256rrkz, X86::VPROLVQZ256rmkz, 0 },
+ { X86::VPROLVQZrrkz, X86::VPROLVQZrmkz, 0 },
+ { X86::VPRORDZ128rik, X86::VPRORDZ128mik, 0 },
+ { X86::VPRORDZ256rik, X86::VPRORDZ256mik, 0 },
+ { X86::VPRORDZrik, X86::VPRORDZmik, 0 },
+ { X86::VPRORQZ128rik, X86::VPRORQZ128mik, 0 },
+ { X86::VPRORQZ256rik, X86::VPRORQZ256mik, 0 },
+ { X86::VPRORQZrik, X86::VPRORQZmik, 0 },
+ { X86::VPRORVDZ128rrkz, X86::VPRORVDZ128rmkz, 0 },
+ { X86::VPRORVDZ256rrkz, X86::VPRORVDZ256rmkz, 0 },
+ { X86::VPRORVDZrrkz, X86::VPRORVDZrmkz, 0 },
+ { X86::VPRORVQZ128rrkz, X86::VPRORVQZ128rmkz, 0 },
+ { X86::VPRORVQZ256rrkz, X86::VPRORVQZ256rmkz, 0 },
+ { X86::VPRORVQZrrkz, X86::VPRORVQZrmkz, 0 },
+ { X86::VPSHLDDZ128rrikz, X86::VPSHLDDZ128rmikz, 0 },
+ { X86::VPSHLDDZ256rrikz, X86::VPSHLDDZ256rmikz, 0 },
+ { X86::VPSHLDDZrrikz, X86::VPSHLDDZrmikz, 0 },
+ { X86::VPSHLDQZ128rrikz, X86::VPSHLDQZ128rmikz, 0 },
+ { X86::VPSHLDQZ256rrikz, X86::VPSHLDQZ256rmikz, 0 },
+ { X86::VPSHLDQZrrikz, X86::VPSHLDQZrmikz, 0 },
+ { X86::VPSHLDVDZ128r, X86::VPSHLDVDZ128m, 0 },
+ { X86::VPSHLDVDZ256r, X86::VPSHLDVDZ256m, 0 },
+ { X86::VPSHLDVDZr, X86::VPSHLDVDZm, 0 },
+ { X86::VPSHLDVQZ128r, X86::VPSHLDVQZ128m, 0 },
+ { X86::VPSHLDVQZ256r, X86::VPSHLDVQZ256m, 0 },
+ { X86::VPSHLDVQZr, X86::VPSHLDVQZm, 0 },
+ { X86::VPSHLDVWZ128r, X86::VPSHLDVWZ128m, 0 },
+ { X86::VPSHLDVWZ256r, X86::VPSHLDVWZ256m, 0 },
+ { X86::VPSHLDVWZr, X86::VPSHLDVWZm, 0 },
+ { X86::VPSHLDWZ128rrikz, X86::VPSHLDWZ128rmikz, 0 },
+ { X86::VPSHLDWZ256rrikz, X86::VPSHLDWZ256rmikz, 0 },
+ { X86::VPSHLDWZrrikz, X86::VPSHLDWZrmikz, 0 },
+ { X86::VPSHRDDZ128rrikz, X86::VPSHRDDZ128rmikz, 0 },
+ { X86::VPSHRDDZ256rrikz, X86::VPSHRDDZ256rmikz, 0 },
+ { X86::VPSHRDDZrrikz, X86::VPSHRDDZrmikz, 0 },
+ { X86::VPSHRDQZ128rrikz, X86::VPSHRDQZ128rmikz, 0 },
+ { X86::VPSHRDQZ256rrikz, X86::VPSHRDQZ256rmikz, 0 },
+ { X86::VPSHRDQZrrikz, X86::VPSHRDQZrmikz, 0 },
+ { X86::VPSHRDVDZ128r, X86::VPSHRDVDZ128m, 0 },
+ { X86::VPSHRDVDZ256r, X86::VPSHRDVDZ256m, 0 },
+ { X86::VPSHRDVDZr, X86::VPSHRDVDZm, 0 },
+ { X86::VPSHRDVQZ128r, X86::VPSHRDVQZ128m, 0 },
+ { X86::VPSHRDVQZ256r, X86::VPSHRDVQZ256m, 0 },
+ { X86::VPSHRDVQZr, X86::VPSHRDVQZm, 0 },
+ { X86::VPSHRDVWZ128r, X86::VPSHRDVWZ128m, 0 },
+ { X86::VPSHRDVWZ256r, X86::VPSHRDVWZ256m, 0 },
+ { X86::VPSHRDVWZr, X86::VPSHRDVWZm, 0 },
+ { X86::VPSHRDWZ128rrikz, X86::VPSHRDWZ128rmikz, 0 },
+ { X86::VPSHRDWZ256rrikz, X86::VPSHRDWZ256rmikz, 0 },
+ { X86::VPSHRDWZrrikz, X86::VPSHRDWZrmikz, 0 },
+ { X86::VPSHUFBITQMBZ128rrk, X86::VPSHUFBITQMBZ128rmk, 0 },
+ { X86::VPSHUFBITQMBZ256rrk, X86::VPSHUFBITQMBZ256rmk, 0 },
+ { X86::VPSHUFBITQMBZrrk, X86::VPSHUFBITQMBZrmk, 0 },
+ { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 },
+ { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 },
+ { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 },
+ { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 },
+ { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 },
+ { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
+ { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 },
+ { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 },
+ { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
+ { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 },
+ { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 },
+ { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
+ { X86::VPSLLDZ128rik, X86::VPSLLDZ128mik, 0 },
+ { X86::VPSLLDZ128rrkz, X86::VPSLLDZ128rmkz, 0 },
+ { X86::VPSLLDZ256rik, X86::VPSLLDZ256mik, 0 },
+ { X86::VPSLLDZ256rrkz, X86::VPSLLDZ256rmkz, 0 },
+ { X86::VPSLLDZrik, X86::VPSLLDZmik, 0 },
+ { X86::VPSLLDZrrkz, X86::VPSLLDZrmkz, 0 },
+ { X86::VPSLLQZ128rik, X86::VPSLLQZ128mik, 0 },
+ { X86::VPSLLQZ128rrkz, X86::VPSLLQZ128rmkz, 0 },
+ { X86::VPSLLQZ256rik, X86::VPSLLQZ256mik, 0 },
+ { X86::VPSLLQZ256rrkz, X86::VPSLLQZ256rmkz, 0 },
+ { X86::VPSLLQZrik, X86::VPSLLQZmik, 0 },
+ { X86::VPSLLQZrrkz, X86::VPSLLQZrmkz, 0 },
+ { X86::VPSLLVDZ128rrkz, X86::VPSLLVDZ128rmkz, 0 },
+ { X86::VPSLLVDZ256rrkz, X86::VPSLLVDZ256rmkz, 0 },
+ { X86::VPSLLVDZrrkz, X86::VPSLLVDZrmkz, 0 },
+ { X86::VPSLLVQZ128rrkz, X86::VPSLLVQZ128rmkz, 0 },
+ { X86::VPSLLVQZ256rrkz, X86::VPSLLVQZ256rmkz, 0 },
+ { X86::VPSLLVQZrrkz, X86::VPSLLVQZrmkz, 0 },
+ { X86::VPSLLVWZ128rrkz, X86::VPSLLVWZ128rmkz, 0 },
+ { X86::VPSLLVWZ256rrkz, X86::VPSLLVWZ256rmkz, 0 },
+ { X86::VPSLLVWZrrkz, X86::VPSLLVWZrmkz, 0 },
+ { X86::VPSLLWZ128rik, X86::VPSLLWZ128mik, 0 },
+ { X86::VPSLLWZ128rrkz, X86::VPSLLWZ128rmkz, 0 },
+ { X86::VPSLLWZ256rik, X86::VPSLLWZ256mik, 0 },
+ { X86::VPSLLWZ256rrkz, X86::VPSLLWZ256rmkz, 0 },
+ { X86::VPSLLWZrik, X86::VPSLLWZmik, 0 },
+ { X86::VPSLLWZrrkz, X86::VPSLLWZrmkz, 0 },
+ { X86::VPSRADZ128rik, X86::VPSRADZ128mik, 0 },
+ { X86::VPSRADZ128rrkz, X86::VPSRADZ128rmkz, 0 },
+ { X86::VPSRADZ256rik, X86::VPSRADZ256mik, 0 },
+ { X86::VPSRADZ256rrkz, X86::VPSRADZ256rmkz, 0 },
+ { X86::VPSRADZrik, X86::VPSRADZmik, 0 },
+ { X86::VPSRADZrrkz, X86::VPSRADZrmkz, 0 },
+ { X86::VPSRAQZ128rik, X86::VPSRAQZ128mik, 0 },
+ { X86::VPSRAQZ128rrkz, X86::VPSRAQZ128rmkz, 0 },
+ { X86::VPSRAQZ256rik, X86::VPSRAQZ256mik, 0 },
+ { X86::VPSRAQZ256rrkz, X86::VPSRAQZ256rmkz, 0 },
+ { X86::VPSRAQZrik, X86::VPSRAQZmik, 0 },
+ { X86::VPSRAQZrrkz, X86::VPSRAQZrmkz, 0 },
+ { X86::VPSRAVDZ128rrkz, X86::VPSRAVDZ128rmkz, 0 },
+ { X86::VPSRAVDZ256rrkz, X86::VPSRAVDZ256rmkz, 0 },
+ { X86::VPSRAVDZrrkz, X86::VPSRAVDZrmkz, 0 },
+ { X86::VPSRAVQZ128rrkz, X86::VPSRAVQZ128rmkz, 0 },
+ { X86::VPSRAVQZ256rrkz, X86::VPSRAVQZ256rmkz, 0 },
+ { X86::VPSRAVQZrrkz, X86::VPSRAVQZrmkz, 0 },
+ { X86::VPSRAVWZ128rrkz, X86::VPSRAVWZ128rmkz, 0 },
+ { X86::VPSRAVWZ256rrkz, X86::VPSRAVWZ256rmkz, 0 },
+ { X86::VPSRAVWZrrkz, X86::VPSRAVWZrmkz, 0 },
+ { X86::VPSRAWZ128rik, X86::VPSRAWZ128mik, 0 },
+ { X86::VPSRAWZ128rrkz, X86::VPSRAWZ128rmkz, 0 },
+ { X86::VPSRAWZ256rik, X86::VPSRAWZ256mik, 0 },
+ { X86::VPSRAWZ256rrkz, X86::VPSRAWZ256rmkz, 0 },
+ { X86::VPSRAWZrik, X86::VPSRAWZmik, 0 },
+ { X86::VPSRAWZrrkz, X86::VPSRAWZrmkz, 0 },
+ { X86::VPSRLDZ128rik, X86::VPSRLDZ128mik, 0 },
+ { X86::VPSRLDZ128rrkz, X86::VPSRLDZ128rmkz, 0 },
+ { X86::VPSRLDZ256rik, X86::VPSRLDZ256mik, 0 },
+ { X86::VPSRLDZ256rrkz, X86::VPSRLDZ256rmkz, 0 },
+ { X86::VPSRLDZrik, X86::VPSRLDZmik, 0 },
+ { X86::VPSRLDZrrkz, X86::VPSRLDZrmkz, 0 },
+ { X86::VPSRLQZ128rik, X86::VPSRLQZ128mik, 0 },
+ { X86::VPSRLQZ128rrkz, X86::VPSRLQZ128rmkz, 0 },
+ { X86::VPSRLQZ256rik, X86::VPSRLQZ256mik, 0 },
+ { X86::VPSRLQZ256rrkz, X86::VPSRLQZ256rmkz, 0 },
+ { X86::VPSRLQZrik, X86::VPSRLQZmik, 0 },
+ { X86::VPSRLQZrrkz, X86::VPSRLQZrmkz, 0 },
+ { X86::VPSRLVDZ128rrkz, X86::VPSRLVDZ128rmkz, 0 },
+ { X86::VPSRLVDZ256rrkz, X86::VPSRLVDZ256rmkz, 0 },
+ { X86::VPSRLVDZrrkz, X86::VPSRLVDZrmkz, 0 },
+ { X86::VPSRLVQZ128rrkz, X86::VPSRLVQZ128rmkz, 0 },
+ { X86::VPSRLVQZ256rrkz, X86::VPSRLVQZ256rmkz, 0 },
+ { X86::VPSRLVQZrrkz, X86::VPSRLVQZrmkz, 0 },
+ { X86::VPSRLVWZ128rrkz, X86::VPSRLVWZ128rmkz, 0 },
+ { X86::VPSRLVWZ256rrkz, X86::VPSRLVWZ256rmkz, 0 },
+ { X86::VPSRLVWZrrkz, X86::VPSRLVWZrmkz, 0 },
+ { X86::VPSRLWZ128rik, X86::VPSRLWZ128mik, 0 },
+ { X86::VPSRLWZ128rrkz, X86::VPSRLWZ128rmkz, 0 },
+ { X86::VPSRLWZ256rik, X86::VPSRLWZ256mik, 0 },
+ { X86::VPSRLWZ256rrkz, X86::VPSRLWZ256rmkz, 0 },
+ { X86::VPSRLWZrik, X86::VPSRLWZmik, 0 },
+ { X86::VPSRLWZrrkz, X86::VPSRLWZrmkz, 0 },
+ { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
+ { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
+ { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 },
+ { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
+ { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
+ { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 },
+ { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
+ { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
+ { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 },
+ { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
+ { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
+ { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 },
+ { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
+ { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
+ { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 },
+ { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
+ { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
+ { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 },
+ { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
+ { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
+ { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 },
+ { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
+ { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
+ { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 },
+ { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
+ { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
+ { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
+ { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },
+ { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },
+ { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },
+ { X86::VPTESTMBZ128rrk, X86::VPTESTMBZ128rmk, 0 },
+ { X86::VPTESTMBZ256rrk, X86::VPTESTMBZ256rmk, 0 },
+ { X86::VPTESTMBZrrk, X86::VPTESTMBZrmk, 0 },
+ { X86::VPTESTMDZ128rrk, X86::VPTESTMDZ128rmk, 0 },
+ { X86::VPTESTMDZ256rrk, X86::VPTESTMDZ256rmk, 0 },
+ { X86::VPTESTMDZrrk, X86::VPTESTMDZrmk, 0 },
+ { X86::VPTESTMQZ128rrk, X86::VPTESTMQZ128rmk, 0 },
+ { X86::VPTESTMQZ256rrk, X86::VPTESTMQZ256rmk, 0 },
+ { X86::VPTESTMQZrrk, X86::VPTESTMQZrmk, 0 },
+ { X86::VPTESTMWZ128rrk, X86::VPTESTMWZ128rmk, 0 },
+ { X86::VPTESTMWZ256rrk, X86::VPTESTMWZ256rmk, 0 },
+ { X86::VPTESTMWZrrk, X86::VPTESTMWZrmk, 0 },
+ { X86::VPTESTNMBZ128rrk, X86::VPTESTNMBZ128rmk, 0 },
+ { X86::VPTESTNMBZ256rrk, X86::VPTESTNMBZ256rmk, 0 },
+ { X86::VPTESTNMBZrrk, X86::VPTESTNMBZrmk, 0 },
+ { X86::VPTESTNMDZ128rrk, X86::VPTESTNMDZ128rmk, 0 },
+ { X86::VPTESTNMDZ256rrk, X86::VPTESTNMDZ256rmk, 0 },
+ { X86::VPTESTNMDZrrk, X86::VPTESTNMDZrmk, 0 },
+ { X86::VPTESTNMQZ128rrk, X86::VPTESTNMQZ128rmk, 0 },
+ { X86::VPTESTNMQZ256rrk, X86::VPTESTNMQZ256rmk, 0 },
+ { X86::VPTESTNMQZrrk, X86::VPTESTNMQZrmk, 0 },
+ { X86::VPTESTNMWZ128rrk, X86::VPTESTNMWZ128rmk, 0 },
+ { X86::VPTESTNMWZ256rrk, X86::VPTESTNMWZ256rmk, 0 },
+ { X86::VPTESTNMWZrrk, X86::VPTESTNMWZrmk, 0 },
+ { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 },
+ { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 },
+ { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 },
+ { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 },
+ { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 },
+ { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 },
+ { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 },
+ { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 },
+ { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 },
+ { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 },
+ { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 },
+ { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 },
+ { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 },
+ { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 },
+ { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 },
+ { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 },
+ { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 },
+ { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 },
+ { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 },
+ { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 },
+ { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 },
+ { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 },
+ { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 },
+ { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 },
+ { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
+ { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
+ { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 },
+ { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
+ { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
+ { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
+ { X86::VRANGEPDZ128rrikz, X86::VRANGEPDZ128rmikz, 0 },
+ { X86::VRANGEPDZ256rrikz, X86::VRANGEPDZ256rmikz, 0 },
+ { X86::VRANGEPDZrrikz, X86::VRANGEPDZrmikz, 0 },
+ { X86::VRANGEPSZ128rrikz, X86::VRANGEPSZ128rmikz, 0 },
+ { X86::VRANGEPSZ256rrikz, X86::VRANGEPSZ256rmikz, 0 },
+ { X86::VRANGEPSZrrikz, X86::VRANGEPSZrmikz, 0 },
+ { X86::VRANGESDZrrikz, X86::VRANGESDZrmikz, TB_NO_REVERSE },
+ { X86::VRANGESSZrrikz, X86::VRANGESSZrmikz, TB_NO_REVERSE },
+ { X86::VRCP14PDZ128rk, X86::VRCP14PDZ128mk, 0 },
+ { X86::VRCP14PDZ256rk, X86::VRCP14PDZ256mk, 0 },
+ { X86::VRCP14PDZrk, X86::VRCP14PDZmk, 0 },
+ { X86::VRCP14PSZ128rk, X86::VRCP14PSZ128mk, 0 },
+ { X86::VRCP14PSZ256rk, X86::VRCP14PSZ256mk, 0 },
+ { X86::VRCP14PSZrk, X86::VRCP14PSZmk, 0 },
+ { X86::VRCP14SDZrrkz, X86::VRCP14SDZrmkz, TB_NO_REVERSE },
+ { X86::VRCP14SSZrrkz, X86::VRCP14SSZrmkz, TB_NO_REVERSE },
+ { X86::VRCP28PDZrk, X86::VRCP28PDZmk, 0 },
+ { X86::VRCP28PSZrk, X86::VRCP28PSZmk, 0 },
+ { X86::VRCP28SDZrkz, X86::VRCP28SDZmkz, TB_NO_REVERSE },
+ { X86::VRCP28SSZrkz, X86::VRCP28SSZmkz, TB_NO_REVERSE },
+ { X86::VREDUCEPDZ128rrik, X86::VREDUCEPDZ128rmik, 0 },
+ { X86::VREDUCEPDZ256rrik, X86::VREDUCEPDZ256rmik, 0 },
+ { X86::VREDUCEPDZrrik, X86::VREDUCEPDZrmik, 0 },
+ { X86::VREDUCEPSZ128rrik, X86::VREDUCEPSZ128rmik, 0 },
+ { X86::VREDUCEPSZ256rrik, X86::VREDUCEPSZ256rmik, 0 },
+ { X86::VREDUCEPSZrrik, X86::VREDUCEPSZrmik, 0 },
+ { X86::VREDUCESDZrrikz, X86::VREDUCESDZrmikz, TB_NO_REVERSE },
+ { X86::VREDUCESSZrrikz, X86::VREDUCESSZrmikz, TB_NO_REVERSE },
+ { X86::VRNDSCALEPDZ128rrik, X86::VRNDSCALEPDZ128rmik, 0 },
+ { X86::VRNDSCALEPDZ256rrik, X86::VRNDSCALEPDZ256rmik, 0 },
+ { X86::VRNDSCALEPDZrrik, X86::VRNDSCALEPDZrmik, 0 },
+ { X86::VRNDSCALEPSZ128rrik, X86::VRNDSCALEPSZ128rmik, 0 },
+ { X86::VRNDSCALEPSZ256rrik, X86::VRNDSCALEPSZ256rmik, 0 },
+ { X86::VRNDSCALEPSZrrik, X86::VRNDSCALEPSZrmik, 0 },
+ { X86::VRNDSCALESDZr_Intkz, X86::VRNDSCALESDZm_Intkz, TB_NO_REVERSE },
+ { X86::VRNDSCALESSZr_Intkz, X86::VRNDSCALESSZm_Intkz, TB_NO_REVERSE },
+ { X86::VRSQRT14PDZ128rk, X86::VRSQRT14PDZ128mk, 0 },
+ { X86::VRSQRT14PDZ256rk, X86::VRSQRT14PDZ256mk, 0 },
+ { X86::VRSQRT14PDZrk, X86::VRSQRT14PDZmk, 0 },
+ { X86::VRSQRT14PSZ128rk, X86::VRSQRT14PSZ128mk, 0 },
+ { X86::VRSQRT14PSZ256rk, X86::VRSQRT14PSZ256mk, 0 },
+ { X86::VRSQRT14PSZrk, X86::VRSQRT14PSZmk, 0 },
+ { X86::VRSQRT14SDZrrkz, X86::VRSQRT14SDZrmkz, TB_NO_REVERSE },
+ { X86::VRSQRT14SSZrrkz, X86::VRSQRT14SSZrmkz, TB_NO_REVERSE },
+ { X86::VRSQRT28PDZrk, X86::VRSQRT28PDZmk, 0 },
+ { X86::VRSQRT28PSZrk, X86::VRSQRT28PSZmk, 0 },
+ { X86::VRSQRT28SDZrkz, X86::VRSQRT28SDZmkz, TB_NO_REVERSE },
+ { X86::VRSQRT28SSZrkz, X86::VRSQRT28SSZmkz, TB_NO_REVERSE },
+ { X86::VSCALEFPDZ128rrkz, X86::VSCALEFPDZ128rmkz, 0 },
+ { X86::VSCALEFPDZ256rrkz, X86::VSCALEFPDZ256rmkz, 0 },
+ { X86::VSCALEFPDZrrkz, X86::VSCALEFPDZrmkz, 0 },
+ { X86::VSCALEFPSZ128rrkz, X86::VSCALEFPSZ128rmkz, 0 },
+ { X86::VSCALEFPSZ256rrkz, X86::VSCALEFPSZ256rmkz, 0 },
+ { X86::VSCALEFPSZrrkz, X86::VSCALEFPSZrmkz, 0 },
+ { X86::VSCALEFSDZrrkz, X86::VSCALEFSDZrmkz, TB_NO_REVERSE },
+ { X86::VSCALEFSSZrrkz, X86::VSCALEFSSZrmkz, TB_NO_REVERSE },
+ { X86::VSHUFF32X4Z256rrikz, X86::VSHUFF32X4Z256rmikz, 0 },
+ { X86::VSHUFF32X4Zrrikz, X86::VSHUFF32X4Zrmikz, 0 },
+ { X86::VSHUFF64X2Z256rrikz, X86::VSHUFF64X2Z256rmikz, 0 },
+ { X86::VSHUFF64X2Zrrikz, X86::VSHUFF64X2Zrmikz, 0 },
+ { X86::VSHUFI32X4Z256rrikz, X86::VSHUFI32X4Z256rmikz, 0 },
+ { X86::VSHUFI32X4Zrrikz, X86::VSHUFI32X4Zrmikz, 0 },
+ { X86::VSHUFI64X2Z256rrikz, X86::VSHUFI64X2Z256rmikz, 0 },
+ { X86::VSHUFI64X2Zrrikz, X86::VSHUFI64X2Zrmikz, 0 },
+ { X86::VSHUFPDZ128rrikz, X86::VSHUFPDZ128rmikz, 0 },
+ { X86::VSHUFPDZ256rrikz, X86::VSHUFPDZ256rmikz, 0 },
+ { X86::VSHUFPDZrrikz, X86::VSHUFPDZrmikz, 0 },
+ { X86::VSHUFPSZ128rrikz, X86::VSHUFPSZ128rmikz, 0 },
+ { X86::VSHUFPSZ256rrikz, X86::VSHUFPSZ256rmikz, 0 },
+ { X86::VSHUFPSZrrikz, X86::VSHUFPSZrmikz, 0 },
+ { X86::VSQRTPDZ128rk, X86::VSQRTPDZ128mk, 0 },
+ { X86::VSQRTPDZ256rk, X86::VSQRTPDZ256mk, 0 },
+ { X86::VSQRTPDZrk, X86::VSQRTPDZmk, 0 },
+ { X86::VSQRTPSZ128rk, X86::VSQRTPSZ128mk, 0 },
+ { X86::VSQRTPSZ256rk, X86::VSQRTPSZ256mk, 0 },
+ { X86::VSQRTPSZrk, X86::VSQRTPSZmk, 0 },
+ { X86::VSQRTSDZr_Intkz, X86::VSQRTSDZm_Intkz, TB_NO_REVERSE },
+ { X86::VSQRTSSZr_Intkz, X86::VSQRTSSZm_Intkz, TB_NO_REVERSE },
+ { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
+ { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
+ { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
+ { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
+ { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
+ { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
+ { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE },
+ { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 },
+ { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 },
+ { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
+ { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 },
+ { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 },
+ { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
+ { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 },
+ { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 },
+ { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
+ { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 },
+ { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 },
+ { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 },
+ { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 },
+ { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 },
+ { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 },
+ { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
+ { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 },
+ { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 },
+};
+
+static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
+ { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
+ { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
+ { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
+ { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
+ { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
+ { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
+ { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 },
+ { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 },
+ { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
+ { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 },
+ { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 },
+ { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
+ { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 },
+ { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 },
+ { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
+ { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 },
+ { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 },
+ { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 },
+ { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 },
+ { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 },
+ { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 },
+ { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
+ { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
+ { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
+ { X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0 },
+ { X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0 },
+ { X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0 },
+ { X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE },
+ { X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE },
+ { X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0 },
+ { X86::VDBPSADBWZ256rrik, X86::VDBPSADBWZ256rmik, 0 },
+ { X86::VDBPSADBWZrrik, X86::VDBPSADBWZrmik, 0 },
+ { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
+ { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
+ { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
+ { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
+ { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
+ { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
+ { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0 },
+ { X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0 },
+ { X86::VDPBF16PSZ256rk, X86::VDPBF16PSZ256mk, 0 },
+ { X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0 },
+ { X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0 },
+ { X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0 },
+ { X86::VFIXUPIMMPDZ128rrik, X86::VFIXUPIMMPDZ128rmik, 0 },
+ { X86::VFIXUPIMMPDZ128rrikz, X86::VFIXUPIMMPDZ128rmikz, 0 },
+ { X86::VFIXUPIMMPDZ256rrik, X86::VFIXUPIMMPDZ256rmik, 0 },
+ { X86::VFIXUPIMMPDZ256rrikz, X86::VFIXUPIMMPDZ256rmikz, 0 },
+ { X86::VFIXUPIMMPDZrrik, X86::VFIXUPIMMPDZrmik, 0 },
+ { X86::VFIXUPIMMPDZrrikz, X86::VFIXUPIMMPDZrmikz, 0 },
+ { X86::VFIXUPIMMPSZ128rrik, X86::VFIXUPIMMPSZ128rmik, 0 },
+ { X86::VFIXUPIMMPSZ128rrikz, X86::VFIXUPIMMPSZ128rmikz, 0 },
+ { X86::VFIXUPIMMPSZ256rrik, X86::VFIXUPIMMPSZ256rmik, 0 },
+ { X86::VFIXUPIMMPSZ256rrikz, X86::VFIXUPIMMPSZ256rmikz, 0 },
+ { X86::VFIXUPIMMPSZrrik, X86::VFIXUPIMMPSZrmik, 0 },
+ { X86::VFIXUPIMMPSZrrikz, X86::VFIXUPIMMPSZrmikz, 0 },
+ { X86::VFIXUPIMMSDZrrik, X86::VFIXUPIMMSDZrmik, TB_NO_REVERSE },
+ { X86::VFIXUPIMMSDZrrikz, X86::VFIXUPIMMSDZrmikz, TB_NO_REVERSE },
+ { X86::VFIXUPIMMSSZrrik, X86::VFIXUPIMMSSZrmik, TB_NO_REVERSE },
+ { X86::VFIXUPIMMSSZrrikz, X86::VFIXUPIMMSSZrmikz, TB_NO_REVERSE },
+ { X86::VFMADD132PDZ128rk, X86::VFMADD132PDZ128mk, 0 },
+ { X86::VFMADD132PDZ128rkz, X86::VFMADD132PDZ128mkz, 0 },
+ { X86::VFMADD132PDZ256rk, X86::VFMADD132PDZ256mk, 0 },
+ { X86::VFMADD132PDZ256rkz, X86::VFMADD132PDZ256mkz, 0 },
+ { X86::VFMADD132PDZrk, X86::VFMADD132PDZmk, 0 },
+ { X86::VFMADD132PDZrkz, X86::VFMADD132PDZmkz, 0 },
+ { X86::VFMADD132PSZ128rk, X86::VFMADD132PSZ128mk, 0 },
+ { X86::VFMADD132PSZ128rkz, X86::VFMADD132PSZ128mkz, 0 },
+ { X86::VFMADD132PSZ256rk, X86::VFMADD132PSZ256mk, 0 },
+ { X86::VFMADD132PSZ256rkz, X86::VFMADD132PSZ256mkz, 0 },
+ { X86::VFMADD132PSZrk, X86::VFMADD132PSZmk, 0 },
+ { X86::VFMADD132PSZrkz, X86::VFMADD132PSZmkz, 0 },
+ { X86::VFMADD132SDZr_Intk, X86::VFMADD132SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD132SDZr_Intkz, X86::VFMADD132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD132SSZr_Intk, X86::VFMADD132SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD132SSZr_Intkz, X86::VFMADD132SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD213PDZ128rk, X86::VFMADD213PDZ128mk, 0 },
+ { X86::VFMADD213PDZ128rkz, X86::VFMADD213PDZ128mkz, 0 },
+ { X86::VFMADD213PDZ256rk, X86::VFMADD213PDZ256mk, 0 },
+ { X86::VFMADD213PDZ256rkz, X86::VFMADD213PDZ256mkz, 0 },
+ { X86::VFMADD213PDZrk, X86::VFMADD213PDZmk, 0 },
+ { X86::VFMADD213PDZrkz, X86::VFMADD213PDZmkz, 0 },
+ { X86::VFMADD213PSZ128rk, X86::VFMADD213PSZ128mk, 0 },
+ { X86::VFMADD213PSZ128rkz, X86::VFMADD213PSZ128mkz, 0 },
+ { X86::VFMADD213PSZ256rk, X86::VFMADD213PSZ256mk, 0 },
+ { X86::VFMADD213PSZ256rkz, X86::VFMADD213PSZ256mkz, 0 },
+ { X86::VFMADD213PSZrk, X86::VFMADD213PSZmk, 0 },
+ { X86::VFMADD213PSZrkz, X86::VFMADD213PSZmkz, 0 },
+ { X86::VFMADD213SDZr_Intk, X86::VFMADD213SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD213SDZr_Intkz, X86::VFMADD213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD213SSZr_Intk, X86::VFMADD213SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD213SSZr_Intkz, X86::VFMADD213SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD231PDZ128rk, X86::VFMADD231PDZ128mk, 0 },
+ { X86::VFMADD231PDZ128rkz, X86::VFMADD231PDZ128mkz, 0 },
+ { X86::VFMADD231PDZ256rk, X86::VFMADD231PDZ256mk, 0 },
+ { X86::VFMADD231PDZ256rkz, X86::VFMADD231PDZ256mkz, 0 },
+ { X86::VFMADD231PDZrk, X86::VFMADD231PDZmk, 0 },
+ { X86::VFMADD231PDZrkz, X86::VFMADD231PDZmkz, 0 },
+ { X86::VFMADD231PSZ128rk, X86::VFMADD231PSZ128mk, 0 },
+ { X86::VFMADD231PSZ128rkz, X86::VFMADD231PSZ128mkz, 0 },
+ { X86::VFMADD231PSZ256rk, X86::VFMADD231PSZ256mk, 0 },
+ { X86::VFMADD231PSZ256rkz, X86::VFMADD231PSZ256mkz, 0 },
+ { X86::VFMADD231PSZrk, X86::VFMADD231PSZmk, 0 },
+ { X86::VFMADD231PSZrkz, X86::VFMADD231PSZmkz, 0 },
+ { X86::VFMADD231SDZr_Intk, X86::VFMADD231SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD231SDZr_Intkz, X86::VFMADD231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADD231SSZr_Intk, X86::VFMADD231SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMADD231SSZr_Intkz, X86::VFMADD231SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMADDSUB132PDZ128rk, X86::VFMADDSUB132PDZ128mk, 0 },
+ { X86::VFMADDSUB132PDZ128rkz, X86::VFMADDSUB132PDZ128mkz, 0 },
+ { X86::VFMADDSUB132PDZ256rk, X86::VFMADDSUB132PDZ256mk, 0 },
+ { X86::VFMADDSUB132PDZ256rkz, X86::VFMADDSUB132PDZ256mkz, 0 },
+ { X86::VFMADDSUB132PDZrk, X86::VFMADDSUB132PDZmk, 0 },
+ { X86::VFMADDSUB132PDZrkz, X86::VFMADDSUB132PDZmkz, 0 },
+ { X86::VFMADDSUB132PSZ128rk, X86::VFMADDSUB132PSZ128mk, 0 },
+ { X86::VFMADDSUB132PSZ128rkz, X86::VFMADDSUB132PSZ128mkz, 0 },
+ { X86::VFMADDSUB132PSZ256rk, X86::VFMADDSUB132PSZ256mk, 0 },
+ { X86::VFMADDSUB132PSZ256rkz, X86::VFMADDSUB132PSZ256mkz, 0 },
+ { X86::VFMADDSUB132PSZrk, X86::VFMADDSUB132PSZmk, 0 },
+ { X86::VFMADDSUB132PSZrkz, X86::VFMADDSUB132PSZmkz, 0 },
+ { X86::VFMADDSUB213PDZ128rk, X86::VFMADDSUB213PDZ128mk, 0 },
+ { X86::VFMADDSUB213PDZ128rkz, X86::VFMADDSUB213PDZ128mkz, 0 },
+ { X86::VFMADDSUB213PDZ256rk, X86::VFMADDSUB213PDZ256mk, 0 },
+ { X86::VFMADDSUB213PDZ256rkz, X86::VFMADDSUB213PDZ256mkz, 0 },
+ { X86::VFMADDSUB213PDZrk, X86::VFMADDSUB213PDZmk, 0 },
+ { X86::VFMADDSUB213PDZrkz, X86::VFMADDSUB213PDZmkz, 0 },
+ { X86::VFMADDSUB213PSZ128rk, X86::VFMADDSUB213PSZ128mk, 0 },
+ { X86::VFMADDSUB213PSZ128rkz, X86::VFMADDSUB213PSZ128mkz, 0 },
+ { X86::VFMADDSUB213PSZ256rk, X86::VFMADDSUB213PSZ256mk, 0 },
+ { X86::VFMADDSUB213PSZ256rkz, X86::VFMADDSUB213PSZ256mkz, 0 },
+ { X86::VFMADDSUB213PSZrk, X86::VFMADDSUB213PSZmk, 0 },
+ { X86::VFMADDSUB213PSZrkz, X86::VFMADDSUB213PSZmkz, 0 },
+ { X86::VFMADDSUB231PDZ128rk, X86::VFMADDSUB231PDZ128mk, 0 },
+ { X86::VFMADDSUB231PDZ128rkz, X86::VFMADDSUB231PDZ128mkz, 0 },
+ { X86::VFMADDSUB231PDZ256rk, X86::VFMADDSUB231PDZ256mk, 0 },
+ { X86::VFMADDSUB231PDZ256rkz, X86::VFMADDSUB231PDZ256mkz, 0 },
+ { X86::VFMADDSUB231PDZrk, X86::VFMADDSUB231PDZmk, 0 },
+ { X86::VFMADDSUB231PDZrkz, X86::VFMADDSUB231PDZmkz, 0 },
+ { X86::VFMADDSUB231PSZ128rk, X86::VFMADDSUB231PSZ128mk, 0 },
+ { X86::VFMADDSUB231PSZ128rkz, X86::VFMADDSUB231PSZ128mkz, 0 },
+ { X86::VFMADDSUB231PSZ256rk, X86::VFMADDSUB231PSZ256mk, 0 },
+ { X86::VFMADDSUB231PSZ256rkz, X86::VFMADDSUB231PSZ256mkz, 0 },
+ { X86::VFMADDSUB231PSZrk, X86::VFMADDSUB231PSZmk, 0 },
+ { X86::VFMADDSUB231PSZrkz, X86::VFMADDSUB231PSZmkz, 0 },
+ { X86::VFMSUB132PDZ128rk, X86::VFMSUB132PDZ128mk, 0 },
+ { X86::VFMSUB132PDZ128rkz, X86::VFMSUB132PDZ128mkz, 0 },
+ { X86::VFMSUB132PDZ256rk, X86::VFMSUB132PDZ256mk, 0 },
+ { X86::VFMSUB132PDZ256rkz, X86::VFMSUB132PDZ256mkz, 0 },
+ { X86::VFMSUB132PDZrk, X86::VFMSUB132PDZmk, 0 },
+ { X86::VFMSUB132PDZrkz, X86::VFMSUB132PDZmkz, 0 },
+ { X86::VFMSUB132PSZ128rk, X86::VFMSUB132PSZ128mk, 0 },
+ { X86::VFMSUB132PSZ128rkz, X86::VFMSUB132PSZ128mkz, 0 },
+ { X86::VFMSUB132PSZ256rk, X86::VFMSUB132PSZ256mk, 0 },
+ { X86::VFMSUB132PSZ256rkz, X86::VFMSUB132PSZ256mkz, 0 },
+ { X86::VFMSUB132PSZrk, X86::VFMSUB132PSZmk, 0 },
+ { X86::VFMSUB132PSZrkz, X86::VFMSUB132PSZmkz, 0 },
+ { X86::VFMSUB132SDZr_Intk, X86::VFMSUB132SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB132SDZr_Intkz, X86::VFMSUB132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB132SSZr_Intk, X86::VFMSUB132SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB132SSZr_Intkz, X86::VFMSUB132SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB213PDZ128rk, X86::VFMSUB213PDZ128mk, 0 },
+ { X86::VFMSUB213PDZ128rkz, X86::VFMSUB213PDZ128mkz, 0 },
+ { X86::VFMSUB213PDZ256rk, X86::VFMSUB213PDZ256mk, 0 },
+ { X86::VFMSUB213PDZ256rkz, X86::VFMSUB213PDZ256mkz, 0 },
+ { X86::VFMSUB213PDZrk, X86::VFMSUB213PDZmk, 0 },
+ { X86::VFMSUB213PDZrkz, X86::VFMSUB213PDZmkz, 0 },
+ { X86::VFMSUB213PSZ128rk, X86::VFMSUB213PSZ128mk, 0 },
+ { X86::VFMSUB213PSZ128rkz, X86::VFMSUB213PSZ128mkz, 0 },
+ { X86::VFMSUB213PSZ256rk, X86::VFMSUB213PSZ256mk, 0 },
+ { X86::VFMSUB213PSZ256rkz, X86::VFMSUB213PSZ256mkz, 0 },
+ { X86::VFMSUB213PSZrk, X86::VFMSUB213PSZmk, 0 },
+ { X86::VFMSUB213PSZrkz, X86::VFMSUB213PSZmkz, 0 },
+ { X86::VFMSUB213SDZr_Intk, X86::VFMSUB213SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB213SDZr_Intkz, X86::VFMSUB213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB213SSZr_Intk, X86::VFMSUB213SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB213SSZr_Intkz, X86::VFMSUB213SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB231PDZ128rk, X86::VFMSUB231PDZ128mk, 0 },
+ { X86::VFMSUB231PDZ128rkz, X86::VFMSUB231PDZ128mkz, 0 },
+ { X86::VFMSUB231PDZ256rk, X86::VFMSUB231PDZ256mk, 0 },
+ { X86::VFMSUB231PDZ256rkz, X86::VFMSUB231PDZ256mkz, 0 },
+ { X86::VFMSUB231PDZrk, X86::VFMSUB231PDZmk, 0 },
+ { X86::VFMSUB231PDZrkz, X86::VFMSUB231PDZmkz, 0 },
+ { X86::VFMSUB231PSZ128rk, X86::VFMSUB231PSZ128mk, 0 },
+ { X86::VFMSUB231PSZ128rkz, X86::VFMSUB231PSZ128mkz, 0 },
+ { X86::VFMSUB231PSZ256rk, X86::VFMSUB231PSZ256mk, 0 },
+ { X86::VFMSUB231PSZ256rkz, X86::VFMSUB231PSZ256mkz, 0 },
+ { X86::VFMSUB231PSZrk, X86::VFMSUB231PSZmk, 0 },
+ { X86::VFMSUB231PSZrkz, X86::VFMSUB231PSZmkz, 0 },
+ { X86::VFMSUB231SDZr_Intk, X86::VFMSUB231SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB231SDZr_Intkz, X86::VFMSUB231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUB231SSZr_Intk, X86::VFMSUB231SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFMSUB231SSZr_Intkz, X86::VFMSUB231SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFMSUBADD132PDZ128rk, X86::VFMSUBADD132PDZ128mk, 0 },
+ { X86::VFMSUBADD132PDZ128rkz, X86::VFMSUBADD132PDZ128mkz, 0 },
+ { X86::VFMSUBADD132PDZ256rk, X86::VFMSUBADD132PDZ256mk, 0 },
+ { X86::VFMSUBADD132PDZ256rkz, X86::VFMSUBADD132PDZ256mkz, 0 },
+ { X86::VFMSUBADD132PDZrk, X86::VFMSUBADD132PDZmk, 0 },
+ { X86::VFMSUBADD132PDZrkz, X86::VFMSUBADD132PDZmkz, 0 },
+ { X86::VFMSUBADD132PSZ128rk, X86::VFMSUBADD132PSZ128mk, 0 },
+ { X86::VFMSUBADD132PSZ128rkz, X86::VFMSUBADD132PSZ128mkz, 0 },
+ { X86::VFMSUBADD132PSZ256rk, X86::VFMSUBADD132PSZ256mk, 0 },
+ { X86::VFMSUBADD132PSZ256rkz, X86::VFMSUBADD132PSZ256mkz, 0 },
+ { X86::VFMSUBADD132PSZrk, X86::VFMSUBADD132PSZmk, 0 },
+ { X86::VFMSUBADD132PSZrkz, X86::VFMSUBADD132PSZmkz, 0 },
+ { X86::VFMSUBADD213PDZ128rk, X86::VFMSUBADD213PDZ128mk, 0 },
+ { X86::VFMSUBADD213PDZ128rkz, X86::VFMSUBADD213PDZ128mkz, 0 },
+ { X86::VFMSUBADD213PDZ256rk, X86::VFMSUBADD213PDZ256mk, 0 },
+ { X86::VFMSUBADD213PDZ256rkz, X86::VFMSUBADD213PDZ256mkz, 0 },
+ { X86::VFMSUBADD213PDZrk, X86::VFMSUBADD213PDZmk, 0 },
+ { X86::VFMSUBADD213PDZrkz, X86::VFMSUBADD213PDZmkz, 0 },
+ { X86::VFMSUBADD213PSZ128rk, X86::VFMSUBADD213PSZ128mk, 0 },
+ { X86::VFMSUBADD213PSZ128rkz, X86::VFMSUBADD213PSZ128mkz, 0 },
+ { X86::VFMSUBADD213PSZ256rk, X86::VFMSUBADD213PSZ256mk, 0 },
+ { X86::VFMSUBADD213PSZ256rkz, X86::VFMSUBADD213PSZ256mkz, 0 },
+ { X86::VFMSUBADD213PSZrk, X86::VFMSUBADD213PSZmk, 0 },
+ { X86::VFMSUBADD213PSZrkz, X86::VFMSUBADD213PSZmkz, 0 },
+ { X86::VFMSUBADD231PDZ128rk, X86::VFMSUBADD231PDZ128mk, 0 },
+ { X86::VFMSUBADD231PDZ128rkz, X86::VFMSUBADD231PDZ128mkz, 0 },
+ { X86::VFMSUBADD231PDZ256rk, X86::VFMSUBADD231PDZ256mk, 0 },
+ { X86::VFMSUBADD231PDZ256rkz, X86::VFMSUBADD231PDZ256mkz, 0 },
+ { X86::VFMSUBADD231PDZrk, X86::VFMSUBADD231PDZmk, 0 },
+ { X86::VFMSUBADD231PDZrkz, X86::VFMSUBADD231PDZmkz, 0 },
+ { X86::VFMSUBADD231PSZ128rk, X86::VFMSUBADD231PSZ128mk, 0 },
+ { X86::VFMSUBADD231PSZ128rkz, X86::VFMSUBADD231PSZ128mkz, 0 },
+ { X86::VFMSUBADD231PSZ256rk, X86::VFMSUBADD231PSZ256mk, 0 },
+ { X86::VFMSUBADD231PSZ256rkz, X86::VFMSUBADD231PSZ256mkz, 0 },
+ { X86::VFMSUBADD231PSZrk, X86::VFMSUBADD231PSZmk, 0 },
+ { X86::VFMSUBADD231PSZrkz, X86::VFMSUBADD231PSZmkz, 0 },
+ { X86::VFNMADD132PDZ128rk, X86::VFNMADD132PDZ128mk, 0 },
+ { X86::VFNMADD132PDZ128rkz, X86::VFNMADD132PDZ128mkz, 0 },
+ { X86::VFNMADD132PDZ256rk, X86::VFNMADD132PDZ256mk, 0 },
+ { X86::VFNMADD132PDZ256rkz, X86::VFNMADD132PDZ256mkz, 0 },
+ { X86::VFNMADD132PDZrk, X86::VFNMADD132PDZmk, 0 },
+ { X86::VFNMADD132PDZrkz, X86::VFNMADD132PDZmkz, 0 },
+ { X86::VFNMADD132PSZ128rk, X86::VFNMADD132PSZ128mk, 0 },
+ { X86::VFNMADD132PSZ128rkz, X86::VFNMADD132PSZ128mkz, 0 },
+ { X86::VFNMADD132PSZ256rk, X86::VFNMADD132PSZ256mk, 0 },
+ { X86::VFNMADD132PSZ256rkz, X86::VFNMADD132PSZ256mkz, 0 },
+ { X86::VFNMADD132PSZrk, X86::VFNMADD132PSZmk, 0 },
+ { X86::VFNMADD132PSZrkz, X86::VFNMADD132PSZmkz, 0 },
+ { X86::VFNMADD132SDZr_Intk, X86::VFNMADD132SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD132SDZr_Intkz, X86::VFNMADD132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD132SSZr_Intk, X86::VFNMADD132SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD132SSZr_Intkz, X86::VFNMADD132SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD213PDZ128rk, X86::VFNMADD213PDZ128mk, 0 },
+ { X86::VFNMADD213PDZ128rkz, X86::VFNMADD213PDZ128mkz, 0 },
+ { X86::VFNMADD213PDZ256rk, X86::VFNMADD213PDZ256mk, 0 },
+ { X86::VFNMADD213PDZ256rkz, X86::VFNMADD213PDZ256mkz, 0 },
+ { X86::VFNMADD213PDZrk, X86::VFNMADD213PDZmk, 0 },
+ { X86::VFNMADD213PDZrkz, X86::VFNMADD213PDZmkz, 0 },
+ { X86::VFNMADD213PSZ128rk, X86::VFNMADD213PSZ128mk, 0 },
+ { X86::VFNMADD213PSZ128rkz, X86::VFNMADD213PSZ128mkz, 0 },
+ { X86::VFNMADD213PSZ256rk, X86::VFNMADD213PSZ256mk, 0 },
+ { X86::VFNMADD213PSZ256rkz, X86::VFNMADD213PSZ256mkz, 0 },
+ { X86::VFNMADD213PSZrk, X86::VFNMADD213PSZmk, 0 },
+ { X86::VFNMADD213PSZrkz, X86::VFNMADD213PSZmkz, 0 },
+ { X86::VFNMADD213SDZr_Intk, X86::VFNMADD213SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD213SDZr_Intkz, X86::VFNMADD213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD213SSZr_Intk, X86::VFNMADD213SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD213SSZr_Intkz, X86::VFNMADD213SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD231PDZ128rk, X86::VFNMADD231PDZ128mk, 0 },
+ { X86::VFNMADD231PDZ128rkz, X86::VFNMADD231PDZ128mkz, 0 },
+ { X86::VFNMADD231PDZ256rk, X86::VFNMADD231PDZ256mk, 0 },
+ { X86::VFNMADD231PDZ256rkz, X86::VFNMADD231PDZ256mkz, 0 },
+ { X86::VFNMADD231PDZrk, X86::VFNMADD231PDZmk, 0 },
+ { X86::VFNMADD231PDZrkz, X86::VFNMADD231PDZmkz, 0 },
+ { X86::VFNMADD231PSZ128rk, X86::VFNMADD231PSZ128mk, 0 },
+ { X86::VFNMADD231PSZ128rkz, X86::VFNMADD231PSZ128mkz, 0 },
+ { X86::VFNMADD231PSZ256rk, X86::VFNMADD231PSZ256mk, 0 },
+ { X86::VFNMADD231PSZ256rkz, X86::VFNMADD231PSZ256mkz, 0 },
+ { X86::VFNMADD231PSZrk, X86::VFNMADD231PSZmk, 0 },
+ { X86::VFNMADD231PSZrkz, X86::VFNMADD231PSZmkz, 0 },
+ { X86::VFNMADD231SDZr_Intk, X86::VFNMADD231SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD231SDZr_Intkz, X86::VFNMADD231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMADD231SSZr_Intk, X86::VFNMADD231SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMADD231SSZr_Intkz, X86::VFNMADD231SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB132PDZ128rk, X86::VFNMSUB132PDZ128mk, 0 },
+ { X86::VFNMSUB132PDZ128rkz, X86::VFNMSUB132PDZ128mkz, 0 },
+ { X86::VFNMSUB132PDZ256rk, X86::VFNMSUB132PDZ256mk, 0 },
+ { X86::VFNMSUB132PDZ256rkz, X86::VFNMSUB132PDZ256mkz, 0 },
+ { X86::VFNMSUB132PDZrk, X86::VFNMSUB132PDZmk, 0 },
+ { X86::VFNMSUB132PDZrkz, X86::VFNMSUB132PDZmkz, 0 },
+ { X86::VFNMSUB132PSZ128rk, X86::VFNMSUB132PSZ128mk, 0 },
+ { X86::VFNMSUB132PSZ128rkz, X86::VFNMSUB132PSZ128mkz, 0 },
+ { X86::VFNMSUB132PSZ256rk, X86::VFNMSUB132PSZ256mk, 0 },
+ { X86::VFNMSUB132PSZ256rkz, X86::VFNMSUB132PSZ256mkz, 0 },
+ { X86::VFNMSUB132PSZrk, X86::VFNMSUB132PSZmk, 0 },
+ { X86::VFNMSUB132PSZrkz, X86::VFNMSUB132PSZmkz, 0 },
+ { X86::VFNMSUB132SDZr_Intk, X86::VFNMSUB132SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB132SDZr_Intkz, X86::VFNMSUB132SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB132SSZr_Intk, X86::VFNMSUB132SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB132SSZr_Intkz, X86::VFNMSUB132SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB213PDZ128rk, X86::VFNMSUB213PDZ128mk, 0 },
+ { X86::VFNMSUB213PDZ128rkz, X86::VFNMSUB213PDZ128mkz, 0 },
+ { X86::VFNMSUB213PDZ256rk, X86::VFNMSUB213PDZ256mk, 0 },
+ { X86::VFNMSUB213PDZ256rkz, X86::VFNMSUB213PDZ256mkz, 0 },
+ { X86::VFNMSUB213PDZrk, X86::VFNMSUB213PDZmk, 0 },
+ { X86::VFNMSUB213PDZrkz, X86::VFNMSUB213PDZmkz, 0 },
+ { X86::VFNMSUB213PSZ128rk, X86::VFNMSUB213PSZ128mk, 0 },
+ { X86::VFNMSUB213PSZ128rkz, X86::VFNMSUB213PSZ128mkz, 0 },
+ { X86::VFNMSUB213PSZ256rk, X86::VFNMSUB213PSZ256mk, 0 },
+ { X86::VFNMSUB213PSZ256rkz, X86::VFNMSUB213PSZ256mkz, 0 },
+ { X86::VFNMSUB213PSZrk, X86::VFNMSUB213PSZmk, 0 },
+ { X86::VFNMSUB213PSZrkz, X86::VFNMSUB213PSZmkz, 0 },
+ { X86::VFNMSUB213SDZr_Intk, X86::VFNMSUB213SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB213SDZr_Intkz, X86::VFNMSUB213SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB213SSZr_Intk, X86::VFNMSUB213SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB213SSZr_Intkz, X86::VFNMSUB213SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB231PDZ128rk, X86::VFNMSUB231PDZ128mk, 0 },
+ { X86::VFNMSUB231PDZ128rkz, X86::VFNMSUB231PDZ128mkz, 0 },
+ { X86::VFNMSUB231PDZ256rk, X86::VFNMSUB231PDZ256mk, 0 },
+ { X86::VFNMSUB231PDZ256rkz, X86::VFNMSUB231PDZ256mkz, 0 },
+ { X86::VFNMSUB231PDZrk, X86::VFNMSUB231PDZmk, 0 },
+ { X86::VFNMSUB231PDZrkz, X86::VFNMSUB231PDZmkz, 0 },
+ { X86::VFNMSUB231PSZ128rk, X86::VFNMSUB231PSZ128mk, 0 },
+ { X86::VFNMSUB231PSZ128rkz, X86::VFNMSUB231PSZ128mkz, 0 },
+ { X86::VFNMSUB231PSZ256rk, X86::VFNMSUB231PSZ256mk, 0 },
+ { X86::VFNMSUB231PSZ256rkz, X86::VFNMSUB231PSZ256mkz, 0 },
+ { X86::VFNMSUB231PSZrk, X86::VFNMSUB231PSZmk, 0 },
+ { X86::VFNMSUB231PSZrkz, X86::VFNMSUB231PSZmkz, 0 },
+ { X86::VFNMSUB231SDZr_Intk, X86::VFNMSUB231SDZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB231SDZr_Intkz, X86::VFNMSUB231SDZm_Intkz, TB_NO_REVERSE },
+ { X86::VFNMSUB231SSZr_Intk, X86::VFNMSUB231SSZm_Intk, TB_NO_REVERSE },
+ { X86::VFNMSUB231SSZr_Intkz, X86::VFNMSUB231SSZm_Intkz, TB_NO_REVERSE },
+ { X86::VGETEXPSDZrk, X86::VGETEXPSDZmk, TB_NO_REVERSE },
+ { X86::VGETEXPSSZrk, X86::VGETEXPSSZmk, TB_NO_REVERSE },
+ { X86::VGETMANTSDZrrik, X86::VGETMANTSDZrmik, TB_NO_REVERSE },
+ { X86::VGETMANTSSZrrik, X86::VGETMANTSSZrmik, TB_NO_REVERSE },
+ { X86::VGF2P8AFFINEINVQBZ128rrik, X86::VGF2P8AFFINEINVQBZ128rmik, 0 },
+ { X86::VGF2P8AFFINEINVQBZ256rrik, X86::VGF2P8AFFINEINVQBZ256rmik, 0 },
+ { X86::VGF2P8AFFINEINVQBZrrik, X86::VGF2P8AFFINEINVQBZrmik, 0 },
+ { X86::VGF2P8AFFINEQBZ128rrik, X86::VGF2P8AFFINEQBZ128rmik, 0 },
+ { X86::VGF2P8AFFINEQBZ256rrik, X86::VGF2P8AFFINEQBZ256rmik, 0 },
+ { X86::VGF2P8AFFINEQBZrrik, X86::VGF2P8AFFINEQBZrmik, 0 },
+ { X86::VGF2P8MULBZ128rrk, X86::VGF2P8MULBZ128rmk, 0 },
+ { X86::VGF2P8MULBZ256rrk, X86::VGF2P8MULBZ256rmk, 0 },
+ { X86::VGF2P8MULBZrrk, X86::VGF2P8MULBZrmk, 0 },
+ { X86::VINSERTF32x4Z256rrk, X86::VINSERTF32x4Z256rmk, 0 },
+ { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
+ { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
+ { X86::VINSERTF64x2Z256rrk, X86::VINSERTF64x2Z256rmk, 0 },
+ { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
+ { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 },
+ { X86::VINSERTI32x4Z256rrk, X86::VINSERTI32x4Z256rmk, 0 },
+ { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 },
+ { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 },
+ { X86::VINSERTI64x2Z256rrk, X86::VINSERTI64x2Z256rmk, 0 },
+ { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 },
+ { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 },
+ { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 },
+ { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 },
+ { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 },
+ { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 },
+ { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 },
+ { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
+ { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 },
+ { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
+ { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
+ { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
+ { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
+ { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
+ { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 },
+ { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 },
+ { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
+ { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 },
+ { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 },
+ { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
+ { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
+ { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
+ { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
+ { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
+ { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
+ { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
+ { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
+ { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
+ { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
+ { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
+ { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
+ { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
+ { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
+ { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
+ { X86::VORPDZrrk, X86::VORPDZrmk, 0 },
+ { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
+ { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
+ { X86::VORPSZrrk, X86::VORPSZrmk, 0 },
+ { X86::VPACKSSDWZ128rrk, X86::VPACKSSDWZ128rmk, 0 },
+ { X86::VPACKSSDWZ256rrk, X86::VPACKSSDWZ256rmk, 0 },
+ { X86::VPACKSSDWZrrk, X86::VPACKSSDWZrmk, 0 },
+ { X86::VPACKSSWBZ128rrk, X86::VPACKSSWBZ128rmk, 0 },
+ { X86::VPACKSSWBZ256rrk, X86::VPACKSSWBZ256rmk, 0 },
+ { X86::VPACKSSWBZrrk, X86::VPACKSSWBZrmk, 0 },
+ { X86::VPACKUSDWZ128rrk, X86::VPACKUSDWZ128rmk, 0 },
+ { X86::VPACKUSDWZ256rrk, X86::VPACKUSDWZ256rmk, 0 },
+ { X86::VPACKUSDWZrrk, X86::VPACKUSDWZrmk, 0 },
+ { X86::VPACKUSWBZ128rrk, X86::VPACKUSWBZ128rmk, 0 },
+ { X86::VPACKUSWBZ256rrk, X86::VPACKUSWBZ256rmk, 0 },
+ { X86::VPACKUSWBZrrk, X86::VPACKUSWBZrmk, 0 },
+ { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
+ { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
+ { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
+ { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
+ { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
+ { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 },
+ { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
+ { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
+ { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 },
+ { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
+ { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
+ { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 },
+ { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
+ { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
+ { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 },
+ { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
+ { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
+ { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 },
+ { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
+ { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
+ { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 },
+ { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
+ { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
+ { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 },
+ { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 },
+ { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 },
+ { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 },
+ { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
+ { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
+ { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 },
+ { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
+ { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
+ { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
+ { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
+ { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
+ { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
+ { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
+ { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
+ { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
+ { X86::VPAVGBZ128rrk, X86::VPAVGBZ128rmk, 0 },
+ { X86::VPAVGBZ256rrk, X86::VPAVGBZ256rmk, 0 },
+ { X86::VPAVGBZrrk, X86::VPAVGBZrmk, 0 },
+ { X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0 },
+ { X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0 },
+ { X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0 },
+ { X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mk, 0 },
+ { X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mkz, 0 },
+ { X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mk, 0 },
+ { X86::VPDPBUSDSZ256rkz, X86::VPDPBUSDSZ256mkz, 0 },
+ { X86::VPDPBUSDSZrk, X86::VPDPBUSDSZmk, 0 },
+ { X86::VPDPBUSDSZrkz, X86::VPDPBUSDSZmkz, 0 },
+ { X86::VPDPBUSDZ128rk, X86::VPDPBUSDZ128mk, 0 },
+ { X86::VPDPBUSDZ128rkz, X86::VPDPBUSDZ128mkz, 0 },
+ { X86::VPDPBUSDZ256rk, X86::VPDPBUSDZ256mk, 0 },
+ { X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mkz, 0 },
+ { X86::VPDPBUSDZrk, X86::VPDPBUSDZmk, 0 },
+ { X86::VPDPBUSDZrkz, X86::VPDPBUSDZmkz, 0 },
+ { X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mk, 0 },
+ { X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mkz, 0 },
+ { X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mk, 0 },
+ { X86::VPDPWSSDSZ256rkz, X86::VPDPWSSDSZ256mkz, 0 },
+ { X86::VPDPWSSDSZrk, X86::VPDPWSSDSZmk, 0 },
+ { X86::VPDPWSSDSZrkz, X86::VPDPWSSDSZmkz, 0 },
+ { X86::VPDPWSSDZ128rk, X86::VPDPWSSDZ128mk, 0 },
+ { X86::VPDPWSSDZ128rkz, X86::VPDPWSSDZ128mkz, 0 },
+ { X86::VPDPWSSDZ256rk, X86::VPDPWSSDZ256mk, 0 },
+ { X86::VPDPWSSDZ256rkz, X86::VPDPWSSDZ256mkz, 0 },
+ { X86::VPDPWSSDZrk, X86::VPDPWSSDZmk, 0 },
+ { X86::VPDPWSSDZrkz, X86::VPDPWSSDZmkz, 0 },
+ { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
+ { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
+ { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
+ { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
+ { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
+ { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
+ { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
+ { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
+ { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
+ { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
+ { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
+ { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
+ { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
+ { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
+ { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
+ { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
+ { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
+ { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
+ { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
+ { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
+ { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
+ { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
+ { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
+ { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
+ { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
+ { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
+ { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
+ { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
+ { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
+ { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
+ { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
+ { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
+ { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
+ { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
+ { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
+ { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
+ { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
+ { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
+ { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
+ { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
+ { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
+ { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
+ { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
+ { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
+ { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
+ { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
+ { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
+ { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
+ { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
+ { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
+ { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
+ { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
+ { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
+ { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
+ { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
+ { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
+ { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
+ { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
+ { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
+ { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
+ { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
+ { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
+ { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
+ { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
+ { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
+ { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
+ { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
+ { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
+ { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
+ { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
+ { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
+ { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
+ { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
+ { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
+ { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
+ { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
+ { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
+ { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
+ { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
+ { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
+ { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
+ { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
+ { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
+ { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
+ { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
+ { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
+ { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
+ { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
+ { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
+ { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
+ { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
+ { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
+ { X86::VPMADD52HUQZ128rk, X86::VPMADD52HUQZ128mk, 0 },
+ { X86::VPMADD52HUQZ128rkz, X86::VPMADD52HUQZ128mkz, 0 },
+ { X86::VPMADD52HUQZ256rk, X86::VPMADD52HUQZ256mk, 0 },
+ { X86::VPMADD52HUQZ256rkz, X86::VPMADD52HUQZ256mkz, 0 },
+ { X86::VPMADD52HUQZrk, X86::VPMADD52HUQZmk, 0 },
+ { X86::VPMADD52HUQZrkz, X86::VPMADD52HUQZmkz, 0 },
+ { X86::VPMADD52LUQZ128rk, X86::VPMADD52LUQZ128mk, 0 },
+ { X86::VPMADD52LUQZ128rkz, X86::VPMADD52LUQZ128mkz, 0 },
+ { X86::VPMADD52LUQZ256rk, X86::VPMADD52LUQZ256mk, 0 },
+ { X86::VPMADD52LUQZ256rkz, X86::VPMADD52LUQZ256mkz, 0 },
+ { X86::VPMADD52LUQZrk, X86::VPMADD52LUQZmk, 0 },
+ { X86::VPMADD52LUQZrkz, X86::VPMADD52LUQZmkz, 0 },
+ { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
+ { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
+ { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
+ { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
+ { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
+ { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
+ { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 },
+ { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 },
+ { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 },
+ { X86::VPMAXSDZ128rrk, X86::VPMAXSDZ128rmk, 0 },
+ { X86::VPMAXSDZ256rrk, X86::VPMAXSDZ256rmk, 0 },
+ { X86::VPMAXSDZrrk, X86::VPMAXSDZrmk, 0 },
+ { X86::VPMAXSQZ128rrk, X86::VPMAXSQZ128rmk, 0 },
+ { X86::VPMAXSQZ256rrk, X86::VPMAXSQZ256rmk, 0 },
+ { X86::VPMAXSQZrrk, X86::VPMAXSQZrmk, 0 },
+ { X86::VPMAXSWZ128rrk, X86::VPMAXSWZ128rmk, 0 },
+ { X86::VPMAXSWZ256rrk, X86::VPMAXSWZ256rmk, 0 },
+ { X86::VPMAXSWZrrk, X86::VPMAXSWZrmk, 0 },
+ { X86::VPMAXUBZ128rrk, X86::VPMAXUBZ128rmk, 0 },
+ { X86::VPMAXUBZ256rrk, X86::VPMAXUBZ256rmk, 0 },
+ { X86::VPMAXUBZrrk, X86::VPMAXUBZrmk, 0 },
+ { X86::VPMAXUDZ128rrk, X86::VPMAXUDZ128rmk, 0 },
+ { X86::VPMAXUDZ256rrk, X86::VPMAXUDZ256rmk, 0 },
+ { X86::VPMAXUDZrrk, X86::VPMAXUDZrmk, 0 },
+ { X86::VPMAXUQZ128rrk, X86::VPMAXUQZ128rmk, 0 },
+ { X86::VPMAXUQZ256rrk, X86::VPMAXUQZ256rmk, 0 },
+ { X86::VPMAXUQZrrk, X86::VPMAXUQZrmk, 0 },
+ { X86::VPMAXUWZ128rrk, X86::VPMAXUWZ128rmk, 0 },
+ { X86::VPMAXUWZ256rrk, X86::VPMAXUWZ256rmk, 0 },
+ { X86::VPMAXUWZrrk, X86::VPMAXUWZrmk, 0 },
+ { X86::VPMINSBZ128rrk, X86::VPMINSBZ128rmk, 0 },
+ { X86::VPMINSBZ256rrk, X86::VPMINSBZ256rmk, 0 },
+ { X86::VPMINSBZrrk, X86::VPMINSBZrmk, 0 },
+ { X86::VPMINSDZ128rrk, X86::VPMINSDZ128rmk, 0 },
+ { X86::VPMINSDZ256rrk, X86::VPMINSDZ256rmk, 0 },
+ { X86::VPMINSDZrrk, X86::VPMINSDZrmk, 0 },
+ { X86::VPMINSQZ128rrk, X86::VPMINSQZ128rmk, 0 },
+ { X86::VPMINSQZ256rrk, X86::VPMINSQZ256rmk, 0 },
+ { X86::VPMINSQZrrk, X86::VPMINSQZrmk, 0 },
+ { X86::VPMINSWZ128rrk, X86::VPMINSWZ128rmk, 0 },
+ { X86::VPMINSWZ256rrk, X86::VPMINSWZ256rmk, 0 },
+ { X86::VPMINSWZrrk, X86::VPMINSWZrmk, 0 },
+ { X86::VPMINUBZ128rrk, X86::VPMINUBZ128rmk, 0 },
+ { X86::VPMINUBZ256rrk, X86::VPMINUBZ256rmk, 0 },
+ { X86::VPMINUBZrrk, X86::VPMINUBZrmk, 0 },
+ { X86::VPMINUDZ128rrk, X86::VPMINUDZ128rmk, 0 },
+ { X86::VPMINUDZ256rrk, X86::VPMINUDZ256rmk, 0 },
+ { X86::VPMINUDZrrk, X86::VPMINUDZrmk, 0 },
+ { X86::VPMINUQZ128rrk, X86::VPMINUQZ128rmk, 0 },
+ { X86::VPMINUQZ256rrk, X86::VPMINUQZ256rmk, 0 },
+ { X86::VPMINUQZrrk, X86::VPMINUQZrmk, 0 },
+ { X86::VPMINUWZ128rrk, X86::VPMINUWZ128rmk, 0 },
+ { X86::VPMINUWZ256rrk, X86::VPMINUWZ256rmk, 0 },
+ { X86::VPMINUWZrrk, X86::VPMINUWZrmk, 0 },
+ { X86::VPMULDQZ128rrk, X86::VPMULDQZ128rmk, 0 },
+ { X86::VPMULDQZ256rrk, X86::VPMULDQZ256rmk, 0 },
+ { X86::VPMULDQZrrk, X86::VPMULDQZrmk, 0 },
+ { X86::VPMULHRSWZ128rrk, X86::VPMULHRSWZ128rmk, 0 },
+ { X86::VPMULHRSWZ256rrk, X86::VPMULHRSWZ256rmk, 0 },
+ { X86::VPMULHRSWZrrk, X86::VPMULHRSWZrmk, 0 },
+ { X86::VPMULHUWZ128rrk, X86::VPMULHUWZ128rmk, 0 },
+ { X86::VPMULHUWZ256rrk, X86::VPMULHUWZ256rmk, 0 },
+ { X86::VPMULHUWZrrk, X86::VPMULHUWZrmk, 0 },
+ { X86::VPMULHWZ128rrk, X86::VPMULHWZ128rmk, 0 },
+ { X86::VPMULHWZ256rrk, X86::VPMULHWZ256rmk, 0 },
+ { X86::VPMULHWZrrk, X86::VPMULHWZrmk, 0 },
+ { X86::VPMULLDZ128rrk, X86::VPMULLDZ128rmk, 0 },
+ { X86::VPMULLDZ256rrk, X86::VPMULLDZ256rmk, 0 },
+ { X86::VPMULLDZrrk, X86::VPMULLDZrmk, 0 },
+ { X86::VPMULLQZ128rrk, X86::VPMULLQZ128rmk, 0 },
+ { X86::VPMULLQZ256rrk, X86::VPMULLQZ256rmk, 0 },
+ { X86::VPMULLQZrrk, X86::VPMULLQZrmk, 0 },
+ { X86::VPMULLWZ128rrk, X86::VPMULLWZ128rmk, 0 },
+ { X86::VPMULLWZ256rrk, X86::VPMULLWZ256rmk, 0 },
+ { X86::VPMULLWZrrk, X86::VPMULLWZrmk, 0 },
+ { X86::VPMULTISHIFTQBZ128rrk, X86::VPMULTISHIFTQBZ128rmk, 0 },
+ { X86::VPMULTISHIFTQBZ256rrk, X86::VPMULTISHIFTQBZ256rmk, 0 },
+ { X86::VPMULTISHIFTQBZrrk, X86::VPMULTISHIFTQBZrmk, 0 },
+ { X86::VPMULUDQZ128rrk, X86::VPMULUDQZ128rmk, 0 },
+ { X86::VPMULUDQZ256rrk, X86::VPMULUDQZ256rmk, 0 },
+ { X86::VPMULUDQZrrk, X86::VPMULUDQZrmk, 0 },
+ { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
+ { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
+ { X86::VPORDZrrk, X86::VPORDZrmk, 0 },
+ { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
+ { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
+ { X86::VPORQZrrk, X86::VPORQZrmk, 0 },
+ { X86::VPROLVDZ128rrk, X86::VPROLVDZ128rmk, 0 },
+ { X86::VPROLVDZ256rrk, X86::VPROLVDZ256rmk, 0 },
+ { X86::VPROLVDZrrk, X86::VPROLVDZrmk, 0 },
+ { X86::VPROLVQZ128rrk, X86::VPROLVQZ128rmk, 0 },
+ { X86::VPROLVQZ256rrk, X86::VPROLVQZ256rmk, 0 },
+ { X86::VPROLVQZrrk, X86::VPROLVQZrmk, 0 },
+ { X86::VPRORVDZ128rrk, X86::VPRORVDZ128rmk, 0 },
+ { X86::VPRORVDZ256rrk, X86::VPRORVDZ256rmk, 0 },
+ { X86::VPRORVDZrrk, X86::VPRORVDZrmk, 0 },
+ { X86::VPRORVQZ128rrk, X86::VPRORVQZ128rmk, 0 },
+ { X86::VPRORVQZ256rrk, X86::VPRORVQZ256rmk, 0 },
+ { X86::VPRORVQZrrk, X86::VPRORVQZrmk, 0 },
+ { X86::VPSHLDDZ128rrik, X86::VPSHLDDZ128rmik, 0 },
+ { X86::VPSHLDDZ256rrik, X86::VPSHLDDZ256rmik, 0 },
+ { X86::VPSHLDDZrrik, X86::VPSHLDDZrmik, 0 },
+ { X86::VPSHLDQZ128rrik, X86::VPSHLDQZ128rmik, 0 },
+ { X86::VPSHLDQZ256rrik, X86::VPSHLDQZ256rmik, 0 },
+ { X86::VPSHLDQZrrik, X86::VPSHLDQZrmik, 0 },
+ { X86::VPSHLDVDZ128rk, X86::VPSHLDVDZ128mk, 0 },
+ { X86::VPSHLDVDZ128rkz, X86::VPSHLDVDZ128mkz, 0 },
+ { X86::VPSHLDVDZ256rk, X86::VPSHLDVDZ256mk, 0 },
+ { X86::VPSHLDVDZ256rkz, X86::VPSHLDVDZ256mkz, 0 },
+ { X86::VPSHLDVDZrk, X86::VPSHLDVDZmk, 0 },
+ { X86::VPSHLDVDZrkz, X86::VPSHLDVDZmkz, 0 },
+ { X86::VPSHLDVQZ128rk, X86::VPSHLDVQZ128mk, 0 },
+ { X86::VPSHLDVQZ128rkz, X86::VPSHLDVQZ128mkz, 0 },
+ { X86::VPSHLDVQZ256rk, X86::VPSHLDVQZ256mk, 0 },
+ { X86::VPSHLDVQZ256rkz, X86::VPSHLDVQZ256mkz, 0 },
+ { X86::VPSHLDVQZrk, X86::VPSHLDVQZmk, 0 },
+ { X86::VPSHLDVQZrkz, X86::VPSHLDVQZmkz, 0 },
+ { X86::VPSHLDVWZ128rk, X86::VPSHLDVWZ128mk, 0 },
+ { X86::VPSHLDVWZ128rkz, X86::VPSHLDVWZ128mkz, 0 },
+ { X86::VPSHLDVWZ256rk, X86::VPSHLDVWZ256mk, 0 },
+ { X86::VPSHLDVWZ256rkz, X86::VPSHLDVWZ256mkz, 0 },
+ { X86::VPSHLDVWZrk, X86::VPSHLDVWZmk, 0 },
+ { X86::VPSHLDVWZrkz, X86::VPSHLDVWZmkz, 0 },
+ { X86::VPSHLDWZ128rrik, X86::VPSHLDWZ128rmik, 0 },
+ { X86::VPSHLDWZ256rrik, X86::VPSHLDWZ256rmik, 0 },
+ { X86::VPSHLDWZrrik, X86::VPSHLDWZrmik, 0 },
+ { X86::VPSHRDDZ128rrik, X86::VPSHRDDZ128rmik, 0 },
+ { X86::VPSHRDDZ256rrik, X86::VPSHRDDZ256rmik, 0 },
+ { X86::VPSHRDDZrrik, X86::VPSHRDDZrmik, 0 },
+ { X86::VPSHRDQZ128rrik, X86::VPSHRDQZ128rmik, 0 },
+ { X86::VPSHRDQZ256rrik, X86::VPSHRDQZ256rmik, 0 },
+ { X86::VPSHRDQZrrik, X86::VPSHRDQZrmik, 0 },
+ { X86::VPSHRDVDZ128rk, X86::VPSHRDVDZ128mk, 0 },
+ { X86::VPSHRDVDZ128rkz, X86::VPSHRDVDZ128mkz, 0 },
+ { X86::VPSHRDVDZ256rk, X86::VPSHRDVDZ256mk, 0 },
+ { X86::VPSHRDVDZ256rkz, X86::VPSHRDVDZ256mkz, 0 },
+ { X86::VPSHRDVDZrk, X86::VPSHRDVDZmk, 0 },
+ { X86::VPSHRDVDZrkz, X86::VPSHRDVDZmkz, 0 },
+ { X86::VPSHRDVQZ128rk, X86::VPSHRDVQZ128mk, 0 },
+ { X86::VPSHRDVQZ128rkz, X86::VPSHRDVQZ128mkz, 0 },
+ { X86::VPSHRDVQZ256rk, X86::VPSHRDVQZ256mk, 0 },
+ { X86::VPSHRDVQZ256rkz, X86::VPSHRDVQZ256mkz, 0 },
+ { X86::VPSHRDVQZrk, X86::VPSHRDVQZmk, 0 },
+ { X86::VPSHRDVQZrkz, X86::VPSHRDVQZmkz, 0 },
+ { X86::VPSHRDVWZ128rk, X86::VPSHRDVWZ128mk, 0 },
+ { X86::VPSHRDVWZ128rkz, X86::VPSHRDVWZ128mkz, 0 },
+ { X86::VPSHRDVWZ256rk, X86::VPSHRDVWZ256mk, 0 },
+ { X86::VPSHRDVWZ256rkz, X86::VPSHRDVWZ256mkz, 0 },
+ { X86::VPSHRDVWZrk, X86::VPSHRDVWZmk, 0 },
+ { X86::VPSHRDVWZrkz, X86::VPSHRDVWZmkz, 0 },
+ { X86::VPSHRDWZ128rrik, X86::VPSHRDWZ128rmik, 0 },
+ { X86::VPSHRDWZ256rrik, X86::VPSHRDWZ256rmik, 0 },
+ { X86::VPSHRDWZrrik, X86::VPSHRDWZrmik, 0 },
+ { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 },
+ { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 },
+ { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 },
+ { X86::VPSLLDZ128rrk, X86::VPSLLDZ128rmk, 0 },
+ { X86::VPSLLDZ256rrk, X86::VPSLLDZ256rmk, 0 },
+ { X86::VPSLLDZrrk, X86::VPSLLDZrmk, 0 },
+ { X86::VPSLLQZ128rrk, X86::VPSLLQZ128rmk, 0 },
+ { X86::VPSLLQZ256rrk, X86::VPSLLQZ256rmk, 0 },
+ { X86::VPSLLQZrrk, X86::VPSLLQZrmk, 0 },
+ { X86::VPSLLVDZ128rrk, X86::VPSLLVDZ128rmk, 0 },
+ { X86::VPSLLVDZ256rrk, X86::VPSLLVDZ256rmk, 0 },
+ { X86::VPSLLVDZrrk, X86::VPSLLVDZrmk, 0 },
+ { X86::VPSLLVQZ128rrk, X86::VPSLLVQZ128rmk, 0 },
+ { X86::VPSLLVQZ256rrk, X86::VPSLLVQZ256rmk, 0 },
+ { X86::VPSLLVQZrrk, X86::VPSLLVQZrmk, 0 },
+ { X86::VPSLLVWZ128rrk, X86::VPSLLVWZ128rmk, 0 },
+ { X86::VPSLLVWZ256rrk, X86::VPSLLVWZ256rmk, 0 },
+ { X86::VPSLLVWZrrk, X86::VPSLLVWZrmk, 0 },
+ { X86::VPSLLWZ128rrk, X86::VPSLLWZ128rmk, 0 },
+ { X86::VPSLLWZ256rrk, X86::VPSLLWZ256rmk, 0 },
+ { X86::VPSLLWZrrk, X86::VPSLLWZrmk, 0 },
+ { X86::VPSRADZ128rrk, X86::VPSRADZ128rmk, 0 },
+ { X86::VPSRADZ256rrk, X86::VPSRADZ256rmk, 0 },
+ { X86::VPSRADZrrk, X86::VPSRADZrmk, 0 },
+ { X86::VPSRAQZ128rrk, X86::VPSRAQZ128rmk, 0 },
+ { X86::VPSRAQZ256rrk, X86::VPSRAQZ256rmk, 0 },
+ { X86::VPSRAQZrrk, X86::VPSRAQZrmk, 0 },
+ { X86::VPSRAVDZ128rrk, X86::VPSRAVDZ128rmk, 0 },
+ { X86::VPSRAVDZ256rrk, X86::VPSRAVDZ256rmk, 0 },
+ { X86::VPSRAVDZrrk, X86::VPSRAVDZrmk, 0 },
+ { X86::VPSRAVQZ128rrk, X86::VPSRAVQZ128rmk, 0 },
+ { X86::VPSRAVQZ256rrk, X86::VPSRAVQZ256rmk, 0 },
+ { X86::VPSRAVQZrrk, X86::VPSRAVQZrmk, 0 },
+ { X86::VPSRAVWZ128rrk, X86::VPSRAVWZ128rmk, 0 },
+ { X86::VPSRAVWZ256rrk, X86::VPSRAVWZ256rmk, 0 },
+ { X86::VPSRAVWZrrk, X86::VPSRAVWZrmk, 0 },
+ { X86::VPSRAWZ128rrk, X86::VPSRAWZ128rmk, 0 },
+ { X86::VPSRAWZ256rrk, X86::VPSRAWZ256rmk, 0 },
+ { X86::VPSRAWZrrk, X86::VPSRAWZrmk, 0 },
+ { X86::VPSRLDZ128rrk, X86::VPSRLDZ128rmk, 0 },
+ { X86::VPSRLDZ256rrk, X86::VPSRLDZ256rmk, 0 },
+ { X86::VPSRLDZrrk, X86::VPSRLDZrmk, 0 },
+ { X86::VPSRLQZ128rrk, X86::VPSRLQZ128rmk, 0 },
+ { X86::VPSRLQZ256rrk, X86::VPSRLQZ256rmk, 0 },
+ { X86::VPSRLQZrrk, X86::VPSRLQZrmk, 0 },
+ { X86::VPSRLVDZ128rrk, X86::VPSRLVDZ128rmk, 0 },
+ { X86::VPSRLVDZ256rrk, X86::VPSRLVDZ256rmk, 0 },
+ { X86::VPSRLVDZrrk, X86::VPSRLVDZrmk, 0 },
+ { X86::VPSRLVQZ128rrk, X86::VPSRLVQZ128rmk, 0 },
+ { X86::VPSRLVQZ256rrk, X86::VPSRLVQZ256rmk, 0 },
+ { X86::VPSRLVQZrrk, X86::VPSRLVQZrmk, 0 },
+ { X86::VPSRLVWZ128rrk, X86::VPSRLVWZ128rmk, 0 },
+ { X86::VPSRLVWZ256rrk, X86::VPSRLVWZ256rmk, 0 },
+ { X86::VPSRLVWZrrk, X86::VPSRLVWZrmk, 0 },
+ { X86::VPSRLWZ128rrk, X86::VPSRLWZ128rmk, 0 },
+ { X86::VPSRLWZ256rrk, X86::VPSRLWZ256rmk, 0 },
+ { X86::VPSRLWZrrk, X86::VPSRLWZrmk, 0 },
+ { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
+ { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
+ { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 },
+ { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
+ { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
+ { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 },
+ { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
+ { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
+ { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 },
+ { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
+ { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
+ { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 },
+ { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
+ { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
+ { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 },
+ { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
+ { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
+ { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
+ { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
+ { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
+ { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
+ { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
+ { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
+ { X86::VPSUBWZrrk, X86::VPSUBWZrmk, 0 },
+ { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
+ { X86::VPTERNLOGDZ128rrikz, X86::VPTERNLOGDZ128rmikz, 0 },
+ { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
+ { X86::VPTERNLOGDZ256rrikz, X86::VPTERNLOGDZ256rmikz, 0 },
+ { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
+ { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
+ { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
+ { X86::VPTERNLOGQZ128rrikz, X86::VPTERNLOGQZ128rmikz, 0 },
+ { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
+ { X86::VPTERNLOGQZ256rrikz, X86::VPTERNLOGQZ256rmikz, 0 },
+ { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
+ { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
+ { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
+ { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
+ { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
+ { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
+ { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
+ { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
+ { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
+ { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
+ { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
+ { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 },
+ { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 },
+ { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 },
+ { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 },
+ { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 },
+ { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 },
+ { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 },
+ { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 },
+ { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 },
+ { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 },
+ { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 },
+ { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 },
+ { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 },
+ { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 },
+ { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 },
+ { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 },
+ { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 },
+ { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 },
+ { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 },
+ { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 },
+ { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
+ { X86::VRANGEPDZ128rrik, X86::VRANGEPDZ128rmik, 0 },
+ { X86::VRANGEPDZ256rrik, X86::VRANGEPDZ256rmik, 0 },
+ { X86::VRANGEPDZrrik, X86::VRANGEPDZrmik, 0 },
+ { X86::VRANGEPSZ128rrik, X86::VRANGEPSZ128rmik, 0 },
+ { X86::VRANGEPSZ256rrik, X86::VRANGEPSZ256rmik, 0 },
+ { X86::VRANGEPSZrrik, X86::VRANGEPSZrmik, 0 },
+ { X86::VRANGESDZrrik, X86::VRANGESDZrmik, TB_NO_REVERSE },
+ { X86::VRANGESSZrrik, X86::VRANGESSZrmik, TB_NO_REVERSE },
+ { X86::VRCP14SDZrrk, X86::VRCP14SDZrmk, TB_NO_REVERSE },
+ { X86::VRCP14SSZrrk, X86::VRCP14SSZrmk, TB_NO_REVERSE },
+ { X86::VRCP28SDZrk, X86::VRCP28SDZmk, TB_NO_REVERSE },
+ { X86::VRCP28SSZrk, X86::VRCP28SSZmk, TB_NO_REVERSE },
+ { X86::VREDUCESDZrrik, X86::VREDUCESDZrmik, TB_NO_REVERSE },
+ { X86::VREDUCESSZrrik, X86::VREDUCESSZrmik, TB_NO_REVERSE },
+ { X86::VRNDSCALESDZr_Intk, X86::VRNDSCALESDZm_Intk, TB_NO_REVERSE },
+ { X86::VRNDSCALESSZr_Intk, X86::VRNDSCALESSZm_Intk, TB_NO_REVERSE },
+ { X86::VRSQRT14SDZrrk, X86::VRSQRT14SDZrmk, TB_NO_REVERSE },
+ { X86::VRSQRT14SSZrrk, X86::VRSQRT14SSZrmk, TB_NO_REVERSE },
+ { X86::VRSQRT28SDZrk, X86::VRSQRT28SDZmk, TB_NO_REVERSE },
+ { X86::VRSQRT28SSZrk, X86::VRSQRT28SSZmk, TB_NO_REVERSE },
+ { X86::VSCALEFPDZ128rrk, X86::VSCALEFPDZ128rmk, 0 },
+ { X86::VSCALEFPDZ256rrk, X86::VSCALEFPDZ256rmk, 0 },
+ { X86::VSCALEFPDZrrk, X86::VSCALEFPDZrmk, 0 },
+ { X86::VSCALEFPSZ128rrk, X86::VSCALEFPSZ128rmk, 0 },
+ { X86::VSCALEFPSZ256rrk, X86::VSCALEFPSZ256rmk, 0 },
+ { X86::VSCALEFPSZrrk, X86::VSCALEFPSZrmk, 0 },
+ { X86::VSCALEFSDZrrk, X86::VSCALEFSDZrmk, TB_NO_REVERSE },
+ { X86::VSCALEFSSZrrk, X86::VSCALEFSSZrmk, TB_NO_REVERSE },
+ { X86::VSHUFF32X4Z256rrik, X86::VSHUFF32X4Z256rmik, 0 },
+ { X86::VSHUFF32X4Zrrik, X86::VSHUFF32X4Zrmik, 0 },
+ { X86::VSHUFF64X2Z256rrik, X86::VSHUFF64X2Z256rmik, 0 },
+ { X86::VSHUFF64X2Zrrik, X86::VSHUFF64X2Zrmik, 0 },
+ { X86::VSHUFI32X4Z256rrik, X86::VSHUFI32X4Z256rmik, 0 },
+ { X86::VSHUFI32X4Zrrik, X86::VSHUFI32X4Zrmik, 0 },
+ { X86::VSHUFI64X2Z256rrik, X86::VSHUFI64X2Z256rmik, 0 },
+ { X86::VSHUFI64X2Zrrik, X86::VSHUFI64X2Zrmik, 0 },
+ { X86::VSHUFPDZ128rrik, X86::VSHUFPDZ128rmik, 0 },
+ { X86::VSHUFPDZ256rrik, X86::VSHUFPDZ256rmik, 0 },
+ { X86::VSHUFPDZrrik, X86::VSHUFPDZrmik, 0 },
+ { X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0 },
+ { X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0 },
+ { X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0 },
+ { X86::VSQRTSDZr_Intk, X86::VSQRTSDZm_Intk, TB_NO_REVERSE },
+ { X86::VSQRTSSZr_Intk, X86::VSQRTSSZm_Intk, TB_NO_REVERSE },
+ { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
+ { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
+ { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
+ { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
+ { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
+ { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
+ { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE },
+ { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 },
+ { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 },
+ { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
+ { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 },
+ { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 },
+ { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
+ { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 },
+ { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 },
+ { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
+ { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
+ { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 },
+ { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 },
+ { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
+ { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 },
+ { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 },
+ { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },
+ { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 },
+ { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
+};
+
+static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = {
+ { X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD },
+ { X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD },
+ { X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD },
+ { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS },
+ { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS },
+ { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS },
+ { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmbi, TB_BCAST_SD },
+ { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmbi, TB_BCAST_SD },
+ { X86::VCMPPDZrri, X86::VCMPPDZrmbi, TB_BCAST_SD },
+ { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS },
+ { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS },
+ { X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS },
+ { X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD },
+ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD },
+ { X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD },
+ { X86::VDIVPSZ128rr, X86::VDIVPSZ128rmb, TB_BCAST_SS },
+ { X86::VDIVPSZ256rr, X86::VDIVPSZ256rmb, TB_BCAST_SS },
+ { X86::VDIVPSZrr, X86::VDIVPSZrmb, TB_BCAST_SS },
+ { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rmb, TB_BCAST_SD },
+ { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rmb, TB_BCAST_SD },
+ { X86::VMAXCPDZrr, X86::VMAXCPDZrmb, TB_BCAST_SD },
+ { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rmb, TB_BCAST_SS },
+ { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rmb, TB_BCAST_SS },
+ { X86::VMAXCPSZrr, X86::VMAXCPSZrmb, TB_BCAST_SS },
+ { X86::VMAXPDZ128rr, X86::VMAXPDZ128rmb, TB_BCAST_SD },
+ { X86::VMAXPDZ256rr, X86::VMAXPDZ256rmb, TB_BCAST_SD },
+ { X86::VMAXPDZrr, X86::VMAXPDZrmb, TB_BCAST_SD },
+ { X86::VMAXPSZ128rr, X86::VMAXPSZ128rmb, TB_BCAST_SS },
+ { X86::VMAXPSZ256rr, X86::VMAXPSZ256rmb, TB_BCAST_SS },
+ { X86::VMAXPSZrr, X86::VMAXPSZrmb, TB_BCAST_SS },
+ { X86::VMINCPDZ128rr, X86::VMINCPDZ128rmb, TB_BCAST_SD },
+ { X86::VMINCPDZ256rr, X86::VMINCPDZ256rmb, TB_BCAST_SD },
+ { X86::VMINCPDZrr, X86::VMINCPDZrmb, TB_BCAST_SD },
+ { X86::VMINCPSZ128rr, X86::VMINCPSZ128rmb, TB_BCAST_SS },
+ { X86::VMINCPSZ256rr, X86::VMINCPSZ256rmb, TB_BCAST_SS },
+ { X86::VMINCPSZrr, X86::VMINCPSZrmb, TB_BCAST_SS },
+ { X86::VMINPDZ128rr, X86::VMINPDZ128rmb, TB_BCAST_SD },
+ { X86::VMINPDZ256rr, X86::VMINPDZ256rmb, TB_BCAST_SD },
+ { X86::VMINPDZrr, X86::VMINPDZrmb, TB_BCAST_SD },
+ { X86::VMINPSZ128rr, X86::VMINPSZ128rmb, TB_BCAST_SS },
+ { X86::VMINPSZ256rr, X86::VMINPSZ256rmb, TB_BCAST_SS },
+ { X86::VMINPSZrr, X86::VMINPSZrmb, TB_BCAST_SS },
+ { X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD },
+ { X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD },
+ { X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD },
+ { X86::VMULPSZ128rr, X86::VMULPSZ128rmb, TB_BCAST_SS },
+ { X86::VMULPSZ256rr, X86::VMULPSZ256rmb, TB_BCAST_SS },
+ { X86::VMULPSZrr, X86::VMULPSZrmb, TB_BCAST_SS },
+ { X86::VPADDDZ128rr, X86::VPADDDZ128rmb, TB_BCAST_D },
+ { X86::VPADDDZ256rr, X86::VPADDDZ256rmb, TB_BCAST_D },
+ { X86::VPADDDZrr, X86::VPADDDZrmb, TB_BCAST_D },
+ { X86::VPADDQZ128rr, X86::VPADDQZ128rmb, TB_BCAST_Q },
+ { X86::VPADDQZ256rr, X86::VPADDQZ256rmb, TB_BCAST_Q },
+ { X86::VPADDQZrr, X86::VPADDQZrmb, TB_BCAST_Q },
+ { X86::VPANDDZ128rr, X86::VPANDDZ128rmb, TB_BCAST_D },
+ { X86::VPANDDZ256rr, X86::VPANDDZ256rmb, TB_BCAST_D },
+ { X86::VPANDDZrr, X86::VPANDDZrmb, TB_BCAST_D },
+ { X86::VPANDNDZ128rr, X86::VPANDNDZ128rmb, TB_BCAST_D },
+ { X86::VPANDNDZ256rr, X86::VPANDNDZ256rmb, TB_BCAST_D },
+ { X86::VPANDNDZrr, X86::VPANDNDZrmb, TB_BCAST_D },
+ { X86::VPANDNQZ128rr, X86::VPANDNQZ128rmb, TB_BCAST_Q },
+ { X86::VPANDNQZ256rr, X86::VPANDNQZ256rmb, TB_BCAST_Q },
+ { X86::VPANDNQZrr, X86::VPANDNQZrmb, TB_BCAST_Q },
+ { X86::VPANDQZ128rr, X86::VPANDQZ128rmb, TB_BCAST_Q },
+ { X86::VPANDQZ256rr, X86::VPANDQZ256rmb, TB_BCAST_Q },
+ { X86::VPANDQZrr, X86::VPANDQZrmb, TB_BCAST_Q },
+ { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmib, TB_BCAST_D },
+ { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmib, TB_BCAST_D },
+ { X86::VPCMPDZrri, X86::VPCMPDZrmib, TB_BCAST_D },
+ { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rmb, TB_BCAST_D },
+ { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rmb, TB_BCAST_D },
+ { X86::VPCMPEQDZrr, X86::VPCMPEQDZrmb, TB_BCAST_D },
+ { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rmb, TB_BCAST_Q },
+ { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rmb, TB_BCAST_Q },
+ { X86::VPCMPEQQZrr, X86::VPCMPEQQZrmb, TB_BCAST_Q },
+ { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rmb, TB_BCAST_D },
+ { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rmb, TB_BCAST_D },
+ { X86::VPCMPGTDZrr, X86::VPCMPGTDZrmb, TB_BCAST_D },
+ { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rmb, TB_BCAST_Q },
+ { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rmb, TB_BCAST_Q },
+ { X86::VPCMPGTQZrr, X86::VPCMPGTQZrmb, TB_BCAST_Q },
+ { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmib, TB_BCAST_Q },
+ { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmib, TB_BCAST_Q },
+ { X86::VPCMPQZrri, X86::VPCMPQZrmib, TB_BCAST_Q },
+ { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmib, TB_BCAST_D },
+ { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmib, TB_BCAST_D },
+ { X86::VPCMPUDZrri, X86::VPCMPUDZrmib, TB_BCAST_D },
+ { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmib, TB_BCAST_Q },
+ { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmib, TB_BCAST_Q },
+ { X86::VPCMPUQZrri, X86::VPCMPUQZrmib, TB_BCAST_Q },
+ { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rmb, TB_BCAST_D },
+ { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rmb, TB_BCAST_D },
+ { X86::VPMAXSDZrr, X86::VPMAXSDZrmb, TB_BCAST_D },
+ { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rmb, TB_BCAST_Q },
+ { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rmb, TB_BCAST_Q },
+ { X86::VPMAXSQZrr, X86::VPMAXSQZrmb, TB_BCAST_Q },
+ { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rmb, TB_BCAST_D },
+ { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rmb, TB_BCAST_D },
+ { X86::VPMAXUDZrr, X86::VPMAXUDZrmb, TB_BCAST_D },
+ { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rmb, TB_BCAST_Q },
+ { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rmb, TB_BCAST_Q },
+ { X86::VPMAXUQZrr, X86::VPMAXUQZrmb, TB_BCAST_Q },
+ { X86::VPMINSDZ128rr, X86::VPMINSDZ128rmb, TB_BCAST_D },
+ { X86::VPMINSDZ256rr, X86::VPMINSDZ256rmb, TB_BCAST_D },
+ { X86::VPMINSDZrr, X86::VPMINSDZrmb, TB_BCAST_D },
+ { X86::VPMINSQZ128rr, X86::VPMINSQZ128rmb, TB_BCAST_Q },
+ { X86::VPMINSQZ256rr, X86::VPMINSQZ256rmb, TB_BCAST_Q },
+ { X86::VPMINSQZrr, X86::VPMINSQZrmb, TB_BCAST_Q },
+ { X86::VPMINUDZ128rr, X86::VPMINUDZ128rmb, TB_BCAST_D },
+ { X86::VPMINUDZ256rr, X86::VPMINUDZ256rmb, TB_BCAST_D },
+ { X86::VPMINUDZrr, X86::VPMINUDZrmb, TB_BCAST_D },
+ { X86::VPMINUQZ128rr, X86::VPMINUQZ128rmb, TB_BCAST_Q },
+ { X86::VPMINUQZ256rr, X86::VPMINUQZ256rmb, TB_BCAST_Q },
+ { X86::VPMINUQZrr, X86::VPMINUQZrmb, TB_BCAST_Q },
+ { X86::VPMULLDZ128rr, X86::VPMULLDZ128rmb, TB_BCAST_D },
+ { X86::VPMULLDZ256rr, X86::VPMULLDZ256rmb, TB_BCAST_D },
+ { X86::VPMULLDZrr, X86::VPMULLDZrmb, TB_BCAST_D },
+ { X86::VPMULLQZ128rr, X86::VPMULLQZ128rmb, TB_BCAST_Q },
+ { X86::VPMULLQZ256rr, X86::VPMULLQZ256rmb, TB_BCAST_Q },
+ { X86::VPMULLQZrr, X86::VPMULLQZrmb, TB_BCAST_Q },
+ { X86::VPORDZ128rr, X86::VPORDZ128rmb, TB_BCAST_D },
+ { X86::VPORDZ256rr, X86::VPORDZ256rmb, TB_BCAST_D },
+ { X86::VPORDZrr, X86::VPORDZrmb, TB_BCAST_D },
+ { X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q },
+ { X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q },
+ { X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q },
+ { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D },
+ { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D },
+ { X86::VPTESTMDZrr, X86::VPTESTMDZrmb, TB_BCAST_D },
+ { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q },
+ { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q },
+ { X86::VPTESTMQZrr, X86::VPTESTMQZrmb, TB_BCAST_Q },
+ { X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D },
+ { X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D },
+ { X86::VPTESTNMDZrr, X86::VPTESTNMDZrmb, TB_BCAST_D },
+ { X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q },
+ { X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q },
+ { X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q },
+ { X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D },
+ { X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D },
+ { X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D },
+ { X86::VPXORQZ128rr, X86::VPXORQZ128rmb, TB_BCAST_Q },
+ { X86::VPXORQZ256rr, X86::VPXORQZ256rmb, TB_BCAST_Q },
+ { X86::VPXORQZrr, X86::VPXORQZrmb, TB_BCAST_Q },
+ { X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD },
+ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD },
+ { X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD },
+ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rmb, TB_BCAST_SS },
+ { X86::VSUBPSZ256rr, X86::VSUBPSZ256rmb, TB_BCAST_SS },
+ { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS },
+};
+
+static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = {
+ { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADD132PDZr, X86::VFMADD132PDZmb, TB_BCAST_SD },
+ { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADD132PSZr, X86::VFMADD132PSZmb, TB_BCAST_SS },
+ { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADD213PDZr, X86::VFMADD213PDZmb, TB_BCAST_SD },
+ { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADD213PSZr, X86::VFMADD213PSZmb, TB_BCAST_SS },
+ { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADD231PDZr, X86::VFMADD231PDZmb, TB_BCAST_SD },
+ { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADD231PSZr, X86::VFMADD231PSZmb, TB_BCAST_SS },
+ { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZmb, TB_BCAST_SD },
+ { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZmb, TB_BCAST_SS },
+ { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZmb, TB_BCAST_SD },
+ { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZmb, TB_BCAST_SS },
+ { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZmb, TB_BCAST_SD },
+ { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZmb, TB_BCAST_SS },
+ { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUB132PDZr, X86::VFMSUB132PDZmb, TB_BCAST_SD },
+ { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUB132PSZr, X86::VFMSUB132PSZmb, TB_BCAST_SS },
+ { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUB213PDZr, X86::VFMSUB213PDZmb, TB_BCAST_SD },
+ { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUB213PSZr, X86::VFMSUB213PSZmb, TB_BCAST_SS },
+ { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUB231PDZr, X86::VFMSUB231PDZmb, TB_BCAST_SD },
+ { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUB231PSZr, X86::VFMSUB231PSZmb, TB_BCAST_SS },
+ { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZmb, TB_BCAST_SD },
+ { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZmb, TB_BCAST_SS },
+ { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZmb, TB_BCAST_SD },
+ { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZmb, TB_BCAST_SS },
+ { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZmb, TB_BCAST_SD },
+ { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZmb, TB_BCAST_SS },
+ { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMADD132PDZr, X86::VFNMADD132PDZmb, TB_BCAST_SD },
+ { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMADD132PSZr, X86::VFNMADD132PSZmb, TB_BCAST_SS },
+ { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMADD213PDZr, X86::VFNMADD213PDZmb, TB_BCAST_SD },
+ { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMADD213PSZr, X86::VFNMADD213PSZmb, TB_BCAST_SS },
+ { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMADD231PDZr, X86::VFNMADD231PDZmb, TB_BCAST_SD },
+ { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMADD231PSZr, X86::VFNMADD231PSZmb, TB_BCAST_SS },
+ { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZmb, TB_BCAST_SD },
+ { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZmb, TB_BCAST_SS },
+ { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZmb, TB_BCAST_SD },
+ { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZmb, TB_BCAST_SS },
+ { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZmb, TB_BCAST_SD },
+ { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS },
+ { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmbi, TB_BCAST_D },
+ { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmbi, TB_BCAST_D },
+ { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmbi, TB_BCAST_D },
+ { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmbi, TB_BCAST_Q },
+ { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmbi, TB_BCAST_Q },
+ { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmbi, TB_BCAST_Q },
+};
+
+static const X86MemoryFoldTableEntry *
+lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
+#ifndef NDEBUG
+ // Make sure the tables are sorted.
+ static std::atomic<bool> FoldTablesChecked(false);
+ if (!FoldTablesChecked.load(std::memory_order_relaxed)) {
+ assert(llvm::is_sorted(MemoryFoldTable2Addr) &&
+ std::adjacent_find(std::begin(MemoryFoldTable2Addr),
+ std::end(MemoryFoldTable2Addr)) ==
+ std::end(MemoryFoldTable2Addr) &&
+ "MemoryFoldTable2Addr is not sorted and unique!");
+ assert(llvm::is_sorted(MemoryFoldTable0) &&
+ std::adjacent_find(std::begin(MemoryFoldTable0),
+ std::end(MemoryFoldTable0)) ==
+ std::end(MemoryFoldTable0) &&
+ "MemoryFoldTable0 is not sorted and unique!");
+ assert(llvm::is_sorted(MemoryFoldTable1) &&
+ std::adjacent_find(std::begin(MemoryFoldTable1),
+ std::end(MemoryFoldTable1)) ==
+ std::end(MemoryFoldTable1) &&
+ "MemoryFoldTable1 is not sorted and unique!");
+ assert(llvm::is_sorted(MemoryFoldTable2) &&
+ std::adjacent_find(std::begin(MemoryFoldTable2),
+ std::end(MemoryFoldTable2)) ==
+ std::end(MemoryFoldTable2) &&
+ "MemoryFoldTable2 is not sorted and unique!");
+ assert(llvm::is_sorted(MemoryFoldTable3) &&
+ std::adjacent_find(std::begin(MemoryFoldTable3),
+ std::end(MemoryFoldTable3)) ==
+ std::end(MemoryFoldTable3) &&
+ "MemoryFoldTable3 is not sorted and unique!");
+ assert(llvm::is_sorted(MemoryFoldTable4) &&
+ std::adjacent_find(std::begin(MemoryFoldTable4),
+ std::end(MemoryFoldTable4)) ==
+ std::end(MemoryFoldTable4) &&
+ "MemoryFoldTable4 is not sorted and unique!");
+ assert(llvm::is_sorted(BroadcastFoldTable2) &&
+ std::adjacent_find(std::begin(BroadcastFoldTable2),
+ std::end(BroadcastFoldTable2)) ==
+ std::end(BroadcastFoldTable2) &&
+ "BroadcastFoldTable2 is not sorted and unique!");
+ assert(llvm::is_sorted(BroadcastFoldTable3) &&
+ std::adjacent_find(std::begin(BroadcastFoldTable3),
+ std::end(BroadcastFoldTable3)) ==
+ std::end(BroadcastFoldTable3) &&
+ "BroadcastFoldTable3 is not sorted and unique!");
+ FoldTablesChecked.store(true, std::memory_order_relaxed);
+ }
+#endif
+
+ const X86MemoryFoldTableEntry *Data = llvm::lower_bound(Table, RegOp);
+ if (Data != Table.end() && Data->KeyOp == RegOp &&
+ !(Data->Flags & TB_NO_FORWARD))
+ return Data;
+ return nullptr;
+}
+
+const X86MemoryFoldTableEntry *
+llvm::lookupTwoAddrFoldTable(unsigned RegOp) {
+ return lookupFoldTableImpl(MemoryFoldTable2Addr, RegOp);
+}
+
+const X86MemoryFoldTableEntry *
+llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) {
+ ArrayRef<X86MemoryFoldTableEntry> FoldTable;
+ if (OpNum == 0)
+ FoldTable = makeArrayRef(MemoryFoldTable0);
+ else if (OpNum == 1)
+ FoldTable = makeArrayRef(MemoryFoldTable1);
+ else if (OpNum == 2)
+ FoldTable = makeArrayRef(MemoryFoldTable2);
+ else if (OpNum == 3)
+ FoldTable = makeArrayRef(MemoryFoldTable3);
+ else if (OpNum == 4)
+ FoldTable = makeArrayRef(MemoryFoldTable4);
+ else
+ return nullptr;
+
+ return lookupFoldTableImpl(FoldTable, RegOp);
+}
+
+namespace {
+
+// This class stores the memory unfolding tables. It is instantiated as a
+// ManagedStatic to lazily init the unfolding table.
+struct X86MemUnfoldTable {
+ // Stores memory unfolding tables entries sorted by opcode.
+ std::vector<X86MemoryFoldTableEntry> Table;
+
+ X86MemUnfoldTable() {
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable2Addr)
+ // Index 0, folded load and store, no alignment requirement.
+ addTableEntry(Entry, TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable0)
+ // Index 0, mix of loads and stores.
+ addTableEntry(Entry, TB_INDEX_0);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable1)
+ // Index 1, folded load
+ addTableEntry(Entry, TB_INDEX_1 | TB_FOLDED_LOAD);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable2)
+ // Index 2, folded load
+ addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable3)
+ // Index 3, folded load
+ addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD);
+
+ for (const X86MemoryFoldTableEntry &Entry : MemoryFoldTable4)
+ // Index 4, folded load
+ addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD);
+
+ // Broadcast tables.
+ for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable2)
+ // Index 2, folded broadcast
+ addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
+
+ for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3)
+ // Index 3, folded broadcast
+ addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
+
+ // Sort the memory->reg unfold table.
+ array_pod_sort(Table.begin(), Table.end());
+
+ // Now that it's sorted, ensure its unique.
+ assert(std::adjacent_find(Table.begin(), Table.end()) == Table.end() &&
+ "Memory unfolding table is not unique!");
+ }
+
+ void addTableEntry(const X86MemoryFoldTableEntry &Entry,
+ uint16_t ExtraFlags) {
+ // NOTE: This swaps the KeyOp and DstOp in the table so we can sort it.
+ if ((Entry.Flags & TB_NO_REVERSE) == 0)
+ Table.push_back({Entry.DstOp, Entry.KeyOp,
+ static_cast<uint16_t>(Entry.Flags | ExtraFlags) });
+ }
+};
+}
+
+static ManagedStatic<X86MemUnfoldTable> MemUnfoldTable;
+
+const X86MemoryFoldTableEntry *
+llvm::lookupUnfoldTable(unsigned MemOp) {
+ auto &Table = MemUnfoldTable->Table;
+ auto I = llvm::lower_bound(Table, MemOp);
+ if (I != Table.end() && I->KeyOp == MemOp)
+ return &*I;
+ return nullptr;
+}
+
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h
new file mode 100644
index 000000000000..b7aca27ab2bb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h
@@ -0,0 +1,97 @@
+//===-- X86InstrFoldTables.h - X86 Instruction Folding Tables ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the interface to query the X86 memory folding tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
+#define LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
+
+#include <cstdint>
+
+namespace llvm {
+
+enum {
+ // Select which memory operand is being unfolded.
+ // (stored in bits 0 - 2)
+ TB_INDEX_0 = 0,
+ TB_INDEX_1 = 1,
+ TB_INDEX_2 = 2,
+ TB_INDEX_3 = 3,
+ TB_INDEX_4 = 4,
+ TB_INDEX_MASK = 0x7,
+
+ // Do not insert the reverse map (MemOp -> RegOp) into the table.
+ // This may be needed because there is a many -> one mapping.
+ TB_NO_REVERSE = 1 << 3,
+
+ // Do not insert the forward map (RegOp -> MemOp) into the table.
+ // This is needed for Native Client, which prohibits branch
+ // instructions from using a memory operand.
+ TB_NO_FORWARD = 1 << 4,
+
+ TB_FOLDED_LOAD = 1 << 5,
+ TB_FOLDED_STORE = 1 << 6,
+ TB_FOLDED_BCAST = 1 << 7,
+
+ // Minimum alignment required for load/store.
+ // Used for RegOp->MemOp conversion. Encoded as Log2(Align) + 1 to allow 0
+ // to mean align of 0.
+ // (stored in bits 8 - 11)
+ TB_ALIGN_SHIFT = 8,
+ TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
+ TB_ALIGN_16 = 5 << TB_ALIGN_SHIFT,
+ TB_ALIGN_32 = 6 << TB_ALIGN_SHIFT,
+ TB_ALIGN_64 = 7 << TB_ALIGN_SHIFT,
+ TB_ALIGN_MASK = 0xf << TB_ALIGN_SHIFT,
+
+ // Broadcast type.
+ // (stored in bits 12 - 13)
+ TB_BCAST_TYPE_SHIFT = 12,
+ TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_MASK = 0x3 << TB_BCAST_TYPE_SHIFT,
+
+ // Unused bits 14-15
+};
+
+// This struct is used for both the folding and unfold tables. They KeyOp
+// is used to determine the sorting order.
+struct X86MemoryFoldTableEntry {
+ uint16_t KeyOp;
+ uint16_t DstOp;
+ uint16_t Flags;
+
+ bool operator<(const X86MemoryFoldTableEntry &RHS) const {
+ return KeyOp < RHS.KeyOp;
+ }
+ bool operator==(const X86MemoryFoldTableEntry &RHS) const {
+ return KeyOp == RHS.KeyOp;
+ }
+ friend bool operator<(const X86MemoryFoldTableEntry &TE, unsigned Opcode) {
+ return TE.KeyOp < Opcode;
+ }
+};
+
+// Look up the memory folding table entry for folding a load and a store into
+// operand 0.
+const X86MemoryFoldTableEntry *lookupTwoAddrFoldTable(unsigned RegOp);
+
+// Look up the memory folding table entry for folding a load or store with
+// operand OpNum.
+const X86MemoryFoldTableEntry *lookupFoldTable(unsigned RegOp, unsigned OpNum);
+
+// Look up the memory unfolding table entry for this instruction.
+const X86MemoryFoldTableEntry *lookupUnfoldTable(unsigned MemOp);
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 000000000000..686b19fc0a6c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,1011 @@
+//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<7> val> {
+ bits<7> Value = val;
+}
+
+def Pseudo : Format<0>;
+def RawFrm : Format<1>;
+def AddRegFrm : Format<2>;
+def RawFrmMemOffs : Format<3>;
+def RawFrmSrc : Format<4>;
+def RawFrmDst : Format<5>;
+def RawFrmDstSrc : Format<6>;
+def RawFrmImm8 : Format<7>;
+def RawFrmImm16 : Format<8>;
+def AddCCFrm : Format<9>;
+def PrefixByte : Format<10>;
+def MRMr0 : Format<21>;
+def MRMSrcMemFSIB : Format<22>;
+def MRMDestMemFSIB : Format<23>;
+def MRMDestMem : Format<24>;
+def MRMSrcMem : Format<25>;
+def MRMSrcMem4VOp3 : Format<26>;
+def MRMSrcMemOp4 : Format<27>;
+def MRMSrcMemCC : Format<28>;
+def MRMXmCC: Format<30>;
+def MRMXm : Format<31>;
+def MRM0m : Format<32>; def MRM1m : Format<33>; def MRM2m : Format<34>;
+def MRM3m : Format<35>; def MRM4m : Format<36>; def MRM5m : Format<37>;
+def MRM6m : Format<38>; def MRM7m : Format<39>;
+def MRMDestReg : Format<40>;
+def MRMSrcReg : Format<41>;
+def MRMSrcReg4VOp3 : Format<42>;
+def MRMSrcRegOp4 : Format<43>;
+def MRMSrcRegCC : Format<44>;
+def MRMXrCC: Format<46>;
+def MRMXr : Format<47>;
+def MRM0r : Format<48>; def MRM1r : Format<49>; def MRM2r : Format<50>;
+def MRM3r : Format<51>; def MRM4r : Format<52>; def MRM5r : Format<53>;
+def MRM6r : Format<54>; def MRM7r : Format<55>;
+def MRM0X : Format<56>; def MRM1X : Format<57>; def MRM2X : Format<58>;
+def MRM3X : Format<59>; def MRM4X : Format<60>; def MRM5X : Format<61>;
+def MRM6X : Format<62>; def MRM7X : Format<63>;
+def MRM_C0 : Format<64>; def MRM_C1 : Format<65>; def MRM_C2 : Format<66>;
+def MRM_C3 : Format<67>; def MRM_C4 : Format<68>; def MRM_C5 : Format<69>;
+def MRM_C6 : Format<70>; def MRM_C7 : Format<71>; def MRM_C8 : Format<72>;
+def MRM_C9 : Format<73>; def MRM_CA : Format<74>; def MRM_CB : Format<75>;
+def MRM_CC : Format<76>; def MRM_CD : Format<77>; def MRM_CE : Format<78>;
+def MRM_CF : Format<79>; def MRM_D0 : Format<80>; def MRM_D1 : Format<81>;
+def MRM_D2 : Format<82>; def MRM_D3 : Format<83>; def MRM_D4 : Format<84>;
+def MRM_D5 : Format<85>; def MRM_D6 : Format<86>; def MRM_D7 : Format<87>;
+def MRM_D8 : Format<88>; def MRM_D9 : Format<89>; def MRM_DA : Format<90>;
+def MRM_DB : Format<91>; def MRM_DC : Format<92>; def MRM_DD : Format<93>;
+def MRM_DE : Format<94>; def MRM_DF : Format<95>; def MRM_E0 : Format<96>;
+def MRM_E1 : Format<97>; def MRM_E2 : Format<98>; def MRM_E3 : Format<99>;
+def MRM_E4 : Format<100>; def MRM_E5 : Format<101>; def MRM_E6 : Format<102>;
+def MRM_E7 : Format<103>; def MRM_E8 : Format<104>; def MRM_E9 : Format<105>;
+def MRM_EA : Format<106>; def MRM_EB : Format<107>; def MRM_EC : Format<108>;
+def MRM_ED : Format<109>; def MRM_EE : Format<110>; def MRM_EF : Format<111>;
+def MRM_F0 : Format<112>; def MRM_F1 : Format<113>; def MRM_F2 : Format<114>;
+def MRM_F3 : Format<115>; def MRM_F4 : Format<116>; def MRM_F5 : Format<117>;
+def MRM_F6 : Format<118>; def MRM_F7 : Format<119>; def MRM_F8 : Format<120>;
+def MRM_F9 : Format<121>; def MRM_FA : Format<122>; def MRM_FB : Format<123>;
+def MRM_FC : Format<124>; def MRM_FD : Format<125>; def MRM_FE : Format<126>;
+def MRM_FF : Format<127>;
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<4> val> {
+ bits<4> Value = val;
+}
+def NoImm : ImmType<0>;
+def Imm8 : ImmType<1>;
+def Imm8PCRel : ImmType<2>;
+def Imm8Reg : ImmType<3>; // Register encoded in [7:4].
+def Imm16 : ImmType<4>;
+def Imm16PCRel : ImmType<5>;
+def Imm32 : ImmType<6>;
+def Imm32PCRel : ImmType<7>;
+def Imm32S : ImmType<8>;
+def Imm64 : ImmType<9>;
+
+// FPFormat - This specifies what form this FP instruction has. This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+ bits<3> Value = val;
+}
+def NotFP : FPFormat<0>;
+def ZeroArgFP : FPFormat<1>;
+def OneArgFP : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP : FPFormat<4>;
+def CompareFP : FPFormat<5>;
+def CondMovFP : FPFormat<6>;
+def SpecialFP : FPFormat<7>;
+
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+ bits<2> Value = val;
+}
+def GenericDomain : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt : Domain<3>;
+
+// Class specifying the vector form of the decompressed
+// displacement of 8-bit.
+class CD8VForm<bits<3> val> {
+ bits<3> Value = val;
+}
+def CD8VF : CD8VForm<0>; // v := VL
+def CD8VH : CD8VForm<1>; // v := VL/2
+def CD8VQ : CD8VForm<2>; // v := VL/4
+def CD8VO : CD8VForm<3>; // v := VL/8
+// The tuple (subvector) forms.
+def CD8VT1 : CD8VForm<4>; // v := 1
+def CD8VT2 : CD8VForm<5>; // v := 2
+def CD8VT4 : CD8VForm<6>; // v := 4
+def CD8VT8 : CD8VForm<7>; // v := 8
+
+// Class specifying the prefix used an opcode extension.
+class Prefix<bits<3> val> {
+ bits<3> Value = val;
+}
+def NoPrfx : Prefix<0>;
+def PD : Prefix<1>;
+def XS : Prefix<2>;
+def XD : Prefix<3>;
+def PS : Prefix<4>; // Similar to NoPrfx, but disassembler uses this to know
+ // that other instructions with this opcode use PD/XS/XD
+ // and if any of those is not supported they shouldn't
+ // decode to this instruction. e.g. ANDSS/ANDSD don't
+ // exist, but the 0xf2/0xf3 encoding shouldn't
+ // disable to ANDPS.
+
+// Class specifying the opcode map.
+class Map<bits<3> val> {
+ bits<3> Value = val;
+}
+def OB : Map<0>;
+def TB : Map<1>;
+def T8 : Map<2>;
+def TA : Map<3>;
+def XOP8 : Map<4>;
+def XOP9 : Map<5>;
+def XOPA : Map<6>;
+def ThreeDNow : Map<7>;
+
+// Class specifying the encoding
+class Encoding<bits<2> val> {
+ bits<2> Value = val;
+}
+def EncNormal : Encoding<0>;
+def EncVEX : Encoding<1>;
+def EncXOP : Encoding<2>;
+def EncEVEX : Encoding<3>;
+
+// Operand size for encodings that change based on mode.
+class OperandSize<bits<2> val> {
+ bits<2> Value = val;
+}
+def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
+def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
+
+// Address size for encodings that change based on mode.
+class AddressSize<bits<2> val> {
+ bits<2> Value = val;
+}
+def AdSizeX : AddressSize<0>; // Address size determined using addr operand.
+def AdSize16 : AddressSize<1>; // Encodes a 16-bit address.
+def AdSize32 : AddressSize<2>; // Encodes a 32-bit address.
+def AdSize64 : AddressSize<3>; // Encodes a 64-bit address.
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize16 { OperandSize OpSize = OpSize16; }
+class OpSize32 { OperandSize OpSize = OpSize32; }
+class AdSize16 { AddressSize AdSize = AdSize16; }
+class AdSize32 { AddressSize AdSize = AdSize32; }
+class AdSize64 { AddressSize AdSize = AdSize64; }
+class REX_W { bit hasREX_WPrefix = 1; }
+class LOCK { bit hasLockPrefix = 1; }
+class REP { bit hasREPPrefix = 1; }
+class TB { Map OpMap = TB; }
+class T8 { Map OpMap = T8; }
+class TA { Map OpMap = TA; }
+class XOP8 { Map OpMap = XOP8; Prefix OpPrefix = PS; }
+class XOP9 { Map OpMap = XOP9; Prefix OpPrefix = PS; }
+class XOPA { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class ThreeDNow { Map OpMap = ThreeDNow; }
+class OBXS { Prefix OpPrefix = XS; }
+class PS : TB { Prefix OpPrefix = PS; }
+class PD : TB { Prefix OpPrefix = PD; }
+class XD : TB { Prefix OpPrefix = XD; }
+class XS : TB { Prefix OpPrefix = XS; }
+class T8PS : T8 { Prefix OpPrefix = PS; }
+class T8PD : T8 { Prefix OpPrefix = PD; }
+class T8XD : T8 { Prefix OpPrefix = XD; }
+class T8XS : T8 { Prefix OpPrefix = XS; }
+class TAPS : TA { Prefix OpPrefix = PS; }
+class TAPD : TA { Prefix OpPrefix = PD; }
+class TAXD : TA { Prefix OpPrefix = XD; }
+class TAXS : TA { Prefix OpPrefix = XS; }
+class VEX { Encoding OpEnc = EncVEX; }
+class VEX_W { bit HasVEX_W = 1; }
+class VEX_WIG { bit IgnoresVEX_W = 1; }
+// Special version of VEX_W that can be changed to VEX.W==0 for EVEX2VEX.
+class VEX_W1X { bit HasVEX_W = 1; bit EVEX_W1_VEX_W0 = 1; }
+class VEX_4V : VEX { bit hasVEX_4V = 1; }
+class VEX_L { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
+class EVEX { Encoding OpEnc = EncEVEX; }
+class EVEX_4V : EVEX { bit hasVEX_4V = 1; }
+class EVEX_K { bit hasEVEX_K = 1; }
+class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
+class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_RC { bit hasEVEX_RC = 1; }
+class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
+class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
+class NOTRACK { bit hasNoTrackPrefix = 1; }
+class SIMD_EXC { list<Register> Uses = [MXCSR]; bit mayRaiseFPException = 1; }
+
+// Specify AVX512 8-bit compressed displacement encoding based on the vector
+// element size in bits (8, 16, 32, 64) and the CDisp8 form.
+class EVEX_CD8<int esize, CD8VForm form> {
+ int CD8_EltSize = !srl(esize, 3);
+ bits<3> CD8_Form = form.Value;
+}
+
+class XOP { Encoding OpEnc = EncXOP; }
+class XOP_4V : XOP { bit hasVEX_4V = 1; }
+
+// Specify the alternative register form instruction to replace the current
+// instruction in case it was picked during generation of memory folding tables
+class FoldGenData<string _RegisterForm> {
+ string FoldGenRegForm = _RegisterForm;
+}
+
+// Provide a specific instruction to be used by the EVEX2VEX conversion.
+class EVEX2VEXOverride<string VEXInstrName> {
+ string EVEX2VEXOverride = VEXInstrName;
+}
+
+// Mark the instruction as "illegal to memory fold/unfold"
+class NotMemoryFoldable { bit isMemoryFoldable = 0; }
+
+// Prevent EVEX->VEX conversion from considering this instruction.
+class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
+
+// Force the instruction to use VEX encoding.
+class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+ string AsmStr, Domain d = GenericDomain>
+ : Instruction {
+ let Namespace = "X86";
+
+ bits<8> Opcode = opcod;
+ Format Form = f;
+ bits<7> FormBits = Form.Value;
+ ImmType ImmT = i;
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ string AsmString = AsmStr;
+
+ // If this is a pseudo instruction, mark it isCodeGenOnly.
+ let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+ //
+ // Attributes specific to X86 instructions...
+ //
+ bit ForceDisassemble = 0; // Force instruction to disassemble even though it's
+ // isCodeGenonly. Needed to hide an ambiguous
+ // AsmString from the parser, but still disassemble.
+
+ OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change
+ // based on operand size of the mode?
+ bits<2> OpSizeBits = OpSize.Value;
+ AddressSize AdSize = AdSizeX; // Does this instruction's encoding change
+ // based on address size of the mode?
+ bits<2> AdSizeBits = AdSize.Value;
+
+ Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
+ bits<3> OpPrefixBits = OpPrefix.Value;
+ Map OpMap = OB; // Which opcode map does this inst have?
+ bits<3> OpMapBits = OpMap.Value;
+ bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix?
+ FPFormat FPForm = NotFP; // What flavor of FP instruction is this?
+ bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix?
+ Domain ExeDomain = d;
+ bit hasREPPrefix = 0; // Does this inst have a REP prefix?
+ Encoding OpEnc = EncNormal; // Encoding used by this instruction
+ bits<2> OpEncBits = OpEnc.Value;
+ bit HasVEX_W = 0; // Does this inst set the VEX_W field?
+ bit IgnoresVEX_W = 0; // Does this inst ignore VEX_W field?
+ bit EVEX_W1_VEX_W0 = 0; // This EVEX inst with VEX.W==1 can become a VEX
+ // instruction with VEX.W == 0.
+ bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field?
+ bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
+ bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
+ bit hasEVEX_K = 0; // Does this inst require masking?
+ bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field?
+ bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field?
+ bit hasEVEX_B = 0; // Does this inst set the EVEX_B field?
+ bits<3> CD8_Form = 0; // Compressed disp8 form - vector-width.
+ // Declare it int rather than bits<4> so that all bits are defined when
+ // assigning to bits<7>.
+ int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes.
+ bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction.
+ bit hasNoTrackPrefix = 0; // Does this inst has 0x3E (NoTrack) prefix?
+
+ // Vector size in bytes.
+ bits<7> VectSize = !if(hasEVEX_L2, 64, !if(hasVEX_L, 32, 16));
+
+ // The scaling factor for AVX512's compressed displacement is either
+ // - the size of a power-of-two number of elements or
+ // - the size of a single element for broadcasts or
+ // - the total vector size divided by a power-of-two number.
+ // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64.
+ bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value),
+ !if (CD8_Form{2},
+ !shl(CD8_EltSize, CD8_Form{1-0}),
+ !if (hasEVEX_B,
+ CD8_EltSize,
+ !srl(VectSize, CD8_Form{1-0}))), 0);
+
+ // Used in the memory folding generation (TableGen backend) to point to an alternative
+ // instruction to replace the current one in case it got picked during generation.
+ string FoldGenRegForm = ?;
+
+ // Used to prevent an explicit EVEX2VEX override for this instruction.
+ string EVEX2VEXOverride = ?;
+
+ bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction?
+ bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
+ bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.
+
+ // TSFlags layout should be kept in sync with X86BaseInfo.h.
+ let TSFlags{6-0} = FormBits;
+ let TSFlags{8-7} = OpSizeBits;
+ let TSFlags{10-9} = AdSizeBits;
+ // No need for 3rd bit, we don't need to distinguish NoPrfx from PS.
+ let TSFlags{12-11} = OpPrefixBits{1-0};
+ let TSFlags{15-13} = OpMapBits;
+ let TSFlags{16} = hasREX_WPrefix;
+ let TSFlags{20-17} = ImmT.Value;
+ let TSFlags{23-21} = FPForm.Value;
+ let TSFlags{24} = hasLockPrefix;
+ let TSFlags{25} = hasREPPrefix;
+ let TSFlags{27-26} = ExeDomain.Value;
+ let TSFlags{29-28} = OpEncBits;
+ let TSFlags{37-30} = Opcode;
+ // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
+ let TSFlags{38} = HasVEX_W;
+ let TSFlags{39} = hasVEX_4V;
+ let TSFlags{40} = hasVEX_L;
+ let TSFlags{41} = hasEVEX_K;
+ let TSFlags{42} = hasEVEX_Z;
+ let TSFlags{43} = hasEVEX_L2;
+ let TSFlags{44} = hasEVEX_B;
+ // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
+ let TSFlags{51-45} = CD8_Scale;
+ let TSFlags{52} = hasEVEX_RC;
+ let TSFlags{53} = hasNoTrackPrefix;
+ let TSFlags{54} = ExplicitVEXPrefix;
+}
+
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+ : X86Inst<0, Pseudo, NoImm, oops, iops, ""> {
+ let Pattern = pattern;
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d = GenericDomain>
+ : X86Inst<o, f, NoImm, outs, ins, asm, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8, outs, ins, asm, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8Reg<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8Reg, outs, ins, asm, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm8PCRel, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm16, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32S, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii64<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm64, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm16PCRel, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32PCRel, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
+ : I<o, F, outs, ins, asm, []> {
+ let Defs = [FPSW];
+}
+
+// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
+ : PseudoI<outs, ins, pattern> {
+ let FPForm = fp;
+ let Defs = [FPSW];
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+// their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+// Iseg16 - 16-bit segment selector, 16-bit offset
+// Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm16, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// SI - SSE 1 & 2 scalar instructions
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d = GenericDomain>
+ : I<o, F, outs, ins, asm, pattern, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+ !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+ !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])))));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512
+class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d = GenericDomain>
+ : I<o, F, outs, ins, asm, pattern, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+ !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+ !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])))));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+// SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512
+class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+ [UseSSE2])));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// PI - SSE 1 & 2 packed instructions
+class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+ Domain d>
+ : I<o, F, outs, ins, asm, pattern, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+ Domain d>
+ : I<o, F, outs, ins, asm, pattern, d> {
+ let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasMMX, HasSSE2],
+ [HasMMX, HasSSE1]);
+}
+
+// PIi8 - SSE 1 & 2 packed instructions with immediate
+class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d>
+ : Ii8<o, F, outs, ins, asm, pattern, d> {
+ let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+ !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+ !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+ [UseSSE1])));
+
+ // AVX instructions have a 'v' prefix in the mnemonic
+ let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+ !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+ asm));
+}
+
+// SSE1 Instruction Templates:
+//
+// SSI - SSE1 instructions with XS prefix.
+// PSI - SSE1 instructions with PS prefix.
+// PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix.
+// VSSI - SSE1 instructions with XS prefix in AVX form.
+// VPSI - SSE1 instructions with PS prefix in AVX form, packed single.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
+ Requires<[UseSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
+ Requires<[UseSSE1]>;
+class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
+ Requires<[HasAVX]>;
+class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, PS,
+ Requires<[HasAVX]>;
+
+// SSE2 Instruction Templates:
+//
+// SDI - SSE2 instructions with XD prefix.
+// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix.
+// S2SI - SSE2 instructions with XS prefix.
+// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+// PDI - SSE2 instructions with PD prefix, packed double domain.
+// PDIi8 - SSE2 instructions with ImmT == Imm8 and PD prefix.
+// VSDI - SSE2 scalar instructions with XD prefix in AVX form.
+// VPDI - SSE2 vector instructions with PD prefix in AVX form,
+// packed double domain.
+// VS2I - SSE2 scalar instructions with PD prefix in AVX form.
+// S2I - SSE2 scalar instructions with PD prefix.
+// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+// MMX operands.
+// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+// MMX operands.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
+class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+ Requires<[UseSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+ Requires<[UseSSE2]>;
+class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD,
+ Requires<[UseAVX]>;
+class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
+ Requires<[HasAVX]>;
+class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>,
+ PD, Requires<[HasAVX]>;
+class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern>, PD,
+ Requires<[UseAVX]>;
+class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PD, Requires<[UseSSE2]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX, HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX, HasSSE2]>;
+
+// SSE3 Instruction Templates:
+//
+// S3I - SSE3 instructions with PD prefixes.
+// S3SI - SSE3 instructions with XS prefix.
+// S3DI - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS,
+ Requires<[UseSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD,
+ Requires<[UseSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+ Requires<[UseSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+//
+// SS38I - SSSE3 instructions with T8 prefix.
+// SS3AI - SSSE3 instructions with TA prefix.
+// MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+// MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. The 64-bit versions are grouped with the MMX
+// classes. They need to be enabled even if AVX is enabled.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+ Requires<[UseSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+ Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PS,
+ Requires<[HasMMX, HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPS,
+ Requires<[HasMMX, HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+//
+// SS48I - SSE 4.1 instructions with T8 prefix.
+// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+ Requires<[UseSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+ Requires<[UseSSE41]>;
+
+// SSE4.2 Instruction Templates:
+//
+// SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+ Requires<[UseSSE42]>;
+
+// SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8XD, Requires<[HasSSE42]>;
+
+// SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+ Requires<[UseSSE42]>;
+
+// AVX Instruction Templates:
+// Instructions introduced in AVX (no SSE equivalent forms)
+//
+// AVX8I - AVX instructions with T8PD prefix.
+// AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
+class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+ Requires<[HasAVX]>;
+class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+ Requires<[HasAVX]>;
+
+// AVX2 Instruction Templates:
+// Instructions introduced in AVX2 (no SSE equivalent forms)
+//
+// AVX28I - AVX2 instructions with T8PD prefix.
+// AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
+class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+ Requires<[HasAVX2]>;
+class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+ Requires<[HasAVX2]>;
+
+
+// AVX-512 Instruction Templates:
+// Instructions introduced in AVX-512 (no SSE equivalent forms)
+//
+// AVX5128I - AVX-512 instructions with T8PD prefix.
+// AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8.
+// AVX512PDI - AVX-512 instructions with PD, double packed.
+// AVX512PSI - AVX-512 instructions with PS, single packed.
+// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
+// AVX512XSI - AVX-512 instructions with XS prefix, generic domain.
+// AVX512BI - AVX-512 instructions with PD, int packed domain.
+// AVX512SI - AVX-512 scalar instructions with PD prefix.
+
+class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+ Requires<[HasAVX512]>;
+class AVX5128IBase : T8PD {
+ Domain ExeDomain = SSEPackedInt;
+}
+class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8XS,
+ Requires<[HasAVX512]>;
+class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XS,
+ Requires<[HasAVX512]>;
+class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, XD,
+ Requires<[HasAVX512]>;
+class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
+ Requires<[HasAVX512]>;
+class AVX512BIBase : PD {
+ Domain ExeDomain = SSEPackedInt;
+}
+class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
+ Requires<[HasAVX512]>;
+class AVX512BIi8Base : PD {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512XSIi8Base : XS {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512XDIi8Base : XD {
+ Domain ExeDomain = SSEPackedInt;
+ ImmType ImmT = Imm8;
+}
+class AVX512PSIi8Base : PS {
+ Domain ExeDomain = SSEPackedSingle;
+ ImmType ImmT = Imm8;
+}
+class AVX512PDIi8Base : PD {
+ Domain ExeDomain = SSEPackedDouble;
+ ImmType ImmT = Imm8;
+}
+class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+ Requires<[HasAVX512]>;
+class AVX512AIi8Base : TAPD {
+ ImmType ImmT = Imm8;
+}
+class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>,
+ Requires<[HasAVX512]>;
+class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+ Requires<[HasAVX512]>;
+class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
+ Requires<[HasAVX512]>;
+class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d>
+ : Ii8<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>;
+class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d>
+ : I<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>;
+class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8PD,
+ EVEX_4V, Requires<[HasAVX512]>;
+class AVX512FMA3Base : T8PD, EVEX_4V;
+
+class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, Requires<[HasAVX512]>;
+
+// AES Instruction Templates:
+//
+// AES8I
+// These use the same encoding as the SSE4.2 T8 and TA encodings.
+class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+ Requires<[NoAVX, HasAES]>;
+
+class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+ Requires<[NoAVX, HasAES]>;
+
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD;
+
+// FMA3 Instruction Templates
+class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8PD,
+ VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
+class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8PD,
+ VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
+class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8PD,
+ VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;
+
+// FMA4 Instruction Templates
+class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
+ VEX_4V, FMASC, Requires<[HasFMA4, NoVLX]>;
+class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
+ VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>;
+class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
+ VEX_4V, FMASC, Requires<[HasFMA4]>;
+
+// XOP 2, 3 and 4 Operand Instruction Template
+class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
+ XOP9, Requires<[HasXOP]>;
+
+// XOP 2 and 3 Operand Instruction Templates with imm byte
+class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
+ XOP8, Requires<[HasXOP]>;
+// XOP 4 Operand Instruction Templates with imm byte
+class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
+ XOP8, Requires<[HasXOP]>;
+
+// XOP 5 operand instruction (VEX encoding!)
+class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+ VEX_4V, Requires<[HasXOP]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii16<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii32<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii32S<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi64<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii64<o, F, outs, ins, asm, pattern>, REX_W;
+
+class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : S2I<o, F, outs, ins, asm, pattern>, REX_W;
+class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : VS2I<o, F, outs, ins, asm, pattern>, VEX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI - MMX instructions with TB prefix.
+// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I - MMX / SSE2 instructions with PD prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXID - MMX instructions with XD prefix.
+// MMXIS - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX,Not64BitMode]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PS, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, PD, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 000000000000..777c5a158b4c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -0,0 +1,1195 @@
+//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+ [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+// GPR to low word of MMX.
+def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
+ [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>;
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>]>;
+
+def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
+def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
+def X86fmins : SDNode<"X86ISD::FMINS", SDTFPBinOp>;
+def X86fmaxs : SDNode<"X86ISD::FMAXS", SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+
+def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>;
+def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
+def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
+def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
+def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
+def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
+def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
+def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>;
+def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>;
+
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>;
+
+def X86pshufb : SDNode<"X86ISD::PSHUFB",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86psadbw : SDNode<"X86ISD::PSADBW",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i8>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>, [SDNPCommutative]>;
+def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, i8>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>>;
+def X86andnp : SDNode<"X86ISD::ANDNP",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86multishift : SDNode<"X86ISD::MULTISHIFT",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<1,2>]>>;
+def X86pextrb : SDNode<"X86ISD::PEXTRB",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
+ SDTCisVT<2, i8>]>>;
+def X86pextrw : SDNode<"X86ISD::PEXTRW",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
+ SDTCisVT<2, i8>]>>;
+def X86pinsrb : SDNode<"X86ISD::PINSRB",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
+def X86pinsrw : SDNode<"X86ISD::PINSRW",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
+def X86insertps : SDNode<"X86ISD::INSERTPS",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>;
+def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86SubVBroadcastld : SDNode<"X86ISD::SUBV_BROADCAST_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<0, 1>]>;
+def SDTVmtrunc : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>]>;
+
+def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>;
+def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>;
+def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>;
+def X86vmtrunc : SDNode<"X86ISD::VMTRUNC", SDTVmtrunc>;
+def X86vmtruncs : SDNode<"X86ISD::VMTRUNCS", SDTVmtrunc>;
+def X86vmtruncus : SDNode<"X86ISD::VMTRUNCUS", SDTVmtrunc>;
+
+def X86vfpext : SDNode<"X86ISD::VFPEXT",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisSameSizeAs<0, 1>]>>;
+
+def X86strict_vfpext : SDNode<"X86ISD::STRICT_VFPEXT",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisSameSizeAs<0, 1>]>,
+ [SDNPHasChain]>;
+
+def X86any_vfpext : PatFrags<(ops node:$src),
+ [(X86strict_vfpext node:$src),
+ (X86vfpext node:$src)]>;
+
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCisOpSmallerThanOp<0, 1>]>>;
+
+def X86strict_vfpround: SDNode<"X86ISD::STRICT_VFPROUND",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCisOpSmallerThanOp<0, 1>]>,
+ [SDNPHasChain]>;
+
+def X86any_vfpround : PatFrags<(ops node:$src),
+ [(X86strict_vfpround node:$src),
+ (X86vfpround node:$src)]>;
+
+def X86frounds : SDNode<"X86ISD::VFPROUNDS",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f64>,
+ SDTCisSameSizeAs<0, 2>]>>;
+
+def X86froundsRnd: SDNode<"X86ISD::VFPROUNDS_RND",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f64>,
+ SDTCisSameSizeAs<0, 2>,
+ SDTCisVT<3, i32>]>>;
+
+def X86fpexts : SDNode<"X86ISD::VFPEXTS",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f32>,
+ SDTCisSameSizeAs<0, 2>]>>;
+def X86fpextsSAE : SDNode<"X86ISD::VFPEXTS_SAE",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f32>,
+ SDTCisSameSizeAs<0, 2>]>>;
+
+def X86vmfpround: SDNode<"X86ISD::VMFPROUND",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCisSameSizeAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>]>>;
+
+def X86vshiftimm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i8>, SDTCisInt<0>]>;
+
+def X86vshldq : SDNode<"X86ISD::VSHLDQ", X86vshiftimm>;
+def X86vshrdq : SDNode<"X86ISD::VSRLDQ", X86vshiftimm>;
+def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+
+def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
+def X86strict_cmpp : SDNode<"X86ISD::STRICT_CMPP", SDTX86VFCMP, [SDNPHasChain]>;
+def X86any_cmpp : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_cmpp node:$src1, node:$src2, node:$src3),
+ (X86cmpp node:$src1, node:$src2, node:$src3)]>;
+
+def X86CmpMaskCC :
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
+def X86MaskCmpMaskCC :
+ SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisSameAs<4, 0>]>;
+def X86CmpMaskCCScalar :
+ SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>]>;
+
+def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
+def X86cmpmm : SDNode<"X86ISD::CMPMM", X86MaskCmpMaskCC>;
+def X86strict_cmpm : SDNode<"X86ISD::STRICT_CMPM", X86CmpMaskCC, [SDNPHasChain]>;
+def X86any_cmpm : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_cmpm node:$src1, node:$src2, node:$src3),
+ (X86cmpm node:$src1, node:$src2, node:$src3)]>;
+def X86cmpmmSAE : SDNode<"X86ISD::CMPMM_SAE", X86MaskCmpMaskCC>;
+def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
+def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>;
+
+def X86phminpos: SDNode<"X86ISD::PHMINPOS",
+ SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>;
+
+def X86vshiftuniform : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>, SDTCisInt<0>,
+ SDTCisInt<2>]>;
+
+def X86vshl : SDNode<"X86ISD::VSHL", X86vshiftuniform>;
+def X86vsrl : SDNode<"X86ISD::VSRL", X86vshiftuniform>;
+def X86vsra : SDNode<"X86ISD::VSRA", X86vshiftuniform>;
+
+def X86vshiftvariable : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisInt<0>]>;
+
+def X86vshlv : SDNode<"X86ISD::VSHLV", X86vshiftvariable>;
+def X86vsrlv : SDNode<"X86ISD::VSRLV", X86vshiftvariable>;
+def X86vsrav : SDNode<"X86ISD::VSRAV", X86vshiftvariable>;
+
+def X86vshli : SDNode<"X86ISD::VSHLI", X86vshiftimm>;
+def X86vsrli : SDNode<"X86ISD::VSRLI", X86vshiftimm>;
+def X86vsrai : SDNode<"X86ISD::VSRAI", X86vshiftimm>;
+
+def X86kshiftl : SDNode<"X86ISD::KSHIFTL",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+ SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i8>]>>;
+def X86kshiftr : SDNode<"X86ISD::KSHIFTR",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+ SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i8>]>>;
+
+def X86kadd : SDNode<"X86ISD::KADD", SDTIntBinOp, [SDNPCommutative]>;
+
+def X86vrotli : SDNode<"X86ISD::VROTLI", X86vshiftimm>;
+def X86vrotri : SDNode<"X86ISD::VROTRI", X86vshiftimm>;
+
+def X86vpshl : SDNode<"X86ISD::VPSHL", X86vshiftvariable>;
+def X86vpsha : SDNode<"X86ISD::VPSHA", X86vshiftvariable>;
+
+def X86vpcom : SDNode<"X86ISD::VPCOM",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>, SDTCisInt<0>]>>;
+def X86vpcomu : SDNode<"X86ISD::VPCOMU",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>, SDTCisInt<0>]>>;
+def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
+ SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisFP<0>, SDTCisInt<3>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisSameSizeAs<0,3>,
+ SDTCisVT<4, i8>]>>;
+def X86vpperm : SDNode<"X86ISD::VPPERM",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisSameAs<0, 3>]>>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisVec<1>,
+ SDTCisSameAs<2, 1>]>;
+
+def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>;
+def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>;
+def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
+
+def X86movmsk : SDNode<"X86ISD::MOVMSK",
+ SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>;
+
+def X86selects : SDNode<"X86ISD::SELECTS",
+ SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<2, 3>]>>;
+
+def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>]>,
+ [SDNPCommutative]>;
+def X86pmuldq : SDNode<"X86ISD::PMULDQ",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>]>,
+ [SDNPCommutative]>;
+
+def X86extrqi : SDNode<"X86ISD::EXTRQI",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
+def X86insertqi : SDNode<"X86ISD::INSERTQI",
+ SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
+ SDTCisVT<4, i8>]>>;
+
+// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
+// translated into one of the target nodes below during lowering.
+// Note: this is a work in progress...
+def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>;
+def SDTShuff2OpFP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>;
+
+def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisFP<0>, SDTCisInt<2>,
+ SDTCisSameNumEltsAs<0,2>,
+ SDTCisSameSizeAs<0,2>]>;
+def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
+def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
+def SDTFPBinOpImm: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i32>]>;
+def SDTFPTernaryOpImm: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisInt<3>,
+ SDTCisSameSizeAs<0, 3>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, i32>]>;
+def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>,
+ SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>]>;
+
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
+ SDTCisInt<0>, SDTCisInt<1>]>;
+
+def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
+
+def SDTTernlog : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisVec<0>,
+ SDTCisSameAs<0,1>, SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>, SDTCisVT<4, i8>]>;
+
+def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc.
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisVT<3, i32>]>;
+
+def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [ // fsqrt_round, fgetexp_round, etc.
+ SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisVT<2, i32>]>;
+
+def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
+ SDTCisFP<0>, SDTCisVT<4, i32>]>;
+
+def X86PAlignr : SDNode<"X86ISD::PALIGNR",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i8>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>]>>;
+def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
+
+def X86VShld : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>;
+def X86VShrd : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>;
+def X86VShldv : SDNode<"X86ISD::VSHLDV",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>]>>;
+def X86VShrdv : SDNode<"X86ISD::VSHRDV",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>]>>;
+
+def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
+
+def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
+def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
+def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
+
+def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+def X86Shuf128 : SDNode<"X86ISD::SHUF128", SDTShuff3OpI>;
+
+def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
+def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
+def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
+
+def X86Movsd : SDNode<"X86ISD::MOVSD",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v2f64>,
+ SDTCisVT<1, v2f64>,
+ SDTCisVT<2, v2f64>]>>;
+def X86Movss : SDNode<"X86ISD::MOVSS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>>;
+
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
+ SDTCisVec<1>, SDTCisInt<1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>,
+ SDTCisOpSmallerThanOp<0, 1>]>;
+def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
+def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
+
+def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
+def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+
+def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, i8>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>>;
+def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i32>,
+ SDTCVecEltisVT<1, i16>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>,
+ [SDNPCommutative]>;
+
+def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
+def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
+def X86VPermv : SDNode<"X86ISD::VPERMV",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisSameNumEltsAs<0,1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
+def X86VPermt2 : SDNode<"X86ISD::VPERMV3",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>,
+ SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>,
+ SDTCisSameSizeAs<0,2>,
+ SDTCisSameAs<0,3>]>, []>;
+
+def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
+
+def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
+
+def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImm>;
+def X86VFixupimmSAE : SDNode<"X86ISD::VFIXUPIMM_SAE", SDTFPTernaryOpImm>;
+def X86VFixupimms : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImm>;
+def X86VFixupimmSAEs : SDNode<"X86ISD::VFIXUPIMMS_SAE", SDTFPTernaryOpImm>;
+def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImm>;
+def X86VRangeSAE : SDNode<"X86ISD::VRANGE_SAE", SDTFPBinOpImm>;
+def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>;
+def X86VReduceSAE : SDNode<"X86ISD::VREDUCE_SAE", SDTFPUnaryOpImm>;
+def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>;
+def X86strict_VRndScale : SDNode<"X86ISD::STRICT_VRNDSCALE", SDTFPUnaryOpImm,
+ [SDNPHasChain]>;
+def X86any_VRndScale : PatFrags<(ops node:$src1, node:$src2),
+ [(X86strict_VRndScale node:$src1, node:$src2),
+ (X86VRndScale node:$src1, node:$src2)]>;
+
+def X86VRndScaleSAE: SDNode<"X86ISD::VRNDSCALE_SAE", SDTFPUnaryOpImm>;
+def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>;
+def X86VGetMantSAE : SDNode<"X86ISD::VGETMANT_SAE", SDTFPUnaryOpImm>;
+def X86Vfpclass : SDNode<"X86ISD::VFPCLASS",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+ SDTCisFP<1>,
+ SDTCisSameNumEltsAs<0,1>,
+ SDTCisVT<2, i32>]>, []>;
+def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>,
+ SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
+
+def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
+def X86Blendv : SDNode<"X86ISD::BLENDV",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<2, 3>,
+ SDTCisSameNumEltsAs<0, 1>,
+ SDTCisSameSizeAs<0, 1>]>>;
+
+def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
+
+def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>;
+def X86fadds : SDNode<"X86ISD::FADDS", SDTFPBinOp>;
+def X86faddRnds : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>;
+def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>;
+def X86fsubs : SDNode<"X86ISD::FSUBS", SDTFPBinOp>;
+def X86fsubRnds : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>;
+def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>;
+def X86fmuls : SDNode<"X86ISD::FMULS", SDTFPBinOp>;
+def X86fmulRnds : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>;
+def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>;
+def X86fdivs : SDNode<"X86ISD::FDIVS", SDTFPBinOp>;
+def X86fdivRnds : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>;
+def X86fmaxSAE : SDNode<"X86ISD::FMAX_SAE", SDTFPBinOp>;
+def X86fmaxSAEs : SDNode<"X86ISD::FMAXS_SAE", SDTFPBinOp>;
+def X86fminSAE : SDNode<"X86ISD::FMIN_SAE", SDTFPBinOp>;
+def X86fminSAEs : SDNode<"X86ISD::FMINS_SAE", SDTFPBinOp>;
+def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOp>;
+def X86scalefRnd : SDNode<"X86ISD::SCALEF_RND", SDTFPBinOpRound>;
+def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOp>;
+def X86scalefsRnd: SDNode<"X86ISD::SCALEFS_RND", SDTFPBinOpRound>;
+def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
+def X86fsqrts : SDNode<"X86ISD::FSQRTS", SDTFPBinOp>;
+def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
+def X86fgetexp : SDNode<"X86ISD::FGETEXP", SDTFPUnaryOp>;
+def X86fgetexpSAE : SDNode<"X86ISD::FGETEXP_SAE", SDTFPUnaryOp>;
+def X86fgetexps : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>;
+def X86fgetexpSAEs : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>;
+
+def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fnmadd : SDNode<"X86ISD::STRICT_FNMADD", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fnmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fnmadd node:$src1, node:$src2, node:$src3),
+ (X86Fnmadd node:$src1, node:$src2, node:$src3)]>;
+def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fmsub : SDNode<"X86ISD::STRICT_FMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fmsub node:$src1, node:$src2, node:$src3),
+ (X86Fmsub node:$src1, node:$src2, node:$src3)]>;
+def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fnmsub : SDNode<"X86ISD::STRICT_FNMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fnmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fnmsub node:$src1, node:$src2, node:$src3),
+ (X86Fnmsub node:$src1, node:$src2, node:$src3)]>;
+def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>;
+
+def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>;
+
+def X86vp2intersect : SDNode<"X86ISD::VP2INTERSECT",
+ SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+ SDTCisVec<1>, SDTCisSameAs<1, 2>]>>;
+
+def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
+def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>;
+
+def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>;
+def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>;
+
+// VNNI
+def SDTVnni : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def X86Vpdpbusd : SDNode<"X86ISD::VPDPBUSD", SDTVnni>;
+def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>;
+def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>;
+def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>;
+
+def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOp>;
+def X86rsqrt28SAE: SDNode<"X86ISD::RSQRT28_SAE", SDTFPUnaryOp>;
+def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOp>;
+def X86rcp28SAE : SDNode<"X86ISD::RCP28_SAE", SDTFPUnaryOp>;
+def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOp>;
+def X86exp2SAE : SDNode<"X86ISD::EXP2_SAE", SDTFPUnaryOp>;
+
+def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>;
+def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>;
+def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOp>;
+def X86rsqrt28SAEs : SDNode<"X86ISD::RSQRT28S_SAE", SDTFPBinOp>;
+def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOp>;
+def X86rcp28SAEs : SDNode<"X86ISD::RCP28S_SAE", SDTFPBinOp>;
+def X86Ranges : SDNode<"X86ISD::VRANGES", SDTFPBinOpImm>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>;
+def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImm>;
+def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImm>;
+def X86RangesSAE : SDNode<"X86ISD::VRANGES_SAE", SDTFPBinOpImm>;
+def X86RndScalesSAE : SDNode<"X86ISD::VRNDSCALES_SAE", SDTFPBinOpImm>;
+def X86ReducesSAE : SDNode<"X86ISD::VREDUCES_SAE", SDTFPBinOpImm>;
+def X86GetMantsSAE : SDNode<"X86ISD::VGETMANTS_SAE", SDTFPBinOpImm>;
+
+def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>,
+ SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>]>, []>;
+def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>,
+ SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>]>, []>;
+
+// vpshufbitqmb
+def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<1,2>,
+ SDTCVecEltisVT<0,i1>,
+ SDTCisSameNumEltsAs<0,1>]>>;
+
+def SDTintToFP: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>,
+ SDTCisVT<3, i32>]>;
+
+def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisFP<1>]>;
+def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisFP<1>,
+ SDTCisVT<2, i32>]>;
+def SDTSFloatToInt: SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisFP<1>,
+ SDTCisVec<1>]>;
+def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>,
+ SDTCisVec<1>, SDTCisVT<2, i32>]>;
+
+def SDTVintToFP: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisInt<1>]>;
+def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisInt<1>,
+ SDTCisVT<2, i32>]>;
+
+// Scalar
+def X86SintToFp : SDNode<"X86ISD::SCALAR_SINT_TO_FP", SDTintToFP>;
+def X86SintToFpRnd : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND", SDTintToFPRound>;
+def X86UintToFp : SDNode<"X86ISD::SCALAR_UINT_TO_FP", SDTintToFP>;
+def X86UintToFpRnd : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND", SDTintToFPRound>;
+
+def X86cvtts2Int : SDNode<"X86ISD::CVTTS2SI", SDTSFloatToInt>;
+def X86cvtts2UInt : SDNode<"X86ISD::CVTTS2UI", SDTSFloatToInt>;
+def X86cvtts2IntSAE : SDNode<"X86ISD::CVTTS2SI_SAE", SDTSFloatToInt>;
+def X86cvtts2UIntSAE : SDNode<"X86ISD::CVTTS2UI_SAE", SDTSFloatToInt>;
+
+def X86cvts2si : SDNode<"X86ISD::CVTS2SI", SDTSFloatToInt>;
+def X86cvts2usi : SDNode<"X86ISD::CVTS2UI", SDTSFloatToInt>;
+def X86cvts2siRnd : SDNode<"X86ISD::CVTS2SI_RND", SDTSFloatToIntRnd>;
+def X86cvts2usiRnd : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>;
+
+// Vector with rounding mode
+
+// cvtt fp-to-int staff
+def X86cvttp2siSAE : SDNode<"X86ISD::CVTTP2SI_SAE", SDTFloatToInt>;
+def X86cvttp2uiSAE : SDNode<"X86ISD::CVTTP2UI_SAE", SDTFloatToInt>;
+
+def X86VSintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTVintToFPRound>;
+def X86VUintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTVintToFPRound>;
+
+// cvt fp-to-int staff
+def X86cvtp2IntRnd : SDNode<"X86ISD::CVTP2SI_RND", SDTFloatToIntRnd>;
+def X86cvtp2UIntRnd : SDNode<"X86ISD::CVTP2UI_RND", SDTFloatToIntRnd>;
+
+// Vector without rounding mode
+
+// cvtt fp-to-int staff
+def X86cvttp2si : SDNode<"X86ISD::CVTTP2SI", SDTFloatToInt>;
+def X86cvttp2ui : SDNode<"X86ISD::CVTTP2UI", SDTFloatToInt>;
+def X86strict_cvttp2si : SDNode<"X86ISD::STRICT_CVTTP2SI", SDTFloatToInt, [SDNPHasChain]>;
+def X86strict_cvttp2ui : SDNode<"X86ISD::STRICT_CVTTP2UI", SDTFloatToInt, [SDNPHasChain]>;
+def X86any_cvttp2si : PatFrags<(ops node:$src),
+ [(X86strict_cvttp2si node:$src),
+ (X86cvttp2si node:$src)]>;
+def X86any_cvttp2ui : PatFrags<(ops node:$src),
+ [(X86strict_cvttp2ui node:$src),
+ (X86cvttp2ui node:$src)]>;
+
+def X86VSintToFP : SDNode<"X86ISD::CVTSI2P", SDTVintToFP>;
+def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>;
+def X86strict_VSintToFP : SDNode<"X86ISD::STRICT_CVTSI2P", SDTVintToFP, [SDNPHasChain]>;
+def X86strict_VUintToFP : SDNode<"X86ISD::STRICT_CVTUI2P", SDTVintToFP, [SDNPHasChain]>;
+def X86any_VSintToFP : PatFrags<(ops node:$src),
+ [(X86strict_VSintToFP node:$src),
+ (X86VSintToFP node:$src)]>;
+def X86any_VUintToFP : PatFrags<(ops node:$src),
+ [(X86strict_VUintToFP node:$src),
+ (X86VUintToFP node:$src)]>;
+
+
+// cvt int-to-fp staff
+def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>;
+def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>;
+
+
+// Masked versions of above
+def SDTMVintToFP: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisInt<1>,
+ SDTCisSameSizeAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>]>;
+def SDTMFloatToInt: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisFP<1>,
+ SDTCisSameSizeAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>]>;
+
+def X86VMSintToFP : SDNode<"X86ISD::MCVTSI2P", SDTMVintToFP>;
+def X86VMUintToFP : SDNode<"X86ISD::MCVTUI2P", SDTMVintToFP>;
+
+def X86mcvtp2Int : SDNode<"X86ISD::MCVTP2SI", SDTMFloatToInt>;
+def X86mcvtp2UInt : SDNode<"X86ISD::MCVTP2UI", SDTMFloatToInt>;
+def X86mcvttp2si : SDNode<"X86ISD::MCVTTP2SI", SDTMFloatToInt>;
+def X86mcvttp2ui : SDNode<"X86ISD::MCVTTP2UI", SDTMFloatToInt>;
+
+def SDTcvtph2ps : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>]>;
+def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", SDTcvtph2ps>;
+def X86strict_cvtph2ps : SDNode<"X86ISD::STRICT_CVTPH2PS", SDTcvtph2ps,
+ [SDNPHasChain]>;
+def X86any_cvtph2ps : PatFrags<(ops node:$src),
+ [(X86strict_cvtph2ps node:$src),
+ (X86cvtph2ps node:$src)]>;
+
+def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE", SDTcvtph2ps>;
+
+def SDTcvtps2ph : SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisVT<2, i32>]>;
+def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", SDTcvtps2ph>;
+def X86strict_cvtps2ph : SDNode<"X86ISD::STRICT_CVTPS2PH", SDTcvtps2ph,
+ [SDNPHasChain]>;
+def X86any_cvtps2ph : PatFrags<(ops node:$src1, node:$src2),
+ [(X86strict_cvtps2ph node:$src1, node:$src2),
+ (X86cvtps2ph node:$src1, node:$src2)]>;
+
+def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH",
+ SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisVT<2, i32>,
+ SDTCisSameAs<0, 3>,
+ SDTCVecEltisVT<4, i1>,
+ SDTCisSameNumEltsAs<1, 4>]> >;
+def X86vfpextSAE : SDNode<"X86ISD::VFPEXT_SAE",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisOpSmallerThanOp<1, 0>]>>;
+def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCisOpSmallerThanOp<0, 1>,
+ SDTCisVT<2, i32>]>>;
+
+// cvt fp to bfloat16
+def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>>;
+def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>]>>;
+def X86cvtneps2bf16 : SDNode<"X86ISD::CVTNEPS2BF16",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>]>>;
+def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+ SDTCisSameAs<0,1>,
+ SDTCVecEltisVT<2, i32>,
+ SDTCisSameAs<2,3>]>>;
+
+// galois field arithmetic
+def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
+def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
+def X86GF2P8mulb : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>;
+
+def SDTX86MaskedStore: SDTypeProfile<0, 3, [ // masked store
+ SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
+]>;
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+// 128-bit load pattern fragments
+def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv8i16 : PatFrag<(ops node:$ptr), (v8i16 (load node:$ptr))>;
+def loadv16i8 : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>;
+
+// 256-bit load pattern fragments
+def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+def loadv8i32 : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>;
+def loadv16i16 : PatFrag<(ops node:$ptr), (v16i16 (load node:$ptr))>;
+def loadv32i8 : PatFrag<(ops node:$ptr), (v32i8 (load node:$ptr))>;
+
+// 512-bit load pattern fragments
+def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
+def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
+
+// Like 'store', but always requires vector size alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ auto *St = cast<StoreSDNode>(N);
+ return St->getAlignment() >= St->getMemoryVT().getStoreSize();
+}]>;
+
+// Like 'load', but always requires vector size alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ auto *Ld = cast<LoadSDNode>(N);
+ return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
+}]>;
+
+// 128-bit aligned load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+ (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+ (v2f64 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+ (v2i64 (alignedload node:$ptr))>;
+def alignedloadv4i32 : PatFrag<(ops node:$ptr),
+ (v4i32 (alignedload node:$ptr))>;
+def alignedloadv8i16 : PatFrag<(ops node:$ptr),
+ (v8i16 (alignedload node:$ptr))>;
+def alignedloadv16i8 : PatFrag<(ops node:$ptr),
+ (v16i8 (alignedload node:$ptr))>;
+
+// 256-bit aligned load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+ (v8f32 (alignedload node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+ (v4f64 (alignedload node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+ (v4i64 (alignedload node:$ptr))>;
+def alignedloadv8i32 : PatFrag<(ops node:$ptr),
+ (v8i32 (alignedload node:$ptr))>;
+def alignedloadv16i16 : PatFrag<(ops node:$ptr),
+ (v16i16 (alignedload node:$ptr))>;
+def alignedloadv32i8 : PatFrag<(ops node:$ptr),
+ (v32i8 (alignedload node:$ptr))>;
+
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+ (v16f32 (alignedload node:$ptr))>;
+def alignedloadv8f64 : PatFrag<(ops node:$ptr),
+ (v8f64 (alignedload node:$ptr))>;
+def alignedloadv8i64 : PatFrag<(ops node:$ptr),
+ (v8i64 (alignedload node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+ (v16i32 (alignedload node:$ptr))>;
+def alignedloadv32i16 : PatFrag<(ops node:$ptr),
+ (v32i16 (alignedload node:$ptr))>;
+def alignedloadv64i8 : PatFrag<(ops node:$ptr),
+ (v64i8 (alignedload node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others. If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ auto *Ld = cast<LoadSDNode>(N);
+ return Subtarget->hasSSEUnalignedMem() ||
+ Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
+}]>;
+
+// 128-bit memop pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
+def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>;
+def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
+
+// 128-bit bitconvert pattern fragments
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+// 256-bit bitconvert pattern fragments
+def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>;
+def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
+def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
+def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
+def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
+def bc_v4f64 : PatFrag<(ops node:$in), (v4f64 (bitconvert node:$in))>;
+
+// 512-bit bitconvert pattern fragments
+def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>;
+def bc_v32i16 : PatFrag<(ops node:$in), (v32i16 (bitconvert node:$in))>;
+def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
+def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
+def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
+
+def X86vzload32 : PatFrag<(ops node:$src),
+ (X86vzld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
+}]>;
+
+def X86vzload64 : PatFrag<(ops node:$src),
+ (X86vzld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
+
+def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86vextractst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
+
+def X86VBroadcastld8 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 1;
+}]>;
+
+def X86VBroadcastld16 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 2;
+}]>;
+
+def X86VBroadcastld32 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
+}]>;
+
+def X86VBroadcastld64 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
+
+def X86SubVBroadcastld128 : PatFrag<(ops node:$src),
+ (X86SubVBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 16;
+}]>;
+
+def X86SubVBroadcastld256 : PatFrag<(ops node:$src),
+ (X86SubVBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 32;
+}]>;
+
+// Scalar SSE intrinsic fragments to match several different types of loads.
+// Used by scalar SSE intrinsic instructions which have 128 bit types, but
+// only load a single element.
+// FIXME: We should add more canolicalizing in DAGCombine. Particulary removing
+// the simple_load case.
+def sse_load_f32 : PatFrags<(ops node:$ptr),
+ [(v4f32 (simple_load node:$ptr)),
+ (v4f32 (X86vzload32 node:$ptr)),
+ (v4f32 (scalar_to_vector (loadf32 node:$ptr)))]>;
+def sse_load_f64 : PatFrags<(ops node:$ptr),
+ [(v2f64 (simple_load node:$ptr)),
+ (v2f64 (X86vzload64 node:$ptr)),
+ (v2f64 (scalar_to_vector (loadf64 node:$ptr)))]>;
+
+def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fp64imm0 : PatLeaf<(f64 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fp128imm0 : PatLeaf<(f128 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128/VEXTRACTI128 imm.
+def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
+ return getExtractVEXTRACTImmediate(N, 128, SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
+// VINSERTF128/VINSERTI128 imm.
+def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
+ return getInsertVINSERTImmediate(N, 128, SDLoc(N));
+}]>;
+
+// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
+// to VEXTRACTF64x4 imm.
+def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
+ return getExtractVEXTRACTImmediate(N, 256, SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
+// VINSERTF64x4 imm.
+def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
+ return getInsertVINSERTImmediate(N, 256, SDLoc(N));
+}]>;
+
+def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
+ (extract_subvector node:$bigvec,
+ node:$index), [{
+ // Index 0 can be handled via extract_subreg.
+ return !isNullConstant(N->getOperand(1));
+}], EXTRACT_get_vextract128_imm>;
+
+def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+ node:$index),
+ (insert_subvector node:$bigvec, node:$smallvec,
+ node:$index), [{}],
+ INSERT_get_vinsert128_imm>;
+
+def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
+ (extract_subvector node:$bigvec,
+ node:$index), [{
+ // Index 0 can be handled via extract_subreg.
+ return !isNullConstant(N->getOperand(1));
+}], EXTRACT_get_vextract256_imm>;
+
+def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+ node:$index),
+ (insert_subvector node:$bigvec, node:$smallvec,
+ node:$index), [{}],
+ INSERT_get_vinsert256_imm>;
+
+def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_ld node:$src1, undef, node:$src2, node:$src3), [{
+ return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
+}]>;
+
+def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_load node:$src1, node:$src2, node:$src3), [{
+ // Use the node type to determine the size the alignment needs to match.
+ // We can't use memory VT because type widening changes the node VT, but
+ // not the memory VT.
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ return Ld->getAlignment() >= Ld->getValueType(0).getStoreSize();
+}]>;
+
+def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_ld node:$src1, undef, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
+}]>;
+
+// Masked store fragments.
+// X86mstore can't be implemented in core DAG files because some targets
+// do not support vector types (llvm-tblgen will fail).
+def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+ return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
+}]>;
+
+def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ // Use the node type to determine the size the alignment needs to match.
+ // We can't use memory VT because type widening changes the node VT, but
+ // not the memory VT.
+ auto *St = cast<MaskedStoreSDNode>(N);
+ return St->getAlignment() >= St->getOperand(1).getValueType().getStoreSize();
+}]>;
+
+def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isCompressingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
+}]>;
+
+// masked truncstore fragments
+// X86mtruncstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
+}]>;
+def masked_truncstorevi8 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def masked_truncstorevi16 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def masked_truncstorevi32 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTX86MaskedStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTX86MaskedStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncSStore node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncUSStore node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncSStore node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncUSStore node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncSStore node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncUSStore node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 000000000000..d9bab14f0c08
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,9065 @@
+//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrFoldTables.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-instr-info"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "X86GenInstrInfo.inc"
+
+static cl::opt<bool>
+ NoFusing("disable-spill-fusing",
+ cl::desc("Disable fusing of spill code into instructions"),
+ cl::Hidden);
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+ cl::desc("Print instructions that the allocator wants to"
+ " fuse, but the X86 backend currently can't"),
+ cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+ cl::desc("Re-materialize load from stub in PIC mode"),
+ cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+PartialRegUpdateClearance("partial-reg-update-clearance",
+ cl::desc("Clearance between two register writes "
+ "for inserting XOR to avoid partial "
+ "register update"),
+ cl::init(64), cl::Hidden);
+static cl::opt<unsigned>
+UndefRegClearance("undef-reg-clearance",
+ cl::desc("How many idle instructions we would like before "
+ "certain undef register reads"),
+ cl::init(128), cl::Hidden);
+
+
+// Pin the vtable to this file.
+void X86InstrInfo::anchor() {}
+
+X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
+ : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+ : X86::ADJCALLSTACKDOWN32),
+ (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
+ : X86::ADJCALLSTACKUP32),
+ X86::CATCHRET,
+ (STI.is64Bit() ? X86::RETQ : X86::RETL)),
+ Subtarget(STI), RI(STI.getTargetTriple()) {
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+ Register &SrcReg, Register &DstReg,
+ unsigned &SubIdx) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ if (!Subtarget.is64Bit())
+ // It's not always legal to reference the low 8-bit of the larger
+ // register in 32-bit mode.
+ return false;
+ LLVM_FALLTHROUGH;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ case X86::MOVSX64rr32: {
+ if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+ // Be conservative.
+ return false;
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ SubIdx = X86::sub_8bit;
+ break;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ SubIdx = X86::sub_16bit;
+ break;
+ case X86::MOVSX64rr32:
+ SubIdx = X86::sub_32bit;
+ break;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+bool X86InstrInfo::isDataInvariant(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ // By default, assume that the instruction is not data invariant.
+ return false;
+
+ // Some target-independent operations that trivially lower to data-invariant
+ // instructions.
+ case TargetOpcode::COPY:
+ case TargetOpcode::INSERT_SUBREG:
+ case TargetOpcode::SUBREG_TO_REG:
+ return true;
+
+ // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+ // However, they set flags and are perhaps the most surprisingly constant
+ // time operations so we call them out here separately.
+ case X86::IMUL16rr:
+ case X86::IMUL16rri8:
+ case X86::IMUL16rri:
+ case X86::IMUL32rr:
+ case X86::IMUL32rri8:
+ case X86::IMUL32rri:
+ case X86::IMUL64rr:
+ case X86::IMUL64rri32:
+ case X86::IMUL64rri8:
+
+ // Bit scanning and counting instructions that are somewhat surprisingly
+ // constant time as they scan across bits and do other fairly complex
+ // operations like popcnt, but are believed to be constant time on x86.
+ // However, these set flags.
+ case X86::BSF16rr:
+ case X86::BSF32rr:
+ case X86::BSF64rr:
+ case X86::BSR16rr:
+ case X86::BSR32rr:
+ case X86::BSR64rr:
+ case X86::LZCNT16rr:
+ case X86::LZCNT32rr:
+ case X86::LZCNT64rr:
+ case X86::POPCNT16rr:
+ case X86::POPCNT32rr:
+ case X86::POPCNT64rr:
+ case X86::TZCNT16rr:
+ case X86::TZCNT32rr:
+ case X86::TZCNT64rr:
+
+ // Bit manipulation instructions are effectively combinations of basic
+ // arithmetic ops, and should still execute in constant time. These also
+ // set flags.
+ case X86::BLCFILL32rr:
+ case X86::BLCFILL64rr:
+ case X86::BLCI32rr:
+ case X86::BLCI64rr:
+ case X86::BLCIC32rr:
+ case X86::BLCIC64rr:
+ case X86::BLCMSK32rr:
+ case X86::BLCMSK64rr:
+ case X86::BLCS32rr:
+ case X86::BLCS64rr:
+ case X86::BLSFILL32rr:
+ case X86::BLSFILL64rr:
+ case X86::BLSI32rr:
+ case X86::BLSI64rr:
+ case X86::BLSIC32rr:
+ case X86::BLSIC64rr:
+ case X86::BLSMSK32rr:
+ case X86::BLSMSK64rr:
+ case X86::BLSR32rr:
+ case X86::BLSR64rr:
+ case X86::TZMSK32rr:
+ case X86::TZMSK64rr:
+
+ // Bit extracting and clearing instructions should execute in constant time,
+ // and set flags.
+ case X86::BEXTR32rr:
+ case X86::BEXTR64rr:
+ case X86::BEXTRI32ri:
+ case X86::BEXTRI64ri:
+ case X86::BZHI32rr:
+ case X86::BZHI64rr:
+
+ // Shift and rotate.
+ case X86::ROL8r1:
+ case X86::ROL16r1:
+ case X86::ROL32r1:
+ case X86::ROL64r1:
+ case X86::ROL8rCL:
+ case X86::ROL16rCL:
+ case X86::ROL32rCL:
+ case X86::ROL64rCL:
+ case X86::ROL8ri:
+ case X86::ROL16ri:
+ case X86::ROL32ri:
+ case X86::ROL64ri:
+ case X86::ROR8r1:
+ case X86::ROR16r1:
+ case X86::ROR32r1:
+ case X86::ROR64r1:
+ case X86::ROR8rCL:
+ case X86::ROR16rCL:
+ case X86::ROR32rCL:
+ case X86::ROR64rCL:
+ case X86::ROR8ri:
+ case X86::ROR16ri:
+ case X86::ROR32ri:
+ case X86::ROR64ri:
+ case X86::SAR8r1:
+ case X86::SAR16r1:
+ case X86::SAR32r1:
+ case X86::SAR64r1:
+ case X86::SAR8rCL:
+ case X86::SAR16rCL:
+ case X86::SAR32rCL:
+ case X86::SAR64rCL:
+ case X86::SAR8ri:
+ case X86::SAR16ri:
+ case X86::SAR32ri:
+ case X86::SAR64ri:
+ case X86::SHL8r1:
+ case X86::SHL16r1:
+ case X86::SHL32r1:
+ case X86::SHL64r1:
+ case X86::SHL8rCL:
+ case X86::SHL16rCL:
+ case X86::SHL32rCL:
+ case X86::SHL64rCL:
+ case X86::SHL8ri:
+ case X86::SHL16ri:
+ case X86::SHL32ri:
+ case X86::SHL64ri:
+ case X86::SHR8r1:
+ case X86::SHR16r1:
+ case X86::SHR32r1:
+ case X86::SHR64r1:
+ case X86::SHR8rCL:
+ case X86::SHR16rCL:
+ case X86::SHR32rCL:
+ case X86::SHR64rCL:
+ case X86::SHR8ri:
+ case X86::SHR16ri:
+ case X86::SHR32ri:
+ case X86::SHR64ri:
+ case X86::SHLD16rrCL:
+ case X86::SHLD32rrCL:
+ case X86::SHLD64rrCL:
+ case X86::SHLD16rri8:
+ case X86::SHLD32rri8:
+ case X86::SHLD64rri8:
+ case X86::SHRD16rrCL:
+ case X86::SHRD32rrCL:
+ case X86::SHRD64rrCL:
+ case X86::SHRD16rri8:
+ case X86::SHRD32rri8:
+ case X86::SHRD64rri8:
+
+ // Basic arithmetic is constant time on the input but does set flags.
+ case X86::ADC8rr:
+ case X86::ADC8ri:
+ case X86::ADC16rr:
+ case X86::ADC16ri:
+ case X86::ADC16ri8:
+ case X86::ADC32rr:
+ case X86::ADC32ri:
+ case X86::ADC32ri8:
+ case X86::ADC64rr:
+ case X86::ADC64ri8:
+ case X86::ADC64ri32:
+ case X86::ADD8rr:
+ case X86::ADD8ri:
+ case X86::ADD16rr:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD32rr:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD64rr:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32:
+ case X86::AND8rr:
+ case X86::AND8ri:
+ case X86::AND16rr:
+ case X86::AND16ri:
+ case X86::AND16ri8:
+ case X86::AND32rr:
+ case X86::AND32ri:
+ case X86::AND32ri8:
+ case X86::AND64rr:
+ case X86::AND64ri8:
+ case X86::AND64ri32:
+ case X86::OR8rr:
+ case X86::OR8ri:
+ case X86::OR16rr:
+ case X86::OR16ri:
+ case X86::OR16ri8:
+ case X86::OR32rr:
+ case X86::OR32ri:
+ case X86::OR32ri8:
+ case X86::OR64rr:
+ case X86::OR64ri8:
+ case X86::OR64ri32:
+ case X86::SBB8rr:
+ case X86::SBB8ri:
+ case X86::SBB16rr:
+ case X86::SBB16ri:
+ case X86::SBB16ri8:
+ case X86::SBB32rr:
+ case X86::SBB32ri:
+ case X86::SBB32ri8:
+ case X86::SBB64rr:
+ case X86::SBB64ri8:
+ case X86::SBB64ri32:
+ case X86::SUB8rr:
+ case X86::SUB8ri:
+ case X86::SUB16rr:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB32rr:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB64rr:
+ case X86::SUB64ri8:
+ case X86::SUB64ri32:
+ case X86::XOR8rr:
+ case X86::XOR8ri:
+ case X86::XOR16rr:
+ case X86::XOR16ri:
+ case X86::XOR16ri8:
+ case X86::XOR32rr:
+ case X86::XOR32ri:
+ case X86::XOR32ri8:
+ case X86::XOR64rr:
+ case X86::XOR64ri8:
+ case X86::XOR64ri32:
+ // Arithmetic with just 32-bit and 64-bit variants and no immediates.
+ case X86::ADCX32rr:
+ case X86::ADCX64rr:
+ case X86::ADOX32rr:
+ case X86::ADOX64rr:
+ case X86::ANDN32rr:
+ case X86::ANDN64rr:
+ // Unary arithmetic operations.
+ case X86::DEC8r:
+ case X86::DEC16r:
+ case X86::DEC32r:
+ case X86::DEC64r:
+ case X86::INC8r:
+ case X86::INC16r:
+ case X86::INC32r:
+ case X86::INC64r:
+ case X86::NEG8r:
+ case X86::NEG16r:
+ case X86::NEG32r:
+ case X86::NEG64r:
+
+ // Unlike other arithmetic, NOT doesn't set EFLAGS.
+ case X86::NOT8r:
+ case X86::NOT16r:
+ case X86::NOT32r:
+ case X86::NOT64r:
+
+ // Various move instructions used to zero or sign extend things. Note that we
+ // intentionally don't support the _NOREX variants as we can't handle that
+ // register constraint anyways.
+ case X86::MOVSX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVSX32rr16:
+ case X86::MOVSX64rr8:
+ case X86::MOVSX64rr16:
+ case X86::MOVSX64rr32:
+ case X86::MOVZX16rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVZX32rr16:
+ case X86::MOVZX64rr8:
+ case X86::MOVZX64rr16:
+ case X86::MOV32rr:
+
+ // Arithmetic instructions that are both constant time and don't set flags.
+ case X86::RORX32ri:
+ case X86::RORX64ri:
+ case X86::SARX32rr:
+ case X86::SARX64rr:
+ case X86::SHLX32rr:
+ case X86::SHLX64rr:
+ case X86::SHRX32rr:
+ case X86::SHRX64rr:
+
+ // LEA doesn't actually access memory, and its arithmetic is constant time.
+ case X86::LEA16r:
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ return true;
+ }
+}
+
+bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ // By default, assume that the load will immediately leak.
+ return false;
+
+ // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+ // However, they set flags and are perhaps the most surprisingly constant
+ // time operations so we call them out here separately.
+ case X86::IMUL16rm:
+ case X86::IMUL16rmi8:
+ case X86::IMUL16rmi:
+ case X86::IMUL32rm:
+ case X86::IMUL32rmi8:
+ case X86::IMUL32rmi:
+ case X86::IMUL64rm:
+ case X86::IMUL64rmi32:
+ case X86::IMUL64rmi8:
+
+ // Bit scanning and counting instructions that are somewhat surprisingly
+ // constant time as they scan across bits and do other fairly complex
+ // operations like popcnt, but are believed to be constant time on x86.
+ // However, these set flags.
+ case X86::BSF16rm:
+ case X86::BSF32rm:
+ case X86::BSF64rm:
+ case X86::BSR16rm:
+ case X86::BSR32rm:
+ case X86::BSR64rm:
+ case X86::LZCNT16rm:
+ case X86::LZCNT32rm:
+ case X86::LZCNT64rm:
+ case X86::POPCNT16rm:
+ case X86::POPCNT32rm:
+ case X86::POPCNT64rm:
+ case X86::TZCNT16rm:
+ case X86::TZCNT32rm:
+ case X86::TZCNT64rm:
+
+ // Bit manipulation instructions are effectively combinations of basic
+ // arithmetic ops, and should still execute in constant time. These also
+ // set flags.
+ case X86::BLCFILL32rm:
+ case X86::BLCFILL64rm:
+ case X86::BLCI32rm:
+ case X86::BLCI64rm:
+ case X86::BLCIC32rm:
+ case X86::BLCIC64rm:
+ case X86::BLCMSK32rm:
+ case X86::BLCMSK64rm:
+ case X86::BLCS32rm:
+ case X86::BLCS64rm:
+ case X86::BLSFILL32rm:
+ case X86::BLSFILL64rm:
+ case X86::BLSI32rm:
+ case X86::BLSI64rm:
+ case X86::BLSIC32rm:
+ case X86::BLSIC64rm:
+ case X86::BLSMSK32rm:
+ case X86::BLSMSK64rm:
+ case X86::BLSR32rm:
+ case X86::BLSR64rm:
+ case X86::TZMSK32rm:
+ case X86::TZMSK64rm:
+
+ // Bit extracting and clearing instructions should execute in constant time,
+ // and set flags.
+ case X86::BEXTR32rm:
+ case X86::BEXTR64rm:
+ case X86::BEXTRI32mi:
+ case X86::BEXTRI64mi:
+ case X86::BZHI32rm:
+ case X86::BZHI64rm:
+
+ // Basic arithmetic is constant time on the input but does set flags.
+ case X86::ADC8rm:
+ case X86::ADC16rm:
+ case X86::ADC32rm:
+ case X86::ADC64rm:
+ case X86::ADCX32rm:
+ case X86::ADCX64rm:
+ case X86::ADD8rm:
+ case X86::ADD16rm:
+ case X86::ADD32rm:
+ case X86::ADD64rm:
+ case X86::ADOX32rm:
+ case X86::ADOX64rm:
+ case X86::AND8rm:
+ case X86::AND16rm:
+ case X86::AND32rm:
+ case X86::AND64rm:
+ case X86::ANDN32rm:
+ case X86::ANDN64rm:
+ case X86::OR8rm:
+ case X86::OR16rm:
+ case X86::OR32rm:
+ case X86::OR64rm:
+ case X86::SBB8rm:
+ case X86::SBB16rm:
+ case X86::SBB32rm:
+ case X86::SBB64rm:
+ case X86::SUB8rm:
+ case X86::SUB16rm:
+ case X86::SUB32rm:
+ case X86::SUB64rm:
+ case X86::XOR8rm:
+ case X86::XOR16rm:
+ case X86::XOR32rm:
+ case X86::XOR64rm:
+
+ // Integer multiply w/o affecting flags is still believed to be constant
+ // time on x86. Called out separately as this is among the most surprising
+ // instructions to exhibit that behavior.
+ case X86::MULX32rm:
+ case X86::MULX64rm:
+
+ // Arithmetic instructions that are both constant time and don't set flags.
+ case X86::RORX32mi:
+ case X86::RORX64mi:
+ case X86::SARX32rm:
+ case X86::SARX64rm:
+ case X86::SHLX32rm:
+ case X86::SHLX64rm:
+ case X86::SHRX32rm:
+ case X86::SHRX64rm:
+
+ // Conversions are believed to be constant time and don't set flags.
+ case X86::CVTTSD2SI64rm:
+ case X86::VCVTTSD2SI64rm:
+ case X86::VCVTTSD2SI64Zrm:
+ case X86::CVTTSD2SIrm:
+ case X86::VCVTTSD2SIrm:
+ case X86::VCVTTSD2SIZrm:
+ case X86::CVTTSS2SI64rm:
+ case X86::VCVTTSS2SI64rm:
+ case X86::VCVTTSS2SI64Zrm:
+ case X86::CVTTSS2SIrm:
+ case X86::VCVTTSS2SIrm:
+ case X86::VCVTTSS2SIZrm:
+ case X86::CVTSI2SDrm:
+ case X86::VCVTSI2SDrm:
+ case X86::VCVTSI2SDZrm:
+ case X86::CVTSI2SSrm:
+ case X86::VCVTSI2SSrm:
+ case X86::VCVTSI2SSZrm:
+ case X86::CVTSI642SDrm:
+ case X86::VCVTSI642SDrm:
+ case X86::VCVTSI642SDZrm:
+ case X86::CVTSI642SSrm:
+ case X86::VCVTSI642SSrm:
+ case X86::VCVTSI642SSZrm:
+ case X86::CVTSS2SDrm:
+ case X86::VCVTSS2SDrm:
+ case X86::VCVTSS2SDZrm:
+ case X86::CVTSD2SSrm:
+ case X86::VCVTSD2SSrm:
+ case X86::VCVTSD2SSZrm:
+ // AVX512 added unsigned integer conversions.
+ case X86::VCVTTSD2USI64Zrm:
+ case X86::VCVTTSD2USIZrm:
+ case X86::VCVTTSS2USI64Zrm:
+ case X86::VCVTTSS2USIZrm:
+ case X86::VCVTUSI2SDZrm:
+ case X86::VCVTUSI642SDZrm:
+ case X86::VCVTUSI2SSZrm:
+ case X86::VCVTUSI642SSZrm:
+
+ // Loads to register don't set flags.
+ case X86::MOV8rm:
+ case X86::MOV8rm_NOREX:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::MOVSX16rm8:
+ case X86::MOVSX32rm16:
+ case X86::MOVSX32rm8:
+ case X86::MOVSX32rm8_NOREX:
+ case X86::MOVSX64rm16:
+ case X86::MOVSX64rm32:
+ case X86::MOVSX64rm8:
+ case X86::MOVZX16rm8:
+ case X86::MOVZX32rm16:
+ case X86::MOVZX32rm8:
+ case X86::MOVZX32rm8_NOREX:
+ case X86::MOVZX64rm16:
+ case X86::MOVZX64rm8:
+ return true;
+ }
+}
+
+int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+ if (isFrameInstr(MI)) {
+ int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
+ SPAdj -= getFrameAdjustment(MI);
+ if (!isFrameSetup(MI))
+ SPAdj = -SPAdj;
+ return SPAdj;
+ }
+
+ // To know whether a call adjusts the stack, we need information
+ // that is bound to the following ADJCALLSTACKUP pseudo.
+ // Look for the next ADJCALLSTACKUP that follows the call.
+ if (MI.isCall()) {
+ const MachineBasicBlock *MBB = MI.getParent();
+ auto I = ++MachineBasicBlock::const_iterator(MI);
+ for (auto E = MBB->end(); I != E; ++I) {
+ if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+ I->isCall())
+ break;
+ }
+
+ // If we could not find a frame destroy opcode, then it has already
+ // been simplified, so we don't care.
+ if (I->getOpcode() != getCallFrameDestroyOpcode())
+ return 0;
+
+ return -(I->getOperand(1).getImm());
+ }
+
+ // Currently handle only PUSHes we can reasonably expect to see
+ // in call sequences
+ switch (MI.getOpcode()) {
+ default:
+ return 0;
+ case X86::PUSH32i8:
+ case X86::PUSH32r:
+ case X86::PUSH32rmm:
+ case X86::PUSH32rmr:
+ case X86::PUSHi32:
+ return 4;
+ case X86::PUSH64i8:
+ case X86::PUSH64r:
+ case X86::PUSH64rmm:
+ case X86::PUSH64rmr:
+ case X86::PUSH64i32:
+ return 8;
+ }
+}
+
+/// Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
+ int &FrameIndex) const {
+ if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
+ MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
+ MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
+ MI.getOperand(Op + X86::AddrDisp).isImm() &&
+ MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
+ MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
+ MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
+ FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
+ return true;
+ }
+ return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
+ switch (Opcode) {
+ default:
+ return false;
+ case X86::MOV8rm:
+ case X86::KMOVBkm:
+ MemBytes = 1;
+ return true;
+ case X86::MOV16rm:
+ case X86::KMOVWkm:
+ MemBytes = 2;
+ return true;
+ case X86::MOV32rm:
+ case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
+ case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
+ case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
+ case X86::KMOVDkm:
+ MemBytes = 4;
+ return true;
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
+ case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
+ case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::KMOVQkm:
+ MemBytes = 8;
+ return true;
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQU64Z128rm:
+ MemBytes = 16;
+ return true;
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQU64Z256rm:
+ MemBytes = 32;
+ return true;
+ case X86::VMOVAPSZrm:
+ case X86::VMOVUPSZrm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU64Zrm:
+ MemBytes = 64;
+ return true;
+ }
+}
+
+static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
+ switch (Opcode) {
+ default:
+ return false;
+ case X86::MOV8mr:
+ case X86::KMOVBmk:
+ MemBytes = 1;
+ return true;
+ case X86::MOV16mr:
+ case X86::KMOVWmk:
+ MemBytes = 2;
+ return true;
+ case X86::MOV32mr:
+ case X86::MOVSSmr:
+ case X86::VMOVSSmr:
+ case X86::VMOVSSZmr:
+ case X86::KMOVDmk:
+ MemBytes = 4;
+ return true;
+ case X86::MOV64mr:
+ case X86::ST_FpP64m:
+ case X86::MOVSDmr:
+ case X86::VMOVSDmr:
+ case X86::VMOVSDZmr:
+ case X86::MMX_MOVD64mr:
+ case X86::MMX_MOVQ64mr:
+ case X86::MMX_MOVNTQmr:
+ case X86::KMOVQmk:
+ MemBytes = 8;
+ return true;
+ case X86::MOVAPSmr:
+ case X86::MOVUPSmr:
+ case X86::MOVAPDmr:
+ case X86::MOVUPDmr:
+ case X86::MOVDQAmr:
+ case X86::MOVDQUmr:
+ case X86::VMOVAPSmr:
+ case X86::VMOVUPSmr:
+ case X86::VMOVAPDmr:
+ case X86::VMOVUPDmr:
+ case X86::VMOVDQAmr:
+ case X86::VMOVDQUmr:
+ case X86::VMOVUPSZ128mr:
+ case X86::VMOVAPSZ128mr:
+ case X86::VMOVUPSZ128mr_NOVLX:
+ case X86::VMOVAPSZ128mr_NOVLX:
+ case X86::VMOVUPDZ128mr:
+ case X86::VMOVAPDZ128mr:
+ case X86::VMOVDQA32Z128mr:
+ case X86::VMOVDQU32Z128mr:
+ case X86::VMOVDQA64Z128mr:
+ case X86::VMOVDQU64Z128mr:
+ case X86::VMOVDQU8Z128mr:
+ case X86::VMOVDQU16Z128mr:
+ MemBytes = 16;
+ return true;
+ case X86::VMOVUPSYmr:
+ case X86::VMOVAPSYmr:
+ case X86::VMOVUPDYmr:
+ case X86::VMOVAPDYmr:
+ case X86::VMOVDQUYmr:
+ case X86::VMOVDQAYmr:
+ case X86::VMOVUPSZ256mr:
+ case X86::VMOVAPSZ256mr:
+ case X86::VMOVUPSZ256mr_NOVLX:
+ case X86::VMOVAPSZ256mr_NOVLX:
+ case X86::VMOVUPDZ256mr:
+ case X86::VMOVAPDZ256mr:
+ case X86::VMOVDQU8Z256mr:
+ case X86::VMOVDQU16Z256mr:
+ case X86::VMOVDQA32Z256mr:
+ case X86::VMOVDQU32Z256mr:
+ case X86::VMOVDQA64Z256mr:
+ case X86::VMOVDQU64Z256mr:
+ MemBytes = 32;
+ return true;
+ case X86::VMOVUPSZmr:
+ case X86::VMOVAPSZmr:
+ case X86::VMOVUPDZmr:
+ case X86::VMOVAPDZmr:
+ case X86::VMOVDQU8Zmr:
+ case X86::VMOVDQU16Zmr:
+ case X86::VMOVDQA32Zmr:
+ case X86::VMOVDQU32Zmr:
+ case X86::VMOVDQA64Zmr:
+ case X86::VMOVDQU64Zmr:
+ MemBytes = 64;
+ return true;
+ }
+ return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ unsigned Dummy;
+ return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex,
+ unsigned &MemBytes) const {
+ if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
+ if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+ return MI.getOperand(0).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const {
+ unsigned Dummy;
+ if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
+ unsigned Reg;
+ if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ SmallVector<const MachineMemOperand *, 1> Accesses;
+ if (hasLoadFromStackSlot(MI, Accesses)) {
+ FrameIndex =
+ cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
+ ->getFrameIndex();
+ return 1;
+ }
+ }
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const {
+ unsigned Dummy;
+ return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex,
+ unsigned &MemBytes) const {
+ if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
+ if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+ isFrameOperand(MI, 0, FrameIndex))
+ return MI.getOperand(X86::AddrNumOperands).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const {
+ unsigned Dummy;
+ if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
+ unsigned Reg;
+ if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ SmallVector<const MachineMemOperand *, 1> Accesses;
+ if (hasStoreToStackSlot(MI, Accesses)) {
+ FrameIndex =
+ cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
+ ->getFrameIndex();
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
+static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
+ // Don't waste compile time scanning use-def chains of physregs.
+ if (!BaseReg.isVirtual())
+ return false;
+ bool isPICBase = false;
+ for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
+ E = MRI.def_instr_end(); I != E; ++I) {
+ MachineInstr *DefMI = &*I;
+ if (DefMI->getOpcode() != X86::MOVPC32r)
+ return false;
+ assert(!isPICBase && "More than one PIC base?");
+ isPICBase = true;
+ }
+ return isPICBase;
+}
+
+bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AAResults *AA) const {
+ switch (MI.getOpcode()) {
+ default:
+ // This function should only be called for opcodes with the ReMaterializable
+ // flag set.
+ llvm_unreachable("Unknown rematerializable operation!");
+ break;
+
+ case X86::LOAD_STACK_GUARD:
+ case X86::AVX1_SETALLONES:
+ case X86::AVX2_SETALLONES:
+ case X86::AVX512_128_SET0:
+ case X86::AVX512_256_SET0:
+ case X86::AVX512_512_SET0:
+ case X86::AVX512_512_SETALLONES:
+ case X86::AVX512_FsFLD0SD:
+ case X86::AVX512_FsFLD0SS:
+ case X86::AVX512_FsFLD0F128:
+ case X86::AVX_SET0:
+ case X86::FsFLD0SD:
+ case X86::FsFLD0SS:
+ case X86::FsFLD0F128:
+ case X86::KSET0D:
+ case X86::KSET0Q:
+ case X86::KSET0W:
+ case X86::KSET1D:
+ case X86::KSET1Q:
+ case X86::KSET1W:
+ case X86::MMX_SET0:
+ case X86::MOV32ImmSExti8:
+ case X86::MOV32r0:
+ case X86::MOV32r1:
+ case X86::MOV32r_1:
+ case X86::MOV32ri64:
+ case X86::MOV64ImmSExti8:
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ case X86::MOV16ri:
+ case X86::MOV32ri:
+ case X86::MOV64ri:
+ case X86::MOV64ri32:
+ case X86::MOV8ri:
+ return true;
+
+ case X86::MOV8rm:
+ case X86::MOV8rm_NOREX:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
+ case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
+ case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ // AVX-512
+ case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
+ case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQU64Zrm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
+ case X86::VMOVUPSZrm: {
+ // Loads from constant pools are trivially rematerializable.
+ if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
+ MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
+ MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
+ MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+ MI.isDereferenceableInvariantLoad(AA)) {
+ Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+ if (BaseReg == 0 || BaseReg == X86::RIP)
+ return true;
+ // Allow re-materialization of PIC load.
+ if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
+ return false;
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+
+ case X86::LEA32r:
+ case X86::LEA64r: {
+ if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
+ MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
+ MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+ !MI.getOperand(1 + X86::AddrDisp).isReg()) {
+ // lea fi#, lea GV, etc. are all rematerializable.
+ if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
+ return true;
+ Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+ if (BaseReg == 0)
+ return true;
+ // Allow re-materialization of lea PICBase + x.
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+ }
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ Register DestReg, unsigned SubIdx,
+ const MachineInstr &Orig,
+ const TargetRegisterInfo &TRI) const {
+ bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
+ if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
+ MachineBasicBlock::LQR_Dead) {
+ // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
+ // effects.
+ int Value;
+ switch (Orig.getOpcode()) {
+ case X86::MOV32r0: Value = 0; break;
+ case X86::MOV32r1: Value = 1; break;
+ case X86::MOV32r_1: Value = -1; break;
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ }
+
+ const DebugLoc &DL = Orig.getDebugLoc();
+ BuildMI(MBB, I, DL, get(X86::MOV32ri))
+ .add(Orig.getOperand(0))
+ .addImm(Value);
+ } else {
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+ MBB.insert(I, MI);
+ }
+
+ MachineInstr &NewMI = *std::prev(I);
+ NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
+}
+
+/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
+bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && MO.isDef() &&
+ MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Check whether the shift count for a machine operand is non-zero.
+inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
+ unsigned ShiftAmtOperandIdx) {
+ // The shift count is six bits with the REX.W prefix and five bits without.
+ unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+ unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
+ return Imm & ShiftCountMask;
+}
+
+/// Check whether the given shift count is appropriate
+/// can be represented by a LEA instruction.
+inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
+ // Left shift instructions can be transformed into load-effective-address
+ // instructions if we can encode them appropriately.
+ // A LEA instruction utilizes a SIB byte to encode its scale factor.
+ // The SIB.scale field is two bits wide which means that we can encode any
+ // shift amount less than 4.
+ return ShAmt < 4 && ShAmt > 0;
+}
+
+bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
+ unsigned Opc, bool AllowSP, Register &NewSrc,
+ bool &isKill, MachineOperand &ImplicitOp,
+ LiveVariables *LV) const {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetRegisterClass *RC;
+ if (AllowSP) {
+ RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
+ } else {
+ RC = Opc != X86::LEA32r ?
+ &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+ }
+ Register SrcReg = Src.getReg();
+
+ // For both LEA64 and LEA32 the register already has essentially the right
+ // type (32-bit or 64-bit) we may just need to forbid SP.
+ if (Opc != X86::LEA64_32r) {
+ NewSrc = SrcReg;
+ isKill = Src.isKill();
+ assert(!Src.isUndef() && "Undef op doesn't need optimization");
+
+ if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+ return false;
+
+ return true;
+ }
+
+ // This is for an LEA64_32r and incoming registers are 32-bit. One way or
+ // another we need to add 64-bit registers to the final MI.
+ if (SrcReg.isPhysical()) {
+ ImplicitOp = Src;
+ ImplicitOp.setImplicit();
+
+ NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
+ isKill = Src.isKill();
+ assert(!Src.isUndef() && "Undef op doesn't need optimization");
+ } else {
+ // Virtual register of the wrong class, we have to create a temporary 64-bit
+ // vreg to feed into the LEA.
+ NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+ MachineInstr *Copy =
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+ .add(Src);
+
+ // Which is obviously going to be dead after we're done with it.
+ isKill = true;
+
+ if (LV)
+ LV->replaceKillInstruction(SrcReg, MI, *Copy);
+ }
+
+ // We've set all the parameters without issue.
+ return true;
+}
+
+MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
+ unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
+ LiveVariables *LV, bool Is8BitOp) const {
+ // We handle 8-bit adds and various 16-bit opcodes in the switch below.
+ MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+ assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
+ *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
+ "Unexpected type for LEA transform");
+
+ // TODO: For a 32-bit target, we need to adjust the LEA variables with
+ // something like this:
+ // Opcode = X86::LEA32r;
+ // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ // OutRegLEA =
+ // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
+ // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ if (!Subtarget.is64Bit())
+ return nullptr;
+
+ unsigned Opcode = X86::LEA64_32r;
+ Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // we will be shifting and then extracting the lower 8/16-bits.
+ // This has the potential to cause partial register stall. e.g.
+ // movw (%rbp,%rcx,2), %dx
+ // leal -65(%rdx), %esi
+ // But testing has shown this *does* help performance in 64-bit mode (at
+ // least on modern x86 machines).
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ Register Dest = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ bool IsDead = MI.getOperand(0).isDead();
+ bool IsKill = MI.getOperand(1).isKill();
+ unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
+ assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
+ BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
+ MachineInstr *InsMI =
+ BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(InRegLEA, RegState::Define, SubReg)
+ .addReg(Src, getKillRegState(IsKill));
+
+ MachineInstrBuilder MIB =
+ BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
+ switch (MIOpc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHL8ri:
+ case X86::SHL16ri: {
+ unsigned ShAmt = MI.getOperand(2).getImm();
+ MIB.addReg(0).addImm(1ULL << ShAmt)
+ .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
+ break;
+ }
+ case X86::INC8r:
+ case X86::INC16r:
+ addRegOffset(MIB, InRegLEA, true, 1);
+ break;
+ case X86::DEC8r:
+ case X86::DEC16r:
+ addRegOffset(MIB, InRegLEA, true, -1);
+ break;
+ case X86::ADD8ri:
+ case X86::ADD8ri_DB:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
+ break;
+ case X86::ADD8rr:
+ case X86::ADD8rr_DB:
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB: {
+ Register Src2 = MI.getOperand(2).getReg();
+ bool IsKill2 = MI.getOperand(2).isKill();
+ assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
+ unsigned InRegLEA2 = 0;
+ MachineInstr *InsMI2 = nullptr;
+ if (Src == Src2) {
+ // ADD8rr/ADD16rr killed %reg1028, %reg1028
+ // just a single insert_subreg.
+ addRegReg(MIB, InRegLEA, true, InRegLEA, false);
+ } else {
+ if (Subtarget.is64Bit())
+ InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ else
+ InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // we will be shifting and then extracting the lower 8/16-bits.
+ BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2);
+ InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(InRegLEA2, RegState::Define, SubReg)
+ .addReg(Src2, getKillRegState(IsKill2));
+ addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
+ }
+ if (LV && IsKill2 && InsMI2)
+ LV->replaceKillInstruction(Src2, MI, *InsMI2);
+ break;
+ }
+ }
+
+ MachineInstr *NewMI = MIB;
+ MachineInstr *ExtMI =
+ BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+ .addReg(Dest, RegState::Define | getDeadRegState(IsDead))
+ .addReg(OutRegLEA, RegState::Kill, SubReg);
+
+ if (LV) {
+ // Update live variables.
+ LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
+ LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
+ if (IsKill)
+ LV->replaceKillInstruction(Src, MI, *InsMI);
+ if (IsDead)
+ LV->replaceKillInstruction(Dest, MI, *ExtMI);
+ }
+
+ return ExtMI;
+}
+
+/// This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand. This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineInstr &MI, LiveVariables *LV) const {
+ // The following opcodes also sets the condition code register(s). Only
+ // convert them to equivalent lea if the condition code register def's
+ // are dead!
+ if (hasLiveCondCodeDef(MI))
+ return nullptr;
+
+ MachineFunction &MF = *MI.getParent()->getParent();
+ // All instructions input are two-addr instructions. Get the known operands.
+ const MachineOperand &Dest = MI.getOperand(0);
+ const MachineOperand &Src = MI.getOperand(1);
+
+ // Ideally, operations with undef should be folded before we get here, but we
+ // can't guarantee it. Bail out because optimizing undefs is a waste of time.
+ // Without this, we have to forward undef state to new register operands to
+ // avoid machine verifier errors.
+ if (Src.isUndef())
+ return nullptr;
+ if (MI.getNumOperands() > 2)
+ if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
+ return nullptr;
+
+ MachineInstr *NewMI = nullptr;
+ bool Is64Bit = Subtarget.is64Bit();
+
+ bool Is8BitOp = false;
+ unsigned MIOpc = MI.getOpcode();
+ switch (MIOpc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHL64ri: {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+ // LEA can't handle RSP.
+ if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
+ Src.getReg(), &X86::GR64_NOSPRegClass))
+ return nullptr;
+
+ NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+ .add(Dest)
+ .addReg(0)
+ .addImm(1ULL << ShAmt)
+ .add(Src)
+ .addImm(0)
+ .addReg(0);
+ break;
+ }
+ case X86::SHL32ri: {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+ unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ // LEA can't handle ESP.
+ bool isKill;
+ Register SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(0)
+ .addImm(1ULL << ShAmt)
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addImm(0)
+ .addReg(0);
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+ NewMI = MIB;
+
+ break;
+ }
+ case X86::SHL8ri:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
+ case X86::SHL16ri: {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt))
+ return nullptr;
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+ }
+ case X86::INC64r:
+ case X86::INC32r: {
+ assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+ unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
+ (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ bool isKill;
+ Register SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
+ ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+
+ NewMI = addOffset(MIB, 1);
+ break;
+ }
+ case X86::DEC64r:
+ case X86::DEC32r: {
+ assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+ unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+ : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+ bool isKill;
+ Register SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
+ ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+
+ NewMI = addOffset(MIB, -1);
+
+ break;
+ }
+ case X86::DEC8r:
+ case X86::INC8r:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
+ case X86::DEC16r:
+ case X86::INC16r:
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB: {
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc;
+ if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
+ Opc = X86::LEA64r;
+ else
+ Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ bool isKill;
+ Register SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, ImplicitOp, LV))
+ return nullptr;
+
+ const MachineOperand &Src2 = MI.getOperand(2);
+ bool isKill2;
+ Register SrcReg2;
+ MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+ SrcReg2, isKill2, ImplicitOp2, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+ if (ImplicitOp2.getReg() != 0)
+ MIB.add(ImplicitOp2);
+
+ NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
+ if (LV && Src2.isKill())
+ LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
+ break;
+ }
+ case X86::ADD8rr:
+ case X86::ADD8rr_DB:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB:
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8_DB:
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ NewMI = addOffset(
+ BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
+ MI.getOperand(2));
+ break;
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri_DB:
+ case X86::ADD32ri8_DB: {
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ bool isKill;
+ Register SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+
+ NewMI = addOffset(MIB, MI.getOperand(2));
+ break;
+ }
+ case X86::ADD8ri:
+ case X86::ADD8ri_DB:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+ case X86::SUB8ri:
+ case X86::SUB16ri8:
+ case X86::SUB16ri:
+ /// FIXME: Support these similar to ADD8ri/ADD16ri*.
+ return nullptr;
+ case X86::SUB32ri8:
+ case X86::SUB32ri: {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
+
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ bool isKill;
+ Register SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+
+ NewMI = addOffset(MIB, -Imm);
+ break;
+ }
+
+ case X86::SUB64ri8:
+ case X86::SUB64ri32: {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
+
+ assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
+ get(X86::LEA64r)).add(Dest).add(Src);
+ NewMI = addOffset(MIB, -Imm);
+ break;
+ }
+
+ case X86::VMOVDQU8Z128rmk:
+ case X86::VMOVDQU8Z256rmk:
+ case X86::VMOVDQU8Zrmk:
+ case X86::VMOVDQU16Z128rmk:
+ case X86::VMOVDQU16Z256rmk:
+ case X86::VMOVDQU16Zrmk:
+ case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
+ case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
+ case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
+ case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
+ case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
+ case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
+ case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
+ case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
+ case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
+ case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
+ case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
+ case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
+ case X86::VBROADCASTSDZ256rmk:
+ case X86::VBROADCASTSDZrmk:
+ case X86::VBROADCASTSSZ128rmk:
+ case X86::VBROADCASTSSZ256rmk:
+ case X86::VBROADCASTSSZrmk:
+ case X86::VPBROADCASTDZ128rmk:
+ case X86::VPBROADCASTDZ256rmk:
+ case X86::VPBROADCASTDZrmk:
+ case X86::VPBROADCASTQZ128rmk:
+ case X86::VPBROADCASTQZ256rmk:
+ case X86::VPBROADCASTQZrmk: {
+ unsigned Opc;
+ switch (MIOpc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
+ case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
+ case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
+ case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
+ case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
+ case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
+ case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
+ case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break;
+ case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
+ case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
+ case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break;
+ case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
+ case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
+ case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break;
+ case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
+ case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
+ case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break;
+ }
+
+ NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .add(MI.getOperand(2))
+ .add(Src)
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(5))
+ .add(MI.getOperand(6))
+ .add(MI.getOperand(7));
+ break;
+ }
+
+ case X86::VMOVDQU8Z128rrk:
+ case X86::VMOVDQU8Z256rrk:
+ case X86::VMOVDQU8Zrrk:
+ case X86::VMOVDQU16Z128rrk:
+ case X86::VMOVDQU16Z256rrk:
+ case X86::VMOVDQU16Zrrk:
+ case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
+ case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
+ case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
+ case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
+ case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
+ case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
+ case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
+ case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
+ case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
+ case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
+ case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
+ case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
+ unsigned Opc;
+ switch (MIOpc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
+ case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
+ case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
+ case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
+ case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
+ case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
+ case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
+ case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
+ case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
+ case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
+ case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
+ case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
+ case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
+ case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
+ case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
+ case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
+ case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
+ case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
+ case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
+ case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
+ case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
+ case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
+ case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
+ case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
+ case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
+ case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
+ case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
+ case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
+ case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
+ case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
+ }
+
+ NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .add(MI.getOperand(2))
+ .add(Src)
+ .add(MI.getOperand(3));
+ break;
+ }
+ }
+
+ if (!NewMI) return nullptr;
+
+ if (LV) { // Update live variables
+ if (Src.isKill())
+ LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
+ if (Dest.isDead())
+ LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
+ }
+
+ MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
+ return NewMI;
+}
+
+/// This determines which of three possible cases of a three source commute
+/// the source indexes correspond to taking into account any mask operands.
+/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
+/// possible.
+/// Case 0 - Possible to commute the first and second operands.
+/// Case 1 - Possible to commute the first and third operands.
+/// Case 2 - Possible to commute the second and third operands.
+static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
+
+ unsigned Op1 = 1, Op2 = 2, Op3 = 3;
+ if (X86II::isKMasked(TSFlags)) {
+ Op2++;
+ Op3++;
+ }
+
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
+ return 0;
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
+ return 1;
+ if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
+ return 2;
+ llvm_unreachable("Unknown three src commute case.");
+}
+
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
+ const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const {
+
+ unsigned Opc = MI.getOpcode();
+
+ // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+ // analysis. The commute optimization is legal only if all users of FMA*_Int
+ // use only the lowest element of the FMA*_Int instruction. Such analysis are
+ // not implemented yet. So, just return 0 in that case.
+ // When such analysis are available this place will be the right place for
+ // calling it.
+ assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
+ "Intrinsic instructions can't commute operand 1");
+
+ // Determine which case this commute is or if it can't be done.
+ unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
+ SrcOpIdx2);
+ assert(Case < 3 && "Unexpected case number!");
+
+ // Define the FMA forms mapping array that helps to map input FMA form
+ // to output FMA form to preserve the operation semantics after
+ // commuting the operands.
+ const unsigned Form132Index = 0;
+ const unsigned Form213Index = 1;
+ const unsigned Form231Index = 2;
+ static const unsigned FormMapping[][3] = {
+ // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+ // FMA132 A, C, b; ==> FMA231 C, A, b;
+ // FMA213 B, A, c; ==> FMA213 A, B, c;
+ // FMA231 C, A, b; ==> FMA132 A, C, b;
+ { Form231Index, Form213Index, Form132Index },
+ // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+ // FMA132 A, c, B; ==> FMA132 B, c, A;
+ // FMA213 B, a, C; ==> FMA231 C, a, B;
+ // FMA231 C, a, B; ==> FMA213 B, a, C;
+ { Form132Index, Form231Index, Form213Index },
+ // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+ // FMA132 a, C, B; ==> FMA213 a, B, C;
+ // FMA213 b, A, C; ==> FMA132 b, C, A;
+ // FMA231 c, A, B; ==> FMA231 c, B, A;
+ { Form213Index, Form132Index, Form231Index }
+ };
+
+ unsigned FMAForms[3];
+ FMAForms[0] = FMA3Group.get132Opcode();
+ FMAForms[1] = FMA3Group.get213Opcode();
+ FMAForms[2] = FMA3Group.get231Opcode();
+ unsigned FormIndex;
+ for (FormIndex = 0; FormIndex < 3; FormIndex++)
+ if (Opc == FMAForms[FormIndex])
+ break;
+
+ // Everything is ready, just adjust the FMA opcode and return it.
+ FormIndex = FormMapping[Case][FormIndex];
+ return FMAForms[FormIndex];
+}
+
+static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
+ // Determine which case this commute is or if it can't be done.
+ unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
+ SrcOpIdx2);
+ assert(Case < 3 && "Unexpected case value!");
+
+ // For each case we need to swap two pairs of bits in the final immediate.
+ static const uint8_t SwapMasks[3][4] = {
+ { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
+ { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
+ { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
+ };
+
+ uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
+ // Clear out the bits we are swapping.
+ uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
+ SwapMasks[Case][2] | SwapMasks[Case][3]);
+ // If the immediate had a bit of the pair set, then set the opposite bit.
+ if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
+ if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
+ if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
+ if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
+ MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
+}
+
+// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
+// commuted.
+static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
+#define VPERM_CASES(Suffix) \
+ case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
+ case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
+ case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
+ case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
+ case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
+ case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
+ case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
+ case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
+ case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
+ case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
+ case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
+ case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
+
+#define VPERM_CASES_BROADCAST(Suffix) \
+ VPERM_CASES(Suffix) \
+ case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
+ case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
+ case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
+ case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
+ case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
+ case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
+
+ switch (Opcode) {
+ default: return false;
+ VPERM_CASES(B)
+ VPERM_CASES_BROADCAST(D)
+ VPERM_CASES_BROADCAST(PD)
+ VPERM_CASES_BROADCAST(PS)
+ VPERM_CASES_BROADCAST(Q)
+ VPERM_CASES(W)
+ return true;
+ }
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
+}
+
+// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
+// from the I opcode to the T opcode and vice versa.
+static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
+#define VPERM_CASES(Orig, New) \
+ case X86::Orig##128rr: return X86::New##128rr; \
+ case X86::Orig##128rrkz: return X86::New##128rrkz; \
+ case X86::Orig##128rm: return X86::New##128rm; \
+ case X86::Orig##128rmkz: return X86::New##128rmkz; \
+ case X86::Orig##256rr: return X86::New##256rr; \
+ case X86::Orig##256rrkz: return X86::New##256rrkz; \
+ case X86::Orig##256rm: return X86::New##256rm; \
+ case X86::Orig##256rmkz: return X86::New##256rmkz; \
+ case X86::Orig##rr: return X86::New##rr; \
+ case X86::Orig##rrkz: return X86::New##rrkz; \
+ case X86::Orig##rm: return X86::New##rm; \
+ case X86::Orig##rmkz: return X86::New##rmkz;
+
+#define VPERM_CASES_BROADCAST(Orig, New) \
+ VPERM_CASES(Orig, New) \
+ case X86::Orig##128rmb: return X86::New##128rmb; \
+ case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
+ case X86::Orig##256rmb: return X86::New##256rmb; \
+ case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
+ case X86::Orig##rmb: return X86::New##rmb; \
+ case X86::Orig##rmbkz: return X86::New##rmbkz;
+
+ switch (Opcode) {
+ VPERM_CASES(VPERMI2B, VPERMT2B)
+ VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
+ VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
+ VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
+ VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
+ VPERM_CASES(VPERMI2W, VPERMT2W)
+ VPERM_CASES(VPERMT2B, VPERMI2B)
+ VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
+ VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
+ VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
+ VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
+ VPERM_CASES(VPERMT2W, VPERMI2W)
+ }
+
+ llvm_unreachable("Unreachable!");
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
+}
+
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
+ if (NewMI)
+ return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
+ return MI;
+ };
+
+ switch (MI.getOpcode()) {
+ case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+ case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+ case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+ case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+ case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+ case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+ unsigned Opc;
+ unsigned Size;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+ case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+ case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+ case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+ case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+ case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+ }
+ unsigned Amt = MI.getOperand(3).getImm();
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.getOperand(3).setImm(Size - Amt);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::PFSUBrr:
+ case X86::PFSUBRrr: {
+ // PFSUB x, y: x = x - y
+ // PFSUBR x, y: x = y - x
+ unsigned Opc =
+ (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::BLENDPDrri:
+ case X86::BLENDPSrri:
+ case X86::VBLENDPDrri:
+ case X86::VBLENDPSrri:
+ // If we're optimizing for size, try to use MOVSD/MOVSS.
+ if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
+ unsigned Mask, Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break;
+ case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break;
+ case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
+ case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
+ }
+ if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.RemoveOperand(3);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
+ /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ }
+ LLVM_FALLTHROUGH;
+ case X86::PBLENDWrri:
+ case X86::VBLENDPDYrri:
+ case X86::VBLENDPSYrri:
+ case X86::VPBLENDDrri:
+ case X86::VPBLENDWrri:
+ case X86::VPBLENDDYrri:
+ case X86::VPBLENDWYrri:{
+ int8_t Mask;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
+ case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
+ case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
+ case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
+ case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
+ case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
+ case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
+ case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
+ case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
+ case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
+ case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
+ }
+ // Only the least significant bits of Imm are used.
+ // Using int8_t to ensure it will be sign extended to the int64_t that
+ // setImm takes in order to match isel behavior.
+ int8_t Imm = MI.getOperand(3).getImm() & Mask;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm(Mask ^ Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ case X86::VINSERTPSZrr: {
+ unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
+ unsigned ZMask = Imm & 15;
+ unsigned DstIdx = (Imm >> 4) & 3;
+ unsigned SrcIdx = (Imm >> 6) & 3;
+
+ // We can commute insertps if we zero 2 of the elements, the insertion is
+ // "inline" and we don't override the insertion with a zero.
+ if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
+ countPopulation(ZMask) == 2) {
+ unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
+ assert(AltIdx < 4 && "Illegal insertion index");
+ unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ return nullptr;
+ }
+ case X86::MOVSDrr:
+ case X86::MOVSSrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr:{
+ // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
+ if (Subtarget.hasSSE41()) {
+ unsigned Mask, Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
+ case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
+ case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
+ case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+ }
+
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+
+ // Convert to SHUFPD.
+ assert(MI.getOpcode() == X86::MOVSDrr &&
+ "Can only commute MOVSDrr without SSE4.1");
+
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(X86::SHUFPDrri));
+ WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::SHUFPDrri: {
+ // Commute to MOVSD.
+ assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(X86::MOVSDrr));
+ WorkingMI.RemoveOperand(3);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::PCLMULQDQrr:
+ case X86::VPCLMULQDQrr:
+ case X86::VPCLMULQDQYrr:
+ case X86::VPCLMULQDQZrr:
+ case X86::VPCLMULQDQZ128rr:
+ case X86::VPCLMULQDQZ256rr: {
+ // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
+ // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned Src1Hi = Imm & 0x01;
+ unsigned Src2Hi = Imm & 0x10;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
+ case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
+ case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
+ case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPWZrri: case X86::VPCMPUWZrri:
+ case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik:
+ case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
+ case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
+ // Flip comparison mode immediate (if necessary).
+ unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
+ Imm = X86::getSwappedVPCMPImm(Imm);
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::VPCOMBri: case X86::VPCOMUBri:
+ case X86::VPCOMDri: case X86::VPCOMUDri:
+ case X86::VPCOMQri: case X86::VPCOMUQri:
+ case X86::VPCOMWri: case X86::VPCOMUWri: {
+ // Flip comparison mode immediate (if necessary).
+ unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ Imm = X86::getSwappedVPCOMImm(Imm);
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri:
+ case X86::VCMPPDZrrik:
+ case X86::VCMPPSZrrik:
+ case X86::VCMPPDZ128rrik:
+ case X86::VCMPPSZ128rrik:
+ case X86::VCMPPDZ256rrik:
+ case X86::VCMPPSZ256rrik: {
+ unsigned Imm =
+ MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
+ Imm = X86::getSwappedVCMPImm(Imm);
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::VPERM2F128rr:
+ case X86::VPERM2I128rr: {
+ // Flip permute source immediate.
+ // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
+ // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
+ int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::MOVHLPSrr:
+ case X86::UNPCKHPDrr:
+ case X86::VMOVHLPSrr:
+ case X86::VUNPCKHPDrr:
+ case X86::VMOVHLPSZrr:
+ case X86::VUNPCKHPDZ128rr: {
+ assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
+ case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+ case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
+ case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
+ case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
+ case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
+ }
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
+ auto &WorkingMI = cloneIfNew(MI);
+ unsigned OpNo = MI.getDesc().getNumOperands() - 1;
+ X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
+ WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik:
+ case X86::VPTERNLOGDZ128rrik:
+ case X86::VPTERNLOGDZ256rrik:
+ case X86::VPTERNLOGQZrrik:
+ case X86::VPTERNLOGQZ128rrik:
+ case X86::VPTERNLOGQZ256rrik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+ case X86::VPTERNLOGDZ128rmbi:
+ case X86::VPTERNLOGDZ256rmbi:
+ case X86::VPTERNLOGDZrmbi:
+ case X86::VPTERNLOGQZ128rmbi:
+ case X86::VPTERNLOGQZ256rmbi:
+ case X86::VPTERNLOGQZrmbi:
+ case X86::VPTERNLOGDZ128rmbikz:
+ case X86::VPTERNLOGDZ256rmbikz:
+ case X86::VPTERNLOGDZrmbikz:
+ case X86::VPTERNLOGQZ128rmbikz:
+ case X86::VPTERNLOGQZ256rmbikz:
+ case X86::VPTERNLOGQZrmbikz: {
+ auto &WorkingMI = cloneIfNew(MI);
+ commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ default: {
+ if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
+ unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+
+ const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
+ MI.getDesc().TSFlags);
+ if (FMA3Group) {
+ unsigned Opc =
+ getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ }
+ }
+}
+
+bool
+X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2,
+ bool IsIntrinsic) const {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
+ unsigned FirstCommutableVecOp = 1;
+ unsigned LastCommutableVecOp = 3;
+ unsigned KMaskOp = -1U;
+ if (X86II::isKMasked(TSFlags)) {
+ // For k-zero-masked operations it is Ok to commute the first vector
+ // operand. Unless this is an intrinsic instruction.
+ // For regular k-masked operations a conservative choice is done as the
+ // elements of the first vector operand, for which the corresponding bit
+ // in the k-mask operand is set to 0, are copied to the result of the
+ // instruction.
+ // TODO/FIXME: The commute still may be legal if it is known that the
+ // k-mask operand is set to either all ones or all zeroes.
+ // It is also Ok to commute the 1st operand if all users of MI use only
+ // the elements enabled by the k-mask operand. For example,
+ // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
+ // : v1[i];
+ // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
+ // // Ok, to commute v1 in FMADD213PSZrk.
+
+ // The k-mask operand has index = 2 for masked and zero-masked operations.
+ KMaskOp = 2;
+
+ // The operand with index = 1 is used as a source for those elements for
+ // which the corresponding bit in the k-mask is set to 0.
+ if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
+ FirstCommutableVecOp = 3;
+
+ LastCommutableVecOp++;
+ } else if (IsIntrinsic) {
+ // Commuting the first operand of an intrinsic instruction isn't possible
+ // unless we can prove that only the lowest element of the result is used.
+ FirstCommutableVecOp = 2;
+ }
+
+ if (isMem(MI, LastCommutableVecOp))
+ LastCommutableVecOp--;
+
+ // Only the first RegOpsNum operands are commutable.
+ // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
+ // that the operand is not specified/fixed.
+ if (SrcOpIdx1 != CommuteAnyOperandIndex &&
+ (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
+ SrcOpIdx1 == KMaskOp))
+ return false;
+ if (SrcOpIdx2 != CommuteAnyOperandIndex &&
+ (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
+ SrcOpIdx2 == KMaskOp))
+ return false;
+
+ // Look for two different register operands assumed to be commutable
+ // regardless of the FMA opcode. The FMA opcode is adjusted later.
+ if (SrcOpIdx1 == CommuteAnyOperandIndex ||
+ SrcOpIdx2 == CommuteAnyOperandIndex) {
+ unsigned CommutableOpIdx2 = SrcOpIdx2;
+
+ // At least one of operands to be commuted is not specified and
+ // this method is free to choose appropriate commutable operands.
+ if (SrcOpIdx1 == SrcOpIdx2)
+ // Both of operands are not fixed. By default set one of commutable
+ // operands to the last register operand of the instruction.
+ CommutableOpIdx2 = LastCommutableVecOp;
+ else if (SrcOpIdx2 == CommuteAnyOperandIndex)
+ // Only one of operands is not fixed.
+ CommutableOpIdx2 = SrcOpIdx1;
+
+ // CommutableOpIdx2 is well defined now. Let's choose another commutable
+ // operand and assign its index to CommutableOpIdx1.
+ Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
+
+ unsigned CommutableOpIdx1;
+ for (CommutableOpIdx1 = LastCommutableVecOp;
+ CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
+ // Just ignore and skip the k-mask operand.
+ if (CommutableOpIdx1 == KMaskOp)
+ continue;
+
+ // The commuted operands must have different registers.
+ // Otherwise, the commute transformation does not change anything and
+ // is useless then.
+ if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
+ break;
+ }
+
+ // No appropriate commutable operands were found.
+ if (CommutableOpIdx1 < FirstCommutableVecOp)
+ return false;
+
+ // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
+ // to return those values.
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+ }
+
+ return true;
+}
+
+bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ const MCInstrDesc &Desc = MI.getDesc();
+ if (!Desc.isCommutable())
+ return false;
+
+ switch (MI.getOpcode()) {
+ case X86::CMPSDrr:
+ case X86::CMPSSrr:
+ case X86::CMPPDrri:
+ case X86::CMPPSrri:
+ case X86::VCMPSDrr:
+ case X86::VCMPSSrr:
+ case X86::VCMPPDrri:
+ case X86::VCMPPSrri:
+ case X86::VCMPPDYrri:
+ case X86::VCMPPSYrri:
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri:
+ case X86::VCMPPDZrrik:
+ case X86::VCMPPSZrrik:
+ case X86::VCMPPDZ128rrik:
+ case X86::VCMPPSZ128rrik:
+ case X86::VCMPPDZ256rrik:
+ case X86::VCMPPSZ256rrik: {
+ unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
+
+ // Float comparison can be safely commuted for
+ // Ordered/Unordered/Equal/NotEqual tests
+ unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
+ switch (Imm) {
+ default:
+ // EVEX versions can be commuted.
+ if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
+ break;
+ return false;
+ case 0x00: // EQUAL
+ case 0x03: // UNORDERED
+ case 0x04: // NOT EQUAL
+ case 0x07: // ORDERED
+ break;
+ }
+
+ // The indices of the commutable operands are 1 and 2 (or 2 and 3
+ // when masked).
+ // Assign them to the returned operand indices here.
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
+ 2 + OpOffset);
+ }
+ case X86::MOVSSrr:
+ // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
+ // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
+ // AVX implies sse4.1.
+ if (Subtarget.hasSSE41())
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return false;
+ case X86::SHUFPDrri:
+ // We can commute this to MOVSD.
+ if (MI.getOperand(3).getImm() == 0x02)
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return false;
+ case X86::MOVHLPSrr:
+ case X86::UNPCKHPDrr:
+ case X86::VMOVHLPSrr:
+ case X86::VUNPCKHPDrr:
+ case X86::VMOVHLPSZrr:
+ case X86::VUNPCKHPDZ128rr:
+ if (Subtarget.hasSSE2())
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return false;
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik:
+ case X86::VPTERNLOGDZ128rrik:
+ case X86::VPTERNLOGDZ256rrik:
+ case X86::VPTERNLOGQZrrik:
+ case X86::VPTERNLOGQZ128rrik:
+ case X86::VPTERNLOGQZ256rrik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+ case X86::VPTERNLOGDZ128rmbi:
+ case X86::VPTERNLOGDZ256rmbi:
+ case X86::VPTERNLOGDZrmbi:
+ case X86::VPTERNLOGQZ128rmbi:
+ case X86::VPTERNLOGQZ256rmbi:
+ case X86::VPTERNLOGQZrmbi:
+ case X86::VPTERNLOGDZ128rmbikz:
+ case X86::VPTERNLOGDZ256rmbikz:
+ case X86::VPTERNLOGDZrmbikz:
+ case X86::VPTERNLOGQZ128rmbikz:
+ case X86::VPTERNLOGQZ256rmbikz:
+ case X86::VPTERNLOGQZrmbikz:
+ return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ case X86::VPDPWSSDYrr:
+ case X86::VPDPWSSDrr:
+ case X86::VPDPWSSDSYrr:
+ case X86::VPDPWSSDSrr:
+ case X86::VPDPWSSDZ128r:
+ case X86::VPDPWSSDZ128rk:
+ case X86::VPDPWSSDZ128rkz:
+ case X86::VPDPWSSDZ256r:
+ case X86::VPDPWSSDZ256rk:
+ case X86::VPDPWSSDZ256rkz:
+ case X86::VPDPWSSDZr:
+ case X86::VPDPWSSDZrk:
+ case X86::VPDPWSSDZrkz:
+ case X86::VPDPWSSDSZ128r:
+ case X86::VPDPWSSDSZ128rk:
+ case X86::VPDPWSSDSZ128rkz:
+ case X86::VPDPWSSDSZ256r:
+ case X86::VPDPWSSDSZ256rk:
+ case X86::VPDPWSSDSZ256rkz:
+ case X86::VPDPWSSDSZr:
+ case X86::VPDPWSSDSZrk:
+ case X86::VPDPWSSDSZrkz:
+ case X86::VPMADD52HUQZ128r:
+ case X86::VPMADD52HUQZ128rk:
+ case X86::VPMADD52HUQZ128rkz:
+ case X86::VPMADD52HUQZ256r:
+ case X86::VPMADD52HUQZ256rk:
+ case X86::VPMADD52HUQZ256rkz:
+ case X86::VPMADD52HUQZr:
+ case X86::VPMADD52HUQZrk:
+ case X86::VPMADD52HUQZrkz:
+ case X86::VPMADD52LUQZ128r:
+ case X86::VPMADD52LUQZ128rk:
+ case X86::VPMADD52LUQZ128rkz:
+ case X86::VPMADD52LUQZ256r:
+ case X86::VPMADD52LUQZ256rk:
+ case X86::VPMADD52LUQZ256rkz:
+ case X86::VPMADD52LUQZr:
+ case X86::VPMADD52LUQZrk:
+ case X86::VPMADD52LUQZrkz: {
+ unsigned CommutableOpIdx1 = 2;
+ unsigned CommutableOpIdx2 = 3;
+ if (X86II::isKMasked(Desc.TSFlags)) {
+ // Skip the mask register.
+ ++CommutableOpIdx1;
+ ++CommutableOpIdx2;
+ }
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+ if (!MI.getOperand(SrcOpIdx1).isReg() ||
+ !MI.getOperand(SrcOpIdx2).isReg())
+ // No idea.
+ return false;
+ return true;
+ }
+
+ default:
+ const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
+ MI.getDesc().TSFlags);
+ if (FMA3Group)
+ return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
+ FMA3Group->isIntrinsic());
+
+ // Handled masked instructions since we need to skip over the mask input
+ // and the preserved input.
+ if (X86II::isKMasked(Desc.TSFlags)) {
+ // First assume that the first input is the mask operand and skip past it.
+ unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
+ unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
+ // Check if the first input is tied. If there isn't one then we only
+ // need to skip the mask operand which we did above.
+ if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
+ MCOI::TIED_TO) != -1)) {
+ // If this is zero masking instruction with a tied operand, we need to
+ // move the first index back to the first input since this must
+ // be a 3 input instruction and we want the first two non-mask inputs.
+ // Otherwise this is a 2 input instruction with a preserved input and
+ // mask, so we need to move the indices to skip one more input.
+ if (X86II::isKMergeMasked(Desc.TSFlags)) {
+ ++CommutableOpIdx1;
+ ++CommutableOpIdx2;
+ } else {
+ --CommutableOpIdx1;
+ }
+ }
+
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+
+ if (!MI.getOperand(SrcOpIdx1).isReg() ||
+ !MI.getOperand(SrcOpIdx2).isReg())
+ // No idea.
+ return false;
+ return true;
+ }
+
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ }
+ return false;
+}
+
+X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default: return X86::COND_INVALID;
+ case X86::JCC_1:
+ return static_cast<X86::CondCode>(
+ MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
+ }
+}
+
+/// Return condition code of a SETCC opcode.
+X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default: return X86::COND_INVALID;
+ case X86::SETCCr: case X86::SETCCm:
+ return static_cast<X86::CondCode>(
+ MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
+ }
+}
+
+/// Return condition code of a CMov opcode.
+X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default: return X86::COND_INVALID;
+ case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr:
+ case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm:
+ return static_cast<X86::CondCode>(
+ MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
+ }
+}
+
+/// Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case X86::COND_E: return X86::COND_NE;
+ case X86::COND_NE: return X86::COND_E;
+ case X86::COND_L: return X86::COND_GE;
+ case X86::COND_LE: return X86::COND_G;
+ case X86::COND_G: return X86::COND_LE;
+ case X86::COND_GE: return X86::COND_L;
+ case X86::COND_B: return X86::COND_AE;
+ case X86::COND_BE: return X86::COND_A;
+ case X86::COND_A: return X86::COND_BE;
+ case X86::COND_AE: return X86::COND_B;
+ case X86::COND_S: return X86::COND_NS;
+ case X86::COND_NS: return X86::COND_S;
+ case X86::COND_P: return X86::COND_NP;
+ case X86::COND_NP: return X86::COND_P;
+ case X86::COND_O: return X86::COND_NO;
+ case X86::COND_NO: return X86::COND_O;
+ case X86::COND_NE_OR_P: return X86::COND_E_AND_NP;
+ case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
+ }
+}
+
+/// Assuming the flags are set by MI(a,b), return the condition code if we
+/// modify the instructions such that flags are set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: return X86::COND_INVALID;
+ case X86::COND_E: return X86::COND_E;
+ case X86::COND_NE: return X86::COND_NE;
+ case X86::COND_L: return X86::COND_G;
+ case X86::COND_LE: return X86::COND_GE;
+ case X86::COND_G: return X86::COND_L;
+ case X86::COND_GE: return X86::COND_LE;
+ case X86::COND_B: return X86::COND_A;
+ case X86::COND_BE: return X86::COND_AE;
+ case X86::COND_A: return X86::COND_B;
+ case X86::COND_AE: return X86::COND_BE;
+ }
+}
+
+std::pair<X86::CondCode, bool>
+X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
+ X86::CondCode CC = X86::COND_INVALID;
+ bool NeedSwap = false;
+ switch (Predicate) {
+ default: break;
+ // Floating-point Predicates
+ case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
+ case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
+ case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
+ case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
+ case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
+ case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
+ case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+ // Integer Predicates
+ case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
+ case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
+ case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
+ case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
+ case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
+ case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
+ case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
+ case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
+/// Return a setcc opcode based on whether it has memory operand.
+unsigned X86::getSETOpc(bool HasMemoryOperand) {
+ return HasMemoryOperand ? X86::SETCCr : X86::SETCCm;
+}
+
+/// Return a cmov opcode for the given register size in bytes, and operand type.
+unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
+ switch(RegBytes) {
+ default: llvm_unreachable("Illegal register size!");
+ case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
+ case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
+ case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
+ }
+}
+
+/// Get the VPCMP immediate for the given condition.
+unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETNE: return 4;
+ case ISD::SETEQ: return 0;
+ case ISD::SETULT:
+ case ISD::SETLT: return 1;
+ case ISD::SETUGT:
+ case ISD::SETGT: return 6;
+ case ISD::SETUGE:
+ case ISD::SETGE: return 5;
+ case ISD::SETULE:
+ case ISD::SETLE: return 2;
+ }
+}
+
+/// Get the VPCMP immediate if the operands are swapped.
+unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
+ switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x01: Imm = 0x06; break; // LT -> NLE
+ case 0x02: Imm = 0x05; break; // LE -> NLT
+ case 0x05: Imm = 0x02; break; // NLT -> LE
+ case 0x06: Imm = 0x01; break; // NLE -> LT
+ case 0x00: // EQ
+ case 0x03: // FALSE
+ case 0x04: // NE
+ case 0x07: // TRUE
+ break;
+ }
+
+ return Imm;
+}
+
+/// Get the VPCOM immediate if the operands are swapped.
+unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
+ switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x00: Imm = 0x02; break; // LT -> GT
+ case 0x01: Imm = 0x03; break; // LE -> GE
+ case 0x02: Imm = 0x00; break; // GT -> LT
+ case 0x03: Imm = 0x01; break; // GE -> LE
+ case 0x04: // EQ
+ case 0x05: // NE
+ case 0x06: // FALSE
+ case 0x07: // TRUE
+ break;
+ }
+
+ return Imm;
+}
+
+/// Get the VCMP immediate if the operands are swapped.
+unsigned X86::getSwappedVCMPImm(unsigned Imm) {
+ // Only need the lower 2 bits to distinquish.
+ switch (Imm & 0x3) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x00: case 0x03:
+ // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
+ break;
+ case 0x01: case 0x02:
+ // Need to toggle bits 3:0. Bit 4 stays the same.
+ Imm ^= 0xf;
+ break;
+ }
+
+ return Imm;
+}
+
+bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool X86InstrInfo::canMakeTailCallConditional(
+ SmallVectorImpl<MachineOperand> &BranchCond,
+ const MachineInstr &TailCall) const {
+ if (TailCall.getOpcode() != X86::TCRETURNdi &&
+ TailCall.getOpcode() != X86::TCRETURNdi64) {
+ // Only direct calls can be done with a conditional branch.
+ return false;
+ }
+
+ const MachineFunction *MF = TailCall.getParent()->getParent();
+ if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
+ // Conditional tail calls confuse the Win64 unwinder.
+ return false;
+ }
+
+ assert(BranchCond.size() == 1);
+ if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
+ // Can't make a conditional tail call with this condition.
+ return false;
+ }
+
+ const X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ if (X86FI->getTCReturnAddrDelta() != 0 ||
+ TailCall.getOperand(1).getImm() != 0) {
+ // A conditional tail call cannot do any stack adjustment.
+ return false;
+ }
+
+ return true;
+}
+
+void X86InstrInfo::replaceBranchWithTailCall(
+ MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond,
+ const MachineInstr &TailCall) const {
+ assert(canMakeTailCallConditional(BranchCond, TailCall));
+
+ MachineBasicBlock::iterator I = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugInstr())
+ continue;
+ if (!I->isBranch())
+ assert(0 && "Can't find the branch to replace!");
+
+ X86::CondCode CC = X86::getCondFromBranch(*I);
+ assert(BranchCond.size() == 1);
+ if (CC != BranchCond[0].getImm())
+ continue;
+
+ break;
+ }
+
+ unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
+ : X86::TCRETURNdi64cc;
+
+ auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
+ MIB->addOperand(TailCall.getOperand(0)); // Destination.
+ MIB.addImm(0); // Stack offset (not used).
+ MIB->addOperand(BranchCond[0]); // Condition.
+ MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
+
+ // Add implicit uses and defs of all live regs potentially clobbered by the
+ // call. This way they still appear live across the call.
+ LivePhysRegs LiveRegs(getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 8> Clobbers;
+ LiveRegs.stepForward(*MIB, Clobbers);
+ for (const auto &C : Clobbers) {
+ MIB.addReg(C.first, RegState::Implicit);
+ MIB.addReg(C.first, RegState::Implicit | RegState::Define);
+ }
+
+ I->eraseFromParent();
+}
+
+// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
+// not be a fallthrough MBB now due to layout changes). Return nullptr if the
+// fallthrough MBB cannot be identified.
+static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB,
+ MachineBasicBlock *TBB) {
+ // Look for non-EHPad successors other than TBB. If we find exactly one, it
+ // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
+ // and fallthrough MBB. If we find more than one, we cannot identify the
+ // fallthrough MBB and should return nullptr.
+ MachineBasicBlock *FallthroughBB = nullptr;
+ for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+ if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB))
+ continue;
+ // Return a nullptr if we found more than one fallthrough successor.
+ if (FallthroughBB && FallthroughBB != TBB)
+ return nullptr;
+ FallthroughBB = *SI;
+ }
+ return FallthroughBB;
+}
+
+bool X86InstrInfo::AnalyzeBranchImpl(
+ MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
+
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugInstr())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator instruction, we're
+ // done.
+ if (!isUnpredicatedTerminator(*I))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled by this
+ // analysis.
+ if (!I->isBranch())
+ return true;
+
+ // Handle unconditional branches.
+ if (I->getOpcode() == X86::JMP_1) {
+ UnCondBrIter = I;
+
+ if (!AllowModify) {
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a JMP, delete them.
+ while (std::next(I) != MBB.end())
+ std::next(I)->eraseFromParent();
+
+ Cond.clear();
+ FBB = nullptr;
+
+ // Delete the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ TBB = nullptr;
+ I->eraseFromParent();
+ I = MBB.end();
+ UnCondBrIter = MBB.end();
+ continue;
+ }
+
+ // TBB is used to indicate the unconditional destination.
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // Handle conditional branches.
+ X86::CondCode BranchCode = X86::getCondFromBranch(*I);
+ if (BranchCode == X86::COND_INVALID)
+ return true; // Can't handle indirect branch.
+
+ // In practice we should never have an undef eflags operand, if we do
+ // abort here as we are not prepared to preserve the flag.
+ if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
+ return true;
+
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+ if (AllowModify && UnCondBrIter != MBB.end() &&
+ MBB.isLayoutSuccessor(TargetBB)) {
+ // If we can modify the code and it ends in something like:
+ //
+ // jCC L1
+ // jmp L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Then we can change this to:
+ //
+ // jnCC L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Which is a bit more efficient.
+ // We conditionally jump to the fall-through block.
+ BranchCode = GetOppositeBranchCondition(BranchCode);
+ MachineBasicBlock::iterator OldInst = I;
+
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1))
+ .addMBB(UnCondBrIter->getOperand(0).getMBB())
+ .addImm(BranchCode);
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
+ .addMBB(TargetBB);
+
+ OldInst->eraseFromParent();
+ UnCondBrIter->eraseFromParent();
+
+ // Restart the analysis.
+ UnCondBrIter = MBB.end();
+ I = MBB.end();
+ continue;
+ }
+
+ FBB = TBB;
+ TBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ CondBranches.push_back(&*I);
+ continue;
+ }
+
+ // Handle subsequent conditional branches. Only handle the case where all
+ // conditional branches branch to the same destination and their condition
+ // opcodes fit one of the special multi-branch idioms.
+ assert(Cond.size() == 1);
+ assert(TBB);
+
+ // If the conditions are the same, we can leave them alone.
+ X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+ auto NewTBB = I->getOperand(0).getMBB();
+ if (OldBranchCode == BranchCode && TBB == NewTBB)
+ continue;
+
+ // If they differ, see if they fit one of the known patterns. Theoretically,
+ // we could handle more patterns here, but we shouldn't expect to see them
+ // if instruction selection has done a reasonable job.
+ if (TBB == NewTBB &&
+ ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
+ (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
+ BranchCode = X86::COND_NE_OR_P;
+ } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
+ (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
+ if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
+ return true;
+
+ // X86::COND_E_AND_NP usually has two different branch destinations.
+ //
+ // JP B1
+ // JE B2
+ // JMP B1
+ // B1:
+ // B2:
+ //
+ // Here this condition branches to B2 only if NP && E. It has another
+ // equivalent form:
+ //
+ // JNE B1
+ // JNP B2
+ // JMP B1
+ // B1:
+ // B2:
+ //
+ // Similarly it branches to B2 only if E && NP. That is why this condition
+ // is named with COND_E_AND_NP.
+ BranchCode = X86::COND_E_AND_NP;
+ } else
+ return true;
+
+ // Update the MachineOperand.
+ Cond[0].setImm(BranchCode);
+ CondBranches.push_back(&*I);
+ }
+
+ return false;
+}
+
+bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ SmallVector<MachineInstr *, 4> CondBranches;
+ return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
+}
+
+bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
+ MachineBranchPredicate &MBP,
+ bool AllowModify) const {
+ using namespace std::placeholders;
+
+ SmallVector<MachineOperand, 4> Cond;
+ SmallVector<MachineInstr *, 4> CondBranches;
+ if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
+ AllowModify))
+ return true;
+
+ if (Cond.size() != 1)
+ return true;
+
+ assert(MBP.TrueDest && "expected!");
+
+ if (!MBP.FalseDest)
+ MBP.FalseDest = MBB.getNextNode();
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ MachineInstr *ConditionDef = nullptr;
+ bool SingleUseCondition = true;
+
+ for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
+ if (I->modifiesRegister(X86::EFLAGS, TRI)) {
+ ConditionDef = &*I;
+ break;
+ }
+
+ if (I->readsRegister(X86::EFLAGS, TRI))
+ SingleUseCondition = false;
+ }
+
+ if (!ConditionDef)
+ return true;
+
+ if (SingleUseCondition) {
+ for (auto *Succ : MBB.successors())
+ if (Succ->isLiveIn(X86::EFLAGS))
+ SingleUseCondition = false;
+ }
+
+ MBP.ConditionDef = ConditionDef;
+ MBP.SingleUseCondition = SingleUseCondition;
+
+ // Currently we only recognize the simple pattern:
+ //
+ // test %reg, %reg
+ // je %label
+ //
+ const unsigned TestOpcode =
+ Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
+
+ if (ConditionDef->getOpcode() == TestOpcode &&
+ ConditionDef->getNumOperands() == 3 &&
+ ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
+ (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
+ MBP.LHS = ConditionDef->getOperand(0);
+ MBP.RHS = MachineOperand::CreateImm(0);
+ MBP.Predicate = Cond[0].getImm() == X86::COND_NE
+ ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
+ return false;
+ }
+
+ return true;
+}
+
+unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugInstr())
+ continue;
+ if (I->getOpcode() != X86::JMP_1 &&
+ X86::getCondFromBranch(*I) == X86::COND_INVALID)
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "X86 branch conditions have one component!");
+ assert(!BytesAdded && "code size not handled");
+
+ if (Cond.empty()) {
+ // Unconditional branch?
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
+ return 1;
+ }
+
+ // If FBB is null, it is implied to be a fall-through block.
+ bool FallThru = FBB == nullptr;
+
+ // Conditional branch.
+ unsigned Count = 0;
+ X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+ switch (CC) {
+ case X86::COND_NE_OR_P:
+ // Synthesize NE_OR_P with two branches.
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
+ ++Count;
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
+ ++Count;
+ break;
+ case X86::COND_E_AND_NP:
+ // Use the next block of MBB as FBB if it is null.
+ if (FBB == nullptr) {
+ FBB = getFallThroughMBB(&MBB, TBB);
+ assert(FBB && "MBB cannot be the last block in function when the false "
+ "body is a fall-through.");
+ }
+ // Synthesize COND_E_AND_NP with two branches.
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
+ ++Count;
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
+ ++Count;
+ break;
+ default: {
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
+ ++Count;
+ }
+ }
+ if (!FallThru) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
+ ++Count;
+ }
+ return Count;
+}
+
+bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+ ArrayRef<MachineOperand> Cond,
+ Register DstReg, Register TrueReg,
+ Register FalseReg, int &CondCycles,
+ int &TrueCycles, int &FalseCycles) const {
+ // Not all subtargets have cmov instructions.
+ if (!Subtarget.hasCMov())
+ return false;
+ if (Cond.size() != 1)
+ return false;
+ // We cannot do the composite conditions, at least not in SSA form.
+ if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
+ return false;
+
+ // Check register classes.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ if (!RC)
+ return false;
+
+ // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+ if (X86::GR16RegClass.hasSubClassEq(RC) ||
+ X86::GR32RegClass.hasSubClassEq(RC) ||
+ X86::GR64RegClass.hasSubClassEq(RC)) {
+ // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+ // Bridge. Probably Ivy Bridge as well.
+ CondCycles = 2;
+ TrueCycles = 2;
+ FalseCycles = 2;
+ return true;
+ }
+
+ // Can't do vectors.
+ return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, Register DstReg,
+ ArrayRef<MachineOperand> Cond, Register TrueReg,
+ Register FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+ const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
+ assert(Cond.size() == 1 && "Invalid Cond array");
+ unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
+ false /*HasMemoryOperand*/);
+ BuildMI(MBB, I, DL, get(Opc), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addImm(Cond[0].getImm());
+}
+
+/// Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+ return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+// Try and copy between VR128/VR64 and GR64 registers.
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+ const X86Subtarget &Subtarget) {
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+
+ // SrcReg(MaskReg) -> DestReg(GR64)
+ // SrcReg(MaskReg) -> DestReg(GR32)
+
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ if (X86::VK16RegClass.contains(SrcReg)) {
+ if (X86::GR64RegClass.contains(DestReg)) {
+ assert(Subtarget.hasBWI());
+ return X86::KMOVQrk;
+ }
+ if (X86::GR32RegClass.contains(DestReg))
+ return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
+ }
+
+ // SrcReg(GR64) -> DestReg(MaskReg)
+ // SrcReg(GR32) -> DestReg(MaskReg)
+
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ if (X86::VK16RegClass.contains(DestReg)) {
+ if (X86::GR64RegClass.contains(SrcReg)) {
+ assert(Subtarget.hasBWI());
+ return X86::KMOVQkr;
+ }
+ if (X86::GR32RegClass.contains(SrcReg))
+ return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
+ }
+
+
+ // SrcReg(VR128) -> DestReg(GR64)
+ // SrcReg(VR64) -> DestReg(GR64)
+ // SrcReg(GR64) -> DestReg(VR128)
+ // SrcReg(GR64) -> DestReg(VR64)
+
+ if (X86::GR64RegClass.contains(DestReg)) {
+ if (X86::VR128XRegClass.contains(SrcReg))
+ // Copy from a VR128 register to a GR64 register.
+ return HasAVX512 ? X86::VMOVPQIto64Zrr :
+ HasAVX ? X86::VMOVPQIto64rr :
+ X86::MOVPQIto64rr;
+ if (X86::VR64RegClass.contains(SrcReg))
+ // Copy from a VR64 register to a GR64 register.
+ return X86::MMX_MOVD64from64rr;
+ } else if (X86::GR64RegClass.contains(SrcReg)) {
+ // Copy from a GR64 register to a VR128 register.
+ if (X86::VR128XRegClass.contains(DestReg))
+ return HasAVX512 ? X86::VMOV64toPQIZrr :
+ HasAVX ? X86::VMOV64toPQIrr :
+ X86::MOV64toPQIrr;
+ // Copy from a GR64 register to a VR64 register.
+ if (X86::VR64RegClass.contains(DestReg))
+ return X86::MMX_MOVD64to64rr;
+ }
+
+ // SrcReg(VR128) -> DestReg(GR32)
+ // SrcReg(GR32) -> DestReg(VR128)
+
+ if (X86::GR32RegClass.contains(DestReg) &&
+ X86::VR128XRegClass.contains(SrcReg))
+ // Copy from a VR128 register to a GR32 register.
+ return HasAVX512 ? X86::VMOVPDI2DIZrr :
+ HasAVX ? X86::VMOVPDI2DIrr :
+ X86::MOVPDI2DIrr;
+
+ if (X86::VR128XRegClass.contains(DestReg) &&
+ X86::GR32RegClass.contains(SrcReg))
+ // Copy from a VR128 register to a VR128 register.
+ return HasAVX512 ? X86::VMOVDI2PDIZrr :
+ HasAVX ? X86::VMOVDI2PDIrr :
+ X86::MOVDI2PDIrr;
+ return 0;
+}
+
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, MCRegister DestReg,
+ MCRegister SrcReg, bool KillSrc) const {
+ // First deal with the normal symmetric copies.
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasVLX = Subtarget.hasVLX();
+ unsigned Opc = 0;
+ if (X86::GR64RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV64rr;
+ else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV32rr;
+ else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MOV16rr;
+ else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+ Subtarget.is64Bit()) {
+ Opc = X86::MOV8rr_NOREX;
+ // Both operands must be encodable without an REX prefix.
+ assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+ "8-bit H register can not be copied outside GR8_NOREX");
+ } else
+ Opc = X86::MOV8rr;
+ }
+ else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MMX_MOVQ64rr;
+ else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
+ if (HasVLX)
+ Opc = X86::VMOVAPSZ128rr;
+ else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+ Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+ else {
+ // If this an extended register and we don't have VLX we need to use a
+ // 512-bit move.
+ Opc = X86::VMOVAPSZrr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
+ &X86::VR512RegClass);
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
+ &X86::VR512RegClass);
+ }
+ } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
+ if (HasVLX)
+ Opc = X86::VMOVAPSZ256rr;
+ else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSYrr;
+ else {
+ // If this an extended register and we don't have VLX we need to use a
+ // 512-bit move.
+ Opc = X86::VMOVAPSZrr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
+ &X86::VR512RegClass);
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
+ &X86::VR512RegClass);
+ }
+ } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSZrr;
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ else if (X86::VK16RegClass.contains(DestReg, SrcReg))
+ Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
+ if (!Opc)
+ Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
+
+ if (Opc) {
+ BuildMI(MBB, MI, DL, get(Opc), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
+ // FIXME: We use a fatal error here because historically LLVM has tried
+ // lower some of these physreg copies and we want to ensure we get
+ // reasonable bug reports if someone encounters a case no other testing
+ // found. This path should be removed after the LLVM 7 release.
+ report_fatal_error("Unable to copy EFLAGS physical register!");
+ }
+
+ LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
+ << RI.getName(DestReg) << '\n');
+ report_fatal_error("Cannot emit physreg copy instruction");
+}
+
+Optional<DestSourcePair>
+X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
+ if (MI.isMoveReg())
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+ return None;
+}
+
+static unsigned getLoadStoreRegOpcode(Register Reg,
+ const TargetRegisterClass *RC,
+ bool IsStackAligned,
+ const X86Subtarget &STI, bool load) {
+ bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
+ switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
+ default:
+ llvm_unreachable("Unknown spill size");
+ case 1:
+ assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
+ if (STI.is64Bit())
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+ return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+ return load ? X86::MOV8rm : X86::MOV8mr;
+ case 2:
+ if (X86::VK16RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVWkm : X86::KMOVWmk;
+ assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+ return load ? X86::MOV16rm : X86::MOV16mr;
+ case 4:
+ if (X86::GR32RegClass.hasSubClassEq(RC))
+ return load ? X86::MOV32rm : X86::MOV32mr;
+ if (X86::FR32XRegClass.hasSubClassEq(RC))
+ return load ?
+ (HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt) :
+ (HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr :
+ X86::MOVSSmr);
+ if (X86::RFP32RegClass.hasSubClassEq(RC))
+ return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+ if (X86::VK32RegClass.hasSubClassEq(RC)) {
+ assert(STI.hasBWI() && "KMOVD requires BWI");
+ return load ? X86::KMOVDkm : X86::KMOVDmk;
+ }
+ // All of these mask pair classes have the same spill size, the same kind
+ // of kmov instructions can be used with all of them.
+ if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK16PAIRRegClass.hasSubClassEq(RC))
+ return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
+ llvm_unreachable("Unknown 4-byte regclass");
+ case 8:
+ if (X86::GR64RegClass.hasSubClassEq(RC))
+ return load ? X86::MOV64rm : X86::MOV64mr;
+ if (X86::FR64XRegClass.hasSubClassEq(RC))
+ return load ?
+ (HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt) :
+ (HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr :
+ X86::MOVSDmr);
+ if (X86::VR64RegClass.hasSubClassEq(RC))
+ return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+ if (X86::RFP64RegClass.hasSubClassEq(RC))
+ return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+ if (X86::VK64RegClass.hasSubClassEq(RC)) {
+ assert(STI.hasBWI() && "KMOVQ requires BWI");
+ return load ? X86::KMOVQkm : X86::KMOVQmk;
+ }
+ llvm_unreachable("Unknown 8-byte regclass");
+ case 10:
+ assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
+ return load ? X86::LD_Fp80m : X86::ST_FpP80m;
+ case 16: {
+ if (X86::VR128XRegClass.hasSubClassEq(RC)) {
+ // If stack is realigned we can use aligned stores.
+ if (IsStackAligned)
+ return load ?
+ (HasVLX ? X86::VMOVAPSZ128rm :
+ HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVAPSrm :
+ X86::MOVAPSrm):
+ (HasVLX ? X86::VMOVAPSZ128mr :
+ HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVAPSmr :
+ X86::MOVAPSmr);
+ else
+ return load ?
+ (HasVLX ? X86::VMOVUPSZ128rm :
+ HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVUPSrm :
+ X86::MOVUPSrm):
+ (HasVLX ? X86::VMOVUPSZ128mr :
+ HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVUPSmr :
+ X86::MOVUPSmr);
+ }
+ if (X86::BNDRRegClass.hasSubClassEq(RC)) {
+ if (STI.is64Bit())
+ return load ? X86::BNDMOV64rm : X86::BNDMOV64mr;
+ else
+ return load ? X86::BNDMOV32rm : X86::BNDMOV32mr;
+ }
+ llvm_unreachable("Unknown 16-byte regclass");
+ }
+ case 32:
+ assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
+ // If stack is realigned we can use aligned stores.
+ if (IsStackAligned)
+ return load ?
+ (HasVLX ? X86::VMOVAPSZ256rm :
+ HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
+ X86::VMOVAPSYrm) :
+ (HasVLX ? X86::VMOVAPSZ256mr :
+ HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
+ X86::VMOVAPSYmr);
+ else
+ return load ?
+ (HasVLX ? X86::VMOVUPSZ256rm :
+ HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
+ X86::VMOVUPSYrm) :
+ (HasVLX ? X86::VMOVUPSZ256mr :
+ HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
+ X86::VMOVUPSYmr);
+ case 64:
+ assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+ assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
+ if (IsStackAligned)
+ return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+ else
+ return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+ }
+}
+
+Optional<ExtAddrMode>
+X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
+ const TargetRegisterInfo *TRI) const {
+ const MCInstrDesc &Desc = MemI.getDesc();
+ int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemRefBegin < 0)
+ return None;
+
+ MemRefBegin += X86II::getOperandBias(Desc);
+
+ auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
+ if (!BaseOp.isReg()) // Can be an MO_FrameIndex
+ return None;
+
+ const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
+ // Displacement can be symbolic
+ if (!DispMO.isImm())
+ return None;
+
+ ExtAddrMode AM;
+ AM.BaseReg = BaseOp.getReg();
+ AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
+ AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
+ AM.Displacement = DispMO.getImm();
+ return AM;
+}
+
+bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
+ const Register Reg,
+ int64_t &ImmVal) const {
+ if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
+ return false;
+ // Mov Src can be a global address.
+ if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
+ return false;
+ ImmVal = MI.getOperand(1).getImm();
+ return true;
+}
+
+bool X86InstrInfo::preservesZeroValueInReg(
+ const MachineInstr *MI, const Register NullValueReg,
+ const TargetRegisterInfo *TRI) const {
+ if (!MI->modifiesRegister(NullValueReg, TRI))
+ return true;
+ switch (MI->getOpcode()) {
+ // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
+ // X.
+ case X86::SHR64ri:
+ case X86::SHR32ri:
+ case X86::SHL64ri:
+ case X86::SHL32ri:
+ assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
+ "expected for shift opcode!");
+ return MI->getOperand(0).getReg() == NullValueReg &&
+ MI->getOperand(1).getReg() == NullValueReg;
+ // Zero extend of a sub-reg of NullValueReg into itself does not change the
+ // null value.
+ case X86::MOV32rr:
+ return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
+ return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
+ });
+ default:
+ return false;
+ }
+ llvm_unreachable("Should be handled above!");
+}
+
+bool X86InstrInfo::getMemOperandsWithOffsetWidth(
+ const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps,
+ int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const {
+ const MCInstrDesc &Desc = MemOp.getDesc();
+ int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemRefBegin < 0)
+ return false;
+
+ MemRefBegin += X86II::getOperandBias(Desc);
+
+ const MachineOperand *BaseOp =
+ &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
+ if (!BaseOp->isReg()) // Can be an MO_FrameIndex
+ return false;
+
+ if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
+ return false;
+
+ if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
+ X86::NoRegister)
+ return false;
+
+ const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
+
+ // Displacement can be symbolic
+ if (!DispMO.isImm())
+ return false;
+
+ Offset = DispMO.getImm();
+
+ if (!BaseOp->isReg())
+ return false;
+
+ OffsetIsScalable = false;
+ // FIXME: Relying on memoperands() may not be right thing to do here. Check
+ // with X86 maintainers, and fix it accordingly. For now, it is ok, since
+ // there is no use of `Width` for X86 back-end at the moment.
+ Width =
+ !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
+ BaseOps.push_back(BaseOp);
+ return true;
+}
+
+static unsigned getStoreRegOpcode(Register SrcReg,
+ const TargetRegisterClass *RC,
+ bool IsStackAligned,
+ const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
+}
+
+static unsigned getLoadRegOpcode(Register DestReg,
+ const TargetRegisterClass *RC,
+ bool IsStackAligned, const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ Register SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+ "Stack slot too small for store");
+ if (RC->getID() == X86::TILERegClassID) {
+ unsigned Opc = X86::TILESTORED;
+ // tilestored %tmm, (%sp, %idx)
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+ MachineInstr *NewMI =
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ MachineOperand &MO = NewMI->getOperand(2);
+ MO.setReg(VirtReg);
+ MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PSTTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ }
+}
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ Register DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ if (RC->getID() == X86::TILERegClassID) {
+ unsigned Opc = X86::TILELOADD;
+ // tileloadd (%sp, %idx), %tmm
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ MachineInstr *NewMI =
+ BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+ NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ MachineOperand &MO = NewMI->getOperand(3);
+ MO.setReg(VirtReg);
+ MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PLDTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ } else {
+ const MachineFunction &MF = *MBB.getParent();
+ unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ }
+}
+
+bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+ Register &SrcReg2, int &CmpMask,
+ int &CmpValue) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = 0;
+ if (MI.getOperand(1).isImm()) {
+ CmpMask = ~0;
+ CmpValue = MI.getOperand(1).getImm();
+ } else {
+ CmpMask = CmpValue = 0;
+ }
+ return true;
+ // A SUB can be used to perform comparison.
+ case X86::SUB64rm:
+ case X86::SUB32rm:
+ case X86::SUB16rm:
+ case X86::SUB8rm:
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = 0;
+ CmpMask = 0;
+ CmpValue = 0;
+ return true;
+ case X86::SUB64rr:
+ case X86::SUB32rr:
+ case X86::SUB16rr:
+ case X86::SUB8rr:
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = MI.getOperand(2).getReg();
+ CmpMask = 0;
+ CmpValue = 0;
+ return true;
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB8ri:
+ SrcReg = MI.getOperand(1).getReg();
+ SrcReg2 = 0;
+ if (MI.getOperand(2).isImm()) {
+ CmpMask = ~0;
+ CmpValue = MI.getOperand(2).getImm();
+ } else {
+ CmpMask = CmpValue = 0;
+ }
+ return true;
+ case X86::CMP64rr:
+ case X86::CMP32rr:
+ case X86::CMP16rr:
+ case X86::CMP8rr:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = MI.getOperand(1).getReg();
+ CmpMask = 0;
+ CmpValue = 0;
+ return true;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ SrcReg = MI.getOperand(0).getReg();
+ if (MI.getOperand(1).getReg() != SrcReg)
+ return false;
+ // Compare against zero.
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ }
+ return false;
+}
+
+/// Check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
+ Register SrcReg, Register SrcReg2,
+ int ImmMask, int ImmValue,
+ const MachineInstr &OI) {
+ if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
+ (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
+ (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
+ (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
+ ((OI.getOperand(1).getReg() == SrcReg &&
+ OI.getOperand(2).getReg() == SrcReg2) ||
+ (OI.getOperand(1).getReg() == SrcReg2 &&
+ OI.getOperand(2).getReg() == SrcReg)))
+ return true;
+
+ if (ImmMask != 0 &&
+ ((FlagI.getOpcode() == X86::CMP64ri32 &&
+ OI.getOpcode() == X86::SUB64ri32) ||
+ (FlagI.getOpcode() == X86::CMP64ri8 &&
+ OI.getOpcode() == X86::SUB64ri8) ||
+ (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) ||
+ (FlagI.getOpcode() == X86::CMP32ri8 &&
+ OI.getOpcode() == X86::SUB32ri8) ||
+ (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) ||
+ (FlagI.getOpcode() == X86::CMP16ri8 &&
+ OI.getOpcode() == X86::SUB16ri8) ||
+ (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
+ OI.getOperand(1).getReg() == SrcReg &&
+ OI.getOperand(2).getImm() == ImmValue)
+ return true;
+ return false;
+}
+
+/// Check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
+ NoSignFlag = false;
+
+ switch (MI.getOpcode()) {
+ default: return false;
+
+ // The shift instructions only modify ZF if their shift count is non-zero.
+ // N.B.: The processor truncates the shift count depending on the encoding.
+ case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
+ case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
+ return getTruncatedShiftCount(MI, 2) != 0;
+
+ // Some left shift instructions can be turned into LEA instructions but only
+ // if their flags aren't used. Avoid transforming such instructions.
+ case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+ return ShAmt != 0;
+ }
+
+ case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
+ case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
+ return getTruncatedShiftCount(MI, 3) != 0;
+
+ case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+ case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
+ case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
+ case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
+ case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
+ case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
+ case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+ case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
+ case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
+ case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
+ case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
+ case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
+ case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+ case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
+ case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
+ case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
+ case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
+ case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+ case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
+ case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
+ case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
+ case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
+ case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
+ case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
+ case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
+ case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
+ case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
+ case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
+ case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8:
+ case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr:
+ case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm:
+ case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm:
+ case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri:
+ case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8:
+ case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr:
+ case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm:
+ case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm:
+ case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
+ case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
+ case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
+ case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
+ case X86::ANDN32rr: case X86::ANDN32rm:
+ case X86::ANDN64rr: case X86::ANDN64rm:
+ case X86::BLSI32rr: case X86::BLSI32rm:
+ case X86::BLSI64rr: case X86::BLSI64rm:
+ case X86::BLSMSK32rr:case X86::BLSMSK32rm:
+ case X86::BLSMSK64rr:case X86::BLSMSK64rm:
+ case X86::BLSR32rr: case X86::BLSR32rm:
+ case X86::BLSR64rr: case X86::BLSR64rm:
+ case X86::BZHI32rr: case X86::BZHI32rm:
+ case X86::BZHI64rr: case X86::BZHI64rm:
+ case X86::LZCNT16rr: case X86::LZCNT16rm:
+ case X86::LZCNT32rr: case X86::LZCNT32rm:
+ case X86::LZCNT64rr: case X86::LZCNT64rm:
+ case X86::POPCNT16rr:case X86::POPCNT16rm:
+ case X86::POPCNT32rr:case X86::POPCNT32rm:
+ case X86::POPCNT64rr:case X86::POPCNT64rm:
+ case X86::TZCNT16rr: case X86::TZCNT16rm:
+ case X86::TZCNT32rr: case X86::TZCNT32rm:
+ case X86::TZCNT64rr: case X86::TZCNT64rm:
+ case X86::BLCFILL32rr: case X86::BLCFILL32rm:
+ case X86::BLCFILL64rr: case X86::BLCFILL64rm:
+ case X86::BLCI32rr: case X86::BLCI32rm:
+ case X86::BLCI64rr: case X86::BLCI64rm:
+ case X86::BLCIC32rr: case X86::BLCIC32rm:
+ case X86::BLCIC64rr: case X86::BLCIC64rm:
+ case X86::BLCMSK32rr: case X86::BLCMSK32rm:
+ case X86::BLCMSK64rr: case X86::BLCMSK64rm:
+ case X86::BLCS32rr: case X86::BLCS32rm:
+ case X86::BLCS64rr: case X86::BLCS64rm:
+ case X86::BLSFILL32rr: case X86::BLSFILL32rm:
+ case X86::BLSFILL64rr: case X86::BLSFILL64rm:
+ case X86::BLSIC32rr: case X86::BLSIC32rm:
+ case X86::BLSIC64rr: case X86::BLSIC64rm:
+ case X86::T1MSKC32rr: case X86::T1MSKC32rm:
+ case X86::T1MSKC64rr: case X86::T1MSKC64rm:
+ case X86::TZMSK32rr: case X86::TZMSK32rm:
+ case X86::TZMSK64rr: case X86::TZMSK64rm:
+ return true;
+ case X86::BEXTR32rr: case X86::BEXTR64rr:
+ case X86::BEXTR32rm: case X86::BEXTR64rm:
+ case X86::BEXTRI32ri: case X86::BEXTRI32mi:
+ case X86::BEXTRI64ri: case X86::BEXTRI64mi:
+ // BEXTR doesn't update the sign flag so we can't use it.
+ NoSignFlag = true;
+ return true;
+ }
+}
+
+/// Check whether the use can be converted to remove a comparison against zero.
+static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default: return X86::COND_INVALID;
+ case X86::NEG8r:
+ case X86::NEG16r:
+ case X86::NEG32r:
+ case X86::NEG64r:
+ return X86::COND_AE;
+ case X86::LZCNT16rr:
+ case X86::LZCNT32rr:
+ case X86::LZCNT64rr:
+ return X86::COND_B;
+ case X86::POPCNT16rr:
+ case X86::POPCNT32rr:
+ case X86::POPCNT64rr:
+ return X86::COND_E;
+ case X86::TZCNT16rr:
+ case X86::TZCNT32rr:
+ case X86::TZCNT64rr:
+ return X86::COND_B;
+ case X86::BSF16rr:
+ case X86::BSF32rr:
+ case X86::BSF64rr:
+ case X86::BSR16rr:
+ case X86::BSR32rr:
+ case X86::BSR64rr:
+ return X86::COND_E;
+ case X86::BLSI32rr:
+ case X86::BLSI64rr:
+ return X86::COND_AE;
+ case X86::BLSR32rr:
+ case X86::BLSR64rr:
+ case X86::BLSMSK32rr:
+ case X86::BLSMSK64rr:
+ return X86::COND_B;
+ // TODO: TBM instructions.
+ }
+}
+
+/// Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+ Register SrcReg2, int CmpMask,
+ int CmpValue,
+ const MachineRegisterInfo *MRI) const {
+ // Check whether we can replace SUB with CMP.
+ switch (CmpInstr.getOpcode()) {
+ default: break;
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB8ri:
+ case X86::SUB64rm:
+ case X86::SUB32rm:
+ case X86::SUB16rm:
+ case X86::SUB8rm:
+ case X86::SUB64rr:
+ case X86::SUB32rr:
+ case X86::SUB16rr:
+ case X86::SUB8rr: {
+ if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
+ return false;
+ // There is no use of the destination register, we can replace SUB with CMP.
+ unsigned NewOpcode = 0;
+ switch (CmpInstr.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
+ case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
+ case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
+ case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
+ case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
+ case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
+ case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
+ case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
+ case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
+ case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
+ case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
+ case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
+ case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
+ case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
+ case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
+ }
+ CmpInstr.setDesc(get(NewOpcode));
+ CmpInstr.RemoveOperand(0);
+ // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+ if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+ NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+ return false;
+ }
+ }
+
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI) return false;
+
+ // CmpInstr is the first instruction of the BB.
+ MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+ // If we are comparing against zero, check whether we can use MI to update
+ // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+ bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
+ if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
+ return false;
+
+ // If we have a use of the source register between the def and our compare
+ // instruction we can eliminate the compare iff the use sets EFLAGS in the
+ // right way.
+ bool ShouldUpdateCC = false;
+ bool NoSignFlag = false;
+ X86::CondCode NewCC = X86::COND_INVALID;
+ if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag)) {
+ // Scan forward from the use until we hit the use we're looking for or the
+ // compare instruction.
+ for (MachineBasicBlock::iterator J = MI;; ++J) {
+ // Do we have a convertible instruction?
+ NewCC = isUseDefConvertible(*J);
+ if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
+ J->getOperand(1).getReg() == SrcReg) {
+ assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
+ ShouldUpdateCC = true; // Update CC later on.
+ // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
+ // with the new def.
+ Def = J;
+ MI = &*Def;
+ break;
+ }
+
+ if (J == I)
+ return false;
+ }
+ }
+
+ // We are searching for an earlier instruction that can make CmpInstr
+ // redundant and that instruction will be saved in Sub.
+ MachineInstr *Sub = nullptr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ // We iterate backward, starting from the instruction before CmpInstr and
+ // stop when reaching the definition of a source register or done with the BB.
+ // RI points to the instruction before CmpInstr.
+ // If the definition is in this basic block, RE points to the definition;
+ // otherwise, RE is the rend of the basic block.
+ MachineBasicBlock::reverse_iterator
+ RI = ++I.getReverse(),
+ RE = CmpInstr.getParent() == MI->getParent()
+ ? Def.getReverse() /* points to MI */
+ : CmpInstr.getParent()->rend();
+ MachineInstr *Movr0Inst = nullptr;
+ for (; RI != RE; ++RI) {
+ MachineInstr &Instr = *RI;
+ // Check whether CmpInstr can be made redundant by the current instruction.
+ if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask,
+ CmpValue, Instr)) {
+ Sub = &Instr;
+ break;
+ }
+
+ if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
+ Instr.readsRegister(X86::EFLAGS, TRI)) {
+ // This instruction modifies or uses EFLAGS.
+
+ // MOV32r0 etc. are implemented with xor which clobbers condition code.
+ // They are safe to move up, if the definition to EFLAGS is dead and
+ // earlier instructions do not read or write EFLAGS.
+ if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
+ Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
+ Movr0Inst = &Instr;
+ continue;
+ }
+
+ // We can't remove CmpInstr.
+ return false;
+ }
+ }
+
+ // Return false if no candidates exist.
+ if (!IsCmpZero && !Sub)
+ return false;
+
+ bool IsSwapped =
+ (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg);
+
+ // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+ // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+ // If we are done with the basic block, we need to check whether EFLAGS is
+ // live-out.
+ bool IsSafe = false;
+ SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate;
+ MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
+ for (++I; I != E; ++I) {
+ const MachineInstr &Instr = *I;
+ bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
+ bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
+ // We should check the usage if this instruction uses and updates EFLAGS.
+ if (!UseEFLAGS && ModifyEFLAGS) {
+ // It is safe to remove CmpInstr if EFLAGS is updated again.
+ IsSafe = true;
+ break;
+ }
+ if (!UseEFLAGS && !ModifyEFLAGS)
+ continue;
+
+ // EFLAGS is used by this instruction.
+ X86::CondCode OldCC = X86::COND_INVALID;
+ if (IsCmpZero || IsSwapped) {
+ // We decode the condition code from opcode.
+ if (Instr.isBranch())
+ OldCC = X86::getCondFromBranch(Instr);
+ else {
+ OldCC = X86::getCondFromSETCC(Instr);
+ if (OldCC == X86::COND_INVALID)
+ OldCC = X86::getCondFromCMov(Instr);
+ }
+ if (OldCC == X86::COND_INVALID) return false;
+ }
+ X86::CondCode ReplacementCC = X86::COND_INVALID;
+ if (IsCmpZero) {
+ switch (OldCC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO:
+ // CF and OF are used, we can't perform this optimization.
+ return false;
+ case X86::COND_S: case X86::COND_NS:
+ // If SF is used, but the instruction doesn't update the SF, then we
+ // can't do the optimization.
+ if (NoSignFlag)
+ return false;
+ break;
+ }
+
+ // If we're updating the condition code check if we have to reverse the
+ // condition.
+ if (ShouldUpdateCC)
+ switch (OldCC) {
+ default:
+ return false;
+ case X86::COND_E:
+ ReplacementCC = NewCC;
+ break;
+ case X86::COND_NE:
+ ReplacementCC = GetOppositeBranchCondition(NewCC);
+ break;
+ }
+ } else if (IsSwapped) {
+ // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+ // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+ // We swap the condition code and synthesize the new opcode.
+ ReplacementCC = getSwappedCondition(OldCC);
+ if (ReplacementCC == X86::COND_INVALID) return false;
+ }
+
+ if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
+ // Push the MachineInstr to OpsToUpdate.
+ // If it is safe to remove CmpInstr, the condition code of these
+ // instructions will be modified.
+ OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC));
+ }
+ if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
+ // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
+ IsSafe = true;
+ break;
+ }
+ }
+
+ // If EFLAGS is not killed nor re-defined, we should check whether it is
+ // live-out. If it is live-out, do not optimize.
+ if ((IsCmpZero || IsSwapped) && !IsSafe) {
+ MachineBasicBlock *MBB = CmpInstr.getParent();
+ for (MachineBasicBlock *Successor : MBB->successors())
+ if (Successor->isLiveIn(X86::EFLAGS))
+ return false;
+ }
+
+ // The instruction to be updated is either Sub or MI.
+ Sub = IsCmpZero ? MI : Sub;
+ // Move Movr0Inst to the appropriate place before Sub.
+ if (Movr0Inst) {
+ // Look backwards until we find a def that doesn't use the current EFLAGS.
+ Def = Sub;
+ MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
+ InsertE = Sub->getParent()->rend();
+ for (; InsertI != InsertE; ++InsertI) {
+ MachineInstr *Instr = &*InsertI;
+ if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
+ Instr->modifiesRegister(X86::EFLAGS, TRI)) {
+ Sub->getParent()->remove(Movr0Inst);
+ Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
+ Movr0Inst);
+ break;
+ }
+ }
+ if (InsertI == InsertE)
+ return false;
+ }
+
+ // Make sure Sub instruction defines EFLAGS and mark the def live.
+ MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
+ assert(FlagDef && "Unable to locate a def EFLAGS operand");
+ FlagDef->setIsDead(false);
+
+ CmpInstr.eraseFromParent();
+
+ // Modify the condition code of instructions in OpsToUpdate.
+ for (auto &Op : OpsToUpdate) {
+ Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
+ .setImm(Op.second);
+ }
+ return true;
+}
+
+/// Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
+ const MachineRegisterInfo *MRI,
+ Register &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const {
+ // Check whether we can move DefMI here.
+ DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+ assert(DefMI);
+ bool SawStore = false;
+ if (!DefMI->isSafeToMove(nullptr, SawStore))
+ return nullptr;
+
+ // Collect information about virtual register operands of MI.
+ SmallVector<unsigned, 1> SrcOperandIds;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (Reg != FoldAsLoadDefReg)
+ continue;
+ // Do not fold if we have a subreg use or a def.
+ if (MO.getSubReg() || MO.isDef())
+ return nullptr;
+ SrcOperandIds.push_back(i);
+ }
+ if (SrcOperandIds.empty())
+ return nullptr;
+
+ // Check whether we can fold the def into SrcOperandId.
+ if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
+ FoldAsLoadDefReg = 0;
+ return FoldMI;
+ }
+
+ return nullptr;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.
+/// This is used for mapping:
+/// %xmm4 = V_SET0
+/// to:
+/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
+///
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc) {
+ assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+ Register Reg = MIB.getReg(0);
+ MIB->setDesc(Desc);
+
+ // MachineInstr::addOperand() will insert explicit operands before any
+ // implicit operands.
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ // But we don't trust that.
+ assert(MIB.getReg(1) == Reg &&
+ MIB.getReg(2) == Reg && "Misplaced operand");
+ return true;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two %k0 reads.
+/// This is used for mapping:
+/// %k4 = K_SET1
+/// to:
+/// %k4 = KXNORrr %k0, %k0
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
+ Register Reg) {
+ assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+ MIB->setDesc(Desc);
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ return true;
+}
+
+static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
+ bool MinusOne) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ Register Reg = MIB.getReg(0);
+
+ // Insert the XOR.
+ BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+
+ // Turn the pseudo into an INC or DEC.
+ MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
+ MIB.addReg(Reg);
+
+ return true;
+}
+
+static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
+ const TargetInstrInfo &TII,
+ const X86Subtarget &Subtarget) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ int64_t Imm = MIB->getOperand(1).getImm();
+ assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
+ MachineBasicBlock::iterator I = MIB.getInstr();
+
+ int StackAdjustment;
+
+ if (Subtarget.is64Bit()) {
+ assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
+ MIB->getOpcode() == X86::MOV32ImmSExti8);
+
+ // Can't use push/pop lowering if the function might write to the red zone.
+ X86MachineFunctionInfo *X86FI =
+ MBB.getParent()->getInfo<X86MachineFunctionInfo>();
+ if (X86FI->getUsesRedZone()) {
+ MIB->setDesc(TII.get(MIB->getOpcode() ==
+ X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
+ return true;
+ }
+
+ // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
+ // widen the register if necessary.
+ StackAdjustment = 8;
+ BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
+ MIB->setDesc(TII.get(X86::POP64r));
+ MIB->getOperand(0)
+ .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
+ } else {
+ assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
+ StackAdjustment = 4;
+ BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
+ MIB->setDesc(TII.get(X86::POP32r));
+ }
+ MIB->RemoveOperand(1);
+ MIB->addImplicitDefUseOperands(*MBB.getParent());
+
+ // Build CFI if necessary.
+ MachineFunction &MF = *MBB.getParent();
+ const X86FrameLowering *TFL = Subtarget.getFrameLowering();
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
+ bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
+ if (EmitCFI) {
+ TFL->BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
+ TFL->BuildCFI(MBB, std::next(I), DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
+ }
+
+ return true;
+}
+
+// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
+// code sequence is needed for other targets.
+static void expandLoadStackGuard(MachineInstrBuilder &MIB,
+ const TargetInstrInfo &TII) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ Register Reg = MIB.getReg(0);
+ const GlobalValue *GV =
+ cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
+ auto Flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant;
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
+ MachineBasicBlock::iterator I = MIB.getInstr();
+
+ BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
+ .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
+ .addMemOperand(MMO);
+ MIB->setDebugLoc(DL);
+ MIB->setDesc(TII.get(X86::MOV64rm));
+ MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ unsigned XorOp =
+ MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
+ MIB->setDesc(TII.get(XorOp));
+ MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
+ return true;
+}
+
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that loads the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &LoadDesc,
+ const MCInstrDesc &BroadcastDesc,
+ unsigned SubIdx) {
+ Register DestReg = MIB.getReg(0);
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(DestReg) < 16) {
+ // We can use a normal VEX encoded load.
+ MIB->setDesc(LoadDesc);
+ } else {
+ // Use a 128/256-bit VBROADCAST instruction.
+ MIB->setDesc(BroadcastDesc);
+ // Change the destination to a 512-bit register.
+ DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(0).setReg(DestReg);
+ }
+ return true;
+}
+
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that stores the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXStore(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &StoreDesc,
+ const MCInstrDesc &ExtractDesc,
+ unsigned SubIdx) {
+ Register SrcReg = MIB.getReg(X86::AddrNumOperands);
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(SrcReg) < 16) {
+ // We can use a normal VEX encoded store.
+ MIB->setDesc(StoreDesc);
+ } else {
+ // Use a VEXTRACTF instruction.
+ MIB->setDesc(ExtractDesc);
+ // Change the destination to a 512-bit register.
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
+ MIB.addImm(0x0); // Append immediate to extract from the lower bits.
+ }
+
+ return true;
+}
+
+static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
+ MIB->setDesc(Desc);
+ int64_t ShiftAmt = MIB->getOperand(2).getImm();
+ // Temporarily remove the immediate so we can add another source register.
+ MIB->RemoveOperand(2);
+ // Add the register. Don't copy the kill flag if there is one.
+ MIB.addReg(MIB.getReg(1),
+ getUndefRegState(MIB->getOperand(1).isUndef()));
+ // Add back the immediate.
+ MIB.addImm(ShiftAmt);
+ return true;
+}
+
+bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ bool HasAVX = Subtarget.hasAVX();
+ MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+ switch (MI.getOpcode()) {
+ case X86::MOV32r0:
+ return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+ case X86::MOV32r1:
+ return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
+ case X86::MOV32r_1:
+ return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
+ case X86::MOV32ImmSExti8:
+ case X86::MOV64ImmSExti8:
+ return ExpandMOVImmSExti8(MIB, *this, Subtarget);
+ case X86::SETB_C32r:
+ return Expand2AddrUndef(MIB, get(X86::SBB32rr));
+ case X86::SETB_C64r:
+ return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+ case X86::MMX_SET0:
+ return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr));
+ case X86::V_SET0:
+ case X86::FsFLD0SS:
+ case X86::FsFLD0SD:
+ case X86::FsFLD0F128:
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+ case X86::AVX_SET0: {
+ assert(HasAVX && "AVX not supported");
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ Register SrcReg = MIB.getReg(0);
+ Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+ MIB->getOperand(0).setReg(XReg);
+ Expand2AddrUndef(MIB, get(X86::VXORPSrr));
+ MIB.addReg(SrcReg, RegState::ImplicitDefine);
+ return true;
+ }
+ case X86::AVX512_128_SET0:
+ case X86::AVX512_FsFLD0SS:
+ case X86::AVX512_FsFLD0SD:
+ case X86::AVX512_FsFLD0F128: {
+ bool HasVLX = Subtarget.hasVLX();
+ Register SrcReg = MIB.getReg(0);
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
+ return Expand2AddrUndef(MIB,
+ get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
+ // Extended register without VLX. Use a larger XOR.
+ SrcReg =
+ TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
+ MIB->getOperand(0).setReg(SrcReg);
+ return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+ }
+ case X86::AVX512_256_SET0:
+ case X86::AVX512_512_SET0: {
+ bool HasVLX = Subtarget.hasVLX();
+ Register SrcReg = MIB.getReg(0);
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
+ Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+ MIB->getOperand(0).setReg(XReg);
+ Expand2AddrUndef(MIB,
+ get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
+ MIB.addReg(SrcReg, RegState::ImplicitDefine);
+ return true;
+ }
+ if (MI.getOpcode() == X86::AVX512_256_SET0) {
+ // No VLX so we must reference a zmm.
+ unsigned ZReg =
+ TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
+ MIB->getOperand(0).setReg(ZReg);
+ }
+ return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+ }
+ case X86::V_SETALLONES:
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+ case X86::AVX2_SETALLONES:
+ return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+ case X86::AVX1_SETALLONES: {
+ Register Reg = MIB.getReg(0);
+ // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
+ MIB->setDesc(get(X86::VCMPPSYrri));
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
+ return true;
+ }
+ case X86::AVX512_512_SETALLONES: {
+ Register Reg = MIB.getReg(0);
+ MIB->setDesc(get(X86::VPTERNLOGDZrri));
+ // VPTERNLOGD needs 3 register inputs and an immediate.
+ // 0xff will return 1s for any input.
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef).addImm(0xff);
+ return true;
+ }
+ case X86::AVX512_512_SEXT_MASK_32:
+ case X86::AVX512_512_SEXT_MASK_64: {
+ Register Reg = MIB.getReg(0);
+ Register MaskReg = MIB.getReg(1);
+ unsigned MaskState = getRegState(MIB->getOperand(1));
+ unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
+ X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
+ MI.RemoveOperand(1);
+ MIB->setDesc(get(Opc));
+ // VPTERNLOG needs 3 register inputs and an immediate.
+ // 0xff will return 1s for any input.
+ MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
+ .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
+ return true;
+ }
+ case X86::VMOVAPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVUPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVAPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVUPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVAPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVUPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVAPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
+ case X86::VMOVUPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
+ case X86::MOV32ri64: {
+ Register Reg = MIB.getReg(0);
+ Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
+ MI.setDesc(get(X86::MOV32ri));
+ MIB->getOperand(0).setReg(Reg32);
+ MIB.addReg(Reg, RegState::ImplicitDefine);
+ return true;
+ }
+
+ // KNL does not recognize dependency-breaking idioms for mask registers,
+ // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
+ // Using %k0 as the undef input register is a performance heuristic based
+ // on the assumption that %k0 is used less frequently than the other mask
+ // registers, since it is not usable as a write mask.
+ // FIXME: A more advanced approach would be to choose the best input mask
+ // register based on context.
+ case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
+ case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
+ case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
+ case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
+ case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
+ case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
+ case TargetOpcode::LOAD_STACK_GUARD:
+ expandLoadStackGuard(MIB, *this);
+ return true;
+ case X86::XOR64_FP:
+ case X86::XOR32_FP:
+ return expandXorFP(MIB, *this);
+ case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8));
+ case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8));
+ case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8));
+ case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8));
+ case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break;
+ case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break;
+ case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break;
+ case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break;
+ case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break;
+ case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break;
+ case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break;
+ case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break;
+ case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break;
+ case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break;
+ case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break;
+ }
+ return false;
+}
+
+/// Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+/// movss (%rdi), %xmm0
+/// cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+/// cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode,
+ const X86Subtarget &Subtarget,
+ bool ForLoadFold = false) {
+ switch (Opcode) {
+ case X86::CVTSI2SSrr:
+ case X86::CVTSI2SSrm:
+ case X86::CVTSI642SSrr:
+ case X86::CVTSI642SSrm:
+ case X86::CVTSI2SDrr:
+ case X86::CVTSI2SDrm:
+ case X86::CVTSI642SDrr:
+ case X86::CVTSI642SDrm:
+ // Load folding won't effect the undef register update since the input is
+ // a GPR.
+ return !ForLoadFold;
+ case X86::CVTSD2SSrr:
+ case X86::CVTSD2SSrm:
+ case X86::CVTSS2SDrr:
+ case X86::CVTSS2SDrm:
+ case X86::MOVHPDrm:
+ case X86::MOVHPSrm:
+ case X86::MOVLPDrm:
+ case X86::MOVLPSrm:
+ case X86::RCPSSr:
+ case X86::RCPSSm:
+ case X86::RCPSSr_Int:
+ case X86::RCPSSm_Int:
+ case X86::ROUNDSDr:
+ case X86::ROUNDSDm:
+ case X86::ROUNDSSr:
+ case X86::ROUNDSSm:
+ case X86::RSQRTSSr:
+ case X86::RSQRTSSm:
+ case X86::RSQRTSSr_Int:
+ case X86::RSQRTSSm_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSm:
+ case X86::SQRTSSr_Int:
+ case X86::SQRTSSm_Int:
+ case X86::SQRTSDr:
+ case X86::SQRTSDm:
+ case X86::SQRTSDr_Int:
+ case X86::SQRTSDm_Int:
+ return true;
+ // GPR
+ case X86::POPCNT32rm:
+ case X86::POPCNT32rr:
+ case X86::POPCNT64rm:
+ case X86::POPCNT64rr:
+ return Subtarget.hasPOPCNTFalseDeps();
+ case X86::LZCNT32rm:
+ case X86::LZCNT32rr:
+ case X86::LZCNT64rm:
+ case X86::LZCNT64rr:
+ case X86::TZCNT32rm:
+ case X86::TZCNT32rr:
+ case X86::TZCNT64rm:
+ case X86::TZCNT64rr:
+ return Subtarget.hasLZCNTFalseDeps();
+ }
+
+ return false;
+}
+
+/// Inform the BreakFalseDeps pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::getPartialRegUpdateClearance(
+ const MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
+ return 0;
+
+ // If MI is marked as reading Reg, the partial register update is wanted.
+ const MachineOperand &MO = MI.getOperand(0);
+ Register Reg = MO.getReg();
+ if (Reg.isVirtual()) {
+ if (MO.readsReg() || MI.readsVirtualRegister(Reg))
+ return 0;
+ } else {
+ if (MI.readsRegister(Reg, TRI))
+ return 0;
+ }
+
+ // If any instructions in the clearance range are reading Reg, insert a
+ // dependency breaking instruction, which is inexpensive and is likely to
+ // be hidden in other instruction's cycles.
+ return PartialRegUpdateClearance;
+}
+
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+// Also returns true for instructions that have two inputs where one may
+// be undef and we want it to use the same register as the other input.
+static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
+ bool ForLoadFold = false) {
+ // Set the OpNum parameter to the first source operand.
+ switch (Opcode) {
+ case X86::MMX_PUNPCKHBWirr:
+ case X86::MMX_PUNPCKHWDirr:
+ case X86::MMX_PUNPCKHDQirr:
+ case X86::MMX_PUNPCKLBWirr:
+ case X86::MMX_PUNPCKLWDirr:
+ case X86::MMX_PUNPCKLDQirr:
+ case X86::MOVHLPSrr:
+ case X86::PACKSSWBrr:
+ case X86::PACKUSWBrr:
+ case X86::PACKSSDWrr:
+ case X86::PACKUSDWrr:
+ case X86::PUNPCKHBWrr:
+ case X86::PUNPCKLBWrr:
+ case X86::PUNPCKHWDrr:
+ case X86::PUNPCKLWDrr:
+ case X86::PUNPCKHDQrr:
+ case X86::PUNPCKLDQrr:
+ case X86::PUNPCKHQDQrr:
+ case X86::PUNPCKLQDQrr:
+ case X86::SHUFPDrri:
+ case X86::SHUFPSrri:
+ // These instructions are sometimes used with an undef first or second
+ // source. Return true here so BreakFalseDeps will assign this source to the
+ // same register as the first source to avoid a false dependency.
+ // Operand 1 of these instructions is tied so they're separate from their
+ // VEX counterparts.
+ return OpNum == 2 && !ForLoadFold;
+
+ case X86::VMOVLHPSrr:
+ case X86::VMOVLHPSZrr:
+ case X86::VPACKSSWBrr:
+ case X86::VPACKUSWBrr:
+ case X86::VPACKSSDWrr:
+ case X86::VPACKUSDWrr:
+ case X86::VPACKSSWBZ128rr:
+ case X86::VPACKUSWBZ128rr:
+ case X86::VPACKSSDWZ128rr:
+ case X86::VPACKUSDWZ128rr:
+ case X86::VPERM2F128rr:
+ case X86::VPERM2I128rr:
+ case X86::VSHUFF32X4Z256rri:
+ case X86::VSHUFF32X4Zrri:
+ case X86::VSHUFF64X2Z256rri:
+ case X86::VSHUFF64X2Zrri:
+ case X86::VSHUFI32X4Z256rri:
+ case X86::VSHUFI32X4Zrri:
+ case X86::VSHUFI64X2Z256rri:
+ case X86::VSHUFI64X2Zrri:
+ case X86::VPUNPCKHBWrr:
+ case X86::VPUNPCKLBWrr:
+ case X86::VPUNPCKHBWYrr:
+ case X86::VPUNPCKLBWYrr:
+ case X86::VPUNPCKHBWZ128rr:
+ case X86::VPUNPCKLBWZ128rr:
+ case X86::VPUNPCKHBWZ256rr:
+ case X86::VPUNPCKLBWZ256rr:
+ case X86::VPUNPCKHBWZrr:
+ case X86::VPUNPCKLBWZrr:
+ case X86::VPUNPCKHWDrr:
+ case X86::VPUNPCKLWDrr:
+ case X86::VPUNPCKHWDYrr:
+ case X86::VPUNPCKLWDYrr:
+ case X86::VPUNPCKHWDZ128rr:
+ case X86::VPUNPCKLWDZ128rr:
+ case X86::VPUNPCKHWDZ256rr:
+ case X86::VPUNPCKLWDZ256rr:
+ case X86::VPUNPCKHWDZrr:
+ case X86::VPUNPCKLWDZrr:
+ case X86::VPUNPCKHDQrr:
+ case X86::VPUNPCKLDQrr:
+ case X86::VPUNPCKHDQYrr:
+ case X86::VPUNPCKLDQYrr:
+ case X86::VPUNPCKHDQZ128rr:
+ case X86::VPUNPCKLDQZ128rr:
+ case X86::VPUNPCKHDQZ256rr:
+ case X86::VPUNPCKLDQZ256rr:
+ case X86::VPUNPCKHDQZrr:
+ case X86::VPUNPCKLDQZrr:
+ case X86::VPUNPCKHQDQrr:
+ case X86::VPUNPCKLQDQrr:
+ case X86::VPUNPCKHQDQYrr:
+ case X86::VPUNPCKLQDQYrr:
+ case X86::VPUNPCKHQDQZ128rr:
+ case X86::VPUNPCKLQDQZ128rr:
+ case X86::VPUNPCKHQDQZ256rr:
+ case X86::VPUNPCKLQDQZ256rr:
+ case X86::VPUNPCKHQDQZrr:
+ case X86::VPUNPCKLQDQZrr:
+ // These instructions are sometimes used with an undef first or second
+ // source. Return true here so BreakFalseDeps will assign this source to the
+ // same register as the first source to avoid a false dependency.
+ return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
+
+ case X86::VCVTSI2SSrr:
+ case X86::VCVTSI2SSrm:
+ case X86::VCVTSI2SSrr_Int:
+ case X86::VCVTSI2SSrm_Int:
+ case X86::VCVTSI642SSrr:
+ case X86::VCVTSI642SSrm:
+ case X86::VCVTSI642SSrr_Int:
+ case X86::VCVTSI642SSrm_Int:
+ case X86::VCVTSI2SDrr:
+ case X86::VCVTSI2SDrm:
+ case X86::VCVTSI2SDrr_Int:
+ case X86::VCVTSI2SDrm_Int:
+ case X86::VCVTSI642SDrr:
+ case X86::VCVTSI642SDrm:
+ case X86::VCVTSI642SDrr_Int:
+ case X86::VCVTSI642SDrm_Int:
+ // AVX-512
+ case X86::VCVTSI2SSZrr:
+ case X86::VCVTSI2SSZrm:
+ case X86::VCVTSI2SSZrr_Int:
+ case X86::VCVTSI2SSZrrb_Int:
+ case X86::VCVTSI2SSZrm_Int:
+ case X86::VCVTSI642SSZrr:
+ case X86::VCVTSI642SSZrm:
+ case X86::VCVTSI642SSZrr_Int:
+ case X86::VCVTSI642SSZrrb_Int:
+ case X86::VCVTSI642SSZrm_Int:
+ case X86::VCVTSI2SDZrr:
+ case X86::VCVTSI2SDZrm:
+ case X86::VCVTSI2SDZrr_Int:
+ case X86::VCVTSI2SDZrm_Int:
+ case X86::VCVTSI642SDZrr:
+ case X86::VCVTSI642SDZrm:
+ case X86::VCVTSI642SDZrr_Int:
+ case X86::VCVTSI642SDZrrb_Int:
+ case X86::VCVTSI642SDZrm_Int:
+ case X86::VCVTUSI2SSZrr:
+ case X86::VCVTUSI2SSZrm:
+ case X86::VCVTUSI2SSZrr_Int:
+ case X86::VCVTUSI2SSZrrb_Int:
+ case X86::VCVTUSI2SSZrm_Int:
+ case X86::VCVTUSI642SSZrr:
+ case X86::VCVTUSI642SSZrm:
+ case X86::VCVTUSI642SSZrr_Int:
+ case X86::VCVTUSI642SSZrrb_Int:
+ case X86::VCVTUSI642SSZrm_Int:
+ case X86::VCVTUSI2SDZrr:
+ case X86::VCVTUSI2SDZrm:
+ case X86::VCVTUSI2SDZrr_Int:
+ case X86::VCVTUSI2SDZrm_Int:
+ case X86::VCVTUSI642SDZrr:
+ case X86::VCVTUSI642SDZrm:
+ case X86::VCVTUSI642SDZrr_Int:
+ case X86::VCVTUSI642SDZrrb_Int:
+ case X86::VCVTUSI642SDZrm_Int:
+ // Load folding won't effect the undef register update since the input is
+ // a GPR.
+ return OpNum == 1 && !ForLoadFold;
+ case X86::VCVTSD2SSrr:
+ case X86::VCVTSD2SSrm:
+ case X86::VCVTSD2SSrr_Int:
+ case X86::VCVTSD2SSrm_Int:
+ case X86::VCVTSS2SDrr:
+ case X86::VCVTSS2SDrm:
+ case X86::VCVTSS2SDrr_Int:
+ case X86::VCVTSS2SDrm_Int:
+ case X86::VRCPSSr:
+ case X86::VRCPSSr_Int:
+ case X86::VRCPSSm:
+ case X86::VRCPSSm_Int:
+ case X86::VROUNDSDr:
+ case X86::VROUNDSDm:
+ case X86::VROUNDSDr_Int:
+ case X86::VROUNDSDm_Int:
+ case X86::VROUNDSSr:
+ case X86::VROUNDSSm:
+ case X86::VROUNDSSr_Int:
+ case X86::VROUNDSSm_Int:
+ case X86::VRSQRTSSr:
+ case X86::VRSQRTSSr_Int:
+ case X86::VRSQRTSSm:
+ case X86::VRSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ // AVX-512
+ case X86::VCVTSD2SSZrr:
+ case X86::VCVTSD2SSZrr_Int:
+ case X86::VCVTSD2SSZrrb_Int:
+ case X86::VCVTSD2SSZrm:
+ case X86::VCVTSD2SSZrm_Int:
+ case X86::VCVTSS2SDZrr:
+ case X86::VCVTSS2SDZrr_Int:
+ case X86::VCVTSS2SDZrrb_Int:
+ case X86::VCVTSS2SDZrm:
+ case X86::VCVTSS2SDZrm_Int:
+ case X86::VGETEXPSDZr:
+ case X86::VGETEXPSDZrb:
+ case X86::VGETEXPSDZm:
+ case X86::VGETEXPSSZr:
+ case X86::VGETEXPSSZrb:
+ case X86::VGETEXPSSZm:
+ case X86::VGETMANTSDZrri:
+ case X86::VGETMANTSDZrrib:
+ case X86::VGETMANTSDZrmi:
+ case X86::VGETMANTSSZrri:
+ case X86::VGETMANTSSZrrib:
+ case X86::VGETMANTSSZrmi:
+ case X86::VRNDSCALESDZr:
+ case X86::VRNDSCALESDZr_Int:
+ case X86::VRNDSCALESDZrb_Int:
+ case X86::VRNDSCALESDZm:
+ case X86::VRNDSCALESDZm_Int:
+ case X86::VRNDSCALESSZr:
+ case X86::VRNDSCALESSZr_Int:
+ case X86::VRNDSCALESSZrb_Int:
+ case X86::VRNDSCALESSZm:
+ case X86::VRNDSCALESSZm_Int:
+ case X86::VRCP14SDZrr:
+ case X86::VRCP14SDZrm:
+ case X86::VRCP14SSZrr:
+ case X86::VRCP14SSZrm:
+ case X86::VRCP28SDZr:
+ case X86::VRCP28SDZrb:
+ case X86::VRCP28SDZm:
+ case X86::VRCP28SSZr:
+ case X86::VRCP28SSZrb:
+ case X86::VRCP28SSZm:
+ case X86::VREDUCESSZrmi:
+ case X86::VREDUCESSZrri:
+ case X86::VREDUCESSZrrib:
+ case X86::VRSQRT14SDZrr:
+ case X86::VRSQRT14SDZrm:
+ case X86::VRSQRT14SSZrr:
+ case X86::VRSQRT14SSZrm:
+ case X86::VRSQRT28SDZr:
+ case X86::VRSQRT28SDZrb:
+ case X86::VRSQRT28SDZm:
+ case X86::VRSQRT28SSZr:
+ case X86::VRSQRT28SSZrb:
+ case X86::VRSQRT28SSZm:
+ case X86::VSQRTSSZr:
+ case X86::VSQRTSSZr_Int:
+ case X86::VSQRTSSZrb_Int:
+ case X86::VSQRTSSZm:
+ case X86::VSQRTSSZm_Int:
+ case X86::VSQRTSDZr:
+ case X86::VSQRTSDZr_Int:
+ case X86::VSQRTSDZrb_Int:
+ case X86::VSQRTSDZm:
+ case X86::VSQRTSDZm_Int:
+ return OpNum == 1;
+ case X86::VMOVSSZrrk:
+ case X86::VMOVSDZrrk:
+ return OpNum == 3 && !ForLoadFold;
+ case X86::VMOVSSZrrkz:
+ case X86::VMOVSDZrrkz:
+ return OpNum == 2 && !ForLoadFold;
+ }
+
+ return false;
+}
+
+/// Inform the BreakFalseDeps pass how many idle instructions we would like
+/// before certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps undef %xmm1, undef %xmm1, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned
+X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ const MachineOperand &MO = MI.getOperand(OpNum);
+ if (Register::isPhysicalRegister(MO.getReg()) &&
+ hasUndefRegUpdate(MI.getOpcode(), OpNum))
+ return UndefRegClearance;
+
+ return 0;
+}
+
+void X86InstrInfo::breakPartialRegDependency(
+ MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+ Register Reg = MI.getOperand(OpNum).getReg();
+ // If MI kills this register, the false dependence is already broken.
+ if (MI.killsRegister(Reg, TRI))
+ return;
+
+ if (X86::VR128RegClass.contains(Reg)) {
+ // These instructions are all floating point domain, so xorps is the best
+ // choice.
+ unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::VR256RegClass.contains(Reg)) {
+ // Use vxorps to clear the full ymm register.
+ // It wants to read and write the xmm sub-register.
+ Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
+ .addReg(XReg, RegState::Undef)
+ .addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::GR64RegClass.contains(Reg)) {
+ // Using XOR32rr because it has shorter encoding and zeros up the upper bits
+ // as well.
+ Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
+ .addReg(XReg, RegState::Undef)
+ .addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::GR32RegClass.contains(Reg)) {
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MI.addRegisterKilled(Reg, TRI, true);
+ }
+}
+
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+ int PtrOffset = 0) {
+ unsigned NumAddrOps = MOs.size();
+
+ if (NumAddrOps < 4) {
+ // FrameIndex only - add an immediate offset (whether its zero or not).
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.add(MOs[i]);
+ addOffset(MIB, PtrOffset);
+ } else {
+ // General Memory Addressing - we need to add any offset to an existing
+ // offset.
+ assert(MOs.size() == 5 && "Unexpected memory operand list length");
+ for (unsigned i = 0; i != NumAddrOps; ++i) {
+ const MachineOperand &MO = MOs[i];
+ if (i == 3 && PtrOffset != 0) {
+ MIB.addDisp(MO, PtrOffset);
+ } else {
+ MIB.add(MO);
+ }
+ }
+ }
+}
+
+static void updateOperandRegConstraints(MachineFunction &MF,
+ MachineInstr &NewMI,
+ const TargetInstrInfo &TII) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+
+ for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
+ MachineOperand &MO = NewMI.getOperand(Idx);
+ // We only need to update constraints on virtual register operands.
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg.isVirtual())
+ continue;
+
+ auto *NewRC = MRI.constrainRegClass(
+ Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF));
+ if (!NewRC) {
+ LLVM_DEBUG(
+ dbgs() << "WARNING: Unable to update register constraint for operand "
+ << Idx << " of instruction:\n";
+ NewMI.dump(); dbgs() << "\n");
+ }
+ }
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr &MI,
+ const TargetInstrInfo &TII) {
+ // Create the base instruction with the memory operand as the first part.
+ // Omit the implicit operands, something BuildMI can't do.
+ MachineInstr *NewMI =
+ MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, NewMI);
+ addOperands(MIB, MOs);
+
+ // Loop over the rest of the ri operands, converting them over.
+ unsigned NumOps = MI.getDesc().getNumOperands() - 2;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ MachineOperand &MO = MI.getOperand(i + 2);
+ MIB.add(MO);
+ }
+ for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ MIB.add(MO);
+ }
+
+ updateOperandRegConstraints(MF, *NewMI, TII);
+
+ MachineBasicBlock *MBB = InsertPt->getParent();
+ MBB->insert(InsertPt, NewMI);
+
+ return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
+ unsigned OpNo, ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr &MI, const TargetInstrInfo &TII,
+ int PtrOffset = 0) {
+ // Omit the implicit operands, something BuildMI can't do.
+ MachineInstr *NewMI =
+ MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, NewMI);
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (i == OpNo) {
+ assert(MO.isReg() && "Expected to fold into reg operand!");
+ addOperands(MIB, MOs, PtrOffset);
+ } else {
+ MIB.add(MO);
+ }
+ }
+
+ updateOperandRegConstraints(MF, *NewMI, TII);
+
+ // Copy the NoFPExcept flag from the instruction we're fusing.
+ if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept))
+ NewMI->setFlag(MachineInstr::MIFlag::NoFPExcept);
+
+ MachineBasicBlock *MBB = InsertPt->getParent();
+ MBB->insert(InsertPt, NewMI);
+
+ return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr &MI) {
+ MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+ MI.getDebugLoc(), TII.get(Opcode));
+ addOperands(MIB, MOs);
+ return MIB.addImm(0);
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+ MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, Align Alignment) const {
+ switch (MI.getOpcode()) {
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ case X86::VINSERTPSZrr:
+ // Attempt to convert the load of inserted vector into a fold load
+ // of a single float.
+ if (OpNum == 2) {
+ unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
+ unsigned ZMask = Imm & 15;
+ unsigned DstIdx = (Imm >> 4) & 3;
+ unsigned SrcIdx = (Imm >> 6) & 3;
+
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+ unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(4)) {
+ int PtrOffset = SrcIdx * 4;
+ unsigned NewImm = (DstIdx << 4) | ZMask;
+ unsigned NewOpCode =
+ (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm :
+ (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm :
+ X86::INSERTPSrm;
+ MachineInstr *NewMI =
+ FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+ NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+ return NewMI;
+ }
+ }
+ break;
+ case X86::MOVHLPSrr:
+ case X86::VMOVHLPSrr:
+ case X86::VMOVHLPSZrr:
+ // Move the upper 64-bits of the second operand to the lower 64-bits.
+ // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
+ // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
+ if (OpNum == 2) {
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+ unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
+ unsigned NewOpCode =
+ (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
+ (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm :
+ X86::MOVLPSrm;
+ MachineInstr *NewMI =
+ FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
+ return NewMI;
+ }
+ }
+ break;
+ case X86::UNPCKLPDrr:
+ // If we won't be able to fold this to the memory form of UNPCKL, use
+ // MOVHPD instead. Done as custom because we can't have this in the load
+ // table twice.
+ if (OpNum == 2) {
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+ unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
+ MachineInstr *NewMI =
+ FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
+ return NewMI;
+ }
+ }
+ break;
+ }
+
+ return nullptr;
+}
+
+static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
+ MachineInstr &MI) {
+ if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/true) ||
+ !MI.getOperand(1).isReg())
+ return false;
+
+ // The are two cases we need to handle depending on where in the pipeline
+ // the folding attempt is being made.
+ // -Register has the undef flag set.
+ // -Register is produced by the IMPLICIT_DEF instruction.
+
+ if (MI.getOperand(1).isUndef())
+ return true;
+
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
+ return VRegDef && VRegDef->isImplicitDef();
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, Align Alignment, bool AllowCommute) const {
+ bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
+ bool isTwoAddrFold = false;
+
+ // For CPUs that favor the register form of a call or push,
+ // do not fold loads into calls or pushes, unless optimizing for size
+ // aggressively.
+ if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
+ (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
+ MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
+ MI.getOpcode() == X86::PUSH64r))
+ return nullptr;
+
+ // Avoid partial and undef register update stalls unless optimizing for size.
+ if (!MF.getFunction().hasOptSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
+ shouldPreventUndefRegUpdateMemFold(MF, MI)))
+ return nullptr;
+
+ unsigned NumOps = MI.getDesc().getNumOperands();
+ bool isTwoAddr =
+ NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+ // FIXME: AsmPrinter doesn't know how to handle
+ // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+ if (MI.getOpcode() == X86::ADD32ri &&
+ MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ return nullptr;
+
+ // GOTTPOFF relocation loads can only be folded into add instructions.
+ // FIXME: Need to exclude other relocations that only support specific
+ // instructions.
+ if (MOs.size() == X86::AddrNumOperands &&
+ MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
+ MI.getOpcode() != X86::ADD64rr)
+ return nullptr;
+
+ MachineInstr *NewMI = nullptr;
+
+ // Attempt to fold any custom cases we have.
+ if (MachineInstr *CustomMI = foldMemoryOperandCustom(
+ MF, MI, OpNum, MOs, InsertPt, Size, Alignment))
+ return CustomMI;
+
+ const X86MemoryFoldTableEntry *I = nullptr;
+
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
+ MI.getOperand(1).isReg() &&
+ MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
+ I = lookupTwoAddrFoldTable(MI.getOpcode());
+ isTwoAddrFold = true;
+ } else {
+ if (OpNum == 0) {
+ if (MI.getOpcode() == X86::MOV32r0) {
+ NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
+ if (NewMI)
+ return NewMI;
+ }
+ }
+
+ I = lookupFoldTable(MI.getOpcode(), OpNum);
+ }
+
+ if (I != nullptr) {
+ unsigned Opcode = I->DstOp;
+ bool FoldedLoad =
+ isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0;
+ bool FoldedStore =
+ isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE);
+ MaybeAlign MinAlign =
+ decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT);
+ if (MinAlign && Alignment < *MinAlign)
+ return nullptr;
+ bool NarrowToMOV32rm = false;
+ if (Size) {
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
+ &RI, MF);
+ unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
+ if (FoldedLoad && Size < RCSize) {
+ // If this is a 64-bit load, but the spill slot is 32, then we can do
+ // a 32-bit load which is implicitly zero-extended. This likely is
+ // due to live interval analysis remat'ing a load from stack slot.
+ if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+ return nullptr;
+ if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+ return nullptr;
+ Opcode = X86::MOV32rm;
+ NarrowToMOV32rm = true;
+ }
+ // For stores, make sure the size of the object is equal to the size of
+ // the store. If the object is larger, the extra bits would be garbage. If
+ // the object is smaller we might overwrite another object or fault.
+ if (FoldedStore && Size != RCSize)
+ return nullptr;
+ }
+
+ if (isTwoAddrFold)
+ NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+ else
+ NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
+
+ if (NarrowToMOV32rm) {
+ // If this is the special case where we use a MOV32rm to load a 32-bit
+ // value and zero-extend the top bits. Change the destination register
+ // to a 32-bit one.
+ Register DstReg = NewMI->getOperand(0).getReg();
+ if (DstReg.isPhysical())
+ NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
+ else
+ NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+ }
+ return NewMI;
+ }
+
+ // If the instruction and target operand are commutable, commute the
+ // instruction and try again.
+ if (AllowCommute) {
+ unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
+ if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
+ bool HasDef = MI.getDesc().getNumDefs();
+ Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
+ Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
+ Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
+ bool Tied1 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+ bool Tied2 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+ // If either of the commutable operands are tied to the destination
+ // then we can not commute + fold.
+ if ((HasDef && Reg0 == Reg1 && Tied1) ||
+ (HasDef && Reg0 == Reg2 && Tied2))
+ return nullptr;
+
+ MachineInstr *CommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!CommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (CommutedMI != &MI) {
+ // New instruction. We can't fold from this.
+ CommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Attempt to fold with the commuted version of the instruction.
+ NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
+ Alignment, /*AllowCommute=*/false);
+ if (NewMI)
+ return NewMI;
+
+ // Folding failed again - undo the commute before returning.
+ MachineInstr *UncommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!UncommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (UncommutedMI != &MI) {
+ // New instruction. It doesn't need to be kept.
+ UncommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Return here to prevent duplicate fuse failure report.
+ return nullptr;
+ }
+ }
+
+ // No fusion
+ if (PrintFailedFusing && !MI.isCopy())
+ dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
+ return nullptr;
+}
+
+MachineInstr *
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt,
+ int FrameIndex, LiveIntervals *LIS,
+ VirtRegMap *VRM) const {
+ // Check switch flag
+ if (NoFusing)
+ return nullptr;
+
+ // Avoid partial and undef register update stalls unless optimizing for size.
+ if (!MF.getFunction().hasOptSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
+ shouldPreventUndefRegUpdateMemFold(MF, MI)))
+ return nullptr;
+
+ // Don't fold subreg spills, or reloads that use a high subreg.
+ for (auto Op : Ops) {
+ MachineOperand &MO = MI.getOperand(Op);
+ auto SubReg = MO.getSubReg();
+ if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
+ return nullptr;
+ }
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Size = MFI.getObjectSize(FrameIndex);
+ Align Alignment = MFI.getObjectAlign(FrameIndex);
+ // If the function stack isn't realigned we don't want to fold instructions
+ // that need increased alignment.
+ if (!RI.needsStackRealignment(MF))
+ Alignment =
+ std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ unsigned RCSize = 0;
+ switch (MI.getOpcode()) {
+ default: return nullptr;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
+ }
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Size < RCSize)
+ return nullptr;
+ // Change to CMPXXri r, 0 first.
+ MI.setDesc(get(NewOpc));
+ MI.getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return nullptr;
+
+ return foldMemoryOperandImpl(MF, MI, Ops[0],
+ MachineOperand::CreateFI(FrameIndex), InsertPt,
+ Size, Alignment, /*AllowCommute=*/true);
+}
+
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version. For instance, this transformation isn't legal:
+/// movss (%rdi), %xmm0
+/// addps %xmm0, %xmm0
+/// ->
+/// addps (%rdi), %xmm0
+///
+/// But this one is:
+/// movss (%rdi), %xmm0
+/// addss %xmm0, %xmm0
+/// ->
+/// addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+ const MachineInstr &UserMI,
+ const MachineFunction &MF) {
+ unsigned Opc = LoadMI.getOpcode();
+ unsigned UserOpc = UserMI.getOpcode();
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass *RC =
+ MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
+ unsigned RegSize = TRI.getRegSizeInBits(*RC);
+
+ if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
+ Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
+ Opc == X86::VMOVSSZrm_alt) &&
+ RegSize > 32) {
+ // These instructions only load 32 bits, we can't fold them if the
+ // destination register is wider than 32 bits (4 bytes), and its user
+ // instruction isn't scalar (SS).
+ switch (UserOpc) {
+ case X86::CVTSS2SDrr_Int:
+ case X86::VCVTSS2SDrr_Int:
+ case X86::VCVTSS2SDZrr_Int:
+ case X86::VCVTSS2SDZrr_Intk:
+ case X86::VCVTSS2SDZrr_Intkz:
+ case X86::CVTSS2SIrr_Int: case X86::CVTSS2SI64rr_Int:
+ case X86::VCVTSS2SIrr_Int: case X86::VCVTSS2SI64rr_Int:
+ case X86::VCVTSS2SIZrr_Int: case X86::VCVTSS2SI64Zrr_Int:
+ case X86::CVTTSS2SIrr_Int: case X86::CVTTSS2SI64rr_Int:
+ case X86::VCVTTSS2SIrr_Int: case X86::VCVTTSS2SI64rr_Int:
+ case X86::VCVTTSS2SIZrr_Int: case X86::VCVTTSS2SI64Zrr_Int:
+ case X86::VCVTSS2USIZrr_Int: case X86::VCVTSS2USI64Zrr_Int:
+ case X86::VCVTTSS2USIZrr_Int: case X86::VCVTTSS2USI64Zrr_Int:
+ case X86::RCPSSr_Int: case X86::VRCPSSr_Int:
+ case X86::RSQRTSSr_Int: case X86::VRSQRTSSr_Int:
+ case X86::ROUNDSSr_Int: case X86::VROUNDSSr_Int:
+ case X86::COMISSrr_Int: case X86::VCOMISSrr_Int: case X86::VCOMISSZrr_Int:
+ case X86::UCOMISSrr_Int:case X86::VUCOMISSrr_Int:case X86::VUCOMISSZrr_Int:
+ case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
+ case X86::CMPSSrr_Int: case X86::VCMPSSrr_Int: case X86::VCMPSSZrr_Int:
+ case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
+ case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
+ case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
+ case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
+ case X86::SQRTSSr_Int: case X86::VSQRTSSr_Int: case X86::VSQRTSSZr_Int:
+ case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
+ case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
+ case X86::VCMPSSZrr_Intk:
+ case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz:
+ case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
+ case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
+ case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz:
+ case X86::VSQRTSSZr_Intk: case X86::VSQRTSSZr_Intkz:
+ case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz:
+ case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int:
+ case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int:
+ case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int:
+ case X86::VFMADD213SSr_Int: case X86::VFNMADD213SSr_Int:
+ case X86::VFMADD231SSr_Int: case X86::VFNMADD231SSr_Int:
+ case X86::VFMSUB132SSr_Int: case X86::VFNMSUB132SSr_Int:
+ case X86::VFMSUB213SSr_Int: case X86::VFNMSUB213SSr_Int:
+ case X86::VFMSUB231SSr_Int: case X86::VFNMSUB231SSr_Int:
+ case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int:
+ case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int:
+ case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int:
+ case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
+ case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
+ case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
+ case X86::VFMADD132SSZr_Intk: case X86::VFNMADD132SSZr_Intk:
+ case X86::VFMADD213SSZr_Intk: case X86::VFNMADD213SSZr_Intk:
+ case X86::VFMADD231SSZr_Intk: case X86::VFNMADD231SSZr_Intk:
+ case X86::VFMSUB132SSZr_Intk: case X86::VFNMSUB132SSZr_Intk:
+ case X86::VFMSUB213SSZr_Intk: case X86::VFNMSUB213SSZr_Intk:
+ case X86::VFMSUB231SSZr_Intk: case X86::VFNMSUB231SSZr_Intk:
+ case X86::VFMADD132SSZr_Intkz: case X86::VFNMADD132SSZr_Intkz:
+ case X86::VFMADD213SSZr_Intkz: case X86::VFNMADD213SSZr_Intkz:
+ case X86::VFMADD231SSZr_Intkz: case X86::VFNMADD231SSZr_Intkz:
+ case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz:
+ case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz:
+ case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz:
+ case X86::VFIXUPIMMSSZrri:
+ case X86::VFIXUPIMMSSZrrik:
+ case X86::VFIXUPIMMSSZrrikz:
+ case X86::VFPCLASSSSZrr:
+ case X86::VFPCLASSSSZrrk:
+ case X86::VGETEXPSSZr:
+ case X86::VGETEXPSSZrk:
+ case X86::VGETEXPSSZrkz:
+ case X86::VGETMANTSSZrri:
+ case X86::VGETMANTSSZrrik:
+ case X86::VGETMANTSSZrrikz:
+ case X86::VRANGESSZrri:
+ case X86::VRANGESSZrrik:
+ case X86::VRANGESSZrrikz:
+ case X86::VRCP14SSZrr:
+ case X86::VRCP14SSZrrk:
+ case X86::VRCP14SSZrrkz:
+ case X86::VRCP28SSZr:
+ case X86::VRCP28SSZrk:
+ case X86::VRCP28SSZrkz:
+ case X86::VREDUCESSZrri:
+ case X86::VREDUCESSZrrik:
+ case X86::VREDUCESSZrrikz:
+ case X86::VRNDSCALESSZr_Int:
+ case X86::VRNDSCALESSZr_Intk:
+ case X86::VRNDSCALESSZr_Intkz:
+ case X86::VRSQRT14SSZrr:
+ case X86::VRSQRT14SSZrrk:
+ case X86::VRSQRT14SSZrrkz:
+ case X86::VRSQRT28SSZr:
+ case X86::VRSQRT28SSZrk:
+ case X86::VRSQRT28SSZrkz:
+ case X86::VSCALEFSSZrr:
+ case X86::VSCALEFSSZrrk:
+ case X86::VSCALEFSSZrrkz:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
+ Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
+ Opc == X86::VMOVSDZrm_alt) &&
+ RegSize > 64) {
+ // These instructions only load 64 bits, we can't fold them if the
+ // destination register is wider than 64 bits (8 bytes), and its user
+ // instruction isn't scalar (SD).
+ switch (UserOpc) {
+ case X86::CVTSD2SSrr_Int:
+ case X86::VCVTSD2SSrr_Int:
+ case X86::VCVTSD2SSZrr_Int:
+ case X86::VCVTSD2SSZrr_Intk:
+ case X86::VCVTSD2SSZrr_Intkz:
+ case X86::CVTSD2SIrr_Int: case X86::CVTSD2SI64rr_Int:
+ case X86::VCVTSD2SIrr_Int: case X86::VCVTSD2SI64rr_Int:
+ case X86::VCVTSD2SIZrr_Int: case X86::VCVTSD2SI64Zrr_Int:
+ case X86::CVTTSD2SIrr_Int: case X86::CVTTSD2SI64rr_Int:
+ case X86::VCVTTSD2SIrr_Int: case X86::VCVTTSD2SI64rr_Int:
+ case X86::VCVTTSD2SIZrr_Int: case X86::VCVTTSD2SI64Zrr_Int:
+ case X86::VCVTSD2USIZrr_Int: case X86::VCVTSD2USI64Zrr_Int:
+ case X86::VCVTTSD2USIZrr_Int: case X86::VCVTTSD2USI64Zrr_Int:
+ case X86::ROUNDSDr_Int: case X86::VROUNDSDr_Int:
+ case X86::COMISDrr_Int: case X86::VCOMISDrr_Int: case X86::VCOMISDZrr_Int:
+ case X86::UCOMISDrr_Int:case X86::VUCOMISDrr_Int:case X86::VUCOMISDZrr_Int:
+ case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
+ case X86::CMPSDrr_Int: case X86::VCMPSDrr_Int: case X86::VCMPSDZrr_Int:
+ case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
+ case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
+ case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
+ case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
+ case X86::SQRTSDr_Int: case X86::VSQRTSDr_Int: case X86::VSQRTSDZr_Int:
+ case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
+ case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
+ case X86::VCMPSDZrr_Intk:
+ case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz:
+ case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
+ case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
+ case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz:
+ case X86::VSQRTSDZr_Intk: case X86::VSQRTSDZr_Intkz:
+ case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz:
+ case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int:
+ case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int:
+ case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int:
+ case X86::VFMADD213SDr_Int: case X86::VFNMADD213SDr_Int:
+ case X86::VFMADD231SDr_Int: case X86::VFNMADD231SDr_Int:
+ case X86::VFMSUB132SDr_Int: case X86::VFNMSUB132SDr_Int:
+ case X86::VFMSUB213SDr_Int: case X86::VFNMSUB213SDr_Int:
+ case X86::VFMSUB231SDr_Int: case X86::VFNMSUB231SDr_Int:
+ case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int:
+ case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int:
+ case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int:
+ case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
+ case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
+ case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
+ case X86::VFMADD132SDZr_Intk: case X86::VFNMADD132SDZr_Intk:
+ case X86::VFMADD213SDZr_Intk: case X86::VFNMADD213SDZr_Intk:
+ case X86::VFMADD231SDZr_Intk: case X86::VFNMADD231SDZr_Intk:
+ case X86::VFMSUB132SDZr_Intk: case X86::VFNMSUB132SDZr_Intk:
+ case X86::VFMSUB213SDZr_Intk: case X86::VFNMSUB213SDZr_Intk:
+ case X86::VFMSUB231SDZr_Intk: case X86::VFNMSUB231SDZr_Intk:
+ case X86::VFMADD132SDZr_Intkz: case X86::VFNMADD132SDZr_Intkz:
+ case X86::VFMADD213SDZr_Intkz: case X86::VFNMADD213SDZr_Intkz:
+ case X86::VFMADD231SDZr_Intkz: case X86::VFNMADD231SDZr_Intkz:
+ case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz:
+ case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz:
+ case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz:
+ case X86::VFIXUPIMMSDZrri:
+ case X86::VFIXUPIMMSDZrrik:
+ case X86::VFIXUPIMMSDZrrikz:
+ case X86::VFPCLASSSDZrr:
+ case X86::VFPCLASSSDZrrk:
+ case X86::VGETEXPSDZr:
+ case X86::VGETEXPSDZrk:
+ case X86::VGETEXPSDZrkz:
+ case X86::VGETMANTSDZrri:
+ case X86::VGETMANTSDZrrik:
+ case X86::VGETMANTSDZrrikz:
+ case X86::VRANGESDZrri:
+ case X86::VRANGESDZrrik:
+ case X86::VRANGESDZrrikz:
+ case X86::VRCP14SDZrr:
+ case X86::VRCP14SDZrrk:
+ case X86::VRCP14SDZrrkz:
+ case X86::VRCP28SDZr:
+ case X86::VRCP28SDZrk:
+ case X86::VRCP28SDZrkz:
+ case X86::VREDUCESDZrri:
+ case X86::VREDUCESDZrrik:
+ case X86::VREDUCESDZrrikz:
+ case X86::VRNDSCALESDZr_Int:
+ case X86::VRNDSCALESDZr_Intk:
+ case X86::VRNDSCALESDZr_Intkz:
+ case X86::VRSQRT14SDZrr:
+ case X86::VRSQRT14SDZrrk:
+ case X86::VRSQRT14SDZrrkz:
+ case X86::VRSQRT28SDZr:
+ case X86::VRSQRT28SDZrk:
+ case X86::VRSQRT28SDZrkz:
+ case X86::VSCALEFSDZrr:
+ case X86::VSCALEFSDZrrk:
+ case X86::VSCALEFSDZrrkz:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ return false;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+ LiveIntervals *LIS) const {
+
+ // TODO: Support the case where LoadMI loads a wide register, but MI
+ // only uses a subreg.
+ for (auto Op : Ops) {
+ if (MI.getOperand(Op).getSubReg())
+ return nullptr;
+ }
+
+ // If loading from a FrameIndex, fold directly from the FrameIndex.
+ unsigned NumOps = LoadMI.getDesc().getNumOperands();
+ int FrameIndex;
+ if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
+ if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
+ return nullptr;
+ return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
+ }
+
+ // Check switch flag
+ if (NoFusing) return nullptr;
+
+ // Avoid partial and undef register update stalls unless optimizing for size.
+ if (!MF.getFunction().hasOptSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
+ shouldPreventUndefRegUpdateMemFold(MF, MI)))
+ return nullptr;
+
+ // Determine the alignment of the load.
+ Align Alignment;
+ if (LoadMI.hasOneMemOperand())
+ Alignment = (*LoadMI.memoperands_begin())->getAlign();
+ else
+ switch (LoadMI.getOpcode()) {
+ case X86::AVX512_512_SET0:
+ case X86::AVX512_512_SETALLONES:
+ Alignment = Align(64);
+ break;
+ case X86::AVX2_SETALLONES:
+ case X86::AVX1_SETALLONES:
+ case X86::AVX_SET0:
+ case X86::AVX512_256_SET0:
+ Alignment = Align(32);
+ break;
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ case X86::AVX512_128_SET0:
+ case X86::FsFLD0F128:
+ case X86::AVX512_FsFLD0F128:
+ Alignment = Align(16);
+ break;
+ case X86::MMX_SET0:
+ case X86::FsFLD0SD:
+ case X86::AVX512_FsFLD0SD:
+ Alignment = Align(8);
+ break;
+ case X86::FsFLD0SS:
+ case X86::AVX512_FsFLD0SS:
+ Alignment = Align(4);
+ break;
+ default:
+ return nullptr;
+ }
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ switch (MI.getOpcode()) {
+ default: return nullptr;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
+ }
+ // Change to CMPXXri r, 0 first.
+ MI.setDesc(get(NewOpc));
+ MI.getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return nullptr;
+
+ // Make sure the subregisters match.
+ // Otherwise we risk changing the size of the load.
+ if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
+ return nullptr;
+
+ SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
+ switch (LoadMI.getOpcode()) {
+ case X86::MMX_SET0:
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ case X86::AVX2_SETALLONES:
+ case X86::AVX1_SETALLONES:
+ case X86::AVX_SET0:
+ case X86::AVX512_128_SET0:
+ case X86::AVX512_256_SET0:
+ case X86::AVX512_512_SET0:
+ case X86::AVX512_512_SETALLONES:
+ case X86::FsFLD0SD:
+ case X86::AVX512_FsFLD0SD:
+ case X86::FsFLD0SS:
+ case X86::AVX512_FsFLD0SS:
+ case X86::FsFLD0F128:
+ case X86::AVX512_FsFLD0F128: {
+ // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+ // Create a constant-pool entry and operands to load from it.
+
+ // Medium and large mode can't fold loads this way.
+ if (MF.getTarget().getCodeModel() != CodeModel::Small &&
+ MF.getTarget().getCodeModel() != CodeModel::Kernel)
+ return nullptr;
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ if (MF.getTarget().isPositionIndependent()) {
+ if (Subtarget.is64Bit())
+ PICBase = X86::RIP;
+ else
+ // FIXME: PICBase = getGlobalBaseReg(&MF);
+ // This doesn't work for several reasons.
+ // 1. GlobalBaseReg may have been spilled.
+ // 2. It may not be live at MI.
+ return nullptr;
+ }
+
+ // Create a constant-pool entry.
+ MachineConstantPool &MCP = *MF.getConstantPool();
+ Type *Ty;
+ unsigned Opc = LoadMI.getOpcode();
+ if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS)
+ Ty = Type::getFloatTy(MF.getFunction().getContext());
+ else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
+ Ty = Type::getDoubleTy(MF.getFunction().getContext());
+ else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
+ Ty = Type::getFP128Ty(MF.getFunction().getContext());
+ else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
+ Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+ 16);
+ else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
+ Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
+ Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+ 8);
+ else if (Opc == X86::MMX_SET0)
+ Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+ 2);
+ else
+ Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+ 4);
+
+ bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
+ Opc == X86::AVX512_512_SETALLONES ||
+ Opc == X86::AVX1_SETALLONES);
+ const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+ Constant::getNullValue(Ty);
+ unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+ // Create operands to load from the constant pool entry.
+ MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+ MOs.push_back(MachineOperand::CreateImm(1));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ break;
+ }
+ default: {
+ if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
+ return nullptr;
+
+ // Folding a normal load. Just copy the load's address operands.
+ MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands,
+ LoadMI.operands_begin() + NumOps);
+ break;
+ }
+ }
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
+ /*Size=*/0, Alignment, /*AllowCommute=*/true);
+}
+
+static SmallVector<MachineMemOperand *, 2>
+extractLoadMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
+ SmallVector<MachineMemOperand *, 2> LoadMMOs;
+
+ for (MachineMemOperand *MMO : MMOs) {
+ if (!MMO->isLoad())
+ continue;
+
+ if (!MMO->isStore()) {
+ // Reuse the MMO.
+ LoadMMOs.push_back(MMO);
+ } else {
+ // Clone the MMO and unset the store flag.
+ LoadMMOs.push_back(MF.getMachineMemOperand(
+ MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
+ }
+ }
+
+ return LoadMMOs;
+}
+
+static SmallVector<MachineMemOperand *, 2>
+extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
+ SmallVector<MachineMemOperand *, 2> StoreMMOs;
+
+ for (MachineMemOperand *MMO : MMOs) {
+ if (!MMO->isStore())
+ continue;
+
+ if (!MMO->isLoad()) {
+ // Reuse the MMO.
+ StoreMMOs.push_back(MMO);
+ } else {
+ // Clone the MMO and unset the load flag.
+ StoreMMOs.push_back(MF.getMachineMemOperand(
+ MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
+ }
+ }
+
+ return StoreMMOs;
+}
+
+static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I,
+ const TargetRegisterClass *RC,
+ const X86Subtarget &STI) {
+ assert(STI.hasAVX512() && "Expected at least AVX512!");
+ unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
+ assert((SpillSize == 64 || STI.hasVLX()) &&
+ "Can't broadcast less than 64 bytes without AVX512VL!");
+
+ switch (I->Flags & TB_BCAST_MASK) {
+ default: llvm_unreachable("Unexpected broadcast type!");
+ case TB_BCAST_D:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VPBROADCASTDZ128rm;
+ case 32: return X86::VPBROADCASTDZ256rm;
+ case 64: return X86::VPBROADCASTDZrm;
+ }
+ break;
+ case TB_BCAST_Q:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VPBROADCASTQZ128rm;
+ case 32: return X86::VPBROADCASTQZ256rm;
+ case 64: return X86::VPBROADCASTQZrm;
+ }
+ break;
+ case TB_BCAST_SS:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VBROADCASTSSZ128rm;
+ case 32: return X86::VBROADCASTSSZ256rm;
+ case 64: return X86::VBROADCASTSSZrm;
+ }
+ break;
+ case TB_BCAST_SD:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VMOVDDUPZ128rm;
+ case 32: return X86::VBROADCASTSDZ256rm;
+ case 64: return X86::VBROADCASTSDZrm;
+ }
+ break;
+ }
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(
+ MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
+ bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
+ const X86MemoryFoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
+ if (I == nullptr)
+ return false;
+ unsigned Opc = I->DstOp;
+ unsigned Index = I->Flags & TB_INDEX_MASK;
+ bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
+ bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+ bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
+ if (UnfoldLoad && !FoldedLoad)
+ return false;
+ UnfoldLoad &= FoldedLoad;
+ if (UnfoldStore && !FoldedStore)
+ return false;
+ UnfoldStore &= FoldedStore;
+
+ const MCInstrDesc &MCID = get(Opc);
+
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ // TODO: Check if 32-byte or greater accesses are slow too?
+ if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
+ Subtarget.isUnalignedMem16Slow())
+ // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+ // conservatively assume the address is unaligned. That's bad for
+ // performance.
+ return false;
+ SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
+ SmallVector<MachineOperand,2> BeforeOps;
+ SmallVector<MachineOperand,2> AfterOps;
+ SmallVector<MachineOperand,4> ImpOps;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (i >= Index && i < Index + X86::AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (Op.isReg() && Op.isImplicit())
+ ImpOps.push_back(Op);
+ else if (i < Index)
+ BeforeOps.push_back(Op);
+ else if (i > Index)
+ AfterOps.push_back(Op);
+ }
+
+ // Emit the load or broadcast instruction.
+ if (UnfoldLoad) {
+ auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
+
+ unsigned Opc;
+ if (FoldedBCast) {
+ Opc = getBroadcastOpcode(I, RC, Subtarget);
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
+ Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
+ }
+
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
+ for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
+ MIB.add(AddrOps[i]);
+ MIB.setMemRefs(MMOs);
+ NewMIs.push_back(MIB);
+
+ if (UnfoldStore) {
+ // Address operands cannot be marked isKill.
+ for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
+ MachineOperand &MO = NewMIs[0]->getOperand(i);
+ if (MO.isReg())
+ MO.setIsKill(false);
+ }
+ }
+ }
+
+ // Emit the data processing instruction.
+ MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
+ MachineInstrBuilder MIB(MF, DataMI);
+
+ if (FoldedStore)
+ MIB.addReg(Reg, RegState::Define);
+ for (MachineOperand &BeforeOp : BeforeOps)
+ MIB.add(BeforeOp);
+ if (FoldedLoad)
+ MIB.addReg(Reg);
+ for (MachineOperand &AfterOp : AfterOps)
+ MIB.add(AfterOp);
+ for (MachineOperand &ImpOp : ImpOps) {
+ MIB.addReg(ImpOp.getReg(),
+ getDefRegState(ImpOp.isDef()) |
+ RegState::Implicit |
+ getKillRegState(ImpOp.isKill()) |
+ getDeadRegState(ImpOp.isDead()) |
+ getUndefRegState(ImpOp.isUndef()));
+ }
+ // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+ switch (DataMI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri: {
+ MachineOperand &MO0 = DataMI->getOperand(0);
+ MachineOperand &MO1 = DataMI->getOperand(1);
+ if (MO1.getImm() == 0) {
+ unsigned NewOpc;
+ switch (DataMI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::CMP64ri8:
+ case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+ case X86::CMP32ri8:
+ case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
+ case X86::CMP16ri8:
+ case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
+ case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
+ }
+ DataMI->setDesc(get(NewOpc));
+ MO1.ChangeToRegister(MO0.getReg(), false);
+ }
+ }
+ }
+ NewMIs.push_back(DataMI);
+
+ // Emit the store instruction.
+ if (UnfoldStore) {
+ const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
+ auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
+ unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+ for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
+ MIB.add(AddrOps[i]);
+ MIB.addReg(Reg, RegState::Kill);
+ MIB.setMemRefs(MMOs);
+ NewMIs.push_back(MIB);
+ }
+
+ return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const {
+ if (!N->isMachineOpcode())
+ return false;
+
+ const X86MemoryFoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
+ if (I == nullptr)
+ return false;
+ unsigned Opc = I->DstOp;
+ unsigned Index = I->Flags & TB_INDEX_MASK;
+ bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
+ bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+ bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
+ const MCInstrDesc &MCID = get(Opc);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ unsigned NumDefs = MCID.NumDefs;
+ std::vector<SDValue> AddrOps;
+ std::vector<SDValue> BeforeOps;
+ std::vector<SDValue> AfterOps;
+ SDLoc dl(N);
+ unsigned NumOps = N->getNumOperands();
+ for (unsigned i = 0; i != NumOps-1; ++i) {
+ SDValue Op = N->getOperand(i);
+ if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (i < Index-NumDefs)
+ BeforeOps.push_back(Op);
+ else if (i > Index-NumDefs)
+ AfterOps.push_back(Op);
+ }
+ SDValue Chain = N->getOperand(NumOps-1);
+ AddrOps.push_back(Chain);
+
+ // Emit the load instruction.
+ SDNode *Load = nullptr;
+ if (FoldedLoad) {
+ EVT VT = *TRI.legalclasstypes_begin(*RC);
+ auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
+ if (MMOs.empty() && RC == &X86::VR128RegClass &&
+ Subtarget.isUnalignedMem16Slow())
+ // Do not introduce a slow unaligned load.
+ return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
+
+ unsigned Opc;
+ if (FoldedBCast) {
+ Opc = getBroadcastOpcode(I, RC, Subtarget);
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
+ Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
+ }
+
+ Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
+ NewNodes.push_back(Load);
+
+ // Preserve memory reference information.
+ DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs);
+ }
+
+ // Emit the data processing instruction.
+ std::vector<EVT> VTs;
+ const TargetRegisterClass *DstRC = nullptr;
+ if (MCID.getNumDefs() > 0) {
+ DstRC = getRegClass(MCID, 0, &RI, MF);
+ VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
+ }
+ for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+ EVT VT = N->getValueType(i);
+ if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
+ VTs.push_back(VT);
+ }
+ if (Load)
+ BeforeOps.push_back(SDValue(Load, 0));
+ llvm::append_range(BeforeOps, AfterOps);
+ // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+ switch (Opc) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri:
+ if (isNullConstant(BeforeOps[1])) {
+ switch (Opc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::CMP64ri8:
+ case X86::CMP64ri32: Opc = X86::TEST64rr; break;
+ case X86::CMP32ri8:
+ case X86::CMP32ri: Opc = X86::TEST32rr; break;
+ case X86::CMP16ri8:
+ case X86::CMP16ri: Opc = X86::TEST16rr; break;
+ case X86::CMP8ri: Opc = X86::TEST8rr; break;
+ }
+ BeforeOps[1] = BeforeOps[0];
+ }
+ }
+ SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
+ NewNodes.push_back(NewNode);
+
+ // Emit the store instruction.
+ if (FoldedStore) {
+ AddrOps.pop_back();
+ AddrOps.push_back(SDValue(NewNode, 0));
+ AddrOps.push_back(Chain);
+ auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
+ if (MMOs.empty() && RC == &X86::VR128RegClass &&
+ Subtarget.isUnalignedMem16Slow())
+ // Do not introduce a slow unaligned store.
+ return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
+ SDNode *Store =
+ DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
+ dl, MVT::Other, AddrOps);
+ NewNodes.push_back(Store);
+
+ // Preserve memory reference information.
+ DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs);
+ }
+
+ return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex) const {
+ const X86MemoryFoldTableEntry *I = lookupUnfoldTable(Opc);
+ if (I == nullptr)
+ return 0;
+ bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
+ bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+ if (UnfoldLoad && !FoldedLoad)
+ return 0;
+ if (UnfoldStore && !FoldedStore)
+ return 0;
+ if (LoadRegIndex)
+ *LoadRegIndex = I->Flags & TB_INDEX_MASK;
+ return I->DstOp;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1, int64_t &Offset2) const {
+ if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+ return false;
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ switch (Opc1) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
+ case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ // AVX load instructions
+ case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
+ case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ // AVX512 load instructions
+ case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
+ case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVUPSZrm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU64Zrm:
+ case X86::KMOVBkm:
+ case X86::KMOVWkm:
+ case X86::KMOVDkm:
+ case X86::KMOVQkm:
+ break;
+ }
+ switch (Opc2) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
+ case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ // AVX load instructions
+ case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
+ case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPSrm:
+ case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ // AVX512 load instructions
+ case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
+ case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVUPSZrm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU64Zrm:
+ case X86::KMOVBkm:
+ case X86::KMOVWkm:
+ case X86::KMOVDkm:
+ case X86::KMOVQkm:
+ break;
+ }
+
+ // Lambda to check if both the loads have the same value for an operand index.
+ auto HasSameOp = [&](int I) {
+ return Load1->getOperand(I) == Load2->getOperand(I);
+ };
+
+ // All operands except the displacement should match.
+ if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
+ !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
+ return false;
+
+ // Chain Operand must be the same.
+ if (!HasSameOp(5))
+ return false;
+
+ // Now let's examine if the displacements are constants.
+ auto Disp1 = dyn_cast<ConstantSDNode>(Load1->getOperand(X86::AddrDisp));
+ auto Disp2 = dyn_cast<ConstantSDNode>(Load2->getOperand(X86::AddrDisp));
+ if (!Disp1 || !Disp2)
+ return false;
+
+ Offset1 = Disp1->getSExtValue();
+ Offset2 = Disp2->getSExtValue();
+ return true;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const {
+ assert(Offset2 > Offset1);
+ if ((Offset2 - Offset1) / 8 > 64)
+ return false;
+
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ if (Opc1 != Opc2)
+ return false; // FIXME: overly conservative?
+
+ switch (Opc1) {
+ default: break;
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ return false;
+ }
+
+ EVT VT = Load1->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+ // have 16 of them to play with.
+ if (Subtarget.is64Bit()) {
+ if (NumLoads >= 3)
+ return false;
+ } else if (NumLoads) {
+ return false;
+ }
+ break;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f32:
+ case MVT::f64:
+ if (NumLoads)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const {
+
+ // ENDBR instructions should not be scheduled around.
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32)
+ return true;
+
+ return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
+bool X86InstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+ X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+ Cond[0].setImm(GetOppositeBranchCondition(CC));
+ return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+ // FIXME: Return false for x87 stack register classes for now. We can't
+ // allow any loads of these registers before FpGet_ST0_80.
+ return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
+ RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
+ RC == &X86::RFP80RegClass);
+}
+
+/// Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+ assert((!Subtarget.is64Bit() ||
+ MF->getTarget().getCodeModel() == CodeModel::Medium ||
+ MF->getTarget().getCodeModel() == CodeModel::Large) &&
+ "X86-64 PIC uses RIP relative addressing");
+
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ Register GlobalBaseReg = X86FI->getGlobalBaseReg();
+ if (GlobalBaseReg != 0)
+ return GlobalBaseReg;
+
+ // Create the register. The code to initialize it is inserted
+ // later, by the CGBR pass (below).
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ GlobalBaseReg = RegInfo.createVirtualRegister(
+ Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
+ X86FI->setGlobalBaseReg(GlobalBaseReg);
+ return GlobalBaseReg;
+}
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const uint16_t ReplaceableInstrs[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr },
+ { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm },
+ { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
+ { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
+ { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
+ { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
+ { X86::MOVSDmr, X86::MOVSDmr, X86::MOVPQI2QImr },
+ { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr },
+ { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm },
+ { X86::MOVSDrm_alt,X86::MOVSDrm_alt,X86::MOVQI2PQIrm },
+ { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm },
+ { X86::MOVSSrm_alt,X86::MOVSSrm_alt,X86::MOVDI2PDIrm },
+ { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
+ { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
+ { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
+ { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm },
+ { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr },
+ { X86::ORPSrm, X86::ORPDrm, X86::PORrm },
+ { X86::ORPSrr, X86::ORPDrr, X86::PORrr },
+ { X86::XORPSrm, X86::XORPDrm, X86::PXORrm },
+ { X86::XORPSrr, X86::XORPDrr, X86::PXORrr },
+ { X86::UNPCKLPDrm, X86::UNPCKLPDrm, X86::PUNPCKLQDQrm },
+ { X86::MOVLHPSrr, X86::UNPCKLPDrr, X86::PUNPCKLQDQrr },
+ { X86::UNPCKHPDrm, X86::UNPCKHPDrm, X86::PUNPCKHQDQrm },
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrr, X86::PUNPCKHQDQrr },
+ { X86::UNPCKLPSrm, X86::UNPCKLPSrm, X86::PUNPCKLDQrm },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrr, X86::PUNPCKLDQrr },
+ { X86::UNPCKHPSrm, X86::UNPCKHPSrm, X86::PUNPCKHDQrm },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrr, X86::PUNPCKHDQrr },
+ { X86::EXTRACTPSmr, X86::EXTRACTPSmr, X86::PEXTRDmr },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSrr, X86::PEXTRDrr },
+ // AVX 128-bit support
+ { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr },
+ { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm },
+ { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
+ { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
+ { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
+ { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
+ { X86::VMOVSDmr, X86::VMOVSDmr, X86::VMOVPQI2QImr },
+ { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr },
+ { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm },
+ { X86::VMOVSDrm_alt,X86::VMOVSDrm_alt,X86::VMOVQI2PQIrm },
+ { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm },
+ { X86::VMOVSSrm_alt,X86::VMOVSSrm_alt,X86::VMOVDI2PDIrm },
+ { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
+ { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
+ { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
+ { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm },
+ { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr },
+ { X86::VORPSrm, X86::VORPDrm, X86::VPORrm },
+ { X86::VORPSrr, X86::VORPDrr, X86::VPORrr },
+ { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
+ { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
+ { X86::VUNPCKLPDrm, X86::VUNPCKLPDrm, X86::VPUNPCKLQDQrm },
+ { X86::VMOVLHPSrr, X86::VUNPCKLPDrr, X86::VPUNPCKLQDQrr },
+ { X86::VUNPCKHPDrm, X86::VUNPCKHPDrm, X86::VPUNPCKHQDQrm },
+ { X86::VUNPCKHPDrr, X86::VUNPCKHPDrr, X86::VPUNPCKHQDQrr },
+ { X86::VUNPCKLPSrm, X86::VUNPCKLPSrm, X86::VPUNPCKLDQrm },
+ { X86::VUNPCKLPSrr, X86::VUNPCKLPSrr, X86::VPUNPCKLDQrr },
+ { X86::VUNPCKHPSrm, X86::VUNPCKHPSrm, X86::VPUNPCKHDQrm },
+ { X86::VUNPCKHPSrr, X86::VUNPCKHPSrr, X86::VPUNPCKHDQrr },
+ { X86::VEXTRACTPSmr, X86::VEXTRACTPSmr, X86::VPEXTRDmr },
+ { X86::VEXTRACTPSrr, X86::VEXTRACTPSrr, X86::VPEXTRDrr },
+ // AVX 256-bit support
+ { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
+ { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
+ { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
+ { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
+ { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
+ { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
+ { X86::VPERMPSYrm, X86::VPERMPSYrm, X86::VPERMDYrm },
+ { X86::VPERMPSYrr, X86::VPERMPSYrr, X86::VPERMDYrr },
+ { X86::VPERMPDYmi, X86::VPERMPDYmi, X86::VPERMQYmi },
+ { X86::VPERMPDYri, X86::VPERMPDYri, X86::VPERMQYri },
+ // AVX512 support
+ { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr },
+ { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+ { X86::VMOVNTPSZ256mr, X86::VMOVNTPDZ256mr, X86::VMOVNTDQZ256mr },
+ { X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr },
+ { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr },
+ { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr },
+ { X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm },
+ { X86::VMOVSDZrm_alt, X86::VMOVSDZrm_alt, X86::VMOVQI2PQIZrm },
+ { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm },
+ { X86::VMOVSSZrm_alt, X86::VMOVSSZrm_alt, X86::VMOVDI2PDIZrm },
+ { X86::VBROADCASTSSZ128rr,X86::VBROADCASTSSZ128rr,X86::VPBROADCASTDZ128rr },
+ { X86::VBROADCASTSSZ128rm,X86::VBROADCASTSSZ128rm,X86::VPBROADCASTDZ128rm },
+ { X86::VBROADCASTSSZ256rr,X86::VBROADCASTSSZ256rr,X86::VPBROADCASTDZ256rr },
+ { X86::VBROADCASTSSZ256rm,X86::VBROADCASTSSZ256rm,X86::VPBROADCASTDZ256rm },
+ { X86::VBROADCASTSSZrr, X86::VBROADCASTSSZrr, X86::VPBROADCASTDZrr },
+ { X86::VBROADCASTSSZrm, X86::VBROADCASTSSZrm, X86::VPBROADCASTDZrm },
+ { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128rr },
+ { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128rm },
+ { X86::VBROADCASTSDZ256rr,X86::VBROADCASTSDZ256rr,X86::VPBROADCASTQZ256rr },
+ { X86::VBROADCASTSDZ256rm,X86::VBROADCASTSDZ256rm,X86::VPBROADCASTQZ256rm },
+ { X86::VBROADCASTSDZrr, X86::VBROADCASTSDZrr, X86::VPBROADCASTQZrr },
+ { X86::VBROADCASTSDZrm, X86::VBROADCASTSDZrm, X86::VPBROADCASTQZrm },
+ { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrr, X86::VINSERTI32x4Zrr },
+ { X86::VINSERTF32x4Zrm, X86::VINSERTF32x4Zrm, X86::VINSERTI32x4Zrm },
+ { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrr, X86::VINSERTI32x8Zrr },
+ { X86::VINSERTF32x8Zrm, X86::VINSERTF32x8Zrm, X86::VINSERTI32x8Zrm },
+ { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrr, X86::VINSERTI64x2Zrr },
+ { X86::VINSERTF64x2Zrm, X86::VINSERTF64x2Zrm, X86::VINSERTI64x2Zrm },
+ { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrr, X86::VINSERTI64x4Zrr },
+ { X86::VINSERTF64x4Zrm, X86::VINSERTF64x4Zrm, X86::VINSERTI64x4Zrm },
+ { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rr,X86::VINSERTI32x4Z256rr },
+ { X86::VINSERTF32x4Z256rm,X86::VINSERTF32x4Z256rm,X86::VINSERTI32x4Z256rm },
+ { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rr,X86::VINSERTI64x2Z256rr },
+ { X86::VINSERTF64x2Z256rm,X86::VINSERTF64x2Z256rm,X86::VINSERTI64x2Z256rm },
+ { X86::VEXTRACTF32x4Zrr, X86::VEXTRACTF32x4Zrr, X86::VEXTRACTI32x4Zrr },
+ { X86::VEXTRACTF32x4Zmr, X86::VEXTRACTF32x4Zmr, X86::VEXTRACTI32x4Zmr },
+ { X86::VEXTRACTF32x8Zrr, X86::VEXTRACTF32x8Zrr, X86::VEXTRACTI32x8Zrr },
+ { X86::VEXTRACTF32x8Zmr, X86::VEXTRACTF32x8Zmr, X86::VEXTRACTI32x8Zmr },
+ { X86::VEXTRACTF64x2Zrr, X86::VEXTRACTF64x2Zrr, X86::VEXTRACTI64x2Zrr },
+ { X86::VEXTRACTF64x2Zmr, X86::VEXTRACTF64x2Zmr, X86::VEXTRACTI64x2Zmr },
+ { X86::VEXTRACTF64x4Zrr, X86::VEXTRACTF64x4Zrr, X86::VEXTRACTI64x4Zrr },
+ { X86::VEXTRACTF64x4Zmr, X86::VEXTRACTF64x4Zmr, X86::VEXTRACTI64x4Zmr },
+ { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTI32x4Z256rr },
+ { X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTI32x4Z256mr },
+ { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTI64x2Z256rr },
+ { X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTI64x2Z256mr },
+ { X86::VPERMILPSmi, X86::VPERMILPSmi, X86::VPSHUFDmi },
+ { X86::VPERMILPSri, X86::VPERMILPSri, X86::VPSHUFDri },
+ { X86::VPERMILPSZ128mi, X86::VPERMILPSZ128mi, X86::VPSHUFDZ128mi },
+ { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128ri, X86::VPSHUFDZ128ri },
+ { X86::VPERMILPSZ256mi, X86::VPERMILPSZ256mi, X86::VPSHUFDZ256mi },
+ { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256ri, X86::VPSHUFDZ256ri },
+ { X86::VPERMILPSZmi, X86::VPERMILPSZmi, X86::VPSHUFDZmi },
+ { X86::VPERMILPSZri, X86::VPERMILPSZri, X86::VPSHUFDZri },
+ { X86::VPERMPSZ256rm, X86::VPERMPSZ256rm, X86::VPERMDZ256rm },
+ { X86::VPERMPSZ256rr, X86::VPERMPSZ256rr, X86::VPERMDZ256rr },
+ { X86::VPERMPDZ256mi, X86::VPERMPDZ256mi, X86::VPERMQZ256mi },
+ { X86::VPERMPDZ256ri, X86::VPERMPDZ256ri, X86::VPERMQZ256ri },
+ { X86::VPERMPDZ256rm, X86::VPERMPDZ256rm, X86::VPERMQZ256rm },
+ { X86::VPERMPDZ256rr, X86::VPERMPDZ256rr, X86::VPERMQZ256rr },
+ { X86::VPERMPSZrm, X86::VPERMPSZrm, X86::VPERMDZrm },
+ { X86::VPERMPSZrr, X86::VPERMPSZrr, X86::VPERMDZrr },
+ { X86::VPERMPDZmi, X86::VPERMPDZmi, X86::VPERMQZmi },
+ { X86::VPERMPDZri, X86::VPERMPDZri, X86::VPERMQZri },
+ { X86::VPERMPDZrm, X86::VPERMPDZrm, X86::VPERMQZrm },
+ { X86::VPERMPDZrr, X86::VPERMPDZrr, X86::VPERMQZrr },
+ { X86::VUNPCKLPDZ256rm, X86::VUNPCKLPDZ256rm, X86::VPUNPCKLQDQZ256rm },
+ { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rr, X86::VPUNPCKLQDQZ256rr },
+ { X86::VUNPCKHPDZ256rm, X86::VUNPCKHPDZ256rm, X86::VPUNPCKHQDQZ256rm },
+ { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rr, X86::VPUNPCKHQDQZ256rr },
+ { X86::VUNPCKLPSZ256rm, X86::VUNPCKLPSZ256rm, X86::VPUNPCKLDQZ256rm },
+ { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rr, X86::VPUNPCKLDQZ256rr },
+ { X86::VUNPCKHPSZ256rm, X86::VUNPCKHPSZ256rm, X86::VPUNPCKHDQZ256rm },
+ { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rr, X86::VPUNPCKHDQZ256rr },
+ { X86::VUNPCKLPDZ128rm, X86::VUNPCKLPDZ128rm, X86::VPUNPCKLQDQZ128rm },
+ { X86::VMOVLHPSZrr, X86::VUNPCKLPDZ128rr, X86::VPUNPCKLQDQZ128rr },
+ { X86::VUNPCKHPDZ128rm, X86::VUNPCKHPDZ128rm, X86::VPUNPCKHQDQZ128rm },
+ { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rr, X86::VPUNPCKHQDQZ128rr },
+ { X86::VUNPCKLPSZ128rm, X86::VUNPCKLPSZ128rm, X86::VPUNPCKLDQZ128rm },
+ { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rr, X86::VPUNPCKLDQZ128rr },
+ { X86::VUNPCKHPSZ128rm, X86::VUNPCKHPSZ128rm, X86::VPUNPCKHDQZ128rm },
+ { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rr, X86::VPUNPCKHDQZ128rr },
+ { X86::VUNPCKLPDZrm, X86::VUNPCKLPDZrm, X86::VPUNPCKLQDQZrm },
+ { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrr, X86::VPUNPCKLQDQZrr },
+ { X86::VUNPCKHPDZrm, X86::VUNPCKHPDZrm, X86::VPUNPCKHQDQZrm },
+ { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrr, X86::VPUNPCKHQDQZrr },
+ { X86::VUNPCKLPSZrm, X86::VUNPCKLPSZrm, X86::VPUNPCKLDQZrm },
+ { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrr, X86::VPUNPCKLDQZrr },
+ { X86::VUNPCKHPSZrm, X86::VUNPCKHPSZrm, X86::VPUNPCKHDQZrm },
+ { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrr, X86::VPUNPCKHDQZrr },
+ { X86::VEXTRACTPSZmr, X86::VEXTRACTPSZmr, X86::VPEXTRDZmr },
+ { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZrr, X86::VPEXTRDZrr },
+};
+
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm },
+ { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr },
+ { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm },
+ { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr },
+ { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm },
+ { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr },
+ { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm },
+ { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr },
+ { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm },
+ { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr },
+ { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
+ { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
+ { X86::VMOVDDUPrm, X86::VMOVDDUPrm, X86::VPBROADCASTQrm},
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrr, X86::VPBROADCASTQrr},
+ { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
+ { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
+ { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
+ { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
+ { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
+ { X86::VBLENDPSYrri, X86::VBLENDPSYrri, X86::VPBLENDDYrri },
+ { X86::VBLENDPSYrmi, X86::VBLENDPSYrmi, X86::VPBLENDDYrmi },
+ { X86::VPERMILPSYmi, X86::VPERMILPSYmi, X86::VPSHUFDYmi },
+ { X86::VPERMILPSYri, X86::VPERMILPSYri, X86::VPSHUFDYri },
+ { X86::VUNPCKLPDYrm, X86::VUNPCKLPDYrm, X86::VPUNPCKLQDQYrm },
+ { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrr, X86::VPUNPCKLQDQYrr },
+ { X86::VUNPCKHPDYrm, X86::VUNPCKHPDYrm, X86::VPUNPCKHQDQYrm },
+ { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrr, X86::VPUNPCKHQDQYrr },
+ { X86::VUNPCKLPSYrm, X86::VUNPCKLPSYrm, X86::VPUNPCKLDQYrm },
+ { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrr, X86::VPUNPCKLDQYrr },
+ { X86::VUNPCKHPSYrm, X86::VUNPCKHPSYrm, X86::VPUNPCKHDQYrm },
+ { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr },
+};
+
+static const uint16_t ReplaceableInstrsFP[][3] = {
+ //PackedSingle PackedDouble
+ { X86::MOVLPSrm, X86::MOVLPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::MOVHPSrm, X86::MOVHPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::MOVHPSmr, X86::MOVHPDmr, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVLPSrm, X86::VMOVLPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSrm, X86::VMOVHPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSmr, X86::VMOVHPDmr, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVLPSZ128rm, X86::VMOVLPDZ128rm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSZ128rm, X86::VMOVHPDZ128rm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSZ128mr, X86::VMOVHPDZ128mr, X86::INSTRUCTION_LIST_END },
+};
+
+static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+ { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm },
+ { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr },
+};
+
+static const uint16_t ReplaceableInstrsAVX512[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr },
+ { X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr },
+ { X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr },
+ { X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm },
+ { X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr },
+ { X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr },
+ { X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr },
+ { X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm },
+ { X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA32Zmr },
+ { X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA32Zrm },
+ { X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA32Zrr },
+ { X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU32Zmr },
+ { X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU32Zrm },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
+ { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
+ { X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm },
+ { X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr },
+ { X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm },
+ { X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr },
+ { X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm },
+ { X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr },
+ { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
+ { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
+ { X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm },
+ { X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr },
+ { X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm },
+ { X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr },
+ { X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm },
+ { X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr },
+ { X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm },
+ { X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr },
+ { X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm },
+ { X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr },
+ { X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm },
+ { X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr },
+ { X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm },
+ { X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble
+ //PackedInt PackedInt
+ { X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk,
+ X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk },
+ { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz,
+ X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz },
+ { X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk,
+ X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk },
+ { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz,
+ X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz },
+ { X86::VANDPSZ128rmk, X86::VANDPDZ128rmk,
+ X86::VPANDQZ128rmk, X86::VPANDDZ128rmk },
+ { X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz,
+ X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz },
+ { X86::VANDPSZ128rrk, X86::VANDPDZ128rrk,
+ X86::VPANDQZ128rrk, X86::VPANDDZ128rrk },
+ { X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz,
+ X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz },
+ { X86::VORPSZ128rmk, X86::VORPDZ128rmk,
+ X86::VPORQZ128rmk, X86::VPORDZ128rmk },
+ { X86::VORPSZ128rmkz, X86::VORPDZ128rmkz,
+ X86::VPORQZ128rmkz, X86::VPORDZ128rmkz },
+ { X86::VORPSZ128rrk, X86::VORPDZ128rrk,
+ X86::VPORQZ128rrk, X86::VPORDZ128rrk },
+ { X86::VORPSZ128rrkz, X86::VORPDZ128rrkz,
+ X86::VPORQZ128rrkz, X86::VPORDZ128rrkz },
+ { X86::VXORPSZ128rmk, X86::VXORPDZ128rmk,
+ X86::VPXORQZ128rmk, X86::VPXORDZ128rmk },
+ { X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz,
+ X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz },
+ { X86::VXORPSZ128rrk, X86::VXORPDZ128rrk,
+ X86::VPXORQZ128rrk, X86::VPXORDZ128rrk },
+ { X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz,
+ X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz },
+ { X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk,
+ X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk },
+ { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz,
+ X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz },
+ { X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk,
+ X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk },
+ { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz,
+ X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz },
+ { X86::VANDPSZ256rmk, X86::VANDPDZ256rmk,
+ X86::VPANDQZ256rmk, X86::VPANDDZ256rmk },
+ { X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz,
+ X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz },
+ { X86::VANDPSZ256rrk, X86::VANDPDZ256rrk,
+ X86::VPANDQZ256rrk, X86::VPANDDZ256rrk },
+ { X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz,
+ X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz },
+ { X86::VORPSZ256rmk, X86::VORPDZ256rmk,
+ X86::VPORQZ256rmk, X86::VPORDZ256rmk },
+ { X86::VORPSZ256rmkz, X86::VORPDZ256rmkz,
+ X86::VPORQZ256rmkz, X86::VPORDZ256rmkz },
+ { X86::VORPSZ256rrk, X86::VORPDZ256rrk,
+ X86::VPORQZ256rrk, X86::VPORDZ256rrk },
+ { X86::VORPSZ256rrkz, X86::VORPDZ256rrkz,
+ X86::VPORQZ256rrkz, X86::VPORDZ256rrkz },
+ { X86::VXORPSZ256rmk, X86::VXORPDZ256rmk,
+ X86::VPXORQZ256rmk, X86::VPXORDZ256rmk },
+ { X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz,
+ X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz },
+ { X86::VXORPSZ256rrk, X86::VXORPDZ256rrk,
+ X86::VPXORQZ256rrk, X86::VPXORDZ256rrk },
+ { X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz,
+ X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz },
+ { X86::VANDNPSZrmk, X86::VANDNPDZrmk,
+ X86::VPANDNQZrmk, X86::VPANDNDZrmk },
+ { X86::VANDNPSZrmkz, X86::VANDNPDZrmkz,
+ X86::VPANDNQZrmkz, X86::VPANDNDZrmkz },
+ { X86::VANDNPSZrrk, X86::VANDNPDZrrk,
+ X86::VPANDNQZrrk, X86::VPANDNDZrrk },
+ { X86::VANDNPSZrrkz, X86::VANDNPDZrrkz,
+ X86::VPANDNQZrrkz, X86::VPANDNDZrrkz },
+ { X86::VANDPSZrmk, X86::VANDPDZrmk,
+ X86::VPANDQZrmk, X86::VPANDDZrmk },
+ { X86::VANDPSZrmkz, X86::VANDPDZrmkz,
+ X86::VPANDQZrmkz, X86::VPANDDZrmkz },
+ { X86::VANDPSZrrk, X86::VANDPDZrrk,
+ X86::VPANDQZrrk, X86::VPANDDZrrk },
+ { X86::VANDPSZrrkz, X86::VANDPDZrrkz,
+ X86::VPANDQZrrkz, X86::VPANDDZrrkz },
+ { X86::VORPSZrmk, X86::VORPDZrmk,
+ X86::VPORQZrmk, X86::VPORDZrmk },
+ { X86::VORPSZrmkz, X86::VORPDZrmkz,
+ X86::VPORQZrmkz, X86::VPORDZrmkz },
+ { X86::VORPSZrrk, X86::VORPDZrrk,
+ X86::VPORQZrrk, X86::VPORDZrrk },
+ { X86::VORPSZrrkz, X86::VORPDZrrkz,
+ X86::VPORQZrrkz, X86::VPORDZrrkz },
+ { X86::VXORPSZrmk, X86::VXORPDZrmk,
+ X86::VPXORQZrmk, X86::VPXORDZrmk },
+ { X86::VXORPSZrmkz, X86::VXORPDZrmkz,
+ X86::VPXORQZrmkz, X86::VPXORDZrmkz },
+ { X86::VXORPSZrrk, X86::VXORPDZrrk,
+ X86::VPXORQZrrk, X86::VPXORDZrrk },
+ { X86::VXORPSZrrkz, X86::VXORPDZrrkz,
+ X86::VPXORQZrrkz, X86::VPXORDZrrkz },
+ // Broadcast loads can be handled the same as masked operations to avoid
+ // changing element size.
+ { X86::VANDNPSZ128rmb, X86::VANDNPDZ128rmb,
+ X86::VPANDNQZ128rmb, X86::VPANDNDZ128rmb },
+ { X86::VANDPSZ128rmb, X86::VANDPDZ128rmb,
+ X86::VPANDQZ128rmb, X86::VPANDDZ128rmb },
+ { X86::VORPSZ128rmb, X86::VORPDZ128rmb,
+ X86::VPORQZ128rmb, X86::VPORDZ128rmb },
+ { X86::VXORPSZ128rmb, X86::VXORPDZ128rmb,
+ X86::VPXORQZ128rmb, X86::VPXORDZ128rmb },
+ { X86::VANDNPSZ256rmb, X86::VANDNPDZ256rmb,
+ X86::VPANDNQZ256rmb, X86::VPANDNDZ256rmb },
+ { X86::VANDPSZ256rmb, X86::VANDPDZ256rmb,
+ X86::VPANDQZ256rmb, X86::VPANDDZ256rmb },
+ { X86::VORPSZ256rmb, X86::VORPDZ256rmb,
+ X86::VPORQZ256rmb, X86::VPORDZ256rmb },
+ { X86::VXORPSZ256rmb, X86::VXORPDZ256rmb,
+ X86::VPXORQZ256rmb, X86::VPXORDZ256rmb },
+ { X86::VANDNPSZrmb, X86::VANDNPDZrmb,
+ X86::VPANDNQZrmb, X86::VPANDNDZrmb },
+ { X86::VANDPSZrmb, X86::VANDPDZrmb,
+ X86::VPANDQZrmb, X86::VPANDDZrmb },
+ { X86::VANDPSZrmb, X86::VANDPDZrmb,
+ X86::VPANDQZrmb, X86::VPANDDZrmb },
+ { X86::VORPSZrmb, X86::VORPDZrmb,
+ X86::VPORQZrmb, X86::VPORDZrmb },
+ { X86::VXORPSZrmb, X86::VXORPDZrmb,
+ X86::VPXORQZrmb, X86::VPXORDZrmb },
+ { X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk,
+ X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk },
+ { X86::VANDPSZ128rmbk, X86::VANDPDZ128rmbk,
+ X86::VPANDQZ128rmbk, X86::VPANDDZ128rmbk },
+ { X86::VORPSZ128rmbk, X86::VORPDZ128rmbk,
+ X86::VPORQZ128rmbk, X86::VPORDZ128rmbk },
+ { X86::VXORPSZ128rmbk, X86::VXORPDZ128rmbk,
+ X86::VPXORQZ128rmbk, X86::VPXORDZ128rmbk },
+ { X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk,
+ X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk },
+ { X86::VANDPSZ256rmbk, X86::VANDPDZ256rmbk,
+ X86::VPANDQZ256rmbk, X86::VPANDDZ256rmbk },
+ { X86::VORPSZ256rmbk, X86::VORPDZ256rmbk,
+ X86::VPORQZ256rmbk, X86::VPORDZ256rmbk },
+ { X86::VXORPSZ256rmbk, X86::VXORPDZ256rmbk,
+ X86::VPXORQZ256rmbk, X86::VPXORDZ256rmbk },
+ { X86::VANDNPSZrmbk, X86::VANDNPDZrmbk,
+ X86::VPANDNQZrmbk, X86::VPANDNDZrmbk },
+ { X86::VANDPSZrmbk, X86::VANDPDZrmbk,
+ X86::VPANDQZrmbk, X86::VPANDDZrmbk },
+ { X86::VANDPSZrmbk, X86::VANDPDZrmbk,
+ X86::VPANDQZrmbk, X86::VPANDDZrmbk },
+ { X86::VORPSZrmbk, X86::VORPDZrmbk,
+ X86::VPORQZrmbk, X86::VPORDZrmbk },
+ { X86::VXORPSZrmbk, X86::VXORPDZrmbk,
+ X86::VPXORQZrmbk, X86::VPXORDZrmbk },
+ { X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz,
+ X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz},
+ { X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz,
+ X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz },
+ { X86::VORPSZ128rmbkz, X86::VORPDZ128rmbkz,
+ X86::VPORQZ128rmbkz, X86::VPORDZ128rmbkz },
+ { X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz,
+ X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz },
+ { X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz,
+ X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz},
+ { X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz,
+ X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz },
+ { X86::VORPSZ256rmbkz, X86::VORPDZ256rmbkz,
+ X86::VPORQZ256rmbkz, X86::VPORDZ256rmbkz },
+ { X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz,
+ X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz },
+ { X86::VANDNPSZrmbkz, X86::VANDNPDZrmbkz,
+ X86::VPANDNQZrmbkz, X86::VPANDNDZrmbkz },
+ { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
+ X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
+ { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
+ X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
+ { X86::VORPSZrmbkz, X86::VORPDZrmbkz,
+ X86::VPORQZrmbkz, X86::VPORDZrmbkz },
+ { X86::VXORPSZrmbkz, X86::VXORPDZrmbkz,
+ X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
+};
+
+// NOTE: These should only be used by the custom domain methods.
+static const uint16_t ReplaceableBlendInstrs[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi },
+ { X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri },
+ { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDWrmi },
+ { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDWrri },
+ { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi },
+ { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri },
+};
+static const uint16_t ReplaceableBlendAVX2Instrs[][3] = {
+ //PackedSingle PackedDouble PackedInt
+ { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi },
+ { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri },
+ { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDDYrmi },
+ { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDDYrri },
+};
+
+// Special table for changing EVEX logic instructions to VEX.
+// TODO: Should we run EVEX->VEX earlier?
+static const uint16_t ReplaceableCustomAVX512LogicInstrs[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
+ { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
+ { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDQZ128rm, X86::VPANDDZ128rm },
+ { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDQZ128rr, X86::VPANDDZ128rr },
+ { X86::VORPSrm, X86::VORPDrm, X86::VPORQZ128rm, X86::VPORDZ128rm },
+ { X86::VORPSrr, X86::VORPDrr, X86::VPORQZ128rr, X86::VPORDZ128rr },
+ { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORQZ128rm, X86::VPXORDZ128rm },
+ { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORQZ128rr, X86::VPXORDZ128rr },
+ { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
+ { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
+ { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDQZ256rm, X86::VPANDDZ256rm },
+ { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDQZ256rr, X86::VPANDDZ256rr },
+ { X86::VORPSYrm, X86::VORPDYrm, X86::VPORQZ256rm, X86::VPORDZ256rm },
+ { X86::VORPSYrr, X86::VORPDYrr, X86::VPORQZ256rr, X86::VPORDZ256rr },
+ { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORQZ256rm, X86::VPXORDZ256rm },
+ { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORQZ256rr, X86::VPXORDZ256rr },
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const uint16_t *lookup(unsigned opcode, unsigned domain,
+ ArrayRef<uint16_t[3]> Table) {
+ for (const uint16_t (&Row)[3] : Table)
+ if (Row[domain-1] == opcode)
+ return Row;
+ return nullptr;
+}
+
+static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
+ ArrayRef<uint16_t[4]> Table) {
+ // If this is the integer domain make sure to check both integer columns.
+ for (const uint16_t (&Row)[4] : Table)
+ if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode))
+ return Row;
+ return nullptr;
+}
+
+// Helper to attempt to widen/narrow blend masks.
+static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
+ unsigned NewWidth, unsigned *pNewMask = nullptr) {
+ assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
+ "Illegal blend mask scale");
+ unsigned NewMask = 0;
+
+ if ((OldWidth % NewWidth) == 0) {
+ unsigned Scale = OldWidth / NewWidth;
+ unsigned SubMask = (1u << Scale) - 1;
+ for (unsigned i = 0; i != NewWidth; ++i) {
+ unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
+ if (Sub == SubMask)
+ NewMask |= (1u << i);
+ else if (Sub != 0x0)
+ return false;
+ }
+ } else {
+ unsigned Scale = NewWidth / OldWidth;
+ unsigned SubMask = (1u << Scale) - 1;
+ for (unsigned i = 0; i != OldWidth; ++i) {
+ if (OldMask & (1 << i)) {
+ NewMask |= (SubMask << (i * Scale));
+ }
+ }
+ }
+
+ if (pNewMask)
+ *pNewMask = NewMask;
+ return true;
+}
+
+uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+
+ auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
+ uint16_t validDomains = 0;
+ if (MI.getOperand(NumOperands - 1).isImm()) {
+ unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
+ if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
+ validDomains |= 0x2; // PackedSingle
+ if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
+ validDomains |= 0x4; // PackedDouble
+ if (!Is256 || Subtarget.hasAVX2())
+ validDomains |= 0x8; // PackedInt
+ }
+ return validDomains;
+ };
+
+ switch (Opcode) {
+ case X86::BLENDPDrmi:
+ case X86::BLENDPDrri:
+ case X86::VBLENDPDrmi:
+ case X86::VBLENDPDrri:
+ return GetBlendDomains(2, false);
+ case X86::VBLENDPDYrmi:
+ case X86::VBLENDPDYrri:
+ return GetBlendDomains(4, true);
+ case X86::BLENDPSrmi:
+ case X86::BLENDPSrri:
+ case X86::VBLENDPSrmi:
+ case X86::VBLENDPSrri:
+ case X86::VPBLENDDrmi:
+ case X86::VPBLENDDrri:
+ return GetBlendDomains(4, false);
+ case X86::VBLENDPSYrmi:
+ case X86::VBLENDPSYrri:
+ case X86::VPBLENDDYrmi:
+ case X86::VPBLENDDYrri:
+ return GetBlendDomains(8, true);
+ case X86::PBLENDWrmi:
+ case X86::PBLENDWrri:
+ case X86::VPBLENDWrmi:
+ case X86::VPBLENDWrri:
+ // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
+ case X86::VPBLENDWYrmi:
+ case X86::VPBLENDWYrri:
+ return GetBlendDomains(8, false);
+ case X86::VPANDDZ128rr: case X86::VPANDDZ128rm:
+ case X86::VPANDDZ256rr: case X86::VPANDDZ256rm:
+ case X86::VPANDQZ128rr: case X86::VPANDQZ128rm:
+ case X86::VPANDQZ256rr: case X86::VPANDQZ256rm:
+ case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm:
+ case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm:
+ case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm:
+ case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm:
+ case X86::VPORDZ128rr: case X86::VPORDZ128rm:
+ case X86::VPORDZ256rr: case X86::VPORDZ256rm:
+ case X86::VPORQZ128rr: case X86::VPORQZ128rm:
+ case X86::VPORQZ256rr: case X86::VPORQZ256rm:
+ case X86::VPXORDZ128rr: case X86::VPXORDZ128rm:
+ case X86::VPXORDZ256rr: case X86::VPXORDZ256rm:
+ case X86::VPXORQZ128rr: case X86::VPXORQZ128rm:
+ case X86::VPXORQZ256rr: case X86::VPXORQZ256rm:
+ // If we don't have DQI see if we can still switch from an EVEX integer
+ // instruction to a VEX floating point instruction.
+ if (Subtarget.hasDQI())
+ return 0;
+
+ if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
+ return 0;
+ if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
+ return 0;
+ // Register forms will have 3 operands. Memory form will have more.
+ if (NumOperands == 3 &&
+ RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+ return 0;
+
+ // All domains are valid.
+ return 0xe;
+ case X86::MOVHLPSrr:
+ // We can swap domains when both inputs are the same register.
+ // FIXME: This doesn't catch all the cases we would like. If the input
+ // register isn't KILLed by the instruction, the two address instruction
+ // pass puts a COPY on one input. The other input uses the original
+ // register. This prevents the same physical register from being used by
+ // both inputs.
+ if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
+ MI.getOperand(0).getSubReg() == 0 &&
+ MI.getOperand(1).getSubReg() == 0 &&
+ MI.getOperand(2).getSubReg() == 0)
+ return 0x6;
+ return 0;
+ case X86::SHUFPDrri:
+ return 0x6;
+ }
+ return 0;
+}
+
+bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
+ unsigned Domain) const {
+ assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
+ uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ assert(dom && "Not an SSE instruction");
+
+ unsigned Opcode = MI.getOpcode();
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+
+ auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
+ if (MI.getOperand(NumOperands - 1).isImm()) {
+ unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
+ Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
+ unsigned NewImm = Imm;
+
+ const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
+ if (!table)
+ table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
+
+ if (Domain == 1) { // PackedSingle
+ AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
+ } else if (Domain == 2) { // PackedDouble
+ AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
+ } else if (Domain == 3) { // PackedInt
+ if (Subtarget.hasAVX2()) {
+ // If we are already VPBLENDW use that, else use VPBLENDD.
+ if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
+ table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
+ AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
+ }
+ } else {
+ assert(!Is256 && "128-bit vector expected");
+ AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
+ }
+ }
+
+ assert(table && table[Domain - 1] && "Unknown domain op");
+ MI.setDesc(get(table[Domain - 1]));
+ MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
+ }
+ return true;
+ };
+
+ switch (Opcode) {
+ case X86::BLENDPDrmi:
+ case X86::BLENDPDrri:
+ case X86::VBLENDPDrmi:
+ case X86::VBLENDPDrri:
+ return SetBlendDomain(2, false);
+ case X86::VBLENDPDYrmi:
+ case X86::VBLENDPDYrri:
+ return SetBlendDomain(4, true);
+ case X86::BLENDPSrmi:
+ case X86::BLENDPSrri:
+ case X86::VBLENDPSrmi:
+ case X86::VBLENDPSrri:
+ case X86::VPBLENDDrmi:
+ case X86::VPBLENDDrri:
+ return SetBlendDomain(4, false);
+ case X86::VBLENDPSYrmi:
+ case X86::VBLENDPSYrri:
+ case X86::VPBLENDDYrmi:
+ case X86::VPBLENDDYrri:
+ return SetBlendDomain(8, true);
+ case X86::PBLENDWrmi:
+ case X86::PBLENDWrri:
+ case X86::VPBLENDWrmi:
+ case X86::VPBLENDWrri:
+ return SetBlendDomain(8, false);
+ case X86::VPBLENDWYrmi:
+ case X86::VPBLENDWYrri:
+ return SetBlendDomain(16, true);
+ case X86::VPANDDZ128rr: case X86::VPANDDZ128rm:
+ case X86::VPANDDZ256rr: case X86::VPANDDZ256rm:
+ case X86::VPANDQZ128rr: case X86::VPANDQZ128rm:
+ case X86::VPANDQZ256rr: case X86::VPANDQZ256rm:
+ case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm:
+ case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm:
+ case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm:
+ case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm:
+ case X86::VPORDZ128rr: case X86::VPORDZ128rm:
+ case X86::VPORDZ256rr: case X86::VPORDZ256rm:
+ case X86::VPORQZ128rr: case X86::VPORQZ128rm:
+ case X86::VPORQZ256rr: case X86::VPORQZ256rm:
+ case X86::VPXORDZ128rr: case X86::VPXORDZ128rm:
+ case X86::VPXORDZ256rr: case X86::VPXORDZ256rm:
+ case X86::VPXORQZ128rr: case X86::VPXORQZ128rm:
+ case X86::VPXORQZ256rr: case X86::VPXORQZ256rm: {
+ // Without DQI, convert EVEX instructions to VEX instructions.
+ if (Subtarget.hasDQI())
+ return false;
+
+ const uint16_t *table = lookupAVX512(MI.getOpcode(), dom,
+ ReplaceableCustomAVX512LogicInstrs);
+ assert(table && "Instruction not found in table?");
+ // Don't change integer Q instructions to D instructions and
+ // use D intructions if we started with a PS instruction.
+ if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
+ MI.setDesc(get(table[Domain - 1]));
+ return true;
+ }
+ case X86::UNPCKHPDrr:
+ case X86::MOVHLPSrr:
+ // We just need to commute the instruction which will switch the domains.
+ if (Domain != dom && Domain != 3 &&
+ MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
+ MI.getOperand(0).getSubReg() == 0 &&
+ MI.getOperand(1).getSubReg() == 0 &&
+ MI.getOperand(2).getSubReg() == 0) {
+ commuteInstruction(MI, false);
+ return true;
+ }
+ // We must always return true for MOVHLPSrr.
+ if (Opcode == X86::MOVHLPSrr)
+ return true;
+ break;
+ case X86::SHUFPDrri: {
+ if (Domain == 1) {
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned NewImm = 0x44;
+ if (Imm & 1) NewImm |= 0x0a;
+ if (Imm & 2) NewImm |= 0xa0;
+ MI.getOperand(3).setImm(NewImm);
+ MI.setDesc(get(X86::SHUFPSrri));
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
+ uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ unsigned opcode = MI.getOpcode();
+ uint16_t validDomains = 0;
+ if (domain) {
+ // Attempt to match for custom instructions.
+ validDomains = getExecutionDomainCustom(MI);
+ if (validDomains)
+ return std::make_pair(domain, validDomains);
+
+ if (lookup(opcode, domain, ReplaceableInstrs)) {
+ validDomains = 0xe;
+ } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
+ validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+ } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
+ validDomains = 0x6;
+ } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
+ // Insert/extract instructions should only effect domain if AVX2
+ // is enabled.
+ if (!Subtarget.hasAVX2())
+ return std::make_pair(0, 0);
+ validDomains = 0xe;
+ } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
+ validDomains = 0xe;
+ } else if (Subtarget.hasDQI() && lookupAVX512(opcode, domain,
+ ReplaceableInstrsAVX512DQ)) {
+ validDomains = 0xe;
+ } else if (Subtarget.hasDQI()) {
+ if (const uint16_t *table = lookupAVX512(opcode, domain,
+ ReplaceableInstrsAVX512DQMasked)) {
+ if (domain == 1 || (domain == 3 && table[3] == opcode))
+ validDomains = 0xa;
+ else
+ validDomains = 0xc;
+ }
+ }
+ }
+ return std::make_pair(domain, validDomains);
+}
+
+void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
+ assert(Domain>0 && Domain<4 && "Invalid execution domain");
+ uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ assert(dom && "Not an SSE instruction");
+
+ // Attempt to match for custom instructions.
+ if (setExecutionDomainCustom(MI, Domain))
+ return;
+
+ const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
+ if (!table) { // try the other table
+ assert((Subtarget.hasAVX2() || Domain < 3) &&
+ "256-bit vector operations only available in AVX2");
+ table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
+ }
+ if (!table) { // try the FP table
+ table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
+ assert((!table || Domain < 3) &&
+ "Can only select PackedSingle or PackedDouble");
+ }
+ if (!table) { // try the other table
+ assert(Subtarget.hasAVX2() &&
+ "256-bit insert/extract only available in AVX2");
+ table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
+ }
+ if (!table) { // try the AVX512 table
+ assert(Subtarget.hasAVX512() && "Requires AVX-512");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
+ // Don't change integer Q instructions to D instructions.
+ if (table && Domain == 3 && table[3] == MI.getOpcode())
+ Domain = 4;
+ }
+ if (!table) { // try the AVX512DQ table
+ assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
+ // Don't change integer Q instructions to D instructions and
+ // use D instructions if we started with a PS instruction.
+ if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
+ }
+ if (!table) { // try the AVX512DQMasked table
+ assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
+ if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
+ }
+ assert(table && "Cannot change domain");
+ MI.setDesc(get(table[Domain - 1]));
+}
+
+/// Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoop(MCInst &NopInst) const {
+ NopInst.setOpcode(X86::NOOP);
+}
+
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+ switch (opc) {
+ default: return false;
+ case X86::DIVPDrm:
+ case X86::DIVPDrr:
+ case X86::DIVPSrm:
+ case X86::DIVPSrr:
+ case X86::DIVSDrm:
+ case X86::DIVSDrm_Int:
+ case X86::DIVSDrr:
+ case X86::DIVSDrr_Int:
+ case X86::DIVSSrm:
+ case X86::DIVSSrm_Int:
+ case X86::DIVSSrr:
+ case X86::DIVSSrr_Int:
+ case X86::SQRTPDm:
+ case X86::SQRTPDr:
+ case X86::SQRTPSm:
+ case X86::SQRTPSr:
+ case X86::SQRTSDm:
+ case X86::SQRTSDm_Int:
+ case X86::SQRTSDr:
+ case X86::SQRTSDr_Int:
+ case X86::SQRTSSm:
+ case X86::SQRTSSm_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSr_Int:
+ // AVX instructions with high latency
+ case X86::VDIVPDrm:
+ case X86::VDIVPDrr:
+ case X86::VDIVPDYrm:
+ case X86::VDIVPDYrr:
+ case X86::VDIVPSrm:
+ case X86::VDIVPSrr:
+ case X86::VDIVPSYrm:
+ case X86::VDIVPSYrr:
+ case X86::VDIVSDrm:
+ case X86::VDIVSDrm_Int:
+ case X86::VDIVSDrr:
+ case X86::VDIVSDrr_Int:
+ case X86::VDIVSSrm:
+ case X86::VDIVSSrm_Int:
+ case X86::VDIVSSrr:
+ case X86::VDIVSSrr_Int:
+ case X86::VSQRTPDm:
+ case X86::VSQRTPDr:
+ case X86::VSQRTPDYm:
+ case X86::VSQRTPDYr:
+ case X86::VSQRTPSm:
+ case X86::VSQRTPSr:
+ case X86::VSQRTPSYm:
+ case X86::VSQRTPSYr:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
+ // AVX512 instructions with high latency
+ case X86::VDIVPDZ128rm:
+ case X86::VDIVPDZ128rmb:
+ case X86::VDIVPDZ128rmbk:
+ case X86::VDIVPDZ128rmbkz:
+ case X86::VDIVPDZ128rmk:
+ case X86::VDIVPDZ128rmkz:
+ case X86::VDIVPDZ128rr:
+ case X86::VDIVPDZ128rrk:
+ case X86::VDIVPDZ128rrkz:
+ case X86::VDIVPDZ256rm:
+ case X86::VDIVPDZ256rmb:
+ case X86::VDIVPDZ256rmbk:
+ case X86::VDIVPDZ256rmbkz:
+ case X86::VDIVPDZ256rmk:
+ case X86::VDIVPDZ256rmkz:
+ case X86::VDIVPDZ256rr:
+ case X86::VDIVPDZ256rrk:
+ case X86::VDIVPDZ256rrkz:
+ case X86::VDIVPDZrrb:
+ case X86::VDIVPDZrrbk:
+ case X86::VDIVPDZrrbkz:
+ case X86::VDIVPDZrm:
+ case X86::VDIVPDZrmb:
+ case X86::VDIVPDZrmbk:
+ case X86::VDIVPDZrmbkz:
+ case X86::VDIVPDZrmk:
+ case X86::VDIVPDZrmkz:
+ case X86::VDIVPDZrr:
+ case X86::VDIVPDZrrk:
+ case X86::VDIVPDZrrkz:
+ case X86::VDIVPSZ128rm:
+ case X86::VDIVPSZ128rmb:
+ case X86::VDIVPSZ128rmbk:
+ case X86::VDIVPSZ128rmbkz:
+ case X86::VDIVPSZ128rmk:
+ case X86::VDIVPSZ128rmkz:
+ case X86::VDIVPSZ128rr:
+ case X86::VDIVPSZ128rrk:
+ case X86::VDIVPSZ128rrkz:
+ case X86::VDIVPSZ256rm:
+ case X86::VDIVPSZ256rmb:
+ case X86::VDIVPSZ256rmbk:
+ case X86::VDIVPSZ256rmbkz:
+ case X86::VDIVPSZ256rmk:
+ case X86::VDIVPSZ256rmkz:
+ case X86::VDIVPSZ256rr:
+ case X86::VDIVPSZ256rrk:
+ case X86::VDIVPSZ256rrkz:
+ case X86::VDIVPSZrrb:
+ case X86::VDIVPSZrrbk:
+ case X86::VDIVPSZrrbkz:
+ case X86::VDIVPSZrm:
+ case X86::VDIVPSZrmb:
+ case X86::VDIVPSZrmbk:
+ case X86::VDIVPSZrmbkz:
+ case X86::VDIVPSZrmk:
+ case X86::VDIVPSZrmkz:
+ case X86::VDIVPSZrr:
+ case X86::VDIVPSZrrk:
+ case X86::VDIVPSZrrkz:
+ case X86::VDIVSDZrm:
+ case X86::VDIVSDZrr:
+ case X86::VDIVSDZrm_Int:
+ case X86::VDIVSDZrm_Intk:
+ case X86::VDIVSDZrm_Intkz:
+ case X86::VDIVSDZrr_Int:
+ case X86::VDIVSDZrr_Intk:
+ case X86::VDIVSDZrr_Intkz:
+ case X86::VDIVSDZrrb_Int:
+ case X86::VDIVSDZrrb_Intk:
+ case X86::VDIVSDZrrb_Intkz:
+ case X86::VDIVSSZrm:
+ case X86::VDIVSSZrr:
+ case X86::VDIVSSZrm_Int:
+ case X86::VDIVSSZrm_Intk:
+ case X86::VDIVSSZrm_Intkz:
+ case X86::VDIVSSZrr_Int:
+ case X86::VDIVSSZrr_Intk:
+ case X86::VDIVSSZrr_Intkz:
+ case X86::VDIVSSZrrb_Int:
+ case X86::VDIVSSZrrb_Intk:
+ case X86::VDIVSSZrrb_Intkz:
+ case X86::VSQRTPDZ128m:
+ case X86::VSQRTPDZ128mb:
+ case X86::VSQRTPDZ128mbk:
+ case X86::VSQRTPDZ128mbkz:
+ case X86::VSQRTPDZ128mk:
+ case X86::VSQRTPDZ128mkz:
+ case X86::VSQRTPDZ128r:
+ case X86::VSQRTPDZ128rk:
+ case X86::VSQRTPDZ128rkz:
+ case X86::VSQRTPDZ256m:
+ case X86::VSQRTPDZ256mb:
+ case X86::VSQRTPDZ256mbk:
+ case X86::VSQRTPDZ256mbkz:
+ case X86::VSQRTPDZ256mk:
+ case X86::VSQRTPDZ256mkz:
+ case X86::VSQRTPDZ256r:
+ case X86::VSQRTPDZ256rk:
+ case X86::VSQRTPDZ256rkz:
+ case X86::VSQRTPDZm:
+ case X86::VSQRTPDZmb:
+ case X86::VSQRTPDZmbk:
+ case X86::VSQRTPDZmbkz:
+ case X86::VSQRTPDZmk:
+ case X86::VSQRTPDZmkz:
+ case X86::VSQRTPDZr:
+ case X86::VSQRTPDZrb:
+ case X86::VSQRTPDZrbk:
+ case X86::VSQRTPDZrbkz:
+ case X86::VSQRTPDZrk:
+ case X86::VSQRTPDZrkz:
+ case X86::VSQRTPSZ128m:
+ case X86::VSQRTPSZ128mb:
+ case X86::VSQRTPSZ128mbk:
+ case X86::VSQRTPSZ128mbkz:
+ case X86::VSQRTPSZ128mk:
+ case X86::VSQRTPSZ128mkz:
+ case X86::VSQRTPSZ128r:
+ case X86::VSQRTPSZ128rk:
+ case X86::VSQRTPSZ128rkz:
+ case X86::VSQRTPSZ256m:
+ case X86::VSQRTPSZ256mb:
+ case X86::VSQRTPSZ256mbk:
+ case X86::VSQRTPSZ256mbkz:
+ case X86::VSQRTPSZ256mk:
+ case X86::VSQRTPSZ256mkz:
+ case X86::VSQRTPSZ256r:
+ case X86::VSQRTPSZ256rk:
+ case X86::VSQRTPSZ256rkz:
+ case X86::VSQRTPSZm:
+ case X86::VSQRTPSZmb:
+ case X86::VSQRTPSZmbk:
+ case X86::VSQRTPSZmbkz:
+ case X86::VSQRTPSZmk:
+ case X86::VSQRTPSZmkz:
+ case X86::VSQRTPSZr:
+ case X86::VSQRTPSZrb:
+ case X86::VSQRTPSZrbk:
+ case X86::VSQRTPSZrbkz:
+ case X86::VSQRTPSZrk:
+ case X86::VSQRTPSZrkz:
+ case X86::VSQRTSDZm:
+ case X86::VSQRTSDZm_Int:
+ case X86::VSQRTSDZm_Intk:
+ case X86::VSQRTSDZm_Intkz:
+ case X86::VSQRTSDZr:
+ case X86::VSQRTSDZr_Int:
+ case X86::VSQRTSDZr_Intk:
+ case X86::VSQRTSDZr_Intkz:
+ case X86::VSQRTSDZrb_Int:
+ case X86::VSQRTSDZrb_Intk:
+ case X86::VSQRTSDZrb_Intkz:
+ case X86::VSQRTSSZm:
+ case X86::VSQRTSSZm_Int:
+ case X86::VSQRTSSZm_Intk:
+ case X86::VSQRTSSZm_Intkz:
+ case X86::VSQRTSSZr:
+ case X86::VSQRTSSZr_Int:
+ case X86::VSQRTSSZr_Intk:
+ case X86::VSQRTSSZr_Intkz:
+ case X86::VSQRTSSZrb_Int:
+ case X86::VSQRTSSZrb_Intk:
+ case X86::VSQRTSSZrb_Intkz:
+
+ case X86::VGATHERDPDYrm:
+ case X86::VGATHERDPDZ128rm:
+ case X86::VGATHERDPDZ256rm:
+ case X86::VGATHERDPDZrm:
+ case X86::VGATHERDPDrm:
+ case X86::VGATHERDPSYrm:
+ case X86::VGATHERDPSZ128rm:
+ case X86::VGATHERDPSZ256rm:
+ case X86::VGATHERDPSZrm:
+ case X86::VGATHERDPSrm:
+ case X86::VGATHERPF0DPDm:
+ case X86::VGATHERPF0DPSm:
+ case X86::VGATHERPF0QPDm:
+ case X86::VGATHERPF0QPSm:
+ case X86::VGATHERPF1DPDm:
+ case X86::VGATHERPF1DPSm:
+ case X86::VGATHERPF1QPDm:
+ case X86::VGATHERPF1QPSm:
+ case X86::VGATHERQPDYrm:
+ case X86::VGATHERQPDZ128rm:
+ case X86::VGATHERQPDZ256rm:
+ case X86::VGATHERQPDZrm:
+ case X86::VGATHERQPDrm:
+ case X86::VGATHERQPSYrm:
+ case X86::VGATHERQPSZ128rm:
+ case X86::VGATHERQPSZ256rm:
+ case X86::VGATHERQPSZrm:
+ case X86::VGATHERQPSrm:
+ case X86::VPGATHERDDYrm:
+ case X86::VPGATHERDDZ128rm:
+ case X86::VPGATHERDDZ256rm:
+ case X86::VPGATHERDDZrm:
+ case X86::VPGATHERDDrm:
+ case X86::VPGATHERDQYrm:
+ case X86::VPGATHERDQZ128rm:
+ case X86::VPGATHERDQZ256rm:
+ case X86::VPGATHERDQZrm:
+ case X86::VPGATHERDQrm:
+ case X86::VPGATHERQDYrm:
+ case X86::VPGATHERQDZ128rm:
+ case X86::VPGATHERQDZ256rm:
+ case X86::VPGATHERQDZrm:
+ case X86::VPGATHERQDrm:
+ case X86::VPGATHERQQYrm:
+ case X86::VPGATHERQQZ128rm:
+ case X86::VPGATHERQQZ256rm:
+ case X86::VPGATHERQQZrm:
+ case X86::VPGATHERQQrm:
+ case X86::VSCATTERDPDZ128mr:
+ case X86::VSCATTERDPDZ256mr:
+ case X86::VSCATTERDPDZmr:
+ case X86::VSCATTERDPSZ128mr:
+ case X86::VSCATTERDPSZ256mr:
+ case X86::VSCATTERDPSZmr:
+ case X86::VSCATTERPF0DPDm:
+ case X86::VSCATTERPF0DPSm:
+ case X86::VSCATTERPF0QPDm:
+ case X86::VSCATTERPF0QPSm:
+ case X86::VSCATTERPF1DPDm:
+ case X86::VSCATTERPF1DPSm:
+ case X86::VSCATTERPF1QPDm:
+ case X86::VSCATTERPF1QPSm:
+ case X86::VSCATTERQPDZ128mr:
+ case X86::VSCATTERQPDZ256mr:
+ case X86::VSCATTERQPDZmr:
+ case X86::VSCATTERQPSZ128mr:
+ case X86::VSCATTERQPSZ256mr:
+ case X86::VSCATTERQPSZmr:
+ case X86::VPSCATTERDDZ128mr:
+ case X86::VPSCATTERDDZ256mr:
+ case X86::VPSCATTERDDZmr:
+ case X86::VPSCATTERDQZ128mr:
+ case X86::VPSCATTERDQZ256mr:
+ case X86::VPSCATTERDQZmr:
+ case X86::VPSCATTERQDZ128mr:
+ case X86::VPSCATTERQDZ256mr:
+ case X86::VPSCATTERQDZmr:
+ case X86::VPSCATTERQQZ128mr:
+ case X86::VPSCATTERQQZ256mr:
+ case X86::VPSCATTERQQZmr:
+ return true;
+ }
+}
+
+bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr &DefMI,
+ unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const {
+ return isHighLatencyDef(DefMI.getOpcode());
+}
+
+bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) const {
+ assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
+ Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
+
+ // Integer binary math/logic instructions have a third source operand:
+ // the EFLAGS register. That operand must be both defined here and never
+ // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+ // not change anything because rearranging the operands could affect other
+ // instructions that depend on the exact status flags (zero, sign, etc.)
+ // that are set by using these particular operands with this operation.
+ const MachineOperand *FlagDef = Inst.findRegisterDefOperand(X86::EFLAGS);
+ assert((Inst.getNumDefs() == 1 || FlagDef) &&
+ "Implicit def isn't flags?");
+ if (FlagDef && !FlagDef->isDead())
+ return false;
+
+ return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
+}
+
+// TODO: There are many more machine instruction opcodes to match:
+// 1. Other data types (integer, vectors)
+// 2. Other math / logic operations (xor, or)
+// 3. Other forms of the same operation (intrinsics and other variants)
+bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ case X86::AND8rr:
+ case X86::AND16rr:
+ case X86::AND32rr:
+ case X86::AND64rr:
+ case X86::OR8rr:
+ case X86::OR16rr:
+ case X86::OR32rr:
+ case X86::OR64rr:
+ case X86::XOR8rr:
+ case X86::XOR16rr:
+ case X86::XOR32rr:
+ case X86::XOR64rr:
+ case X86::IMUL16rr:
+ case X86::IMUL32rr:
+ case X86::IMUL64rr:
+ case X86::PANDrr:
+ case X86::PORrr:
+ case X86::PXORrr:
+ case X86::ANDPDrr:
+ case X86::ANDPSrr:
+ case X86::ORPDrr:
+ case X86::ORPSrr:
+ case X86::XORPDrr:
+ case X86::XORPSrr:
+ case X86::PADDBrr:
+ case X86::PADDWrr:
+ case X86::PADDDrr:
+ case X86::PADDQrr:
+ case X86::PMULLWrr:
+ case X86::PMULLDrr:
+ case X86::PMAXSBrr:
+ case X86::PMAXSDrr:
+ case X86::PMAXSWrr:
+ case X86::PMAXUBrr:
+ case X86::PMAXUDrr:
+ case X86::PMAXUWrr:
+ case X86::PMINSBrr:
+ case X86::PMINSDrr:
+ case X86::PMINSWrr:
+ case X86::PMINUBrr:
+ case X86::PMINUDrr:
+ case X86::PMINUWrr:
+ case X86::VPANDrr:
+ case X86::VPANDYrr:
+ case X86::VPANDDZ128rr:
+ case X86::VPANDDZ256rr:
+ case X86::VPANDDZrr:
+ case X86::VPANDQZ128rr:
+ case X86::VPANDQZ256rr:
+ case X86::VPANDQZrr:
+ case X86::VPORrr:
+ case X86::VPORYrr:
+ case X86::VPORDZ128rr:
+ case X86::VPORDZ256rr:
+ case X86::VPORDZrr:
+ case X86::VPORQZ128rr:
+ case X86::VPORQZ256rr:
+ case X86::VPORQZrr:
+ case X86::VPXORrr:
+ case X86::VPXORYrr:
+ case X86::VPXORDZ128rr:
+ case X86::VPXORDZ256rr:
+ case X86::VPXORDZrr:
+ case X86::VPXORQZ128rr:
+ case X86::VPXORQZ256rr:
+ case X86::VPXORQZrr:
+ case X86::VANDPDrr:
+ case X86::VANDPSrr:
+ case X86::VANDPDYrr:
+ case X86::VANDPSYrr:
+ case X86::VANDPDZ128rr:
+ case X86::VANDPSZ128rr:
+ case X86::VANDPDZ256rr:
+ case X86::VANDPSZ256rr:
+ case X86::VANDPDZrr:
+ case X86::VANDPSZrr:
+ case X86::VORPDrr:
+ case X86::VORPSrr:
+ case X86::VORPDYrr:
+ case X86::VORPSYrr:
+ case X86::VORPDZ128rr:
+ case X86::VORPSZ128rr:
+ case X86::VORPDZ256rr:
+ case X86::VORPSZ256rr:
+ case X86::VORPDZrr:
+ case X86::VORPSZrr:
+ case X86::VXORPDrr:
+ case X86::VXORPSrr:
+ case X86::VXORPDYrr:
+ case X86::VXORPSYrr:
+ case X86::VXORPDZ128rr:
+ case X86::VXORPSZ128rr:
+ case X86::VXORPDZ256rr:
+ case X86::VXORPSZ256rr:
+ case X86::VXORPDZrr:
+ case X86::VXORPSZrr:
+ case X86::KADDBrr:
+ case X86::KADDWrr:
+ case X86::KADDDrr:
+ case X86::KADDQrr:
+ case X86::KANDBrr:
+ case X86::KANDWrr:
+ case X86::KANDDrr:
+ case X86::KANDQrr:
+ case X86::KORBrr:
+ case X86::KORWrr:
+ case X86::KORDrr:
+ case X86::KORQrr:
+ case X86::KXORBrr:
+ case X86::KXORWrr:
+ case X86::KXORDrr:
+ case X86::KXORQrr:
+ case X86::VPADDBrr:
+ case X86::VPADDWrr:
+ case X86::VPADDDrr:
+ case X86::VPADDQrr:
+ case X86::VPADDBYrr:
+ case X86::VPADDWYrr:
+ case X86::VPADDDYrr:
+ case X86::VPADDQYrr:
+ case X86::VPADDBZ128rr:
+ case X86::VPADDWZ128rr:
+ case X86::VPADDDZ128rr:
+ case X86::VPADDQZ128rr:
+ case X86::VPADDBZ256rr:
+ case X86::VPADDWZ256rr:
+ case X86::VPADDDZ256rr:
+ case X86::VPADDQZ256rr:
+ case X86::VPADDBZrr:
+ case X86::VPADDWZrr:
+ case X86::VPADDDZrr:
+ case X86::VPADDQZrr:
+ case X86::VPMULLWrr:
+ case X86::VPMULLWYrr:
+ case X86::VPMULLWZ128rr:
+ case X86::VPMULLWZ256rr:
+ case X86::VPMULLWZrr:
+ case X86::VPMULLDrr:
+ case X86::VPMULLDYrr:
+ case X86::VPMULLDZ128rr:
+ case X86::VPMULLDZ256rr:
+ case X86::VPMULLDZrr:
+ case X86::VPMULLQZ128rr:
+ case X86::VPMULLQZ256rr:
+ case X86::VPMULLQZrr:
+ case X86::VPMAXSBrr:
+ case X86::VPMAXSBYrr:
+ case X86::VPMAXSBZ128rr:
+ case X86::VPMAXSBZ256rr:
+ case X86::VPMAXSBZrr:
+ case X86::VPMAXSDrr:
+ case X86::VPMAXSDYrr:
+ case X86::VPMAXSDZ128rr:
+ case X86::VPMAXSDZ256rr:
+ case X86::VPMAXSDZrr:
+ case X86::VPMAXSQZ128rr:
+ case X86::VPMAXSQZ256rr:
+ case X86::VPMAXSQZrr:
+ case X86::VPMAXSWrr:
+ case X86::VPMAXSWYrr:
+ case X86::VPMAXSWZ128rr:
+ case X86::VPMAXSWZ256rr:
+ case X86::VPMAXSWZrr:
+ case X86::VPMAXUBrr:
+ case X86::VPMAXUBYrr:
+ case X86::VPMAXUBZ128rr:
+ case X86::VPMAXUBZ256rr:
+ case X86::VPMAXUBZrr:
+ case X86::VPMAXUDrr:
+ case X86::VPMAXUDYrr:
+ case X86::VPMAXUDZ128rr:
+ case X86::VPMAXUDZ256rr:
+ case X86::VPMAXUDZrr:
+ case X86::VPMAXUQZ128rr:
+ case X86::VPMAXUQZ256rr:
+ case X86::VPMAXUQZrr:
+ case X86::VPMAXUWrr:
+ case X86::VPMAXUWYrr:
+ case X86::VPMAXUWZ128rr:
+ case X86::VPMAXUWZ256rr:
+ case X86::VPMAXUWZrr:
+ case X86::VPMINSBrr:
+ case X86::VPMINSBYrr:
+ case X86::VPMINSBZ128rr:
+ case X86::VPMINSBZ256rr:
+ case X86::VPMINSBZrr:
+ case X86::VPMINSDrr:
+ case X86::VPMINSDYrr:
+ case X86::VPMINSDZ128rr:
+ case X86::VPMINSDZ256rr:
+ case X86::VPMINSDZrr:
+ case X86::VPMINSQZ128rr:
+ case X86::VPMINSQZ256rr:
+ case X86::VPMINSQZrr:
+ case X86::VPMINSWrr:
+ case X86::VPMINSWYrr:
+ case X86::VPMINSWZ128rr:
+ case X86::VPMINSWZ256rr:
+ case X86::VPMINSWZrr:
+ case X86::VPMINUBrr:
+ case X86::VPMINUBYrr:
+ case X86::VPMINUBZ128rr:
+ case X86::VPMINUBZ256rr:
+ case X86::VPMINUBZrr:
+ case X86::VPMINUDrr:
+ case X86::VPMINUDYrr:
+ case X86::VPMINUDZ128rr:
+ case X86::VPMINUDZ256rr:
+ case X86::VPMINUDZrr:
+ case X86::VPMINUQZ128rr:
+ case X86::VPMINUQZ256rr:
+ case X86::VPMINUQZrr:
+ case X86::VPMINUWrr:
+ case X86::VPMINUWYrr:
+ case X86::VPMINUWZ128rr:
+ case X86::VPMINUWZ256rr:
+ case X86::VPMINUWZrr:
+ // Normal min/max instructions are not commutative because of NaN and signed
+ // zero semantics, but these are. Thus, there's no need to check for global
+ // relaxed math; the instructions themselves have the properties we need.
+ case X86::MAXCPDrr:
+ case X86::MAXCPSrr:
+ case X86::MAXCSDrr:
+ case X86::MAXCSSrr:
+ case X86::MINCPDrr:
+ case X86::MINCPSrr:
+ case X86::MINCSDrr:
+ case X86::MINCSSrr:
+ case X86::VMAXCPDrr:
+ case X86::VMAXCPSrr:
+ case X86::VMAXCPDYrr:
+ case X86::VMAXCPSYrr:
+ case X86::VMAXCPDZ128rr:
+ case X86::VMAXCPSZ128rr:
+ case X86::VMAXCPDZ256rr:
+ case X86::VMAXCPSZ256rr:
+ case X86::VMAXCPDZrr:
+ case X86::VMAXCPSZrr:
+ case X86::VMAXCSDrr:
+ case X86::VMAXCSSrr:
+ case X86::VMAXCSDZrr:
+ case X86::VMAXCSSZrr:
+ case X86::VMINCPDrr:
+ case X86::VMINCPSrr:
+ case X86::VMINCPDYrr:
+ case X86::VMINCPSYrr:
+ case X86::VMINCPDZ128rr:
+ case X86::VMINCPSZ128rr:
+ case X86::VMINCPDZ256rr:
+ case X86::VMINCPSZ256rr:
+ case X86::VMINCPDZrr:
+ case X86::VMINCPSZrr:
+ case X86::VMINCSDrr:
+ case X86::VMINCSSrr:
+ case X86::VMINCSDZrr:
+ case X86::VMINCSSZrr:
+ return true;
+ case X86::ADDPDrr:
+ case X86::ADDPSrr:
+ case X86::ADDSDrr:
+ case X86::ADDSSrr:
+ case X86::MULPDrr:
+ case X86::MULPSrr:
+ case X86::MULSDrr:
+ case X86::MULSSrr:
+ case X86::VADDPDrr:
+ case X86::VADDPSrr:
+ case X86::VADDPDYrr:
+ case X86::VADDPSYrr:
+ case X86::VADDPDZ128rr:
+ case X86::VADDPSZ128rr:
+ case X86::VADDPDZ256rr:
+ case X86::VADDPSZ256rr:
+ case X86::VADDPDZrr:
+ case X86::VADDPSZrr:
+ case X86::VADDSDrr:
+ case X86::VADDSSrr:
+ case X86::VADDSDZrr:
+ case X86::VADDSSZrr:
+ case X86::VMULPDrr:
+ case X86::VMULPSrr:
+ case X86::VMULPDYrr:
+ case X86::VMULPSYrr:
+ case X86::VMULPDZ128rr:
+ case X86::VMULPSZ128rr:
+ case X86::VMULPDZ256rr:
+ case X86::VMULPSZ256rr:
+ case X86::VMULPDZrr:
+ case X86::VMULPSZrr:
+ case X86::VMULSDrr:
+ case X86::VMULSSrr:
+ case X86::VMULSDZrr:
+ case X86::VMULSSZrr:
+ return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+ Inst.getFlag(MachineInstr::MIFlag::FmNsz);
+ default:
+ return false;
+ }
+}
+
+/// If \p DescribedReg overlaps with the MOVrr instruction's destination
+/// register then, if possible, describe the value in terms of the source
+/// register.
+static Optional<ParamLoadedValue>
+describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg,
+ const TargetRegisterInfo *TRI) {
+ Register DestReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
+
+ // If the described register is the destination, just return the source.
+ if (DestReg == DescribedReg)
+ return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
+
+ // If the described register is a sub-register of the destination register,
+ // then pick out the source register's corresponding sub-register.
+ if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
+ Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
+ return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
+ }
+
+ // The remaining case to consider is when the described register is a
+ // super-register of the destination register. MOV8rr and MOV16rr does not
+ // write to any of the other bytes in the register, meaning that we'd have to
+ // describe the value using a combination of the source register and the
+ // non-overlapping bits in the described register, which is not currently
+ // possible.
+ if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
+ !TRI->isSuperRegister(DestReg, DescribedReg))
+ return None;
+
+ assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
+ return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
+}
+
+Optional<ParamLoadedValue>
+X86InstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const {
+ const MachineOperand *Op = nullptr;
+ DIExpression *Expr = nullptr;
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ switch (MI.getOpcode()) {
+ case X86::LEA32r:
+ case X86::LEA64r:
+ case X86::LEA64_32r: {
+ // We may need to describe a 64-bit parameter with a 32-bit LEA.
+ if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
+ return None;
+
+ // Operand 4 could be global address. For now we do not support
+ // such situation.
+ if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
+ return None;
+
+ const MachineOperand &Op1 = MI.getOperand(1);
+ const MachineOperand &Op2 = MI.getOperand(3);
+ assert(Op2.isReg() && (Op2.getReg() == X86::NoRegister ||
+ Register::isPhysicalRegister(Op2.getReg())));
+
+ // Omit situations like:
+ // %rsi = lea %rsi, 4, ...
+ if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
+ Op2.getReg() == MI.getOperand(0).getReg())
+ return None;
+ else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
+ TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
+ (Op2.getReg() != X86::NoRegister &&
+ TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
+ return None;
+
+ int64_t Coef = MI.getOperand(2).getImm();
+ int64_t Offset = MI.getOperand(4).getImm();
+ SmallVector<uint64_t, 8> Ops;
+
+ if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
+ Op = &Op1;
+ } else if (Op1.isFI())
+ Op = &Op1;
+
+ if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
+ Ops.push_back(dwarf::DW_OP_constu);
+ Ops.push_back(Coef + 1);
+ Ops.push_back(dwarf::DW_OP_mul);
+ } else {
+ if (Op && Op2.getReg() != X86::NoRegister) {
+ int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
+ if (dwarfReg < 0)
+ return None;
+ else if (dwarfReg < 32) {
+ Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
+ Ops.push_back(0);
+ } else {
+ Ops.push_back(dwarf::DW_OP_bregx);
+ Ops.push_back(dwarfReg);
+ Ops.push_back(0);
+ }
+ } else if (!Op) {
+ assert(Op2.getReg() != X86::NoRegister);
+ Op = &Op2;
+ }
+
+ if (Coef > 1) {
+ assert(Op2.getReg() != X86::NoRegister);
+ Ops.push_back(dwarf::DW_OP_constu);
+ Ops.push_back(Coef);
+ Ops.push_back(dwarf::DW_OP_mul);
+ }
+
+ if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
+ Op2.getReg() != X86::NoRegister) {
+ Ops.push_back(dwarf::DW_OP_plus);
+ }
+ }
+
+ DIExpression::appendOffset(Ops, Offset);
+ Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
+
+ return ParamLoadedValue(*Op, Expr);;
+ }
+ case X86::MOV8ri:
+ case X86::MOV16ri:
+ // TODO: Handle MOV8ri and MOV16ri.
+ return None;
+ case X86::MOV32ri:
+ case X86::MOV64ri:
+ case X86::MOV64ri32:
+ // MOV32ri may be used for producing zero-extended 32-bit immediates in
+ // 64-bit parameters, so we need to consider super-registers.
+ if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
+ return None;
+ return ParamLoadedValue(MI.getOperand(1), Expr);
+ case X86::MOV8rr:
+ case X86::MOV16rr:
+ case X86::MOV32rr:
+ case X86::MOV64rr:
+ return describeMOVrrLoadedValue(MI, Reg, TRI);
+ case X86::XOR32rr: {
+ // 64-bit parameters are zero-materialized using XOR32rr, so also consider
+ // super-registers.
+ if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
+ return None;
+ if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
+ return ParamLoadedValue(MachineOperand::CreateImm(0), Expr);
+ return None;
+ }
+ case X86::MOVSX64rr32: {
+ // We may need to describe the lower 32 bits of the MOVSX; for example, in
+ // cases like this:
+ //
+ // $ebx = [...]
+ // $rdi = MOVSX64rr32 $ebx
+ // $esi = MOV32rr $edi
+ if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
+ return None;
+
+ Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
+
+ // If the described register is the destination register we need to
+ // sign-extend the source register from 32 bits. The other case we handle
+ // is when the described register is the 32-bit sub-register of the
+ // destination register, in case we just need to return the source
+ // register.
+ if (Reg == MI.getOperand(0).getReg())
+ Expr = DIExpression::appendExt(Expr, 32, 64, true);
+ else
+ assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) &&
+ "Unhandled sub-register case for MOVSX64rr32");
+
+ return ParamLoadedValue(MI.getOperand(1), Expr);
+ }
+ default:
+ assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
+ return TargetInstrInfo::describeLoadedValue(MI, Reg);
+ }
+}
+
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
+ MachineInstr &OldMI2,
+ MachineInstr &NewMI1,
+ MachineInstr &NewMI2) const {
+ // Propagate FP flags from the original instructions.
+ // But clear poison-generating flags because those may not be valid now.
+ // TODO: There should be a helper function for copying only fast-math-flags.
+ uint16_t IntersectedFlags = OldMI1.getFlags() & OldMI2.getFlags();
+ NewMI1.setFlags(IntersectedFlags);
+ NewMI1.clearFlag(MachineInstr::MIFlag::NoSWrap);
+ NewMI1.clearFlag(MachineInstr::MIFlag::NoUWrap);
+ NewMI1.clearFlag(MachineInstr::MIFlag::IsExact);
+
+ NewMI2.setFlags(IntersectedFlags);
+ NewMI2.clearFlag(MachineInstr::MIFlag::NoSWrap);
+ NewMI2.clearFlag(MachineInstr::MIFlag::NoUWrap);
+ NewMI2.clearFlag(MachineInstr::MIFlag::IsExact);
+
+ // Integer instructions may define an implicit EFLAGS dest register operand.
+ MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS);
+ MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS);
+
+ assert(!OldFlagDef1 == !OldFlagDef2 &&
+ "Unexpected instruction type for reassociation");
+
+ if (!OldFlagDef1 || !OldFlagDef2)
+ return;
+
+ assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+
+ MachineOperand *NewFlagDef1 = NewMI1.findRegisterDefOperand(X86::EFLAGS);
+ MachineOperand *NewFlagDef2 = NewMI2.findRegisterDefOperand(X86::EFLAGS);
+
+ assert(NewFlagDef1 && NewFlagDef2 &&
+ "Unexpected operand in reassociable instruction");
+
+ // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+ // of this pass or other passes. The EFLAGS operands must be dead in these new
+ // instructions because the EFLAGS operands in the original instructions must
+ // be dead in order for reassociation to occur.
+ NewFlagDef1->setIsDead();
+ NewFlagDef2->setIsDead();
+}
+
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace X86II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+ {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+ {MO_GOT, "x86-got"},
+ {MO_GOTOFF, "x86-gotoff"},
+ {MO_GOTPCREL, "x86-gotpcrel"},
+ {MO_PLT, "x86-plt"},
+ {MO_TLSGD, "x86-tlsgd"},
+ {MO_TLSLD, "x86-tlsld"},
+ {MO_TLSLDM, "x86-tlsldm"},
+ {MO_GOTTPOFF, "x86-gottpoff"},
+ {MO_INDNTPOFF, "x86-indntpoff"},
+ {MO_TPOFF, "x86-tpoff"},
+ {MO_DTPOFF, "x86-dtpoff"},
+ {MO_NTPOFF, "x86-ntpoff"},
+ {MO_GOTNTPOFF, "x86-gotntpoff"},
+ {MO_DLLIMPORT, "x86-dllimport"},
+ {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+ {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+ {MO_TLVP, "x86-tlvp"},
+ {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+ {MO_SECREL, "x86-secrel"},
+ {MO_COFFSTUB, "x86-coffstub"}};
+ return makeArrayRef(TargetFlags);
+}
+
+namespace {
+ /// Create Global Base Reg pass. This initializes the PIC
+ /// global base register for x86-32.
+ struct CGBR : public MachineFunctionPass {
+ static char ID;
+ CGBR() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF.getTarget());
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+
+ // Don't do anything in the 64-bit small and kernel code models. They use
+ // RIP-relative addressing for everything.
+ if (STI.is64Bit() && (TM->getCodeModel() == CodeModel::Small ||
+ TM->getCodeModel() == CodeModel::Kernel))
+ return false;
+
+ // Only emit a global base reg in PIC mode.
+ if (!TM->isPositionIndependent())
+ return false;
+
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ Register GlobalBaseReg = X86FI->getGlobalBaseReg();
+
+ // If we didn't need a GlobalBaseReg, don't insert code.
+ if (GlobalBaseReg == 0)
+ return false;
+
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = MF.front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ const X86InstrInfo *TII = STI.getInstrInfo();
+
+ Register PC;
+ if (STI.isPICStyleGOT())
+ PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ else
+ PC = GlobalBaseReg;
+
+ if (STI.is64Bit()) {
+ if (TM->getCodeModel() == CodeModel::Medium) {
+ // In the medium code model, use a RIP-relative LEA to materialize the
+ // GOT.
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
+ .addReg(0);
+ } else if (TM->getCodeModel() == CodeModel::Large) {
+ // In the large code model, we are aiming for this code, though the
+ // register allocation may vary:
+ // leaq .LN$pb(%rip), %rax
+ // movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
+ // addq %rcx, %rax
+ // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
+ Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+ Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addSym(MF.getPICBaseSymbol())
+ .addReg(0);
+ std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
+ .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+ X86II::MO_PIC_BASE_OFFSET);
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
+ .addReg(PBReg, RegState::Kill)
+ .addReg(GOTReg, RegState::Kill);
+ } else {
+ llvm_unreachable("unexpected code model");
+ }
+ } else {
+ // Operand of MovePCtoStack is completely ignored by asm printer. It's
+ // only used in JIT code emission as displacement to pc.
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+ // If we're using vanilla 'GOT' PIC style, we should use relative
+ // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+ if (STI.isPICStyleGOT()) {
+ // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
+ // %some_register
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+ .addReg(PC)
+ .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+ X86II::MO_GOT_ABSOLUTE_ADDRESS);
+ }
+ }
+
+ return true;
+ }
+
+ StringRef getPassName() const override {
+ return "X86 PIC Global Base Reg Initialization";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+} // namespace
+
+char CGBR::ID = 0;
+FunctionPass*
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+ struct LDTLSCleanup : public MachineFunctionPass {
+ static char ID;
+ LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+ // No point folding accesses if there isn't at least two.
+ return false;
+ }
+
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ return VisitNode(DT->getRootNode(), 0);
+ }
+
+ // Visit the dominator subtree rooted at Node in pre-order.
+ // If TLSBaseAddrReg is non-null, then use that to replace any
+ // TLS_base_addr instructions. Otherwise, create the register
+ // when the first such instruction is seen, and then use it
+ // as we encounter more instructions.
+ bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+ MachineBasicBlock *BB = Node->getBlock();
+ bool Changed = false;
+
+ // Traverse the current block.
+ for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+ ++I) {
+ switch (I->getOpcode()) {
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ if (TLSBaseAddrReg)
+ I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
+ else
+ I = SetRegister(*I, &TLSBaseAddrReg);
+ Changed = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Visit the children of this block in the dominator tree.
+ for (auto I = Node->begin(), E = Node->end(); I != E; ++I) {
+ Changed |= VisitNode(*I, TLSBaseAddrReg);
+ }
+
+ return Changed;
+ }
+
+ // Replace the TLS_base_addr instruction I with a copy from
+ // TLSBaseAddrReg, returning the new instruction.
+ MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
+ unsigned TLSBaseAddrReg) {
+ MachineFunction *MF = I.getParent()->getParent();
+ const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+ const bool is64Bit = STI.is64Bit();
+ const X86InstrInfo *TII = STI.getInstrInfo();
+
+ // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+ MachineInstr *Copy =
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
+ .addReg(TLSBaseAddrReg);
+
+ // Erase the TLS_base_addr instruction.
+ I.eraseFromParent();
+
+ return Copy;
+ }
+
+ // Create a virtual register in *TLSBaseAddrReg, and populate it by
+ // inserting a copy instruction after I. Returns the new instruction.
+ MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+ MachineFunction *MF = I.getParent()->getParent();
+ const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+ const bool is64Bit = STI.is64Bit();
+ const X86InstrInfo *TII = STI.getInstrInfo();
+
+ // Create a virtual register for the TLS base address.
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+ ? &X86::GR64RegClass
+ : &X86::GR32RegClass);
+
+ // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+ MachineInstr *Next = I.getNextNode();
+ MachineInstr *Copy =
+ BuildMI(*I.getParent(), Next, I.getDebugLoc(),
+ TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+ .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+ return Copy;
+ }
+
+ StringRef getPassName() const override {
+ return "Local Dynamic TLS Access Clean-up";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
+
+/// Constants defining how certain sequences should be outlined.
+///
+/// \p MachineOutlinerDefault implies that the function is called with a call
+/// instruction, and a return must be emitted for the outlined function frame.
+///
+/// That is,
+///
+/// I1 OUTLINED_FUNCTION:
+/// I2 --> call OUTLINED_FUNCTION I1
+/// I3 I2
+/// I3
+/// ret
+///
+/// * Call construction overhead: 1 (call instruction)
+/// * Frame construction overhead: 1 (return instruction)
+///
+/// \p MachineOutlinerTailCall implies that the function is being tail called.
+/// A jump is emitted instead of a call, and the return is already present in
+/// the outlined sequence. That is,
+///
+/// I1 OUTLINED_FUNCTION:
+/// I2 --> jmp OUTLINED_FUNCTION I1
+/// ret I2
+/// ret
+///
+/// * Call construction overhead: 1 (jump instruction)
+/// * Frame construction overhead: 0 (don't need to return)
+///
+enum MachineOutlinerClass {
+ MachineOutlinerDefault,
+ MachineOutlinerTailCall
+};
+
+outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
+ std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+ unsigned SequenceSize =
+ std::accumulate(RepeatedSequenceLocs[0].front(),
+ std::next(RepeatedSequenceLocs[0].back()), 0,
+ [](unsigned Sum, const MachineInstr &MI) {
+ // FIXME: x86 doesn't implement getInstSizeInBytes, so
+ // we can't tell the cost. Just assume each instruction
+ // is one byte.
+ if (MI.isDebugInstr() || MI.isKill())
+ return Sum;
+ return Sum + 1;
+ });
+
+ // We check to see if CFI Instructions are present, and if they are
+ // we find the number of CFI Instructions in the candidates.
+ unsigned CFICount = 0;
+ MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
+ for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
+ Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
+ const std::vector<MCCFIInstruction> &CFIInstructions =
+ RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
+ if (MBBI->isCFIInstruction()) {
+ unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
+ MCCFIInstruction CFI = CFIInstructions[CFIIndex];
+ CFICount++;
+ }
+ MBBI++;
+ }
+
+ // We compare the number of found CFI Instructions to the number of CFI
+ // instructions in the parent function for each candidate. We must check this
+ // since if we outline one of the CFI instructions in a function, we have to
+ // outline them all for correctness. If we do not, the address offsets will be
+ // incorrect between the two sections of the program.
+ for (outliner::Candidate &C : RepeatedSequenceLocs) {
+ std::vector<MCCFIInstruction> CFIInstructions =
+ C.getMF()->getFrameInstructions();
+
+ if (CFICount > 0 && CFICount != CFIInstructions.size())
+ return outliner::OutlinedFunction();
+ }
+
+ // FIXME: Use real size in bytes for call and ret instructions.
+ if (RepeatedSequenceLocs[0].back()->isTerminator()) {
+ for (outliner::Candidate &C : RepeatedSequenceLocs)
+ C.setCallInfo(MachineOutlinerTailCall, 1);
+
+ return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
+ 0, // Number of bytes to emit frame.
+ MachineOutlinerTailCall // Type of frame.
+ );
+ }
+
+ if (CFICount > 0)
+ return outliner::OutlinedFunction();
+
+ for (outliner::Candidate &C : RepeatedSequenceLocs)
+ C.setCallInfo(MachineOutlinerDefault, 1);
+
+ return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 1,
+ MachineOutlinerDefault);
+}
+
+bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
+ bool OutlineFromLinkOnceODRs) const {
+ const Function &F = MF.getFunction();
+
+ // Does the function use a red zone? If it does, then we can't risk messing
+ // with the stack.
+ if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
+ // It could have a red zone. If it does, then we don't want to touch it.
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ if (!X86FI || X86FI->getUsesRedZone())
+ return false;
+ }
+
+ // If we *don't* want to outline from things that could potentially be deduped
+ // then return false.
+ if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
+ return false;
+
+ // This function is viable for outlining, so return true.
+ return true;
+}
+
+outliner::InstrType
+X86InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const {
+ MachineInstr &MI = *MIT;
+ // Don't allow debug values to impact outlining type.
+ if (MI.isDebugInstr() || MI.isIndirectDebugValue())
+ return outliner::InstrType::Invisible;
+
+ // At this point, KILL instructions don't really tell us much so we can go
+ // ahead and skip over them.
+ if (MI.isKill())
+ return outliner::InstrType::Invisible;
+
+ // Is this a tail call? If yes, we can outline as a tail call.
+ if (isTailCall(MI))
+ return outliner::InstrType::Legal;
+
+ // Is this the terminator of a basic block?
+ if (MI.isTerminator() || MI.isReturn()) {
+
+ // Does its parent have any successors in its MachineFunction?
+ if (MI.getParent()->succ_empty())
+ return outliner::InstrType::Legal;
+
+ // It does, so we can't tail call it.
+ return outliner::InstrType::Illegal;
+ }
+
+ // Don't outline anything that modifies or reads from the stack pointer.
+ //
+ // FIXME: There are instructions which are being manually built without
+ // explicit uses/defs so we also have to check the MCInstrDesc. We should be
+ // able to remove the extra checks once those are fixed up. For example,
+ // sometimes we might get something like %rax = POP64r 1. This won't be
+ // caught by modifiesRegister or readsRegister even though the instruction
+ // really ought to be formed so that modifiesRegister/readsRegister would
+ // catch it.
+ if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
+ MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
+ MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
+ return outliner::InstrType::Illegal;
+
+ // Outlined calls change the instruction pointer, so don't read from it.
+ if (MI.readsRegister(X86::RIP, &RI) ||
+ MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
+ MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
+ return outliner::InstrType::Illegal;
+
+ // Positions can't safely be outlined.
+ if (MI.isPosition())
+ return outliner::InstrType::Illegal;
+
+ // Make sure none of the operands of this instruction do anything tricky.
+ for (const MachineOperand &MOP : MI.operands())
+ if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+ MOP.isTargetIndex())
+ return outliner::InstrType::Illegal;
+
+ return outliner::InstrType::Legal;
+}
+
+void X86InstrInfo::buildOutlinedFrame(MachineBasicBlock &MBB,
+ MachineFunction &MF,
+ const outliner::OutlinedFunction &OF)
+ const {
+ // If we're a tail call, we already have a return, so don't do anything.
+ if (OF.FrameConstructionID == MachineOutlinerTailCall)
+ return;
+
+ // We're a normal call, so our sequence doesn't have a return instruction.
+ // Add it in.
+ MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RETQ));
+ MBB.insert(MBB.end(), retq);
+}
+
+MachineBasicBlock::iterator
+X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &It,
+ MachineFunction &MF,
+ const outliner::Candidate &C) const {
+ // Is it a tail call?
+ if (C.CallConstructionID == MachineOutlinerTailCall) {
+ // Yes, just insert a JMP.
+ It = MBB.insert(It,
+ BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
+ .addGlobalAddress(M.getNamedValue(MF.getName())));
+ } else {
+ // No, insert a call.
+ It = MBB.insert(It,
+ BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
+ .addGlobalAddress(M.getNamedValue(MF.getName())));
+ }
+
+ return It;
+}
+
+#define GET_INSTRINFO_HELPERS
+#include "X86GenInstrInfo.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 000000000000..d7d2370c6f67
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,634 @@
+//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrFMA3Info.h"
+#include "X86RegisterInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include <vector>
+
+#define GET_INSTRINFO_HEADER
+#include "X86GenInstrInfo.inc"
+
+namespace llvm {
+class X86Subtarget;
+
+namespace X86 {
+
+enum AsmComments {
+ // For instr that was compressed from EVEX to VEX.
+ AC_EVEX_2_VEX = MachineInstr::TAsmComments
+};
+
+/// Return a pair of condition code for the given predicate and whether
+/// the instruction operands should be swaped to match the condition code.
+std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
+
+/// Return a setcc opcode based on whether it has a memory operand.
+unsigned getSETOpc(bool HasMemoryOperand = false);
+
+/// Return a cmov opcode for the given register size in bytes, and operand type.
+unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false);
+
+// Turn jCC instruction into condition code.
+CondCode getCondFromBranch(const MachineInstr &MI);
+
+// Turn setCC instruction into condition code.
+CondCode getCondFromSETCC(const MachineInstr &MI);
+
+// Turn CMov instruction into condition code.
+CondCode getCondFromCMov(const MachineInstr &MI);
+
+/// GetOppositeBranchCondition - Return the inverse of the specified cond,
+/// e.g. turning COND_E to COND_NE.
+CondCode GetOppositeBranchCondition(CondCode CC);
+
+/// Get the VPCMP immediate for the given condition.
+unsigned getVPCMPImmForCond(ISD::CondCode CC);
+
+/// Get the VPCMP immediate if the opcodes are swapped.
+unsigned getSwappedVPCMPImm(unsigned Imm);
+
+/// Get the VPCOM immediate if the opcodes are swapped.
+unsigned getSwappedVPCOMImm(unsigned Imm);
+
+/// Get the VCMP immediate if the opcodes are swapped.
+unsigned getSwappedVCMPImm(unsigned Imm);
+
+} // namespace X86
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_DLLIMPORT: // dllimport stub.
+ case X86II::MO_GOTPCREL: // rip-relative GOT reference.
+ case X86II::MO_GOT: // normal GOT reference.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref.
+ case X86II::MO_COFFSTUB: // COFF .refptr stub.
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_GOTOFF: // isPICStyleGOT: local global.
+ case X86II::MO_GOT: // isPICStyleGOT: other global.
+ case X86II::MO_PIC_BASE_OFFSET: // Darwin local global.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global.
+ case X86II::MO_TLVP: // ??? Pretty sure..
+ return true;
+ default:
+ return false;
+ }
+}
+
+inline static bool isScale(const MachineOperand &MO) {
+ return MO.isImm() && (MO.getImm() == 1 || MO.getImm() == 2 ||
+ MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr &MI, unsigned Op) {
+ if (MI.getOperand(Op).isFI())
+ return true;
+ return Op + X86::AddrSegmentReg <= MI.getNumOperands() &&
+ MI.getOperand(Op + X86::AddrBaseReg).isReg() &&
+ isScale(MI.getOperand(Op + X86::AddrScaleAmt)) &&
+ MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
+ (MI.getOperand(Op + X86::AddrDisp).isImm() ||
+ MI.getOperand(Op + X86::AddrDisp).isGlobal() ||
+ MI.getOperand(Op + X86::AddrDisp).isCPI() ||
+ MI.getOperand(Op + X86::AddrDisp).isJTI());
+}
+
+inline static bool isMem(const MachineInstr &MI, unsigned Op) {
+ if (MI.getOperand(Op).isFI())
+ return true;
+ return Op + X86::AddrNumOperands <= MI.getNumOperands() &&
+ MI.getOperand(Op + X86::AddrSegmentReg).isReg() && isLeaMem(MI, Op);
+}
+
+class X86InstrInfo final : public X86GenInstrInfo {
+ X86Subtarget &Subtarget;
+ const X86RegisterInfo RI;
+
+ virtual void anchor();
+
+ bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &CondBranches,
+ bool AllowModify) const;
+
+public:
+ explicit X86InstrInfo(X86Subtarget &STI);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+ /// Returns the stack pointer adjustment that happens inside the frame
+ /// setup..destroy sequence (e.g. by pushes, or inside the callee).
+ int64_t getFrameAdjustment(const MachineInstr &I) const {
+ assert(isFrameInstr(I));
+ if (isFrameSetup(I))
+ return I.getOperand(2).getImm();
+ return I.getOperand(1).getImm();
+ }
+
+ /// Sets the stack pointer adjustment made inside the frame made up by this
+ /// instruction.
+ void setFrameAdjustment(MachineInstr &I, int64_t V) const {
+ assert(isFrameInstr(I));
+ if (isFrameSetup(I))
+ I.getOperand(2).setImm(V);
+ else
+ I.getOperand(1).setImm(V);
+ }
+
+ /// getSPAdjust - This returns the stack pointer adjustment made by
+ /// this instruction. For x86, we need to handle more complex call
+ /// sequences involving PUSHes.
+ int getSPAdjust(const MachineInstr &MI) const override;
+
+ /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+ /// extension instruction. That is, it's like a copy where it's legal for the
+ /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+ /// true, then it's expected the pre-extension value is available as a subreg
+ /// of the result register. This also returns the sub-register index in
+ /// SubIdx.
+ bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg,
+ Register &DstReg, unsigned &SubIdx) const override;
+
+ /// Returns true if the instruction has no behavior (specified or otherwise)
+ /// that is based on the value of any of its register operands
+ ///
+ /// Instructions are considered data invariant even if they set EFLAGS.
+ ///
+ /// A classical example of something that is inherently not data invariant is
+ /// an indirect jump -- the destination is loaded into icache based on the
+ /// bits set in the jump destination register.
+ ///
+ /// FIXME: This should become part of our instruction tables.
+ static bool isDataInvariant(MachineInstr &MI);
+
+ /// Returns true if the instruction has no behavior (specified or otherwise)
+ /// that is based on the value loaded from memory or the value of any
+ /// non-address register operands.
+ ///
+ /// For example, if the latency of the instruction is dependent on the
+ /// particular bits set in any of the registers *or* any of the bits loaded
+ /// from memory.
+ ///
+ /// Instructions are considered data invariant even if they set EFLAGS.
+ ///
+ /// A classical example of something that is inherently not data invariant is
+ /// an indirect jump -- the destination is loaded into icache based on the
+ /// bits set in the jump destination register.
+ ///
+ /// FIXME: This should become part of our instruction tables.
+ static bool isDataInvariantLoad(MachineInstr &MI);
+
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isLoadFromStackSlot(const MachineInstr &MI,
+ int &FrameIndex,
+ unsigned &MemBytes) const override;
+ /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex) const override;
+ unsigned isStoreToStackSlot(const MachineInstr &MI,
+ int &FrameIndex,
+ unsigned &MemBytes) const override;
+ /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
+ int &FrameIndex) const override;
+
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AAResults *AA) const override;
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ Register DestReg, unsigned SubIdx,
+ const MachineInstr &Orig,
+ const TargetRegisterInfo &TRI) const override;
+
+ /// Given an operand within a MachineInstr, insert preceding code to put it
+ /// into the right format for a particular kind of LEA instruction. This may
+ /// involve using an appropriate super-register instead (with an implicit use
+ /// of the original) or creating a new virtual register and inserting COPY
+ /// instructions to get the data into the right class.
+ ///
+ /// Reference parameters are set to indicate how caller should add this
+ /// operand to the LEA instruction.
+ bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
+ unsigned LEAOpcode, bool AllowSP, Register &NewSrc,
+ bool &isKill, MachineOperand &ImplicitOp,
+ LiveVariables *LV) const;
+
+ /// convertToThreeAddress - This method must be implemented by targets that
+ /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+ /// may be able to convert a two-address instruction into a true
+ /// three-address instruction on demand. This allows the X86 target (for
+ /// example) to convert ADD and SHL instructions into LEA instructions if they
+ /// would require register copies due to two-addressness.
+ ///
+ /// This method returns a null pointer if the transformation cannot be
+ /// performed, otherwise it returns the new instruction.
+ ///
+ MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineInstr &MI,
+ LiveVariables *LV) const override;
+
+ /// Returns true iff the routine could find two commutable operands in the
+ /// given machine instruction.
+ /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+ /// input values can be re-defined in this method only if the input values
+ /// are not pre-defined, which is designated by the special value
+ /// 'CommuteAnyOperandIndex' assigned to it.
+ /// If both of indices are pre-defined and refer to some operands, then the
+ /// method simply returns true if the corresponding operands are commutable
+ /// and returns false otherwise.
+ ///
+ /// For example, calling this method this way:
+ /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+ /// findCommutedOpIndices(MI, Op1, Op2);
+ /// can be interpreted as a query asking to find an operand that would be
+ /// commutable with the operand#1.
+ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const override;
+
+ /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+ /// performs the same computations as the given \p MI but which has the
+ /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+ /// It may return 0 if it is unsafe to commute the operands.
+ /// Note that a machine instruction (instead of its opcode) is passed as the
+ /// first parameter to make it possible to analyze the instruction's uses and
+ /// commute the first operand of FMA even when it seems unsafe when you look
+ /// at the opcode. For example, it is Ok to commute the first operand of
+ /// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used.
+ ///
+ /// The returned FMA opcode may differ from the opcode in the given \p MI.
+ /// For example, commuting the operands #1 and #3 in the following FMA
+ /// FMA213 #1, #2, #3
+ /// results into instruction with adjusted opcode:
+ /// FMA231 #3, #2, #1
+ unsigned
+ getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const;
+
+ // Branch analysis.
+ bool isUnconditionalTailCall(const MachineInstr &MI) const override;
+ bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
+ const MachineInstr &TailCall) const override;
+ void replaceBranchWithTailCall(MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ const MachineInstr &TailCall) const override;
+
+ bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const override;
+
+ Optional<ExtAddrMode>
+ getAddrModeFromMemoryOp(const MachineInstr &MemI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
+ int64_t &ImmVal) const override;
+
+ bool preservesZeroValueInReg(const MachineInstr *MI,
+ const Register NullValueReg,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool getMemOperandsWithOffsetWidth(
+ const MachineInstr &LdSt,
+ SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+ bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const override;
+ bool analyzeBranchPredicate(MachineBasicBlock &MBB,
+ TargetInstrInfo::MachineBranchPredicate &MBP,
+ bool AllowModify = false) const override;
+
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
+ bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
+ Register, Register, Register, int &, int &,
+ int &) const override;
+ void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, Register DstReg,
+ ArrayRef<MachineOperand> Cond, Register TrueReg,
+ Register FalseReg) const override;
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
+ bool KillSrc) const override;
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, Register SrcReg,
+ bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, Register DestReg,
+ int FrameIndex, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+ /// Check whether the target can fold a load that feeds a subreg operand
+ /// (or a subreg operand that feeds a store).
+ bool isSubregFoldable() const override { return true; }
+
+ /// foldMemoryOperand - If this target supports it, fold a load or store of
+ /// the specified stack slot into the specified machine instruction for the
+ /// specified operand(s). If this is possible, the target should perform the
+ /// folding and return true, otherwise it should return false. If it folds
+ /// the instruction, it is likely that the MachineInstruction the iterator
+ /// references has been changed.
+ MachineInstr *
+ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex,
+ LiveIntervals *LIS = nullptr,
+ VirtRegMap *VRM = nullptr) const override;
+
+ /// foldMemoryOperand - Same as the previous version except it allows folding
+ /// of any load and store from / to any address, not just from a specific
+ /// stack slot.
+ MachineInstr *foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+ LiveIntervals *LIS = nullptr) const override;
+
+ /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+ /// a store or a load and a store into two or more instruction. If this is
+ /// possible, returns true as well as the new instructions by reference.
+ bool
+ unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg,
+ bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr *> &NewMIs) const override;
+
+ bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode *> &NewNodes) const override;
+
+ /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+ /// instruction after load / store are unfolded from an instruction of the
+ /// specified opcode. It returns zero if the specified unfolding is not
+ /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+ /// index of the operand which will hold the register holding the loaded
+ /// value.
+ unsigned
+ getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex = nullptr) const override;
+
+ /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+ /// to determine if two loads are loading from the same base address. It
+ /// should only return true if the base pointers are the same and the
+ /// only differences between the two addresses are the offset. It also returns
+ /// the offsets by reference.
+ bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+ int64_t &Offset2) const override;
+
+ /// isSchedulingBoundary - Overrides the isSchedulingBoundary from
+ /// Codegen/TargetInstrInfo.cpp to make it capable of identifying ENDBR
+ /// intructions and prevent it from being re-scheduled.
+ bool isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const override;
+
+ /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+ /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
+ /// should be scheduled togther. On some targets if two loads are loading from
+ /// addresses in the same cache line, it's better if they are scheduled
+ /// together. This function takes two integers that represent the load offsets
+ /// from the common base address. It returns true if it decides it's desirable
+ /// to schedule the two loads together. "NumLoads" is the number of loads that
+ /// have already been scheduled after Load1.
+ bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1,
+ int64_t Offset2,
+ unsigned NumLoads) const override;
+
+ void getNoop(MCInst &NopInst) const override;
+
+ bool
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+ /// instruction that defines the specified register class.
+ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+
+ /// True if MI has a condition code def, e.g. EFLAGS, that is
+ /// not marked dead.
+ bool hasLiveCondCodeDef(MachineInstr &MI) const;
+
+ /// getGlobalBaseReg - Return a virtual register initialized with the
+ /// the global base register value. Output instructions required to
+ /// initialize the register in the function entry block, if necessary.
+ ///
+ unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+ std::pair<uint16_t, uint16_t>
+ getExecutionDomain(const MachineInstr &MI) const override;
+
+ uint16_t getExecutionDomainCustom(const MachineInstr &MI) const;
+
+ void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
+
+ bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const;
+
+ unsigned
+ getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const override;
+ unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const override;
+ void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const override;
+
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned Size, Align Alignment,
+ bool AllowCommute) const;
+
+ bool isHighLatencyDef(int opc) const override;
+
+ bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
+ const MachineRegisterInfo *MRI,
+ const MachineInstr &DefMI, unsigned DefIdx,
+ const MachineInstr &UseMI,
+ unsigned UseIdx) const override;
+
+ bool useMachineCombiner() const override { return true; }
+
+ bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
+ bool hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) const override;
+
+ void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+ MachineInstr &NewMI1,
+ MachineInstr &NewMI2) const override;
+
+ /// analyzeCompare - For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2 if having two register operands, and the value it
+ /// compares against in CmpValue. Return true if the comparison instruction
+ /// can be analyzed.
+ bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+ Register &SrcReg2, int &CmpMask,
+ int &CmpValue) const override;
+
+ /// optimizeCompareInstr - Check if there exists an earlier instruction that
+ /// operates on the same source operands and sets flags in the same way as
+ /// Compare; remove Compare if possible.
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+ Register SrcReg2, int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const override;
+
+ /// optimizeLoadInstr - Try to remove the load by folding it to a register
+ /// operand at the use. We fold the load instructions if and only if the
+ /// def and use are in the same BB. We only look at one load and see
+ /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+ /// defined by the load we are trying to fold. DefMI returns the machine
+ /// instruction that defines FoldAsLoadDefReg, and the function returns
+ /// the machine instruction generated due to folding.
+ MachineInstr *optimizeLoadInstr(MachineInstr &MI,
+ const MachineRegisterInfo *MRI,
+ Register &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const override;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+
+ virtual outliner::OutlinedFunction getOutliningCandidateInfo(
+ std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
+
+ bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
+ bool OutlineFromLinkOnceODRs) const override;
+
+ outliner::InstrType
+ getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
+
+ void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
+ const outliner::OutlinedFunction &OF) const override;
+
+ MachineBasicBlock::iterator
+ insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &It, MachineFunction &MF,
+ const outliner::Candidate &C) const override;
+
+#define GET_INSTRINFO_HELPER_DECLS
+#include "X86GenInstrInfo.inc"
+
+ static bool hasLockPrefix(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & X86II::LOCK;
+ }
+
+ Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
+ Register Reg) const override;
+
+protected:
+ /// Commutes the operands in the given instruction by changing the operands
+ /// order and/or changing the instruction's opcode and/or the immediate value
+ /// operand.
+ ///
+ /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+ /// to be commuted.
+ ///
+ /// Do not call this method for a non-commutable instruction or
+ /// non-commutable operands.
+ /// Even though the instruction is commutable, the method may still
+ /// fail to commute the operands, null pointer is returned in such cases.
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned CommuteOpIdx1,
+ unsigned CommuteOpIdx2) const override;
+
+ /// If the specific machine instruction is a instruction that moves/copies
+ /// value from one register to another register return destination and source
+ /// registers as machine operands.
+ Optional<DestSourcePair>
+ isCopyInstrImpl(const MachineInstr &MI) const override;
+
+private:
+ /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions.
+ /// We use 32-bit LEA to form 3-address code by promoting to a 32-bit
+ /// super-register and then truncating back down to a 8/16-bit sub-register.
+ MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
+ MachineFunction::iterator &MFI,
+ MachineInstr &MI,
+ LiveVariables *LV,
+ bool Is8BitOp) const;
+
+ /// Handles memory folding for special case instructions, for instance those
+ /// requiring custom manipulation of the address.
+ MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr &MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned Size, Align Alignment) const;
+
+ /// isFrameOperand - Return true and the FrameIndex if the specified
+ /// operand and follow operands form a reference to the stack frame.
+ bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
+ int &FrameIndex) const;
+
+ /// Returns true iff the routine could find two commutable operands in the
+ /// given machine instruction with 3 vector inputs.
+ /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+ /// input values can be re-defined in this method only if the input values
+ /// are not pre-defined, which is designated by the special value
+ /// 'CommuteAnyOperandIndex' assigned to it.
+ /// If both of indices are pre-defined and refer to some operands, then the
+ /// method simply returns true if the corresponding operands are commutable
+ /// and returns false otherwise.
+ ///
+ /// For example, calling this method this way:
+ /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+ /// findThreeSrcCommutedOpIndices(MI, Op1, Op2);
+ /// can be interpreted as a query asking to find an operand that would be
+ /// commutable with the operand#1.
+ ///
+ /// If IsIntrinsic is set, operand 1 will be ignored for commuting.
+ bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2,
+ bool IsIntrinsic = false) const;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 000000000000..b006d1d9aa3a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,3740 @@
+//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisSameAs<1, 2>]>;
+def SDTX86FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>,
+ SDTCisSameAs<1, 2>]>;
+
+def SDTX86Cmov : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+ [SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<4, i32>]>;
+// RES1, RES2, FLAGS = op LHS, RHS
+def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86BrCond : SDTypeProfile<0, 3,
+ [SDTCisVT<0, OtherVT>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i8>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+ [SDTCisInt<0>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTX86rdpkru : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+ SDTCisVT<2, i8>]>;
+def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>;
+
+def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>,
+ SDTCisInt<2>]>;
+
+def SDTLockUnaryArithWithFlags : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>]>;
+
+def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+
+def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86NtBrind : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+ SDTCisVT<1, iPTR>,
+ SDTCisVT<2, iPTR>]>;
+
+def SDT_X86VAARG : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>,
+ SDTCisVT<3, i8>,
+ SDTCisVT<4, i32>]>;
+
+def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86Void : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_X86PROBED_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>;
+
+def SDT_X86AESENCDECKL : SDTypeProfile<2, 2, [SDTCisVT<0, v2i64>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<2, v2i64>,
+ SDTCisPtrTy<3>]>;
+
+def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
+ [SDNPHasChain,SDNPSideEffect]>;
+def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
+ [SDNPHasChain]>;
+
+
+def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
+def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
+def X86fshl : SDNode<"X86ISD::FSHL", SDTIntShiftDOp>;
+def X86fshr : SDNode<"X86ISD::FSHR", SDTIntShiftDOp>;
+
+def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
+def X86fcmp : SDNode<"X86ISD::FCMP", SDTX86FCmp>;
+def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86FCmp, [SDNPHasChain]>;
+def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86FCmp, [SDNPHasChain]>;
+def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
+
+def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
+def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
+ [SDNPHasChain]>;
+def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86rdpkru : SDNode<"X86ISD::RDPKRU", SDTX86rdpkru,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInGlue]>;
+
+def X86vastart_save_xmm_regs :
+ SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+ SDT_X86VASTART_SAVE_XMM_REGS,
+ [SDNPHasChain, SDNPVariadic]>;
+def X86vaarg64 :
+ SDNode<"X86ISD::VAARG_64", SDT_X86VAARG,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+ SDNPMemOperand]>;
+def X86vaargx32 :
+ SDNode<"X86ISD::VAARG_X32", SDT_X86VAARG,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+ SDNPMemOperand]>;
+def X86callseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def X86callseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+
+def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+ SDNPVariadic]>;
+def X86NoTrackBrind : SDNode<"X86ISD::NT_BRIND", SDT_X86NtBrind,
+ [SDNPHasChain]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+ SDNPMayLoad]>;
+
+def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
+
+def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisInt<1>]>>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+ [SDNPHasChain]>;
+
+def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+ SDTypeProfile<1, 1, [SDTCisInt<0>,
+ SDTCisPtrTy<1>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_setup_dispatch : SDNode<"X86ISD::EH_SJLJ_SETUP_DISPATCH",
+ SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>;
+def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>;
+
+def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+
+def X86lock_add : SDNode<"X86ISD::LADD", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86lock_sub : SDNode<"X86ISD::LSUB", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86lock_or : SDNode<"X86ISD::LOR", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86lock_xor : SDNode<"X86ISD::LXOR", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+
+def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;
+def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>;
+
+def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>;
+
+def X86pdep : SDNode<"X86ISD::PDEP", SDTIntBinOp>;
+def X86pext : SDNode<"X86ISD::PEXT", SDTIntBinOp>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
+ [SDNPHasChain, SDNPOutGlue]>;
+
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+ [SDNPHasChain]>;
+
+def X86ProbedAlloca : SDNode<"X86ISD::PROBED_ALLOCA", SDT_X86PROBED_ALLOCA,
+ [SDNPHasChain]>;
+
+def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86lwpins : SDNode<"X86ISD::LWPINS",
+ SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPSideEffect]>;
+
+def X86umwait : SDNode<"X86ISD::UMWAIT",
+ SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tpause : SDNode<"X86ISD::TPAUSE",
+ SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86testui : SDNode<"X86ISD::TESTUI",
+ SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86aesenc128kl : SDNode<"X86ISD::AESENC128KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+def X86aesdec128kl : SDNode<"X86ISD::AESDEC128KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+def X86aesenc256kl : SDNode<"X86ISD::AESENC256KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+def X86aesdec256kl : SDNode<"X86ISD::AESDEC256KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
+}
+let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
+ def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; }
+ def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; }
+ def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; }
+ def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; }
+ def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; }
+ def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; }
+ def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
+ def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
+ // Gather mem operands
+ def X86Mem64_RC128Operand : AsmOperandClass { let Name = "Mem64_RC128"; }
+ def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; }
+ def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; }
+ def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; }
+ def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; }
+
+ def X86Mem64_RC128XOperand : AsmOperandClass { let Name = "Mem64_RC128X"; }
+ def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; }
+ def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; }
+ def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; }
+ def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; }
+ def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
+ def X86Mem256_RC512Operand : AsmOperandClass { let Name = "Mem256_RC512"; }
+ def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; }
+
+ def X86SibMemOperand : AsmOperandClass { let Name = "SibMem"; }
+}
+
+def X86AbsMemAsmOperand : AsmOperandClass {
+ let Name = "AbsMem";
+ let SuperClasses = [X86MemAsmOperand];
+}
+
+class X86MemOperand<string printMethod,
+ AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
+ let PrintMethod = printMethod;
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
+ let ParserMatchClass = parserMatchClass;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// Gather mem operands
+class X86VMemOperand<RegisterClass RC, string printMethod,
+ AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
+}
+
+def anymem : X86MemOperand<"printMemReference">;
+def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
+ [(X86strict_fcmp node:$lhs, node:$rhs),
+ (X86fcmp node:$lhs, node:$rhs)]>;
+
+// FIXME: Right now we allow any size during parsing, but we might want to
+// restrict to only unsized memory.
+def opaquemem : X86MemOperand<"printMemReference">;
+
+def sibmem: X86MemOperand<"printMemReference", X86SibMemOperand>;
+
+def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>;
+def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
+def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
+def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
+def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
+def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
+def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
+def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>;
+def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
+def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
+def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
+
+// Gather mem operands
+def vx64mem : X86VMemOperand<VR128, "printqwordmem", X86Mem64_RC128Operand>;
+def vx128mem : X86VMemOperand<VR128, "printxmmwordmem", X86Mem128_RC128Operand>;
+def vx256mem : X86VMemOperand<VR128, "printymmwordmem", X86Mem256_RC128Operand>;
+def vy128mem : X86VMemOperand<VR256, "printxmmwordmem", X86Mem128_RC256Operand>;
+def vy256mem : X86VMemOperand<VR256, "printymmwordmem", X86Mem256_RC256Operand>;
+
+def vx64xmem : X86VMemOperand<VR128X, "printqwordmem", X86Mem64_RC128XOperand>;
+def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand>;
+def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand>;
+def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand>;
+def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand>;
+def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand>;
+def vz256mem : X86VMemOperand<VR512, "printymmwordmem", X86Mem256_RC512Operand>;
+def vz512mem : X86VMemOperand<VR512, "printzmmwordmem", X86Mem512_RC512Operand>;
+
+// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
+// of a plain GPR, so that it doesn't potentially require a REX prefix.
+def ptr_rc_norex : PointerLikeRegClass<2>;
+def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
+
+def i8mem_NOREX : Operand<iPTR> {
+ let PrintMethod = "printbytemem";
+ let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
+ SEGMENT_REG);
+ let ParserMatchClass = X86Mem8AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// GPRs available for tailcall.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
+def ptr_rc_tailcall : PointerLikeRegClass<4>;
+
+// Special i32mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i32mem_TC : Operand<i32> {
+ let PrintMethod = "printdwordmem";
+ let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+ i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86Mem32AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// Special i64mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i64mem_TC : Operand<i64> {
+ let PrintMethod = "printqwordmem";
+ let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
+ ptr_rc_tailcall, i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86Mem64AsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
+// Special parser to detect 16-bit mode to select 16-bit displacement.
+def X86AbsMem16AsmOperand : AsmOperandClass {
+ let Name = "AbsMem16";
+ let RenderMethod = "addAbsMemOperands";
+ let SuperClasses = [X86AbsMemAsmOperand];
+}
+
+// Branch targets print as pc-relative values.
+class BranchTargetOperand<ValueType ty> : Operand<ty> {
+ let OperandType = "OPERAND_PCREL";
+ let PrintMethod = "printPCRelImm";
+ let ParserMatchClass = X86AbsMemAsmOperand;
+}
+
+def i32imm_brtarget : BranchTargetOperand<i32>;
+def i16imm_brtarget : BranchTargetOperand<i16>;
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_brtarget : BranchTargetOperand<i64>;
+
+def brtarget : BranchTargetOperand<OtherVT>;
+def brtarget8 : BranchTargetOperand<OtherVT>;
+def brtarget16 : BranchTargetOperand<OtherVT> {
+ let ParserMatchClass = X86AbsMem16AsmOperand;
+}
+def brtarget32 : BranchTargetOperand<OtherVT>;
+
+let RenderMethod = "addSrcIdxOperands" in {
+ def X86SrcIdx8Operand : AsmOperandClass {
+ let Name = "SrcIdx8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86SrcIdx16Operand : AsmOperandClass {
+ let Name = "SrcIdx16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86SrcIdx32Operand : AsmOperandClass {
+ let Name = "SrcIdx32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86SrcIdx64Operand : AsmOperandClass {
+ let Name = "SrcIdx64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addSrcIdxOperands"
+
+let RenderMethod = "addDstIdxOperands" in {
+ def X86DstIdx8Operand : AsmOperandClass {
+ let Name = "DstIdx8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86DstIdx16Operand : AsmOperandClass {
+ let Name = "DstIdx16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86DstIdx32Operand : AsmOperandClass {
+ let Name = "DstIdx32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86DstIdx64Operand : AsmOperandClass {
+ let Name = "DstIdx64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addDstIdxOperands"
+
+let RenderMethod = "addMemOffsOperands" in {
+ def X86MemOffs16_8AsmOperand : AsmOperandClass {
+ let Name = "MemOffs16_8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86MemOffs16_16AsmOperand : AsmOperandClass {
+ let Name = "MemOffs16_16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86MemOffs16_32AsmOperand : AsmOperandClass {
+ let Name = "MemOffs16_32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86MemOffs32_8AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86MemOffs32_16AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86MemOffs32_32AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86MemOffs32_64AsmOperand : AsmOperandClass {
+ let Name = "MemOffs32_64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+ def X86MemOffs64_8AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_8";
+ let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86MemOffs64_16AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_16";
+ let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86MemOffs64_32AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_32";
+ let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86MemOffs64_64AsmOperand : AsmOperandClass {
+ let Name = "MemOffs64_64";
+ let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addMemOffsOperands"
+
+class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops ptr_rc, SEGMENT_REG);
+}
+
+class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops ptr_rc);
+}
+
+def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>;
+def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>;
+def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>;
+def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>;
+def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>;
+def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>;
+def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>;
+def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>;
+
+class X86MemOffsOperand<Operand immOperand, string printMethod,
+ AsmOperandClass parserMatchClass>
+ : X86MemOperand<printMethod, parserMatchClass> {
+ let MIOperandInfo = (ops immOperand, SEGMENT_REG);
+}
+
+def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8",
+ X86MemOffs16_8AsmOperand>;
+def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16",
+ X86MemOffs16_16AsmOperand>;
+def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32",
+ X86MemOffs16_32AsmOperand>;
+def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8",
+ X86MemOffs32_8AsmOperand>;
+def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16",
+ X86MemOffs32_16AsmOperand>;
+def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32",
+ X86MemOffs32_32AsmOperand>;
+def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64",
+ X86MemOffs32_64AsmOperand>;
+def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8",
+ X86MemOffs64_8AsmOperand>;
+def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16",
+ X86MemOffs64_16AsmOperand>;
+def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
+ X86MemOffs64_32AsmOperand>;
+def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
+ X86MemOffs64_64AsmOperand>;
+
+def ccode : Operand<i8> {
+ let PrintMethod = "printCondCode";
+ let OperandNamespace = "X86";
+ let OperandType = "OPERAND_COND_CODE";
+}
+
+class ImmSExtAsmOperandClass : AsmOperandClass {
+ let SuperClasses = [ImmAsmOperand];
+ let RenderMethod = "addImmOperands";
+}
+
+def X86GR32orGR64AsmOperand : AsmOperandClass {
+ let Name = "GR32orGR64";
+}
+def GR32orGR64 : RegisterOperand<GR32> {
+ let ParserMatchClass = X86GR32orGR64AsmOperand;
+}
+
+def X86GR16orGR32orGR64AsmOperand : AsmOperandClass {
+ let Name = "GR16orGR32orGR64";
+}
+def GR16orGR32orGR64 : RegisterOperand<GR16> {
+ let ParserMatchClass = X86GR16orGR32orGR64AsmOperand;
+}
+
+def AVX512RCOperand : AsmOperandClass {
+ let Name = "AVX512RC";
+}
+def AVX512RC : Operand<i32> {
+ let PrintMethod = "printRoundingControl";
+ let OperandNamespace = "X86";
+ let OperandType = "OPERAND_ROUNDING_CONTROL";
+ let ParserMatchClass = AVX512RCOperand;
+}
+
+// Sign-extended immediate classes. We don't need to define the full lattice
+// here because there is no instruction with an ambiguity between ImmSExti64i32
+// and ImmSExti32i8.
+//
+// The strange ranges come from the fact that the assembler always works with
+// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
+// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
+
+// [0, 0x7FFFFFFF] |
+// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i32";
+}
+
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti16i8";
+ let SuperClasses = [ImmSExti64i32AsmOperand];
+}
+
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti32i8";
+}
+
+// [0, 0x0000007F] |
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i8";
+ let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
+ ImmSExti64i32AsmOperand];
+}
+
+// 4-bit immediate used by some XOP instructions
+// [0, 0xF]
+def ImmUnsignedi4AsmOperand : AsmOperandClass {
+ let Name = "ImmUnsignedi4";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidImmUnsignedi4";
+}
+
+// Unsigned immediate used by SSE/AVX instructions
+// [0, 0xFF]
+// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmUnsignedi8AsmOperand : AsmOperandClass {
+ let Name = "ImmUnsignedi8";
+ let RenderMethod = "addImmOperands";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm : Operand<i16> {
+ let ParserMatchClass = ImmSExti16i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm : Operand<i32> {
+ let ParserMatchClass = ImmSExti32i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i32AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Unsigned 4-bit immediate used by some XOP instructions.
+def u4imm : Operand<i8> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi4AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Unsigned 8-bit immediate used by SSE/AVX instructions.
+def u8imm : Operand<i8> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 16-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by BT instructions.
+def i16u8imm : Operand<i16> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 32-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by some SSE/AVX instructions that use intrinsics.
+def i32u8imm : Operand<i32> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by BT instructions.
+def i64u8imm : Operand<i64> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def lea64_32mem : Operand<i32> {
+ let PrintMethod = "printMemReference";
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+ let PrintMethod = "printMemReference";
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+let RenderMethod = "addMaskPairOperands" in {
+ def VK1PairAsmOperand : AsmOperandClass { let Name = "VK1Pair"; }
+ def VK2PairAsmOperand : AsmOperandClass { let Name = "VK2Pair"; }
+ def VK4PairAsmOperand : AsmOperandClass { let Name = "VK4Pair"; }
+ def VK8PairAsmOperand : AsmOperandClass { let Name = "VK8Pair"; }
+ def VK16PairAsmOperand : AsmOperandClass { let Name = "VK16Pair"; }
+}
+
+def VK1Pair : RegisterOperand<VK1PAIR, "printVKPair"> {
+ let ParserMatchClass = VK1PairAsmOperand;
+}
+
+def VK2Pair : RegisterOperand<VK2PAIR, "printVKPair"> {
+ let ParserMatchClass = VK2PairAsmOperand;
+}
+
+def VK4Pair : RegisterOperand<VK4PAIR, "printVKPair"> {
+ let ParserMatchClass = VK4PairAsmOperand;
+}
+
+def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> {
+ let ParserMatchClass = VK8PairAsmOperand;
+}
+
+def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
+ let ParserMatchClass = VK16PairAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86-specific addressing mode.
+def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex],
+ []>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
+ [add, sub, mul, X86mul_imm, shl, or,
+ frameindex, X86WrapperRIP],
+ []>;
+
+def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex,
+ X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
+
+// A relocatable immediate is an operand that can be relocated by the linker to
+// an immediate, such as a regular symbol in non-PIC code.
+def relocImm : ComplexPattern<iAny, 1, "selectRelocImm",
+ [X86Wrapper], [], 0>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def TruePredicate : Predicate<"true">;
+
+def HasCMov : Predicate<"Subtarget->hasCMov()">;
+def NoCMov : Predicate<"!Subtarget->hasCMov()">;
+
+def HasMMX : Predicate<"Subtarget->hasMMX()">;
+def Has3DNow : Predicate<"Subtarget->has3DNow()">;
+def Has3DNowA : Predicate<"Subtarget->has3DNowA()">;
+def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
+def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
+def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
+def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
+def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
+def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">;
+def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
+def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
+def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
+def NoAVX : Predicate<"!Subtarget->hasAVX()">;
+def HasAVX : Predicate<"Subtarget->hasAVX()">;
+def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasAVX512 : Predicate<"Subtarget->hasAVX512()">;
+def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
+def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
+def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
+def HasCDI : Predicate<"Subtarget->hasCDI()">;
+def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">;
+def HasPFI : Predicate<"Subtarget->hasPFI()">;
+def HasERI : Predicate<"Subtarget->hasERI()">;
+def HasDQI : Predicate<"Subtarget->hasDQI()">;
+def NoDQI : Predicate<"!Subtarget->hasDQI()">;
+def HasBWI : Predicate<"Subtarget->hasBWI()">;
+def NoBWI : Predicate<"!Subtarget->hasBWI()">;
+def HasVLX : Predicate<"Subtarget->hasVLX()">;
+def NoVLX : Predicate<"!Subtarget->hasVLX()">;
+def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
+def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
+def PKU : Predicate<"Subtarget->hasPKU()">;
+def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
+def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
+def HasBF16 : Predicate<"Subtarget->hasBF16()">;
+def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
+def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
+
+def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
+def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
+def HasAES : Predicate<"Subtarget->hasAES()">;
+def HasVAES : Predicate<"Subtarget->hasVAES()">;
+def NoVLX_Or_NoVAES : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVAES()">;
+def HasFXSR : Predicate<"Subtarget->hasFXSR()">;
+def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">;
+def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">;
+def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">;
+def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">;
+def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
+def NoVLX_Or_NoVPCLMULQDQ :
+ Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVPCLMULQDQ()">;
+def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">;
+def HasGFNI : Predicate<"Subtarget->hasGFNI()">;
+def HasFMA : Predicate<"Subtarget->hasFMA()">;
+def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
+def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">;
+def HasXOP : Predicate<"Subtarget->hasXOP()">;
+def HasTBM : Predicate<"Subtarget->hasTBM()">;
+def NoTBM : Predicate<"!Subtarget->hasTBM()">;
+def HasLWP : Predicate<"Subtarget->hasLWP()">;
+def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C : Predicate<"Subtarget->hasF16C()">;
+def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
+def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI : Predicate<"Subtarget->hasBMI()">;
+def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
+def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">;
+def HasVBMI : Predicate<"Subtarget->hasVBMI()">;
+def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">;
+def HasIFMA : Predicate<"Subtarget->hasIFMA()">;
+def HasRTM : Predicate<"Subtarget->hasRTM()">;
+def HasADX : Predicate<"Subtarget->hasADX()">;
+def HasSHA : Predicate<"Subtarget->hasSHA()">;
+def HasSGX : Predicate<"Subtarget->hasSGX()">;
+def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
+def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">;
+def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPrefetchW()">;
+def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
+def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">;
+def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">;
+def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">;
+def HasCLDEMOTE : Predicate<"Subtarget->hasCLDEMOTE()">;
+def HasMOVDIRI : Predicate<"Subtarget->hasMOVDIRI()">;
+def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">;
+def HasPTWRITE : Predicate<"Subtarget->hasPTWRITE()">;
+def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
+def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">;
+def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
+def HasCLWB : Predicate<"Subtarget->hasCLWB()">;
+def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">;
+def HasRDPID : Predicate<"Subtarget->hasRDPID()">;
+def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">;
+def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">;
+def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">;
+def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">;
+def HasKL : Predicate<"Subtarget->hasKL()">;
+def HasWIDEKL : Predicate<"Subtarget->hasWIDEKL()">;
+def HasHRESET : Predicate<"Subtarget->hasHRESET()">;
+def HasSERIALIZE : Predicate<"Subtarget->hasSERIALIZE()">;
+def HasTSXLDTRK : Predicate<"Subtarget->hasTSXLDTRK()">;
+def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">;
+def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">;
+def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
+def HasUINTR : Predicate<"Subtarget->hasUINTR()">;
+def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
+ AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">;
+def In64BitMode : Predicate<"Subtarget->is64Bit()">,
+ AssemblerPredicate<(all_of Mode64Bit), "64-bit mode">;
+def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">;
+def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
+def In16BitMode : Predicate<"Subtarget->is16Bit()">,
+ AssemblerPredicate<(all_of Mode16Bit), "16-bit mode">;
+def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
+ AssemblerPredicate<(all_of (not Mode16Bit)), "Not 16-bit mode">;
+def In32BitMode : Predicate<"Subtarget->is32Bit()">,
+ AssemblerPredicate<(all_of Mode32Bit), "32-bit mode">;
+def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
+def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
+def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
+ "Subtarget->getFrameLowering()->hasFP(*MF)"> {
+ let RecomputePerFunction = 1;
+}
+def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
+def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
+def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
+def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
+def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+ "TM.getCodeModel() == CodeModel::Kernel">;
+def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
+
+// We could compute these on a per-module basis but doing so requires accessing
+// the Function object through the <Target>Subtarget and objections were raised
+// to that (see post-commit review comments for r301750).
+let RecomputePerFunction = 1 in {
+ def OptForSize : Predicate<"shouldOptForSize(MF)">;
+ def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">;
+ def OptForSpeed : Predicate<"!shouldOptForSize(MF)">;
+ def UseIncDec : Predicate<"!Subtarget->slowIncDec() || "
+ "shouldOptForSize(MF)">;
+ def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || "
+ "!Subtarget->hasSSE41()">;
+}
+
+def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
+def FavorMemIndirectCall : Predicate<"!Subtarget->slowTwoMemOps()">;
+def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
+def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
+def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
+def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
+def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
+def HasMFence : Predicate<"Subtarget->hasMFence()">;
+def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
+def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_O : PatLeaf<(i8 0)>;
+def X86_COND_NO : PatLeaf<(i8 1)>;
+def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
+def X86_COND_AE : PatLeaf<(i8 3)>; // alt. COND_NC
+def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
+def X86_COND_NE : PatLeaf<(i8 5)>; // alt. COND_NZ
+def X86_COND_BE : PatLeaf<(i8 6)>; // alt. COND_NA
+def X86_COND_A : PatLeaf<(i8 7)>; // alt. COND_NBE
+def X86_COND_S : PatLeaf<(i8 8)>;
+def X86_COND_NS : PatLeaf<(i8 9)>;
+def X86_COND_P : PatLeaf<(i8 10)>; // alt. COND_PE
+def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_L : PatLeaf<(i8 12)>; // alt. COND_NGE
+def X86_COND_GE : PatLeaf<(i8 13)>; // alt. COND_NL
+def X86_COND_LE : PatLeaf<(i8 14)>; // alt. COND_NG
+def X86_COND_G : PatLeaf<(i8 15)>; // alt. COND_NLE
+
+def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
+def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
+def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
+def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
+def i64timmSExt32 : TImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
+
+def i16relocImmSExt8 : PatLeaf<(i16 relocImm), [{
+ return isSExtAbsoluteSymbolRef(8, N);
+}]>;
+def i32relocImmSExt8 : PatLeaf<(i32 relocImm), [{
+ return isSExtAbsoluteSymbolRef(8, N);
+}]>;
+def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{
+ return isSExtAbsoluteSymbolRef(8, N);
+}]>;
+def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
+ return isSExtAbsoluteSymbolRef(32, N);
+}]>;
+
+// If we have multiple users of an immediate, it's much smaller to reuse
+// the register, rather than encode the immediate in every instruction.
+// This has the risk of increasing register pressure from stretched live
+// ranges, however, the immediates should be trivial to rematerialize by
+// the RA in the event of high register pressure.
+// TODO : This is currently enabled for stores and binary ops. There are more
+// cases for which this can be enabled, though this catches the bulk of the
+// issues.
+// TODO2 : This should really also be enabled under O2, but there's currently
+// an issue with RA where we don't pull the constants into their users
+// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
+// issue.
+// TODO3 : This is currently limited to single basic blocks (DAG creation
+// pulls block immediates to the top and merges them if necessary).
+// Eventually, it would be nice to allow ConstantHoisting to merge constants
+// globally for potentially added savings.
+//
+def imm_su : PatLeaf<(imm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt32_su : PatLeaf<(i64immSExt32), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def relocImm8_su : PatLeaf<(i8 relocImm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def relocImm16_su : PatLeaf<(i16 relocImm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def relocImm32_su : PatLeaf<(i32 relocImm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def i16relocImmSExt8_su : PatLeaf<(i16relocImmSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i32relocImmSExt8_su : PatLeaf<(i32relocImmSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
+
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+ return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm));
+}]>;
+
+// Helper fragments for loads.
+
+// It's safe to fold a zextload/extload from i1 as a regular i8 load. The
+// upper bits are guaranteed to be zero and we were going to emit a MOV8rm
+// which might get folded during peephole anyway.
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ return ExtType == ISD::NON_EXTLOAD || ExtType == ISD::EXTLOAD ||
+ ExtType == ISD::ZEXTLOAD;
+}]>;
+
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD && EnablePromoteAnyextLoad)
+ return LD->getAlignment() >= 2 && LD->isSimple();
+ return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD && EnablePromoteAnyextLoad)
+ return LD->getAlignment() >= 4 && LD->isSimple();
+ return false;
+}]>;
+
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
+def alignedloadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
+}]>;
+def memopf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ return Subtarget->hasSSEUnalignedMem() ||
+ Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
+}]>;
+
+def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
+def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
+def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+
+// We can treat an i8/i16 extending load to i64 as a 32 bit load if its known
+// to be 4 byte aligned or better.
+def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType != ISD::EXTLOAD)
+ return false;
+ if (LD->getMemoryVT() == MVT::i32)
+ return true;
+
+ return LD->getAlignment() >= 4 && LD->isSimple();
+}]>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+ return N->hasOneUse();
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list.
+//
+
+// Nop
+let hasSideEffects = 0, SchedRW = [WriteNop] in {
+ def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
+ def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
+ "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable;
+ def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
+ "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable;
+ def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero),
+ "nop{q}\t$zero", []>, TB, NotMemoryFoldable,
+ Requires<[In64BitMode]>;
+ // Also allow register so we can assemble/disassemble
+ def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero),
+ "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable;
+ def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero),
+ "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable;
+ def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero),
+ "nop{q}\t$zero", []>, TB, NotMemoryFoldable,
+ Requires<[In64BitMode]>;
+}
+
+
+// Constructing a stack frame.
+def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
+ "enter\t$len, $lvl", []>, Sched<[WriteMicrocoded]>;
+
+let SchedRW = [WriteALU] in {
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
+def LEAVE : I<0xC9, RawFrm, (outs), (ins), "leave", []>,
+ Requires<[Not64BitMode]>;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
+def LEAVE64 : I<0xC9, RawFrm, (outs), (ins), "leave", []>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//
+
+let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1,
+ SchedRW = [WriteSystem] in
+ def Int_eh_sjlj_setup_dispatch
+ : PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>;
+
+let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+ OpSize16;
+def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+ OpSize16, NotMemoryFoldable;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>,
+ OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
+} // mayLoad, SchedRW
+let mayStore = 1, mayLoad = 1, SchedRW = [WriteCopy] in {
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", []>,
+ OpSize16;
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+} // mayStore, mayLoad, SchedRW
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+ OpSize16;
+def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>,
+ OpSize32, Requires<[Not64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+ OpSize16, NotMemoryFoldable;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>,
+ OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
+
+def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
+ "push{w}\t$imm", []>, OpSize16;
+def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+ "push{w}\t$imm", []>, OpSize16;
+
+def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+ "push{l}\t$imm", []>, OpSize32,
+ Requires<[Not64BitMode]>;
+def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
+ "push{l}\t$imm", []>, OpSize32,
+ Requires<[Not64BitMode]>;
+} // mayStore, SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in {
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src", []>,
+ OpSize16;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+ SchedRW = [WriteRMW], Defs = [ESP] in {
+ let Uses = [ESP] in
+ def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
+ [(set GR32:$dst, (int_x86_flags_read_u32))]>,
+ Requires<[Not64BitMode]>;
+
+ let Uses = [RSP] in
+ def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
+ [(set GR64:$dst, (int_x86_flags_read_u64))]>,
+ Requires<[In64BitMode]>;
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+ SchedRW = [WriteRMW] in {
+ let Defs = [ESP, EFLAGS, DF], Uses = [ESP] in
+ def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
+ [(int_x86_flags_write_u32 GR32:$src)]>,
+ Requires<[Not64BitMode]>;
+
+ let Defs = [RSP, EFLAGS, DF], Uses = [RSP] in
+ def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
+ [(int_x86_flags_write_u64 GR64:$src)]>,
+ Requires<[In64BitMode]>;
+}
+
+let Defs = [ESP, EFLAGS, DF], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
+ SchedRW = [WriteLoad] in {
+def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize16;
+def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>, OpSize32,
+ Requires<[Not64BitMode]>;
+}
+
+let Defs = [ESP], Uses = [ESP, EFLAGS, DF], mayStore = 1, hasSideEffects=0,
+ SchedRW = [WriteStore] in {
+def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize16;
+def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>, OpSize32,
+ Requires<[Not64BitMode]>;
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>,
+ OpSize32, Requires<[In64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>,
+ OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
+} // mayLoad, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", []>,
+ OpSize32, Requires<[In64BitMode]>;
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", []>,
+ OpSize32, Requires<[In64BitMode]>;
+// Long form for the disassembler.
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>,
+ OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable;
+} // isCodeGenOnly = 1, ForceDisassemble = 1
+} // mayStore, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in {
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>,
+ OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
+ SchedRW = [WriteStore] in {
+def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
+ "push{q}\t$imm", []>, OpSize32,
+ Requires<[In64BitMode]>;
+def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+ "push{q}\t$imm", []>, OpSize32,
+ Requires<[In64BitMode]>;
+}
+
+let Defs = [RSP, EFLAGS, DF], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
+def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", []>,
+ OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
+let Defs = [RSP], Uses = [RSP, EFLAGS, DF], mayStore = 1, hasSideEffects=0 in
+def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>,
+ OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
+
+let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
+ mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+}
+let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
+ mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+}
+
+let Constraints = "$src = $dst", SchedRW = [WriteBSWAP32] in {
+// This instruction is a consequence of BSWAP32r observing operand size. The
+// encoding is valid, but the behavior is undefined.
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+def BSWAP16r_BAD : I<0xC8, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
+ "bswap{w}\t$dst", []>, OpSize16, TB;
+// GR32 = bswap GR32
+def BSWAP32r : I<0xC8, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
+ "bswap{l}\t$dst",
+ [(set GR32:$dst, (bswap GR32:$src))]>, OpSize32, TB;
+
+let SchedRW = [WriteBSWAP64] in
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+ "bswap{q}\t$dst",
+ [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+} // Constraints = "$src = $dst", SchedRW
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
+ PS, OpSize16, Sched<[WriteBSF]>;
+def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
+ PS, OpSize16, Sched<[WriteBSFLd]>;
+def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
+ PS, OpSize32, Sched<[WriteBSF]>;
+def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
+ PS, OpSize32, Sched<[WriteBSFLd]>;
+def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
+ PS, Sched<[WriteBSF]>;
+def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
+ PS, Sched<[WriteBSFLd]>;
+
+def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
+ PS, OpSize16, Sched<[WriteBSR]>;
+def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
+ PS, OpSize16, Sched<[WriteBSRLd]>;
+def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
+ PS, OpSize32, Sched<[WriteBSR]>;
+def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
+ PS, OpSize32, Sched<[WriteBSRLd]>;
+def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
+ PS, Sched<[WriteBSR]>;
+def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
+ PS, Sched<[WriteBSRLd]>;
+} // Defs = [EFLAGS]
+
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [EDI,ESI], Uses = [EDI,ESI,DF] in {
+def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+ "movsb\t{$src, $dst|$dst, $src}", []>;
+def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+ "movsw\t{$src, $dst|$dst, $src}", []>, OpSize16;
+def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+ "movs{l|d}\t{$src, $dst|$dst, $src}", []>, OpSize32;
+def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+ "movsq\t{$src, $dst|$dst, $src}", []>,
+ Requires<[In64BitMode]>;
+}
+
+let Defs = [EDI], Uses = [AL,EDI,DF] in
+def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst),
+ "stosb\t{%al, $dst|$dst, al}", []>;
+let Defs = [EDI], Uses = [AX,EDI,DF] in
+def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst),
+ "stosw\t{%ax, $dst|$dst, ax}", []>, OpSize16;
+let Defs = [EDI], Uses = [EAX,EDI,DF] in
+def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst),
+ "stos{l|d}\t{%eax, $dst|$dst, eax}", []>, OpSize32;
+let Defs = [RDI], Uses = [RAX,RDI,DF] in
+def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst),
+ "stosq\t{%rax, $dst|$dst, rax}", []>,
+ Requires<[In64BitMode]>;
+
+let Defs = [EDI,EFLAGS], Uses = [AL,EDI,DF] in
+def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
+ "scasb\t{$dst, %al|al, $dst}", []>;
+let Defs = [EDI,EFLAGS], Uses = [AX,EDI,DF] in
+def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
+ "scasw\t{$dst, %ax|ax, $dst}", []>, OpSize16;
+let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,DF] in
+def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
+ "scas{l|d}\t{$dst, %eax|eax, $dst}", []>, OpSize32;
+let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,DF] in
+def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
+ "scasq\t{$dst, %rax|rax, $dst}", []>,
+ Requires<[In64BitMode]>;
+
+let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,DF] in {
+def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+ "cmpsb\t{$dst, $src|$src, $dst}", []>;
+def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+ "cmpsw\t{$dst, $src|$src, $dst}", []>, OpSize16;
+def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+ "cmps{l|d}\t{$dst, $src|$src, $dst}", []>, OpSize32;
+def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+ "cmpsq\t{$dst, $src|$src, $dst}", []>,
+ Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Move Instructions.
+//
+let SchedRW = [WriteMove] in {
+let hasSideEffects = 0, isMoveReg = 1 in {
+def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", []>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32;
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, imm:$src)]>, OpSize16;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, imm:$src)]>, OpSize32;
+def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, i64immSExt32:$src)]>;
+}
+let isReMaterializable = 1, isMoveImm = 1 in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+ "movabs{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, imm:$src)]>;
+}
+
+// Longer forms that use a ModR/M byte. Needed for disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOV8ri">;
+def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
+ FoldGenData<"MOV16ri">;
+def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
+ FoldGenData<"MOV32ri">;
+}
+} // SchedRW
+
+let SchedRW = [WriteStore] in {
+def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store (i8 imm_su:$src), addr:$dst)]>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store (i16 imm_su:$src), addr:$dst)]>, OpSize16;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store (i32 imm_su:$src), addr:$dst)]>, OpSize32;
+def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store i64immSExt32_su:$src, addr:$dst)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+def : Pat<(i32 relocImm:$src), (MOV32ri relocImm:$src)>;
+def : Pat<(i64 relocImm:$src), (MOV64ri relocImm:$src)>;
+
+def : Pat<(store (i8 relocImm8_su:$src), addr:$dst),
+ (MOV8mi addr:$dst, relocImm8_su:$src)>;
+def : Pat<(store (i16 relocImm16_su:$src), addr:$dst),
+ (MOV16mi addr:$dst, relocImm16_su:$src)>;
+def : Pat<(store (i32 relocImm32_su:$src), addr:$dst),
+ (MOV32mi addr:$dst, relocImm32_su:$src)>;
+def : Pat<(store (i64 i64relocImmSExt32_su:$src), addr:$dst),
+ (MOV64mi32 addr:$dst, i64immSExt32_su:$src)>;
+
+let hasSideEffects = 0 in {
+
+/// Memory offset versions of moves. The immediate is an address mode sized
+/// offset from the segment base.
+let SchedRW = [WriteALU] in {
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
+ "mov{b}\t{$src, %al|al, $src}", []>,
+ AdSize32;
+let Defs = [AX] in
+def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
+ "mov{w}\t{$src, %ax|ax, $src}", []>,
+ OpSize16, AdSize32;
+let Defs = [EAX] in
+def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
+ "mov{l}\t{$src, %eax|eax, $src}", []>,
+ OpSize32, AdSize32;
+let Defs = [RAX] in
+def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
+ "mov{q}\t{$src, %rax|rax, $src}", []>,
+ AdSize32;
+
+let Defs = [AL] in
+def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
+ "mov{b}\t{$src, %al|al, $src}", []>, AdSize16;
+let Defs = [AX] in
+def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
+ "mov{w}\t{$src, %ax|ax, $src}", []>,
+ OpSize16, AdSize16;
+let Defs = [EAX] in
+def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
+ "mov{l}\t{$src, %eax|eax, $src}", []>,
+ AdSize16, OpSize32;
+} // mayLoad
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst),
+ "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize32;
+let Uses = [AX] in
+def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst),
+ "mov{w}\t{%ax, $dst|$dst, ax}", []>,
+ OpSize16, AdSize32;
+let Uses = [EAX] in
+def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst),
+ "mov{l}\t{%eax, $dst|$dst, eax}", []>,
+ OpSize32, AdSize32;
+let Uses = [RAX] in
+def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst),
+ "mov{q}\t{%rax, $dst|$dst, rax}", []>,
+ AdSize32;
+
+let Uses = [AL] in
+def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst),
+ "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize16;
+let Uses = [AX] in
+def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst),
+ "mov{w}\t{%ax, $dst|$dst, ax}", []>,
+ OpSize16, AdSize16;
+let Uses = [EAX] in
+def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst),
+ "mov{l}\t{%eax, $dst|$dst, eax}", []>,
+ OpSize32, AdSize16;
+} // mayStore
+
+// These forms all have full 64-bit absolute addresses in their instructions
+// and use the movabs mnemonic to indicate this specific form.
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao64 : Ii64<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
+ "movabs{b}\t{$src, %al|al, $src}", []>,
+ AdSize64;
+let Defs = [AX] in
+def MOV16ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
+ "movabs{w}\t{$src, %ax|ax, $src}", []>,
+ OpSize16, AdSize64;
+let Defs = [EAX] in
+def MOV32ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
+ "movabs{l}\t{$src, %eax|eax, $src}", []>,
+ OpSize32, AdSize64;
+let Defs = [RAX] in
+def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
+ "movabs{q}\t{$src, %rax|rax, $src}", []>,
+ AdSize64;
+} // mayLoad
+
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o64a : Ii64<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
+ "movabs{b}\t{%al, $dst|$dst, al}", []>,
+ AdSize64;
+let Uses = [AX] in
+def MOV16o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
+ "movabs{w}\t{%ax, $dst|$dst, ax}", []>,
+ OpSize16, AdSize64;
+let Uses = [EAX] in
+def MOV32o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
+ "movabs{l}\t{%eax, $dst|$dst, eax}", []>,
+ OpSize32, AdSize64;
+let Uses = [RAX] in
+def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
+ "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
+ AdSize64;
+} // mayStore
+} // SchedRW
+} // hasSideEffects = 0
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [WriteMove], isMoveReg = 1 in {
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOV8rr">;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
+ FoldGenData<"MOV16rr">;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
+ FoldGenData<"MOV32rr">;
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOV64rr">;
+}
+
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"mov{b}.s\t{$src, $dst|$dst, $src}",
+ (MOV8rr_REV GR8:$dst, GR8:$src), 0>;
+def : InstAlias<"mov{w}.s\t{$src, $dst|$dst, $src}",
+ (MOV16rr_REV GR16:$dst, GR16:$src), 0>;
+def : InstAlias<"mov{l}.s\t{$src, $dst|$dst, $src}",
+ (MOV32rr_REV GR32:$dst, GR32:$src), 0>;
+def : InstAlias<"mov{q}.s\t{$src, $dst|$dst, $src}",
+ (MOV64rr_REV GR64:$dst, GR64:$src), 0>;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+ (MOV8rr_REV GR8:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+ (MOV16rr_REV GR16:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+ (MOV32rr_REV GR32:$dst, GR32:$src), 0, "att">;
+def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}",
+ (MOV64rr_REV GR64:$dst, GR64:$src), 0, "att">;
+
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, (loadi8 addr:$src))]>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize16;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (loadi32 addr:$src))]>, OpSize32;
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (load addr:$src))]>;
+}
+
+let SchedRW = [WriteStore] in {
+def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store GR16:$src, addr:$dst)]>, OpSize16;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store GR32:$src, addr:$dst)]>, OpSize32;
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store GR64:$src, addr:$dst)]>;
+} // SchedRW
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let isCodeGenOnly = 1 in {
+let hasSideEffects = 0, isMoveReg = 1 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+ (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteMove]>;
+let mayStore = 1, hasSideEffects = 0 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+ (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteStore]>;
+let mayLoad = 1, hasSideEffects = 0,
+ canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+ (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteLoad]>;
+}
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let SchedRW = [WriteLAHFSAHF] in {
+let Defs = [EFLAGS], Uses = [AH], hasSideEffects = 0 in
+def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", []>, // flags = AH
+ Requires<[HasLAHFSAHF]>;
+let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
+def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags
+ Requires<[HasLAHFSAHF]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Bit tests instructions: BT, BTS, BTR, BTC.
+
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteBitTest] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>,
+ OpSize16, TB, NotMemoryFoldable;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>,
+ OpSize32, TB, NotMemoryFoldable;
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB,
+ NotMemoryFoldable;
+} // SchedRW
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now. These instructions are also slow on modern CPUs so that's
+// another reason to avoid generating them.
+
+let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteBitTestRegLd] in {
+ def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ []>, OpSize16, TB, NotMemoryFoldable;
+ def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ []>, OpSize32, TB, NotMemoryFoldable;
+ def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ []>, TB, NotMemoryFoldable;
+}
+
+let SchedRW = [WriteBitTest] in {
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16u8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, imm:$src2))]>,
+ OpSize16, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32u8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, imm:$src2))]>,
+ OpSize32, TB;
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64u8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, imm:$src2))]>, TB;
+} // SchedRW
+
+// Note that these instructions aren't slow because that only applies when the
+// other operand is in a register. When it's an immediate, bt is still fast.
+let SchedRW = [WriteBitTestImmLd] in {
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi16 addr:$src1),
+ imm:$src2))]>,
+ OpSize16, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi32 addr:$src1),
+ imm:$src2))]>,
+ OpSize32, TB;
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi64 addr:$src1),
+ imm:$src2))]>, TB,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
+def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, TB, NotMemoryFoldable;
+def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, TB, NotMemoryFoldable;
+def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ NotMemoryFoldable;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetRegRMW] in {
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, TB, NotMemoryFoldable;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, TB, NotMemoryFoldable;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ NotMemoryFoldable;
+}
+
+let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ Requires<[In64BitMode]>;
+}
+
+let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
+def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, TB, NotMemoryFoldable;
+def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, TB, NotMemoryFoldable;
+def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ NotMemoryFoldable;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetRegRMW] in {
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, TB, NotMemoryFoldable;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, TB, NotMemoryFoldable;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ NotMemoryFoldable;
+}
+
+let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ Requires<[In64BitMode]>;
+}
+
+let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
+def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, TB, NotMemoryFoldable;
+def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, TB, NotMemoryFoldable;
+def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ NotMemoryFoldable;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetRegRMW] in {
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, TB, NotMemoryFoldable;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, TB, NotMemoryFoldable;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ NotMemoryFoldable;
+}
+
+let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
+ Requires<[In64BitMode]>;
+}
+} // hasSideEffects = 0
+} // Defs = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag> {
+ let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
+ def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
+ def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
+ OpSize16;
+ def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
+ OpSize32;
+ def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
+ }
+}
+
+defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap">, NotMemoryFoldable;
+
+// Swap between registers.
+let SchedRW = [WriteXCHG] in {
+let Constraints = "$src1 = $dst1, $src2 = $dst2", hasSideEffects = 0 in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst1, GR8:$dst2),
+ (ins GR8:$src1, GR8:$src2),
+ "xchg{b}\t{$src2, $src1|$src1, $src2}", []>, NotMemoryFoldable;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst1, GR16:$dst2),
+ (ins GR16:$src1, GR16:$src2),
+ "xchg{w}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize16, NotMemoryFoldable;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst1, GR32:$dst2),
+ (ins GR32:$src1, GR32:$src2),
+ "xchg{l}\t{$src2, $src1|$src1, $src2}", []>,
+ OpSize32, NotMemoryFoldable;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst1, GR64:$dst2),
+ (ins GR64:$src1 ,GR64:$src2),
+ "xchg{q}\t{$src2, $src1|$src1, $src2}", []>, NotMemoryFoldable;
+}
+
+// Swap between EAX and other registers.
+let Constraints = "$src = $dst", hasSideEffects = 0 in {
+let Uses = [AX], Defs = [AX] in
+def XCHG16ar : I<0x90, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
+ "xchg{w}\t{$src, %ax|ax, $src}", []>, OpSize16;
+let Uses = [EAX], Defs = [EAX] in
+def XCHG32ar : I<0x90, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
+ "xchg{l}\t{$src, %eax|eax, $src}", []>, OpSize32;
+let Uses = [RAX], Defs = [RAX] in
+def XCHG64ar : RI<0x90, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+ "xchg{q}\t{$src, %rax|rax, $src}", []>;
+}
+} // SchedRW
+
+let hasSideEffects = 0, Constraints = "$src1 = $dst1, $src2 = $dst2",
+ Defs = [EFLAGS], SchedRW = [WriteXCHG] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst1, GR8:$dst2),
+ (ins GR8:$src1, GR8:$src2),
+ "xadd{b}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst1, GR16:$dst2),
+ (ins GR16:$src1, GR16:$src2),
+ "xadd{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst1, GR32:$dst2),
+ (ins GR32:$src1, GR32:$src2),
+ "xadd{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst1, GR64:$dst2),
+ (ins GR64:$src1, GR64:$src2),
+ "xadd{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, hasSideEffects = 0, Constraints = "$val = $dst",
+ Defs = [EFLAGS], SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm : I<0xC0, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ "xadd{b}\t{$val, $ptr|$ptr, $val}", []>, TB;
+def XADD16rm : I<0xC1, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ "xadd{w}\t{$val, $ptr|$ptr, $val}", []>, TB,
+ OpSize16;
+def XADD32rm : I<0xC1, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ "xadd{l}\t{$val, $ptr|$ptr, $val}", []>, TB,
+ OpSize32;
+def XADD64rm : RI<0xC1, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ "xadd{q}\t{$val, $ptr|$ptr, $val}", []>, TB;
+
+}
+
+let SchedRW = [WriteCMPXCHG], hasSideEffects = 0 in {
+let Defs = [AL, EFLAGS], Uses = [AL] in
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB,
+ NotMemoryFoldable;
+let Defs = [AX, EFLAGS], Uses = [AX] in
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16,
+ NotMemoryFoldable;
+let Defs = [EAX, EFLAGS], Uses = [EAX] in
+def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32,
+ NotMemoryFoldable;
+let Defs = [RAX, EFLAGS], Uses = [RAX] in
+def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB,
+ NotMemoryFoldable;
+} // SchedRW, hasSideEffects
+
+let SchedRW = [WriteCMPXCHGRMW], mayLoad = 1, mayStore = 1,
+ hasSideEffects = 0 in {
+let Defs = [AL, EFLAGS], Uses = [AL] in
+def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB,
+ NotMemoryFoldable;
+let Defs = [AX, EFLAGS], Uses = [AX] in
+def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16,
+ NotMemoryFoldable;
+let Defs = [EAX, EFLAGS], Uses = [EAX] in
+def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32,
+ NotMemoryFoldable;
+let Defs = [RAX, EFLAGS], Uses = [RAX] in
+def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB,
+ NotMemoryFoldable;
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+ "cmpxchg8b\t$dst", []>, TB, Requires<[HasCmpxchg8b]>;
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+// NOTE: In64BitMode check needed for the AssemblerPredicate.
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+ "cmpxchg16b\t$dst", []>,
+ TB, Requires<[HasCmpxchg16b,In64BitMode]>;
+} // SchedRW, mayLoad, mayStore, hasSideEffects
+
+
+// Lock instruction prefix
+let SchedRW = [WriteMicrocoded] in
+def LOCK_PREFIX : I<0xF0, PrefixByte, (outs), (ins), "lock", []>;
+
+let SchedRW = [WriteNop] in {
+
+// Rex64 instruction prefix
+def REX64_PREFIX : I<0x48, PrefixByte, (outs), (ins), "rex64", []>,
+ Requires<[In64BitMode]>;
+
+// Data16 instruction prefix
+def DATA16_PREFIX : I<0x66, PrefixByte, (outs), (ins), "data16", []>;
+} // SchedRW
+
+// Repeat string operation instruction prefixes
+let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "repne", []>;
+}
+
+// String manipulation instructions
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [AL,ESI], Uses = [ESI,DF] in
+def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
+ "lodsb\t{$src, %al|al, $src}", []>;
+let Defs = [AX,ESI], Uses = [ESI,DF] in
+def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
+ "lodsw\t{$src, %ax|ax, $src}", []>, OpSize16;
+let Defs = [EAX,ESI], Uses = [ESI,DF] in
+def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
+ "lods{l|d}\t{$src, %eax|eax, $src}", []>, OpSize32;
+let Defs = [RAX,ESI], Uses = [ESI,DF] in
+def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
+ "lodsq\t{$src, %rax|rax, $src}", []>,
+ Requires<[In64BitMode]>;
+}
+
+let SchedRW = [WriteSystem] in {
+let Defs = [ESI], Uses = [DX,ESI,DF] in {
+def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
+ "outsb\t{$src, %dx|dx, $src}", []>;
+def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
+ "outsw\t{$src, %dx|dx, $src}", []>, OpSize16;
+def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
+ "outs{l|d}\t{$src, %dx|dx, $src}", []>, OpSize32;
+}
+
+let Defs = [EDI], Uses = [DX,EDI,DF] in {
+def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst),
+ "insb\t{%dx, $dst|$dst, dx}", []>;
+def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst),
+ "insw\t{%dx, $dst|$dst, dx}", []>, OpSize16;
+def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst),
+ "ins{l|d}\t{%dx, $dst|$dst, dx}", []>, OpSize32;
+}
+}
+
+// EFLAGS management instructions.
+let SchedRW = [WriteALU], Defs = [EFLAGS], Uses = [EFLAGS] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>;
+}
+
+// DF management instructions.
+let SchedRW = [WriteALU], Defs = [DF] in {
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>;
+}
+
+// Table lookup instructions
+let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>, Sched<[WriteLoad]>;
+
+let SchedRW = [WriteMicrocoded] in {
+// ASCII Adjust After Addition
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>,
+ Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX Before Division
+let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
+ "aad\t$src", []>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX After Multiply
+let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
+ "aam\t$src", []>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AL After Subtraction - sets
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>,
+ Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Addition
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>,
+ Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Subtraction
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>,
+ Requires<[Not64BitMode]>;
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+// Check Array Index Against Bounds
+// Note: "bound" does not have reversed operands in at&t syntax.
+def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bound\t$dst, $src", []>, OpSize16,
+ Requires<[Not64BitMode]>;
+def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bound\t$dst, $src", []>, OpSize32,
+ Requires<[Not64BitMode]>;
+
+// Adjust RPL Field of Segment Selector
+def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "arpl\t{$src, $dst|$dst, $src}", []>,
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
+let mayStore = 1 in
+def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "arpl\t{$src, $dst|$dst, $src}", []>,
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+ let SchedRW = [WriteALULd] in {
+ def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "movbe{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>,
+ OpSize16, T8PS;
+ def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "movbe{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>,
+ OpSize32, T8PS;
+ def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "movbe{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>,
+ T8PS;
+ }
+ let SchedRW = [WriteStore] in {
+ def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "movbe{w}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR16:$src), addr:$dst)]>,
+ OpSize16, T8PS;
+ def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movbe{l}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR32:$src), addr:$dst)]>,
+ OpSize32, T8PS;
+ def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movbe{q}\t{$src, $dst|$dst, $src}",
+ [(store (bswap GR64:$src), addr:$dst)]>,
+ T8PS;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
+ def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+ "rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))]>,
+ OpSize16, PS;
+ def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+ "rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))]>,
+ OpSize32, PS;
+ def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+ "rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))]>,
+ PS;
+}
+
+//===----------------------------------------------------------------------===//
+// RDSEED Instruction
+//
+let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
+ def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, PS;
+ def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, PS;
+ def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdseed))]>, PS;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+ def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>,
+ XS, OpSize16, Sched<[WriteLZCNT]>;
+ def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteLZCNTLd]>;
+
+ def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>,
+ XS, OpSize32, Sched<[WriteLZCNT]>;
+ def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "lzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteLZCNTLd]>;
+
+ def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "lzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+ XS, Sched<[WriteLZCNT]>;
+ def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "lzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, XS, Sched<[WriteLZCNTLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+// BMI Instructions
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "tzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>,
+ XS, OpSize16, Sched<[WriteTZCNT]>;
+ def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "tzcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteTZCNTLd]>;
+
+ def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "tzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>,
+ XS, OpSize32, Sched<[WriteTZCNT]>;
+ def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "tzcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteTZCNTLd]>;
+
+ def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "tzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+ XS, Sched<[WriteTZCNT]>;
+ def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "tzcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, XS, Sched<[WriteTZCNTLd]>;
+}
+
+multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
+ RegisterClass RC, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched> {
+let hasSideEffects = 0 in {
+ def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
+ T8PS, VEX_4V, Sched<[sched]>;
+ let mayLoad = 1 in
+ def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
+ T8PS, VEX_4V, Sched<[sched.Folded]>;
+}
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>;
+ defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, VEX_W;
+ defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>;
+ defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, VEX_W;
+ defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>;
+ defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate BMI instructions.
+//===----------------------------------------------------------------------===//
+
+def or_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86or_flag node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 1));
+}]>;
+
+def xor_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86xor_flag node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 1));
+}]>;
+
+def and_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86and_flag node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 1));
+}]>;
+
+let Predicates = [HasBMI] in {
+ // FIXME: patterns for the load versions are not implemented
+ def : Pat<(and GR32:$src, (add GR32:$src, -1)),
+ (BLSR32rr GR32:$src)>;
+ def : Pat<(and GR64:$src, (add GR64:$src, -1)),
+ (BLSR64rr GR64:$src)>;
+
+ def : Pat<(xor GR32:$src, (add GR32:$src, -1)),
+ (BLSMSK32rr GR32:$src)>;
+ def : Pat<(xor GR64:$src, (add GR64:$src, -1)),
+ (BLSMSK64rr GR64:$src)>;
+
+ def : Pat<(and GR32:$src, (ineg GR32:$src)),
+ (BLSI32rr GR32:$src)>;
+ def : Pat<(and GR64:$src, (ineg GR64:$src)),
+ (BLSI64rr GR64:$src)>;
+
+ // Versions to match flag producing ops.
+ def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, -1)),
+ (BLSR32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, -1)),
+ (BLSR64rr GR64:$src)>;
+
+ def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, -1)),
+ (BLSMSK32rr GR32:$src)>;
+ def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, -1)),
+ (BLSMSK64rr GR64:$src)>;
+
+ def : Pat<(and_flag_nocf GR32:$src, (ineg GR32:$src)),
+ (BLSI32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf GR64:$src, (ineg GR64:$src)),
+ (BLSI64rr GR64:$src)>;
+}
+
+multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode,
+ PatFrag ld_frag, X86FoldableSchedWrite Sched> {
+ def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+ T8PS, VEX, Sched<[Sched]>;
+ def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
+ (implicit EFLAGS)]>, T8PS, VEX,
+ Sched<[Sched.Folded,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src2
+ Sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+ defm BEXTR32 : bmi_bextr<0xF7, "bextr{l}", GR32, i32mem,
+ X86bextr, loadi32, WriteBEXTR>;
+ defm BEXTR64 : bmi_bextr<0xF7, "bextr{q}", GR64, i64mem,
+ X86bextr, loadi64, WriteBEXTR>, VEX_W;
+}
+
+multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, Intrinsic Int,
+ PatFrag ld_frag, X86FoldableSchedWrite Sched> {
+ def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+ T8PS, VEX, Sched<[Sched]>;
+ def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+ (implicit EFLAGS)]>, T8PS, VEX,
+ Sched<[Sched.Folded,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src2
+ Sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasBMI2], Defs = [EFLAGS] in {
+ defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+ X86bzhi, loadi32, WriteBZHI>;
+ defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+ X86bzhi, loadi64, WriteBZHI>, VEX_W;
+}
+
+def CountTrailingOnes : SDNodeXForm<imm, [{
+ // Count the trailing ones in the immediate.
+ return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N));
+}]>;
+
+def BEXTRMaskXForm : SDNodeXForm<imm, [{
+ unsigned Length = countTrailingOnes(N->getZExtValue());
+ return getI32Imm(Length << 8, SDLoc(N));
+}]>;
+
+def AndMask64 : ImmLeaf<i64, [{
+ return isMask_64(Imm) && !isUInt<32>(Imm);
+}]>;
+
+// Use BEXTR for 64-bit 'and' with large immediate 'mask'.
+let Predicates = [HasBMI, NoBMI2, NoTBM] in {
+ def : Pat<(and GR64:$src, AndMask64:$mask),
+ (BEXTR64rr GR64:$src,
+ (SUBREG_TO_REG (i64 0),
+ (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
+ def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
+ (BEXTR64rm addr:$src,
+ (SUBREG_TO_REG (i64 0),
+ (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
+}
+
+// Use BZHI for 64-bit 'and' with large immediate 'mask'.
+let Predicates = [HasBMI2, NoTBM] in {
+ def : Pat<(and GR64:$src, AndMask64:$mask),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+ def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
+ (BZHI64rm addr:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+}
+
+multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode,
+ PatFrag ld_frag> {
+ def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
+ VEX_4V, Sched<[WriteALU]>;
+ def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
+ VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+}
+
+let Predicates = [HasBMI2] in {
+ defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
+ X86pdep, loadi32>, T8XD;
+ defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
+ X86pdep, loadi64>, T8XD, VEX_W;
+ defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
+ X86pext, loadi32>, T8XS;
+ defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
+ X86pext, loadi64>, T8XS, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// TBM Instructions
+//
+let Predicates = [HasTBM], Defs = [EFLAGS] in {
+
+multiclass tbm_bextri<bits<8> opc, RegisterClass RC, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ SDNode OpNode, Operand immtype,
+ SDPatternOperator immoperator,
+ X86FoldableSchedWrite Sched> {
+ def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
+ !strconcat(OpcodeStr,
+ "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+ [(set RC:$dst, (OpNode RC:$src1, immoperator:$cntl))]>,
+ XOP, XOPA, Sched<[Sched]>;
+ def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop:$src1, immtype:$cntl),
+ !strconcat(OpcodeStr,
+ "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+ [(set RC:$dst, (OpNode (ld_frag addr:$src1), immoperator:$cntl))]>,
+ XOP, XOPA, Sched<[Sched.Folded]>;
+}
+
+defm BEXTRI32 : tbm_bextri<0x10, GR32, "bextr{l}", i32mem, loadi32,
+ X86bextri, i32imm, timm, WriteBEXTR>;
+let ImmT = Imm32S in
+defm BEXTRI64 : tbm_bextri<0x10, GR64, "bextr{q}", i64mem, loadi64,
+ X86bextri, i64i32imm,
+ i64timmSExt32, WriteBEXTR>, VEX_W;
+
+multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
+ RegisterClass RC, string OpcodeStr,
+ X86MemOperand x86memop, X86FoldableSchedWrite Sched> {
+let hasSideEffects = 0 in {
+ def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
+ XOP_4V, XOP9, Sched<[Sched]>;
+ let mayLoad = 1 in
+ def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
+ XOP_4V, XOP9, Sched<[Sched.Folded]>;
+}
+}
+
+multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite Sched,
+ Format FormReg, Format FormMem> {
+ defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr#"{l}",
+ i32mem, Sched>;
+ defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr#"{q}",
+ i64mem, Sched>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", WriteALU, MRM1r, MRM1m>;
+defm BLCI : tbm_binary_intr<0x02, "blci", WriteALU, MRM6r, MRM6m>;
+defm BLCIC : tbm_binary_intr<0x01, "blcic", WriteALU, MRM5r, MRM5m>;
+defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", WriteALU, MRM1r, MRM1m>;
+defm BLCS : tbm_binary_intr<0x01, "blcs", WriteALU, MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", WriteALU, MRM2r, MRM2m>;
+defm BLSIC : tbm_binary_intr<0x01, "blsic", WriteALU, MRM6r, MRM6m>;
+defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", WriteALU, MRM7r, MRM7m>;
+defm TZMSK : tbm_binary_intr<0x01, "tzmsk", WriteALU, MRM4r, MRM4m>;
+} // HasTBM, EFLAGS
+
+// Use BEXTRI for 64-bit 'and' with large immediate 'mask'.
+let Predicates = [HasTBM] in {
+ def : Pat<(and GR64:$src, AndMask64:$mask),
+ (BEXTRI64ri GR64:$src, (BEXTRMaskXForm imm:$mask))>;
+
+ def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
+ (BEXTRI64mi addr:$src, (BEXTRMaskXForm imm:$mask))>;
+}
+
+//===----------------------------------------------------------------------===//
+// Lightweight Profiling Instructions
+
+let Predicates = [HasLWP], SchedRW = [WriteSystem] in {
+
+def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src",
+ [(int_x86_llwpcb GR32:$src)]>, XOP, XOP9;
+def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst",
+ [(set GR32:$dst, (int_x86_slwpcb))]>, XOP, XOP9;
+
+def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src",
+ [(int_x86_llwpcb GR64:$src)]>, XOP, XOP9, VEX_W;
+def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
+ [(set GR64:$dst, (int_x86_slwpcb))]>, XOP, XOP9, VEX_W;
+
+multiclass lwpins_intr<RegisterClass RC> {
+ def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
+ "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
+ [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>,
+ XOP_4V, XOPA;
+ let mayLoad = 1 in
+ def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
+ "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
+ [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>,
+ XOP_4V, XOPA;
+}
+
+let Defs = [EFLAGS] in {
+ defm LWPINS32 : lwpins_intr<GR32>;
+ defm LWPINS64 : lwpins_intr<GR64>, VEX_W;
+} // EFLAGS
+
+multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
+ def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
+ "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
+ [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA;
+ let mayLoad = 1 in
+ def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
+ "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
+ [(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>,
+ XOP_4V, XOPA;
+}
+
+defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>;
+defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W;
+
+} // HasLWP, SchedRW
+
+//===----------------------------------------------------------------------===//
+// MONITORX/MWAITX Instructions
+//
+let SchedRW = [ WriteSystem ] in {
+ let Uses = [ EAX, ECX, EDX ] in
+ def MONITORX32rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
+ TB, Requires<[ HasMWAITX, Not64BitMode ]>;
+ let Uses = [ RAX, ECX, EDX ] in
+ def MONITORX64rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
+ TB, Requires<[ HasMWAITX, In64BitMode ]>;
+
+ let Uses = [ ECX, EAX, EBX ] in {
+ def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
+ []>, TB, Requires<[ HasMWAITX ]>;
+ }
+} // SchedRW
+
+def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>,
+ Requires<[ Not64BitMode ]>;
+def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>,
+ Requires<[ In64BitMode ]>;
+
+def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORX32rrr)>,
+ Requires<[ Not64BitMode ]>;
+def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORX64rrr)>,
+ Requires<[ In64BitMode ]>;
+
+//===----------------------------------------------------------------------===//
+// WAITPKG Instructions
+//
+let SchedRW = [WriteSystem] in {
+ def UMONITOR16 : I<0xAE, MRM6r, (outs), (ins GR16:$src),
+ "umonitor\t$src", [(int_x86_umonitor GR16:$src)]>,
+ XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>;
+ def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src),
+ "umonitor\t$src", [(int_x86_umonitor GR32:$src)]>,
+ XS, AdSize32, Requires<[HasWAITPKG]>;
+ def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src),
+ "umonitor\t$src", [(int_x86_umonitor GR64:$src)]>,
+ XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>;
+ let Uses = [EAX, EDX], Defs = [EFLAGS] in {
+ def UMWAIT : I<0xAE, MRM6r,
+ (outs), (ins GR32orGR64:$src), "umwait\t$src",
+ [(set EFLAGS, (X86umwait GR32orGR64:$src, EDX, EAX))]>,
+ XD, Requires<[HasWAITPKG]>;
+ def TPAUSE : I<0xAE, MRM6r,
+ (outs), (ins GR32orGR64:$src), "tpause\t$src",
+ [(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>,
+ PD, Requires<[HasWAITPKG]>, NotMemoryFoldable;
+ }
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVDIRI - Move doubleword/quadword as direct store
+//
+let SchedRW = [WriteStore] in {
+def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movdiri\t{$src, $dst|$dst, $src}",
+ [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
+ T8PS, Requires<[HasMOVDIRI]>;
+def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movdiri\t{$src, $dst|$dst, $src}",
+ [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
+ T8PS, Requires<[In64BitMode, HasMOVDIRI]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVDIR64B - Move 64 bytes as direct store
+//
+let SchedRW = [WriteStore] in {
+def MOVDIR64B16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
+ "movdir64b\t{$src, $dst|$dst, $src}", []>,
+ T8PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>;
+def MOVDIR64B32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
+ "movdir64b\t{$src, $dst|$dst, $src}",
+ [(int_x86_movdir64b GR32:$dst, addr:$src)]>,
+ T8PD, AdSize32, Requires<[HasMOVDIR64B]>;
+def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
+ "movdir64b\t{$src, $dst|$dst, $src}",
+ [(int_x86_movdir64b GR64:$dst, addr:$src)]>,
+ T8PD, AdSize64, Requires<[HasMOVDIR64B, In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// ENQCMD/S - Enqueue 64-byte command as user with 64-byte write atomicity
+//
+let SchedRW = [WriteStore], Defs = [EFLAGS] in {
+ def ENQCMD16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
+ "enqcmd\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmd GR16:$dst, addr:$src))]>,
+ T8XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+ def ENQCMD32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
+ "enqcmd\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmd GR32:$dst, addr:$src))]>,
+ T8XD, AdSize32, Requires<[HasENQCMD]>;
+ def ENQCMD64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
+ "enqcmd\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmd GR64:$dst, addr:$src))]>,
+ T8XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+
+ def ENQCMDS16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
+ "enqcmds\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmds GR16:$dst, addr:$src))]>,
+ T8XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+ def ENQCMDS32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
+ "enqcmds\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmds GR32:$dst, addr:$src))]>,
+ T8XS, AdSize32, Requires<[HasENQCMD]>;
+ def ENQCMDS64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
+ "enqcmds\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmds GR64:$dst, addr:$src))]>,
+ T8XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// CLZERO Instruction
+//
+let SchedRW = [WriteLoad] in {
+ let Uses = [EAX] in
+ def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
+ TB, Requires<[HasCLZERO, Not64BitMode]>;
+ let Uses = [RAX] in
+ def CLZERO64r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
+ TB, Requires<[HasCLZERO, In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>;
+def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// INVLPGB Instruction
+// OPCODE 0F 01 FE
+//
+let SchedRW = [WriteSystem] in {
+ let Uses = [EAX, EDX] in
+ def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
+ "invlpgb}", []>,
+ PS, Requires<[Not64BitMode]>;
+ let Uses = [RAX, EDX] in
+ def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
+ "invlpgb", []>,
+ PS, Requires<[In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"invlpgb\t{%eax, %edx|eax, edx}", (INVLPGB32)>, Requires<[Not64BitMode]>;
+def : InstAlias<"invlpgb\t{%rax, %edx|rax, edx}", (INVLPGB64)>, Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// TLBSYNC Instruction
+// OPCODE 0F 01 FF
+//
+let SchedRW = [WriteSystem] in {
+ def TLBSYNC : I<0x01, MRM_FF, (outs), (ins),
+ "tlbsync", []>,
+ PS, Requires<[]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// HRESET Instruction
+//
+let Uses = [EAX], SchedRW = [WriteSystem] in
+ def HRESET : Ii8<0xF0, MRM_C0, (outs), (ins i32u8imm:$imm), "hreset\t$imm", []>,
+ Requires<[HasHRESET]>, TAXS;
+
+//===----------------------------------------------------------------------===//
+// SERIALIZE Instruction
+//
+def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
+ [(int_x86_serialize)]>, PS,
+ Requires<[HasSERIALIZE]>;
+
+//===----------------------------------------------------------------------===//
+// TSXLDTRK - TSX Suspend Load Address Tracking
+//
+let Predicates = [HasTSXLDTRK] in {
+ def XSUSLDTRK : I<0x01, MRM_E8, (outs), (ins), "xsusldtrk",
+ [(int_x86_xsusldtrk)]>, XD;
+ def XRESLDTRK : I<0x01, MRM_E9, (outs), (ins), "xresldtrk",
+ [(int_x86_xresldtrk)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
+// UINTR Instructions
+//
+let Predicates = [HasUINTR, In64BitMode] in {
+ def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret",
+ []>, XS;
+ def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui",
+ [(int_x86_clui)]>, XS;
+ def STUI : I<0x01, MRM_EF, (outs), (ins), "stui",
+ [(int_x86_stui)]>, XS;
+
+ def SENDUIPI : I<0xC7, MRM6r, (outs), (ins GR64:$arg), "senduipi\t$arg",
+ [(int_x86_senduipi GR64:$arg)]>, XS;
+
+ let Defs = [EFLAGS] in
+ def TESTUI : I<0x01, MRM_ED, (outs), (ins), "testui",
+ [(set EFLAGS, (X86testui))]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate TBM instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasTBM] in {
+ // FIXME: patterns for the load versions are not implemented
+ def : Pat<(and GR32:$src, (add GR32:$src, 1)),
+ (BLCFILL32rr GR32:$src)>;
+ def : Pat<(and GR64:$src, (add GR64:$src, 1)),
+ (BLCFILL64rr GR64:$src)>;
+
+ def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+ (BLCI64rr GR64:$src)>;
+
+ // Extra patterns because opt can optimize the above patterns to this.
+ def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
+ (BLCI64rr GR64:$src)>;
+
+ def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
+ (BLCIC32rr GR32:$src)>;
+ def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
+ (BLCIC64rr GR64:$src)>;
+
+ def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
+ (BLCMSK32rr GR32:$src)>;
+ def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
+ (BLCMSK64rr GR64:$src)>;
+
+ def : Pat<(or GR32:$src, (add GR32:$src, 1)),
+ (BLCS32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (add GR64:$src, 1)),
+ (BLCS64rr GR64:$src)>;
+
+ def : Pat<(or GR32:$src, (add GR32:$src, -1)),
+ (BLSFILL32rr GR32:$src)>;
+ def : Pat<(or GR64:$src, (add GR64:$src, -1)),
+ (BLSFILL64rr GR64:$src)>;
+
+ def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
+ (BLSIC32rr GR32:$src)>;
+ def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
+ (BLSIC64rr GR64:$src)>;
+
+ def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
+ (T1MSKC32rr GR32:$src)>;
+ def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
+ (T1MSKC64rr GR64:$src)>;
+
+ def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
+ (TZMSK32rr GR32:$src)>;
+ def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
+ (TZMSK64rr GR64:$src)>;
+
+ // Patterns to match flag producing ops.
+ def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, 1)),
+ (BLCFILL32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, 1)),
+ (BLCFILL64rr GR64:$src)>;
+
+ def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))),
+ (BLCI64rr GR64:$src)>;
+
+ // Extra patterns because opt can optimize the above patterns to this.
+ def : Pat<(or_flag_nocf GR32:$src, (sub -2, GR32:$src)),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf GR64:$src, (sub -2, GR64:$src)),
+ (BLCI64rr GR64:$src)>;
+
+ def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, 1)),
+ (BLCIC32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
+ (BLCIC64rr GR64:$src)>;
+
+ def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, 1)),
+ (BLCMSK32rr GR32:$src)>;
+ def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, 1)),
+ (BLCMSK64rr GR64:$src)>;
+
+ def : Pat<(or_flag_nocf GR32:$src, (add GR32:$src, 1)),
+ (BLCS32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf GR64:$src, (add GR64:$src, 1)),
+ (BLCS64rr GR64:$src)>;
+
+ def : Pat<(or_flag_nocf GR32:$src, (add GR32:$src, -1)),
+ (BLSFILL32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf GR64:$src, (add GR64:$src, -1)),
+ (BLSFILL64rr GR64:$src)>;
+
+ def : Pat<(or_flag_nocf (not GR32:$src), (add GR32:$src, -1)),
+ (BLSIC32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, -1)),
+ (BLSIC64rr GR64:$src)>;
+
+ def : Pat<(or_flag_nocf (not GR32:$src), (add GR32:$src, 1)),
+ (T1MSKC32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
+ (T1MSKC64rr GR64:$src)>;
+
+ def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, -1)),
+ (TZMSK32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, -1)),
+ (TZMSK64rr GR64:$src)>;
+} // HasTBM
+
+//===----------------------------------------------------------------------===//
+// Memory Instructions
+//
+
+let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in
+def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+ "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
+
+let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
+def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
+ [(int_x86_clwb addr:$src)]>, PD, NotMemoryFoldable;
+
+let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
+def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
+ [(int_x86_cldemote addr:$src)]>, PS;
+
+//===----------------------------------------------------------------------===//
+// Subsystems.
+//===----------------------------------------------------------------------===//
+
+include "X86InstrArithmetic.td"
+include "X86InstrCMovSetCC.td"
+include "X86InstrExtension.td"
+include "X86InstrControl.td"
+include "X86InstrShiftRotate.td"
+
+// X87 Floating Point Stack.
+include "X86InstrFPStack.td"
+
+// SIMD support (SSE, MMX and AVX)
+include "X86InstrFragmentsSIMD.td"
+
+// FMA - Fused Multiply-Add support (requires FMA)
+include "X86InstrFMA.td"
+
+// XOP
+include "X86InstrXOP.td"
+
+// SSE, MMX and 3DNow! vector support.
+include "X86InstrSSE.td"
+include "X86InstrAVX512.td"
+include "X86InstrMMX.td"
+include "X86Instr3DNow.td"
+
+// MPX instructions
+include "X86InstrMPX.td"
+
+include "X86InstrVMX.td"
+include "X86InstrSVM.td"
+include "X86InstrSNP.td"
+
+include "X86InstrTSX.td"
+include "X86InstrSGX.td"
+
+include "X86InstrTDX.td"
+
+// Key Locker instructions
+include "X86InstrKL.td"
+
+// AMX instructions
+include "X86InstrAMX.td"
+
+// System instructions.
+include "X86InstrSystem.td"
+
+// Compiler Pseudo Instructions and Pat Patterns
+include "X86InstrCompiler.td"
+include "X86InstrVecCompiler.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"cbw", "cbtw", "att">;
+def : MnemonicAlias<"cwde", "cwtl", "att">;
+def : MnemonicAlias<"cwd", "cwtd", "att">;
+def : MnemonicAlias<"cdq", "cltd", "att">;
+def : MnemonicAlias<"cdqe", "cltq", "att">;
+def : MnemonicAlias<"cqo", "cqto", "att">;
+
+// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq.
+def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
+
+def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"loopz", "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
+
+def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf", "popfq", "intel">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popfd", "popfl", "att">;
+def : MnemonicAlias<"popfw", "popf", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popfw", "popf", "intel">, Requires<[In64BitMode]>;
+
+// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in
+// all modes. However: "push (addr)" and "push $42" should default to
+// pushl/pushq depending on the current mode. Similar for "pop %bx"
+def : MnemonicAlias<"push", "pushw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf", "pushfq", "intel">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushfd", "pushfl", "att">;
+def : MnemonicAlias<"pushfw", "pushf", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushfw", "pushf", "intel">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"popa", "popaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha", "pushaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa", "popal", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha", "pushal", "intel">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"popa", "popaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"repe", "rep">;
+def : MnemonicAlias<"repz", "rep">;
+def : MnemonicAlias<"repnz", "repne">;
+
+def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
+
+// Apply 'ret' behavior to 'retn'
+def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"retn", "ret", "intel">;
+
+def : MnemonicAlias<"sal", "shl", "intel">;
+def : MnemonicAlias<"salb", "shlb", "att">;
+def : MnemonicAlias<"salw", "shlw", "att">;
+def : MnemonicAlias<"sall", "shll", "att">;
+def : MnemonicAlias<"salq", "shlq", "att">;
+
+def : MnemonicAlias<"smovb", "movsb", "att">;
+def : MnemonicAlias<"smovw", "movsw", "att">;
+def : MnemonicAlias<"smovl", "movsl", "att">;
+def : MnemonicAlias<"smovq", "movsq", "att">;
+
+def : MnemonicAlias<"ud2a", "ud2", "att">;
+def : MnemonicAlias<"ud2bw", "ud1w", "att">;
+def : MnemonicAlias<"ud2bl", "ud1l", "att">;
+def : MnemonicAlias<"ud2bq", "ud1q", "att">;
+def : MnemonicAlias<"verrw", "verr", "att">;
+
+// MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release'
+def : MnemonicAlias<"acquire", "xacquire", "intel">;
+def : MnemonicAlias<"release", "xrelease", "intel">;
+
+// System instruction aliases.
+def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>;
+def : MnemonicAlias<"sysret", "sysretl", "att">;
+def : MnemonicAlias<"sysexit", "sysexitl", "att">;
+
+def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtd", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtd", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtd", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtd", "intel">, Requires<[In32BitMode]>;
+
+
+// Floating point stack aliases.
+def : MnemonicAlias<"fcmovz", "fcmove", "att">;
+def : MnemonicAlias<"fcmova", "fcmovnbe", "att">;
+def : MnemonicAlias<"fcmovnae", "fcmovb", "att">;
+def : MnemonicAlias<"fcmovna", "fcmovbe", "att">;
+def : MnemonicAlias<"fcmovae", "fcmovnb", "att">;
+def : MnemonicAlias<"fcomip", "fcompi">;
+def : MnemonicAlias<"fildq", "fildll", "att">;
+def : MnemonicAlias<"fistpq", "fistpll", "att">;
+def : MnemonicAlias<"fisttpq", "fisttpll", "att">;
+def : MnemonicAlias<"fldcww", "fldcw", "att">;
+def : MnemonicAlias<"fnstcww", "fnstcw", "att">;
+def : MnemonicAlias<"fnstsww", "fnstsw", "att">;
+def : MnemonicAlias<"fucomip", "fucompi">;
+def : MnemonicAlias<"fwait", "wait">;
+
+def : MnemonicAlias<"fxsaveq", "fxsave64", "att">;
+def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">;
+def : MnemonicAlias<"xsaveq", "xsave64", "att">;
+def : MnemonicAlias<"xrstorq", "xrstor64", "att">;
+def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
+def : MnemonicAlias<"xrstorsq", "xrstors64", "att">;
+def : MnemonicAlias<"xsavecq", "xsavec64", "att">;
+def : MnemonicAlias<"xsavesq", "xsaves64", "att">;
+
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
+ string VariantName>
+ : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
+ !strconcat(Prefix, NewCond, Suffix), VariantName>;
+
+/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
+/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
+/// example "setz" -> "sete".
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
+ string V = ""> {
+ def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb
+ def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete
+ def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe
+ def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae
+ def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae
+ def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle
+ def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge
+ def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne
+ def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp
+ def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp
+
+ def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb
+ def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta
+ def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl
+ def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg
+}
+
+// Aliases for set<CC>
+defm : IntegerCondCodeMnemonicAlias<"set", "">;
+// Aliases for j<CC>
+defm : IntegerCondCodeMnemonicAlias<"j", "">;
+// Aliases for cmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler Instruction Aliases
+//===----------------------------------------------------------------------===//
+
+// aad/aam default to base 10 if no operand is specified.
+def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>;
+def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
+
+// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
+// Likewise for btc/btr/bts.
+def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
+ (BT32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
+def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
+ (BTC32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
+def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
+ (BTR32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
+def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
+ (BTS32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
+
+// clr aliases.
+def : InstAlias<"clr{b}\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clr{w}\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clr{l}\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clr{q}\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+
+// lods aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src), 0>;
+def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src), 0>;
+def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t$src", (LODSB srcidx8:$src), 0, "intel">;
+def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0, "intel">;
+def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0, "intel">;
+def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;
+
+
+// stos aliases. Accept the source being omitted because it's implicit in
+// the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst), 0>;
+def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>;
+def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst), 0, "intel">;
+def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0, "intel">;
+def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0, "intel">;
+def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>;
+
+
+// scas aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst), 0>;
+def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst), 0, "intel">;
+def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0, "intel">;
+def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0, "intel">;
+def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>;
+
+// cmps aliases. Mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src), 0, "intel">;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0, "intel">;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0, "intel">;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;
+
+// movs aliases. Mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src), 0, "intel">;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0, "intel">;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0, "intel">;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>;
+
+// div and idiv aliases for explicit A register.
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
+
+
+
+// Various unary fpstack operations default to operating on ST1.
+// For example, "fxch" -> "fxch %st(1)"
+def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
+def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>;
+def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>;
+def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>;
+def : InstAlias<"fxch", (XCH_F ST1), 0>;
+def : InstAlias<"fcom", (COM_FST0r ST1), 0>;
+def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>;
+def : InstAlias<"fcomi", (COM_FIr ST1), 0>;
+def : InstAlias<"fcompi", (COM_FIPr ST1), 0>;
+def : InstAlias<"fucom", (UCOM_Fr ST1), 0>;
+def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>;
+def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>;
+def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>;
+
+// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
+// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate
+// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
+// gas.
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, "\t$op"),
+ (Inst RSTi:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st, %st|st, st}"),
+ (Inst ST0), EmitAlias>;
+}
+
+defm : FpUnaryAlias<"fadd", ADD_FST0r, 0>;
+defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
+defm : FpUnaryAlias<"fsub", SUB_FST0r, 0>;
+defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0, 0>;
+defm : FpUnaryAlias<"fsubr", SUBR_FST0r, 0>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0, 0>;
+defm : FpUnaryAlias<"fmul", MUL_FST0r, 0>;
+defm : FpUnaryAlias<"fmulp", MUL_FPrST0, 0>;
+defm : FpUnaryAlias<"fdiv", DIV_FST0r, 0>;
+defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0, 0>;
+defm : FpUnaryAlias<"fdivr", DIVR_FST0r, 0>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0, 0>;
+defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
+defm : FpUnaryAlias<"fcompi", COM_FIPr, 0>;
+defm : FpUnaryAlias<"fucompi", UCOM_FIPr, 0>;
+
+
+// Handle "f{mulp,addp} $op, %st(0)" the same as "f{mulp,addp} $op", since they
+// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
+// solely because gas supports it.
+def : InstAlias<"faddp\t{$op, %st|st, $op}", (ADD_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fmulp\t{$op, %st|st, $op}", (MUL_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fsub{|r}p\t{$op, %st|st, $op}", (SUBR_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fsub{r|}p\t{$op, %st|st, $op}", (SUB_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fdiv{|r}p\t{$op, %st|st, $op}", (DIVR_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fdiv{r|}p\t{$op, %st|st, $op}", (DIV_FPrST0 RSTi:$op), 0>;
+
+def : InstAlias<"fnstsw" , (FNSTSW16r), 0>;
+
+// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
+// this is compatible with what GAS does.
+def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaquemem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaquemem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0, "att">, Requires<[In64BitMode]>;
+def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0, "att">, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0, "att">, Requires<[In16BitMode]>;
+
+
+// "imul <imm>, B" is an alias for "imul <imm>, B, B".
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>;
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
+
+// ins aliases. Accept the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst), 0, "intel">;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst), 0, "intel">;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst), 0, "intel">;
+
+// outs aliases. Accept the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src), 0, "intel">;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src), 0, "intel">;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src), 0, "intel">;
+
+// inb %dx -> inb %al, %dx
+def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
+def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
+def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
+def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;
+
+
+// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp
+def : InstAlias<"call\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"call\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
+def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+
+// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
+def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+
+// Match 'movd GR64, MMX' as an alias for movq to be compatible with gas,
+// which supports this due to an old AMD documentation bug when 64-bit mode was
+// created.
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
+ (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
+ (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
+
+// movsx aliases
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0, "att">;
+
+// movzx aliases
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0, "att">;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0, "att">;
+// Note: No GR32->GR64 movzx form.
+
+// outb %dx -> outb %al, %dx
+def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
+def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
+def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>;
+
+// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
+// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
+// errors, since its encoding is the most compact.
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;
+
+// shld/shrd op,op -> shld op, op, CL
+def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
+def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;
+
+def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
+def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;
+
+/* FIXME: This is disabled because the asm matcher is currently incapable of
+ * matching a fixed immediate like $1.
+// "shl X, $1" is an alias for "shl X".
+multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+ (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
+}
+
+defm : ShiftRotateByOneAlias<"rcl", "RCL">;
+defm : ShiftRotateByOneAlias<"rcr", "RCR">;
+defm : ShiftRotateByOneAlias<"rol", "ROL">;
+defm : ShiftRotateByOneAlias<"ror", "ROR">;
+FIXME */
+
+// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
+def : InstAlias<"test{b}\t{$mem, $val|$val, $mem}",
+ (TEST8mr i8mem :$mem, GR8 :$val), 0>;
+def : InstAlias<"test{w}\t{$mem, $val|$val, $mem}",
+ (TEST16mr i16mem:$mem, GR16:$val), 0>;
+def : InstAlias<"test{l}\t{$mem, $val|$val, $mem}",
+ (TEST32mr i32mem:$mem, GR32:$val), 0>;
+def : InstAlias<"test{q}\t{$mem, $val|$val, $mem}",
+ (TEST64mr i64mem:$mem, GR64:$val), 0>;
+
+// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
+ (XCHG8rm GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}",
+ (XCHG16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}",
+ (XCHG32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
+ (XCHG64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src), 0>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
+
+// In 64-bit mode, xchg %eax, %eax can't be encoded with the 0x90 opcode we
+// would get by default because it's defined as NOP. But xchg %eax, %eax implies
+// implicit zeroing of the upper 32 bits. So alias to the longer encoding.
+def : InstAlias<"xchg{l}\t{%eax, %eax|eax, eax}",
+ (XCHG32rr EAX, EAX), 0>, Requires<[In64BitMode]>;
+
+// xchg %rax, %rax is a nop in x86-64 and can be encoded as such. Without this
+// we emit an unneeded REX.w prefix.
+def : InstAlias<"xchg{q}\t{%rax, %rax|rax, rax}", (NOOP), 0>;
+
+// These aliases exist to get the parser to prioritize matching 8-bit
+// immediate encodings over matching the implicit ax/eax/rax encodings. By
+// explicitly mentioning the A register here, these entries will be ordered
+// first due to the more explicit immediate type.
+def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>;
+
+def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>;
+
+def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td
new file mode 100644
index 000000000000..b91e563a15f3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td
@@ -0,0 +1,86 @@
+//===---------------------------*-tablegen-*-------------------------------===//
+//===------------- X86InstrKL.td - KL Instruction Set Extension -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel key locker
+// instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Key Locker instructions
+
+let SchedRW = [WriteSystem], Predicates = [HasKL] in {
+ let Uses = [XMM0, EAX], Defs = [EFLAGS] in {
+ def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "loadiwkey\t{$src2, $src1|$src1, $src2}",
+ [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8XS,
+ NotMemoryFoldable;
+ }
+
+ let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, EFLAGS] in {
+ def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS,
+ NotMemoryFoldable;
+ }
+
+ let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, EFLAGS] in {
+ def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS,
+ NotMemoryFoldable;
+ }
+
+ let Constraints = "$src1 = $dst",
+ Defs = [EFLAGS] in {
+ def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesenc128kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+
+ def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesdec128kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+
+ def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesenc256kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+
+ def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesdec256kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+ }
+
+} // SchedRW, Predicates
+
+let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in {
+ let Uses = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
+ Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
+ mayLoad = 1 in {
+ def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src),
+ "aesencwide128kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src),
+ "aesdecwide128kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src),
+ "aesencwide256kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src),
+ "aesdecwide256kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ }
+
+} // SchedRW, Predicates
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 000000000000..bb3e6df3bf3e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,582 @@
+//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+// All instructions that use MMX should be in this file, even if they also use
+// SSE.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor mmx.
+// This is expanded by ExpandPostRAPseudos to an pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasMMX] in {
+def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "",
+ [(set VR64:$dst, (x86mmx (MMX_X86movw2d (i32 0))))]>;
+}
+
+let Constraints = "$src1 = $dst" in {
+ // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
+ multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ X86FoldableSchedWrite sched, bit Commutable = 0,
+ X86MemOperand OType = i64mem> {
+ def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>,
+ Sched<[sched]> {
+ let isCommutable = Commutable;
+ }
+ def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, OType:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+
+ multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, Intrinsic IntId,
+ Intrinsic IntId2, X86FoldableSchedWrite sched,
+ X86FoldableSchedWrite schedImm> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>,
+ Sched<[sched]>;
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+ (ins VR64:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId2 VR64:$src1, timm:$src2))]>,
+ Sched<[schedImm]>;
+ }
+}
+
+/// Unary MMX instructions requiring SSSE3.
+multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, X86FoldableSchedWrite sched> {
+ def rr : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))]>,
+ Sched<[sched]>;
+
+ def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 (load_mmx addr:$src)))]>,
+ Sched<[sched.Folded]>;
+}
+
+/// Binary MMX instructions requiring SSSE3.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, X86FoldableSchedWrite sched,
+ bit Commutable = 0> {
+ let isCommutable = Commutable in
+ def rr : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>,
+ Sched<[sched]>;
+ def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1, (load_mmx addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+}
+
+/// PALIGN MMX instructions (require SSSE3).
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
+ X86FoldableSchedWrite sched> {
+ def rri : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 timm:$src3)))]>,
+ Sched<[sched]>;
+ def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2),
+ (i8 timm:$src3)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, X86FoldableSchedWrite sched, Domain d> {
+ def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ [(set DstRC:$dst, (Int SrcRC:$src))], d>,
+ Sched<[sched]>;
+ def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>,
+ Sched<[sched.Folded]>;
+}
+
+multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+ PatFrag ld_frag, string asm, Domain d> {
+ def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+ (ins DstRC:$src1, SrcRC:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>,
+ Sched<[WriteCvtI2PS]>;
+ def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>,
+ Sched<[WriteCvtI2PS.Folded]>;
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS Instruction
+//===----------------------------------------------------------------------===//
+
+let SchedRW = [WriteEMMS],
+ Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in
+def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (MMX_X86movw2d GR32:$src)))]>,
+ Sched<[WriteVecMoveFromGpr]>;
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (MMX_X86movw2d (loadi32 addr:$src))))]>,
+ Sched<[WriteVecLoad]>;
+
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteVecStore]>;
+
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (MMX_X86movd2w (x86mmx VR64:$src)))]>,
+ Sched<[WriteVecMoveToGpr]>, FoldGenData<"MMX_MOVD64rr">;
+
+let isBitcast = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (bitconvert GR64:$src))]>,
+ Sched<[WriteVecMoveFromGpr]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
+ (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}",
+ []>, Sched<[SchedWriteVecMoveLS.MMX.RM]>;
+
+let isBitcast = 1 in {
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+ (outs GR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert VR64:$src))]>,
+ Sched<[WriteVecMoveToGpr]>;
+let SchedRW = [WriteVecMove], hasSideEffects = 0, isMoveReg = 1 in {
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>;
+let isCodeGenOnly = 1, ForceDisassemble = 1 in
+def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MMX_MOVQ64rr">;
+} // SchedRW, hasSideEffects, isMoveReg
+} // isBitcast
+
+def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
+ (MMX_MOVQ64rr_REV VR64:$dst, VR64:$src), 0>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
+ (outs), (ins i64mem:$dst, VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.MMX.MR]>;
+
+let SchedRW = [SchedWriteVecMoveLS.MMX.RM] in {
+let canFoldAsLoad = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (load_mmx addr:$src))]>;
+} // SchedRW
+
+let SchedRW = [SchedWriteVecMoveLS.MMX.MR] in
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (x86mmx VR64:$src), addr:$dst)]>;
+
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+ [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+ [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
+
+let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (x86mmx (MMX_X86movdq2q VR128:$src)))]>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (MMX_X86movq2dq VR64:$src)))]>;
+
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+ []>;
+
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+ []>;
+}
+} // SchedRW
+
+let Predicates = [HasMMX, HasSSE1] in
+def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movntq\t{$src, $dst|$dst, $src}",
+ [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>,
+ Sched<[SchedWriteVecMoveLSNT.MMX.MR]>;
+
+// Arithmetic Instructions
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+ SchedWriteVecALU.MMX>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+ SchedWriteVecALU.MMX>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+ SchedWriteVecALU.MMX>;
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+ SchedWriteVecALU.MMX, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+ SchedWriteVecALU.MMX, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+ SchedWriteVecALU.MMX, 1>;
+let Predicates = [HasMMX, HasSSE2] in
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+ SchedWriteVecALU.MMX, 1>;
+defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+ SchedWriteVecALU.MMX, 1>;
+defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+ SchedWriteVecALU.MMX, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+ SchedWriteVecALU.MMX, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+ SchedWriteVecALU.MMX, 1>;
+
+defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+ SchedWritePHAdd.MMX>;
+defm MMX_PHADDD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+ SchedWritePHAdd.MMX>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+ SchedWritePHAdd.MMX>;
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+ SchedWriteVecALU.MMX>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+ SchedWriteVecALU.MMX>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+ SchedWriteVecALU.MMX>;
+let Predicates = [HasMMX, HasSSE2] in
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+ SchedWriteVecALU.MMX>;
+
+defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+ SchedWriteVecALU.MMX>;
+defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+ SchedWriteVecALU.MMX>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+ SchedWriteVecALU.MMX>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+ SchedWriteVecALU.MMX>;
+
+defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+ SchedWritePHAdd.MMX>;
+defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+ SchedWritePHAdd.MMX>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+ SchedWritePHAdd.MMX>;
+
+// -- Multiplication
+defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+ SchedWriteVecIMul.MMX, 1>;
+
+defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w,
+ SchedWriteVecIMul.MMX, 1>;
+let Predicates = [HasMMX, HasSSE1] in
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+ SchedWriteVecIMul.MMX, 1>;
+let Predicates = [HasMMX, HasSSE2] in
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+ SchedWriteVecIMul.MMX, 1>;
+defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
+ int_x86_ssse3_pmul_hr_sw,
+ SchedWriteVecIMul.MMX, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+ SchedWriteVecIMul.MMX, 1>;
+
+defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw,
+ SchedWriteVecIMul.MMX>;
+let Predicates = [HasMMX, HasSSE1] in {
+defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+ SchedWriteVecALU.MMX, 1>;
+defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+ SchedWriteVecALU.MMX, 1>;
+
+defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+ SchedWriteVecALU.MMX, 1>;
+defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+ SchedWriteVecALU.MMX, 1>;
+
+defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+ SchedWriteVecALU.MMX, 1>;
+defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+ SchedWriteVecALU.MMX, 1>;
+
+defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+ SchedWritePSADBW.MMX, 1>;
+}
+
+defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+ SchedWriteVecALU.MMX>;
+defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+ SchedWriteVecALU.MMX>;
+defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+ SchedWriteVecALU.MMX>;
+let Constraints = "$src1 = $dst" in
+ defm MMX_PALIGNR : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b,
+ SchedWriteShuffle.MMX>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+ SchedWriteVecLogic.MMX, 1>;
+defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+ SchedWriteVecLogic.MMX, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+ SchedWriteVecLogic.MMX, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+ SchedWriteVecLogic.MMX>;
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+ int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+ int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+ int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+ int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+ int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+ int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+ int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+ int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+ SchedWriteVecShift.MMX,
+ SchedWriteVecShiftImm.MMX>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+ SchedWriteVecALU.MMX>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+ SchedWriteVecALU.MMX>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+ SchedWriteVecALU.MMX>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+ SchedWriteVecALU.MMX>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+ SchedWriteVecALU.MMX>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+ SchedWriteVecALU.MMX>;
+
+// -- Unpack Instructions
+defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
+ int_x86_mmx_punpckhbw,
+ SchedWriteShuffle.MMX>;
+defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
+ int_x86_mmx_punpckhwd,
+ SchedWriteShuffle.MMX>;
+defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
+ int_x86_mmx_punpckhdq,
+ SchedWriteShuffle.MMX>;
+defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
+ int_x86_mmx_punpcklbw,
+ SchedWriteShuffle.MMX,
+ 0, i32mem>;
+defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
+ int_x86_mmx_punpcklwd,
+ SchedWriteShuffle.MMX,
+ 0, i32mem>;
+defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
+ int_x86_mmx_punpckldq,
+ SchedWriteShuffle.MMX,
+ 0, i32mem>;
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+ SchedWriteShuffle.MMX>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+ SchedWriteShuffle.MMX>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+ SchedWriteShuffle.MMX>;
+
+// -- Shuffle Instructions
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+ SchedWriteVarShuffle.MMX>;
+
+let Predicates = [HasMMX, HasSSE1] in {
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (int_x86_sse_pshuf_w VR64:$src1, timm:$src2))]>,
+ Sched<[SchedWriteShuffle.MMX]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+ (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (int_x86_sse_pshuf_w (load_mmx addr:$src1),
+ timm:$src2))]>,
+ Sched<[SchedWriteShuffle.MMX.Folded]>;
+}
+
+// -- Conversion Instructions
+defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
+ f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
+ WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC;
+defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
+ f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC;
+defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
+ f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
+ WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC;
+defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
+ f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC;
+defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
+ i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ WriteCvtI2PD, SSEPackedDouble>, PD;
+let Constraints = "$src1 = $dst" in {
+ defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
+ int_x86_sse_cvtpi2ps,
+ i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, PS, SIMD_EXC;
+}
+
+// Extract / Insert
+let Predicates = [HasMMX, HasSSE1] in
+def MMX_PEXTRWrr: MMXIi8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+ timm:$src2))]>,
+ Sched<[WriteVecExtract]>;
+let Constraints = "$src1 = $dst" in {
+let Predicates = [HasMMX, HasSSE1] in {
+ def MMX_PINSRWrr : MMXIi8<0xC4, MRMSrcReg,
+ (outs VR64:$dst),
+ (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ GR32orGR64:$src2, timm:$src3))]>,
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
+
+ def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
+ (outs VR64:$dst),
+ (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+ (i32 (anyext (loadi16 addr:$src2))),
+ timm:$src3))]>,
+ Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
+}
+}
+
+// Mask creation
+let Predicates = [HasMMX, HasSSE1] in
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins VR64:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst,
+ (int_x86_mmx_pmovmskb VR64:$src))]>,
+ Sched<[WriteMMXMOVMSK]>;
+
+// Misc.
+let SchedRW = [SchedWriteShuffle.MMX] in {
+let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in
+def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+let Uses = [RDI], Predicates = [HasMMX, HasSSE1,In64BitMode] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
+}
+
+// 64-bit bit convert.
+let Predicates = [HasMMX, HasSSE2] in {
+def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
+ (MMX_MOVFR642Qrr FR64:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+ (bc_v2i64 (v4i32 (X86cvtp2Int (v4f32 VR128:$src)))))),
+ (MMX_CVTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+ (bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))),
+ (MMX_CVTTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+ (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+ (MMX_CVTPD2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+ (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+ (MMX_CVTTPD2PIirr VR128:$src)>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMPX.td
new file mode 100644
index 000000000000..44ba071947c2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMPX.td
@@ -0,0 +1,77 @@
+//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MPX instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: Investigate a better scheduler class if MPX is ever used inside LLVM.
+let SchedRW = [WriteSystem] in {
+
+multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
+ def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
+ OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
+ Requires<[Not64BitMode]>;
+ def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
+ OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
+ Requires<[In64BitMode]>;
+}
+
+defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
+
+multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
+ def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[Not64BitMode]>;
+ def 64rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[In64BitMode]>;
+
+ def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[Not64BitMode]>;
+ def 64rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2),
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ Requires<[In64BitMode]>;
+}
+defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable;
+defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable;
+defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable;
+
+def BNDMOVrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ NotMemoryFoldable;
+let mayLoad = 1 in {
+def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
+def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[In64BitMode]>, NotMemoryFoldable;
+}
+let isCodeGenOnly = 1, ForceDisassemble = 1 in
+def BNDMOVrr_REV : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ NotMemoryFoldable;
+let mayStore = 1 in {
+def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
+def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
+ "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ Requires<[In64BitMode]>, NotMemoryFoldable;
+
+def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src),
+ "bndstx\t{$src, $dst|$dst, $src}", []>, PS;
+}
+let mayLoad = 1 in
+def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
+ "bndldx\t{$src, $dst|$dst, $src}", []>, PS;
+} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td
new file mode 100644
index 000000000000..6439f717accb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td
@@ -0,0 +1,29 @@
+//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel SGX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SGX instructions
+
+let SchedRW = [WriteSystem], Predicates = [HasSGX] in {
+// ENCLS - Execute an Enclave System Function of Specified Leaf Number
+def ENCLS : I<0x01, MRM_CF, (outs), (ins),
+ "encls", []>, PS;
+
+// ENCLU - Execute an Enclave User Function of Specified Leaf Number
+def ENCLU : I<0x01, MRM_D7, (outs), (ins),
+ "enclu", []>, PS;
+
+// ENCLV - Execute an Enclave VMM Function of Specified Leaf Number
+def ENCLV : I<0x01, MRM_C0, (outs), (ins),
+ "enclv", []>, PS;
+} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td
new file mode 100644
index 000000000000..de59f3fe2750
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td
@@ -0,0 +1,47 @@
+//===-- X86InstrSNP.td - SNP Instruction Set Extension -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD Secure Nested
+// Paging (SNP) instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SNP instructions
+
+let SchedRW = [WriteSystem] in {
+// F3 0F 01 FF
+let Uses = [RAX] in
+def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, XS,
+ Requires<[In64BitMode]>;
+
+// F2 0F 01 FF
+let Uses = [RAX] in
+def PVALIDATE64: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
+ XD, Requires<[In64BitMode]>;
+
+let Uses = [EAX] in
+def PVALIDATE32: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
+ XD, Requires<[Not64BitMode]>;
+
+// F2 0F 01 FE
+let Uses = [RAX] in
+def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, XD,
+ Requires<[In64BitMode]>;
+
+// F3 0F 01 FE
+let Uses = [RAX] in
+def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, XS,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"psmash\t{%rax|rax}", (PSMASH)>, Requires<[In64BitMode]>;
+def : InstAlias<"pvalidate\t{%rax|rax}", (PVALIDATE64)>, Requires<[In64BitMode]>;
+def : InstAlias<"pvalidate\t{%eax|eax}", (PVALIDATE32)>, Requires<[Not64BitMode]>;
+def : InstAlias<"rmpupdate\t{%rax|rax}", (RMPUPDATE)>, Requires<[In64BitMode]>;
+def : InstAlias<"rmpadjust\t{%rax|rax}", (RMPADJUST)>, Requires<[In64BitMode]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 000000000000..7cf555748c46
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,7995 @@
+//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 Instructions Classes
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, X86MemOperand x86memop,
+ Domain d, X86FoldableSchedWrite sched,
+ bit Is2Addr = 1> {
+let isCodeGenOnly = 1 in {
+ let isCommutable = 1 in {
+ def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
+ Sched<[sched]>;
+ }
+ def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+}
+
+/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode, RegisterClass RC,
+ ValueType VT, string asm, Operand memopr,
+ PatFrags mem_frags, Domain d,
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
+let hasSideEffects = 0 in {
+ def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
+ Sched<[sched]>;
+ let mayLoad = 1 in
+ def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+}
+
+/// sse12_fp_packed - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, ValueType vt,
+ X86MemOperand x86memop, PatFrag mem_frag,
+ Domain d, X86FoldableSchedWrite sched,
+ bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
+ Sched<[sched]>;
+ let mayLoad = 1 in
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+ d>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
+ string OpcodeStr, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched,
+ list<dag> pat_rr, list<dag> pat_rm,
+ bit Is2Addr = 1> {
+ let isCommutable = 1, hasSideEffects = 0 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ pat_rr, d>,
+ Sched<[sched]>;
+ let hasSideEffects = 0, mayLoad = 1 in
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ pat_rm, d>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+ def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
+ [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
+ def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
+ [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
+ def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDomainFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+}
+
+let Predicates = [NoAVX512] in {
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+}
+
+
+// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllZerosV))]>;
+}
+
+let Predicates = [NoAVX512] in {
+def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+}
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero] in {
+ def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+ let Predicates = [HasAVX1Only, OptForMinSize] in {
+ def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+ }
+ let Predicates = [HasAVX2] in
+ def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; Register-to-register
+// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
+// that the insert be implementable in terms of a copy, and just mentioned, we
+// don't use movss/movsd for copies.
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string base_opc,
+ string asm_opr, Domain d, string Name> {
+ let isCommutable = 1 in
+ def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(base_opc, asm_opr),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
+ Sched<[SchedWriteFShuffle.XMM]>;
+
+ // For the disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(base_opc, asm_opr), []>,
+ Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
+}
+
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string OpcodeStr,
+ Domain d, string Name, Predicate pred> {
+ // AVX
+ let Predicates = [UseAVX, OptForSize] in
+ defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
+ "V"#Name>,
+ VEX_4V, VEX_LIG, VEX_WIG;
+
+ def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], d>,
+ VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
+ // SSE1 & 2
+ let Constraints = "$src1 = $dst" in {
+ let Predicates = [pred, NoSSE41_Or_OptForSize] in
+ defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}", d, Name>;
+ }
+
+ def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], d>,
+ Sched<[WriteFStore]>;
+
+ def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>("V"#NAME#"rr_REV")
+ VR128:$dst, VR128:$src1, VR128:$src2), 0>;
+ def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
+ (!cast<Instruction>(NAME#"rr_REV")
+ VR128:$dst, VR128:$src2), 0>;
+}
+
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
+ PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
+ Domain d> {
+ def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
+ VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
+ def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
+ Sched<[WriteFLoad]>;
+
+ // _alt version uses FR32/FR64 register class.
+ let isCodeGenOnly = 1 in {
+ def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))], d>,
+ VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
+ def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))], d>,
+ Sched<[WriteFLoad]>;
+ }
+}
+
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
+ SSEPackedSingle, "MOVSS", UseSSE1>, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
+ SSEPackedDouble, "MOVSD", UseSSE2>, XD;
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+ defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
+ SSEPackedSingle>, XS;
+ defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
+ SSEPackedDouble>, XD;
+}
+
+// Patterns
+let Predicates = [UseAVX] in {
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (VMOVSSrm addr:$src)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (VMOVSDrm addr:$src)>;
+
+ // Represent the same patterns above but in the form they appear for
+ // 256-bit types
+ def : Pat<(v8f32 (X86vzload32 addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+ def : Pat<(v4f64 (X86vzload64 addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+}
+
+let Predicates = [UseAVX, OptForSize] in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSS to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
+
+ // Move low f32 and clear high bits.
+ def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
+}
+
+let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
+// Move scalar to XMM zero-extended, zeroing a VR128 then do a
+// MOVSS to the lower bits.
+def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
+def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in
+def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (MOVSDrm addr:$src)>;
+
+let Predicates = [UseSSE1] in
+def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (MOVSSrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, Domain d,
+ X86SchedWriteMoveLS sched> {
+let hasSideEffects = 0, isMoveReg = 1 in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
+ Sched<[sched.RR]>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (ld_frag addr:$src))], d>,
+ Sched<[sched.RM]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
+ SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+ PS, VEX, VEX_WIG;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
+ SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+ PD, VEX, VEX_WIG;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
+ SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+ PS, VEX, VEX_WIG;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
+ SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+ PD, VEX, VEX_WIG;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
+ SSEPackedSingle, SchedWriteFMoveLS.YMM>,
+ PS, VEX, VEX_L, VEX_WIG;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
+ SSEPackedDouble, SchedWriteFMoveLS.YMM>,
+ PD, VEX, VEX_L, VEX_WIG;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
+ SSEPackedSingle, SchedWriteFMoveLS.YMM>,
+ PS, VEX, VEX_L, VEX_WIG;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
+ SSEPackedDouble, SchedWriteFMoveLS.YMM>,
+ PD, VEX, VEX_L, VEX_WIG;
+}
+
+let Predicates = [UseSSE1] in {
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
+ SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+ PS;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
+ SSEPackedSingle, SchedWriteFMoveLS.XMM>,
+ PS;
+}
+let Predicates = [UseSSE2] in {
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
+ SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+ PD;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
+ SSEPackedDouble, SchedWriteFMoveLS.XMM>,
+ PD;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
+def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
+ VEX, VEX_WIG;
+def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
+ VEX, VEX_WIG;
+def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)]>,
+ VEX, VEX_WIG;
+def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)]>,
+ VEX, VEX_WIG;
+} // SchedRW
+
+let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
+def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
+ VEX, VEX_L, VEX_WIG;
+def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
+ VEX, VEX_L, VEX_WIG;
+def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v8f32 VR256:$src), addr:$dst)]>,
+ VEX, VEX_L, VEX_WIG;
+def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v4f64 VR256:$src), addr:$dst)]>,
+ VEX, VEX_L, VEX_WIG;
+} // SchedRW
+} // Predicate
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ isMoveReg = 1 in {
+let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
+ def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
+ def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
+ def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
+ def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
+} // SchedRW
+
+let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
+ def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movaps\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
+ def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movapd\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
+ def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movups\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
+ def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ "movupd\t{$src, $dst|$dst, $src}", []>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
+} // SchedRW
+} // Predicate
+
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
+ (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
+ (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
+ (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
+ (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
+ (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
+ (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
+ (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
+ (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
+
+let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)]>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)]>;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
+ def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVAPSrr">;
+ def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVAPDrr">;
+ def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVUPSrr">;
+ def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVUPDrr">;
+}
+
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
+ (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
+ (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
+ (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
+ (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
+
+let Predicates = [HasAVX, NoVLX] in {
+ // 256-bit load/store need to use floating point load/store in case we don't
+ // have AVX2. Execution domain fixing will convert to integer if AVX2 is
+ // available and changing the domain is beneficial.
+ def : Pat<(alignedloadv4i64 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(alignedloadv8i32 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(alignedloadv16i16 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(alignedloadv32i8 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(loadv4i64 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(loadv8i32 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(loadv16i16 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(loadv32i8 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+
+ def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+}
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [UseSSE1] in {
+ def : Pat<(alignedloadv2i64 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(alignedloadv4i32 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(alignedloadv8i16 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(alignedloadv16i8 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(loadv2i64 addr:$src),
+ (MOVUPSrm addr:$src)>;
+ def : Pat<(loadv4i32 addr:$src),
+ (MOVUPSrm addr:$src)>;
+ def : Pat<(loadv8i16 addr:$src),
+ (MOVUPSrm addr:$src)>;
+ def : Pat<(loadv16i8 addr:$src),
+ (MOVUPSrm addr:$src)>;
+
+ def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
+ string base_opc, string asm_opr> {
+ // No pattern as they need be special cased between high and low.
+ let hasSideEffects = 0, mayLoad = 1 in
+ def PSrm : PI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ !strconcat(base_opc, "s", asm_opr),
+ [], SSEPackedSingle>, PS,
+ Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
+
+ def PDrm : PI<opc, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ !strconcat(base_opc, "d", asm_opr),
+ [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))))],
+ SSEPackedDouble>, PD,
+ Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
+}
+
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
+ string base_opc> {
+ let Predicates = [UseAVX] in
+ defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+ VEX_4V, VEX_WIG;
+
+ let Constraints = "$src1 = $dst" in
+ defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
+ "\t{$src2, $dst|$dst, $src2}">;
+}
+
+defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
+
+let SchedRW = [WriteFStore] in {
+let Predicates = [UseAVX] in {
+let mayStore = 1, hasSideEffects = 0 in
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ []>,
+ VEX, VEX_WIG;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>,
+ VEX, VEX_WIG;
+}// UseAVX
+let mayStore = 1, hasSideEffects = 0 in
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ []>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+} // SchedRW
+
+let Predicates = [UseSSE1] in {
+ // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
+ // end up with a movsd or blend instead of shufp.
+ // No need for aligned load, we're only loading 64-bits.
+ def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
+ (i8 -28)),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(v4f32 (X86vzload64 addr:$src)),
+ (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
+ def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
+ (MOVLPSmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
+
+let SchedRW = [WriteFStore] in {
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+let Predicates = [UseAVX] in {
+let mayStore = 1, hasSideEffects = 0 in
+def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ []>, VEX, VEX_WIG;
+def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
+} // UseAVX
+let mayStore = 1, hasSideEffects = 0 in
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ []>;
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (extractelt
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)]>;
+} // SchedRW
+
+let Predicates = [UseAVX] in {
+ // MOVHPD patterns
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
+ (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(store (f64 (extractelt
+ (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDmr addr:$dst, VR128:$src)>;
+
+ // MOVLPD patterns
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
+ // end up with a movsd or blend instead of shufp.
+ // No need for aligned load, we're only loading 64-bits.
+ def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
+ addr:$dst),
+ (MOVHPSmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // MOVHPD patterns
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(store (f64 (extractelt
+ (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (MOVHPDmr addr:$dst, VR128:$src)>;
+
+ // MOVLPD patterns
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
+ // Use MOVLPD to load into the low bits from a full vector unless we can use
+ // BLENDPD.
+ def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [UseAVX] in {
+ def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
+ VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
+ let isCommutable = 1 in
+ def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
+ VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
+ NotMemoryFoldable;
+}
+let Constraints = "$src1 = $dst" in {
+ def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
+ Sched<[SchedWriteFShuffle.XMM]>;
+ let isCommutable = 1 in
+ def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
+ Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, string mem, X86FoldableSchedWrite sched,
+ Domain d,
+ SchedRead Int2Fpu = ReadDefault> {
+ let ExeDomain = d in {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
+ Sched<[sched, Int2Fpu]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+ mem#"\t{$src, $dst|$dst, $src}",
+ [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
+ Sched<[sched.Folded]>;
+ }
+}
+
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
+ ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
+ string asm, Domain d, X86FoldableSchedWrite sched> {
+let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
+ [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
+ Sched<[sched]>;
+ let mayLoad = 1 in
+ def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
+ [(set RC:$dst, (DstTy (any_sint_to_fp
+ (SrcTy (ld_frag addr:$src)))))], d>,
+ Sched<[sched.Folded]>;
+}
+}
+
+multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ X86MemOperand x86memop, string asm, string mem,
+ X86FoldableSchedWrite sched, Domain d> {
+let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+ let mayLoad = 1 in
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src),
+ asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+} // hasSideEffects = 0
+}
+
+let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
+defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
+ "cvttss2si", "cvttss2si",
+ WriteCvtSS2I, SSEPackedSingle>,
+ XS, VEX, VEX_LIG;
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
+ "cvttss2si", "cvttss2si",
+ WriteCvtSS2I, SSEPackedSingle>,
+ XS, VEX, VEX_W, VEX_LIG;
+defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
+ "cvttsd2si", "cvttsd2si",
+ WriteCvtSD2I, SSEPackedDouble>,
+ XD, VEX, VEX_LIG;
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
+ "cvttsd2si", "cvttsd2si",
+ WriteCvtSD2I, SSEPackedDouble>,
+ XD, VEX, VEX_W, VEX_LIG;
+
+defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
+ "cvtss2si", "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>,
+ XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
+ "cvtss2si", "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>,
+ XS, VEX, VEX_W, VEX_LIG;
+defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
+ "cvtsd2si", "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>,
+ XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
+ "cvtsd2si", "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>,
+ XD, VEX, VEX_W, VEX_LIG;
+}
+
+// The assembler can recognize rr 64-bit instructions by seeing a rxx
+// register, but the same isn't true when only using memory operands,
+// provide other assembly "l" and "q" forms to address this explicitly
+// where appropriate to do so.
+let isCodeGenOnly = 1 in {
+defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
+ WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+ VEX_LIG, SIMD_EXC;
+defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
+ WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+ VEX_W, VEX_LIG, SIMD_EXC;
+defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
+ WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+ VEX_LIG;
+defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
+ WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+ VEX_W, VEX_LIG, SIMD_EXC;
+} // isCodeGenOnly = 1
+
+let Predicates = [UseAVX] in {
+ def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
+ (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+ def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
+ (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+ def : Pat<(f32 (any_sint_to_fp GR32:$src)),
+ (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+ def : Pat<(f32 (any_sint_to_fp GR64:$src)),
+ (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+ def : Pat<(f64 (any_sint_to_fp GR32:$src)),
+ (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+ def : Pat<(f64 (any_sint_to_fp GR64:$src)),
+ (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+ def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
+ def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
+
+ def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
+ def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
+}
+
+let isCodeGenOnly = 1 in {
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
+ "cvttss2si", "cvttss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
+ "cvttss2si", "cvttss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
+ "cvttsd2si", "cvttsd2si",
+ WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
+ "cvttsd2si", "cvttsd2si",
+ WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+
+defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
+ "cvtss2si", "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
+defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
+ "cvtss2si", "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
+defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
+ "cvtsd2si", "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
+defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
+ "cvtsd2si", "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+
+defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
+ "cvtsi2ss", "cvtsi2ss{l}",
+ WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
+defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
+ "cvtsi2ss", "cvtsi2ss{q}",
+ WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
+defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
+ "cvtsi2sd", "cvtsi2sd{l}",
+ WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
+defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
+ "cvtsi2sd", "cvtsi2sd{q}",
+ WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
+} // isCodeGenOnly = 1
+
+let Predicates = [UseSSE1] in {
+ def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
+ def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
+ def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
+}
+
+// Conversion Instructions Intrinsics - Match intrinsics which expect MM
+// and/or XMM operand(s).
+
+multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+ ValueType DstVT, ValueType SrcVT, SDNode OpNode,
+ Operand memop, PatFrags mem_frags, string asm,
+ X86FoldableSchedWrite sched, Domain d> {
+let ExeDomain = d in {
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
+ Sched<[sched]>;
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
+ Sched<[sched.Folded]>;
+}
+}
+
+multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, X86MemOperand x86memop,
+ string asm, string mem, X86FoldableSchedWrite sched,
+ Domain d, bit Is2Addr = 1> {
+let hasSideEffects = 0, ExeDomain = d in {
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+ let mayLoad = 1 in
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
+ asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+}
+
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+let Predicates = [UseAVX] in {
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
+ X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
+ X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
+ sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
+ SSEPackedDouble>, XD;
+defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
+ sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
+ SSEPackedDouble>, XD, REX_W;
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
+ XS, VEX_4V, VEX_LIG, SIMD_EXC;
+defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
+ XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
+defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
+ XD, VEX_4V, VEX_LIG;
+defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
+ XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
+}
+let Constraints = "$src1 = $dst" in {
+ defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
+ XS, SIMD_EXC;
+ defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
+ XS, REX_W, SIMD_EXC;
+ defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
+ XD;
+ defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
+ XD, REX_W, SIMD_EXC;
+}
+
+def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
+def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
+def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
+def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
+
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
+
+def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
+def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
+def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
+def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
+
+/// SSE 1 Only
+
+// Aliases for intrinsics
+let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
+defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
+ ssmem, sse_load_f32, "cvttss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
+defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
+ X86cvtts2Int, ssmem, sse_load_f32,
+ "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
+ XS, VEX, VEX_LIG, VEX_W;
+defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
+ sdmem, sse_load_f64, "cvttsd2si",
+ WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
+defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
+ X86cvtts2Int, sdmem, sse_load_f64,
+ "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
+ XD, VEX, VEX_LIG, VEX_W;
+}
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
+ ssmem, sse_load_f32, "cvttss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS;
+defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
+ X86cvtts2Int, ssmem, sse_load_f32,
+ "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
+ XS, REX_W;
+defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
+ sdmem, sse_load_f64, "cvttsd2si",
+ WriteCvtSD2I, SSEPackedDouble>, XD;
+defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
+ X86cvtts2Int, sdmem, sse_load_f64,
+ "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
+ XD, REX_W;
+}
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
+
+let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
+defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
+}
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS;
+defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
+ ssmem, sse_load_f32, "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
+
+defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, WriteCvtI2PS>,
+ PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
+defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, WriteCvtI2PSY>,
+ PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
+
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
+ "cvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, WriteCvtI2PS>,
+ PS, Requires<[UseSSE2]>;
+}
+
+// AVX aliases
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
+
+// SSE aliases
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
+
+/// SSE 2 Only
+
+// Convert scalar double to scalar single
+let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
+ ExeDomain = SSEPackedSingle in {
+def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR64:$src2),
+ "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ VEX_4V, VEX_LIG, VEX_WIG,
+ Sched<[WriteCvtSD2SS]>, SIMD_EXC;
+let mayLoad = 1 in
+def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f64mem:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG,
+ Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
+}
+
+def : Pat<(f32 (any_fpround FR64:$src)),
+ (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
+ Requires<[UseAVX]>;
+
+let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
+def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (any_fpround FR64:$src))]>,
+ Sched<[WriteCvtSD2SS]>, SIMD_EXC;
+def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
+ XD, Requires<[UseSSE2, OptForSize]>,
+ Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
+}
+
+let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
+def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
+ Sched<[WriteCvtSD2SS]>;
+def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+ "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
+ Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
+let Constraints = "$src1 = $dst" in {
+def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
+ XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
+def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
+ XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
+}
+}
+
+// Convert scalar single to scalar double
+// SSE2 instructions with XS prefix
+let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
+ (ins FR64:$src1, FR32:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG,
+ Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
+let mayLoad = 1 in
+def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f32mem:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG,
+ Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
+ Requires<[UseAVX, OptForSize]>, SIMD_EXC;
+} // isCodeGenOnly = 1, hasSideEffects = 0
+
+def : Pat<(f64 (any_fpextend FR32:$src)),
+ (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
+def : Pat<(any_fpextend (loadf32 addr:$src)),
+ (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
+
+let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (any_fpextend FR32:$src))]>,
+ XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
+ XS, Requires<[UseSSE2, OptForSize]>,
+ Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
+} // isCodeGenOnly = 1
+
+let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
+ ExeDomain = SSEPackedSingle in {
+def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
+ Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
+let mayLoad = 1 in
+def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+ "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
+ Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
+let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
+def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ []>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtSS2SD]>;
+let mayLoad = 1 in
+def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ []>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
+}
+} // hasSideEffects = 0
+
+// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
+// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
+// vmovs{s,d} instructions
+let Predicates = [UseAVX] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
+ (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
+ (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
+ (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
+ (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
+ (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
+ (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
+ (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
+ (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
+} // Predicates = [UseAVX]
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
+ (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
+ (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
+ (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
+ (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
+} // Predicates = [UseSSE2]
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
+ (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
+ (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
+ (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
+ (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
+} // Predicates = [UseSSE1]
+
+let Predicates = [HasAVX, NoVLX] in {
+// Convert packed single/double fp to doubleword
+def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
+def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
+ VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
+def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
+def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
+}
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
+ Sched<[WriteCvtPS2I]>, SIMD_EXC;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
+ Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
+
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
+
+// XMM only
+def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
+ Sched<[WriteCvtPD2ILd]>, VEX_WIG;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
+}
+
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
+
+def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
+ Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
+def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
+ Sched<[WriteCvtPD2I]>, SIMD_EXC;
+
+// Convert with truncation packed single/double fp to doubleword
+// SSE2 packed instructions with XS prefix
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+let Predicates = [HasAVX, NoVLX] in {
+def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
+def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
+ VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
+def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
+def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
+ VEX, VEX_L,
+ Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
+}
+
+def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
+ Sched<[WriteCvtPS2I]>;
+def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
+ Sched<[WriteCvtPS2ILd]>;
+}
+
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
+// XMM only
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
+def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
+ VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
+
+// YMM only
+def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
+def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
+} // Predicates = [HasAVX, NoVLX]
+
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
+ (VCVTTPD2DQYrr VR256:$src)>;
+ def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
+ (VCVTTPD2DQYrm addr:$src)>;
+}
+
+def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
+ Sched<[WriteCvtPD2I]>, SIMD_EXC;
+def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
+ Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
+
+// Convert packed single to packed double
+let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
+ // SSE2 instructions without OpSize prefix
+def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
+ PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
+def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
+ PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
+def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
+ PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
+def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+ "vcvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
+ PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
+}
+
+let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
+ PS, Sched<[WriteCvtPS2PD]>;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
+ PS, Sched<[WriteCvtPS2PD.Folded]>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX, NoVLX] in {
+let hasSideEffects = 0, mayLoad = 1 in
+def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86any_VSintToFP
+ (bc_v4i32
+ (v2i64 (scalar_to_vector
+ (loadi64 addr:$src)))))))]>,
+ VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
+def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
+def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
+ VEX_WIG;
+def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
+}
+
+let hasSideEffects = 0, mayLoad = 1 in
+def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86any_VSintToFP
+ (bc_v4i32
+ (v2i64 (scalar_to_vector
+ (loadi64 addr:$src)))))))]>,
+ Sched<[WriteCvtI2PDLd]>;
+def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
+ Sched<[WriteCvtI2PD]>;
+
+// AVX register conversion intrinsics
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ (VCVTDQ2PDrm addr:$src)>;
+} // Predicates = [HasAVX, NoVLX]
+
+// SSE2 register conversion intrinsics
+let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ (CVTDQ2PDrm addr:$src)>;
+} // Predicates = [UseSSE2]
+
+// Convert packed double to packed single
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
+// XMM only
+def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
+ VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
+def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>,
+ VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
+
+def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86any_vfpround VR256:$src))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
+def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>,
+ VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
+} // Predicates = [HasAVX, NoVLX]
+
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
+ Sched<[WriteCvtPD2PS]>, SIMD_EXC;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>,
+ Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
+multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+ Operand memop, SDNode OpNode, ValueType VT,
+ PatFrag ld_frag, string asm,
+ X86FoldableSchedWrite sched,
+ PatFrags mem_frags> {
+ def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
+ [(set VR128:$dst, (OpNode (VT VR128:$src1),
+ VR128:$src2, timm:$cc))]>,
+ Sched<[sched]>, SIMD_EXC;
+ let mayLoad = 1 in
+ def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
+ [(set VR128:$dst, (OpNode (VT VR128:$src1),
+ (mem_frags addr:$src2), timm:$cc))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+
+ let isCodeGenOnly = 1 in {
+ let isCommutable = 1 in
+ def rr : SIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
+ Sched<[sched]>, SIMD_EXC;
+ def rm : SIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
+ [(set RC:$dst, (OpNode RC:$src1,
+ (ld_frag addr:$src2), timm:$cc))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
+ "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG;
+let ExeDomain = SSEPackedDouble in
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
+ "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG;
+
+let Constraints = "$src1 = $dst" in {
+ let ExeDomain = SSEPackedSingle in
+ defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
+ let ExeDomain = SSEPackedDouble in
+ defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
+ "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
+}
+
+// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+ ValueType vt, X86MemOperand x86memop,
+ PatFrag ld_frag, string OpcodeStr, Domain d,
+ X86FoldableSchedWrite sched = WriteFComX> {
+ let ExeDomain = d in {
+ def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
+ Sched<[sched]>, SIMD_EXC;
+ let mayLoad = 1 in
+ def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1),
+ (ld_frag addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+}
+}
+
+// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
+multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
+ ValueType vt, Operand memop,
+ PatFrags mem_frags, string OpcodeStr,
+ Domain d,
+ X86FoldableSchedWrite sched = WriteFComX> {
+let ExeDomain = d in {
+ def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
+ Sched<[sched]>, SIMD_EXC;
+let mayLoad = 1 in
+ def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1),
+ (mem_frags addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+}
+}
+
+let Defs = [EFLAGS] in {
+ defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
+ "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
+ defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
+ "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
+ defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
+ "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
+ defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
+ "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
+
+ let isCodeGenOnly = 1 in {
+ defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
+ defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
+
+ defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
+ defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
+ }
+ defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
+ "ucomiss", SSEPackedSingle>, PS;
+ defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
+ "ucomisd", SSEPackedDouble>, PD;
+ defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
+ "comiss", SSEPackedSingle>, PS;
+ defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
+ "comisd", SSEPackedDouble>, PD;
+
+ let isCodeGenOnly = 1 in {
+ defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
+ defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
+
+ defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss", SSEPackedSingle>, PS;
+ defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd", SSEPackedDouble>, PD;
+ }
+} // Defs = [EFLAGS]
+
+// sse12_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
+ ValueType VT, string asm,
+ X86FoldableSchedWrite sched,
+ Domain d, PatFrag ld_frag> {
+ let isCommutable = 1 in
+ def rri : PIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
+ [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
+ Sched<[sched]>, SIMD_EXC;
+ def rmi : PIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
+ [(set RC:$dst,
+ (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+}
+
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
+ "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
+ "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
+ "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
+ "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
+let Constraints = "$src1 = $dst" in {
+ defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
+ "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
+ defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
+ "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
+}
+
+def CommutableCMPCC : PatLeaf<(timm), [{
+ uint64_t Imm = N->getZExtValue() & 0x7;
+ return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
+}]>;
+
+// Patterns to select compares with loads in first operand.
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
+ (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
+ CommutableCMPCC:$cc)),
+ (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
+}
+
+let Predicates = [UseSSE1] in {
+ def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
+ (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
+ CommutableCMPCC:$cc)),
+ (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Shuffle Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
+multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
+ ValueType vt, string asm, PatFrag mem_frag,
+ X86FoldableSchedWrite sched, Domain d,
+ bit IsCommutable = 0> {
+ def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+ (i8 timm:$src3))))], d>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ let isCommutable = IsCommutable in
+ def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+ (i8 timm:$src3))))], d>,
+ Sched<[sched]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+ "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
+ PS, VEX_4V, VEX_WIG;
+ defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+ "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
+ PS, VEX_4V, VEX_L, VEX_WIG;
+ defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+ "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
+ PD, VEX_4V, VEX_WIG;
+ defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+ "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
+ PD, VEX_4V, VEX_L, VEX_WIG;
+}
+let Constraints = "$src1 = $dst" in {
+ defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+ "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
+ defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+ "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Unpack FP Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
+multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
+ PatFrag mem_frag, RegisterClass RC,
+ X86MemOperand x86memop, string asm,
+ X86FoldableSchedWrite sched, Domain d,
+ bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in
+ def rr : PI<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ asm, [(set RC:$dst,
+ (vt (OpNode RC:$src1, RC:$src2)))], d>,
+ Sched<[sched]>;
+ def rm : PI<opc, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ asm, [(set RC:$dst,
+ (vt (OpNode RC:$src1,
+ (mem_frag addr:$src2))))], d>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
+ VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
+ VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
+ VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
+ VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
+ VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
+ VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
+ VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
+ VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+}// Predicates = [HasAVX, NoVLX]
+
+let Constraints = "$src1 = $dst" in {
+ defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
+ VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
+ defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
+ VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+ SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
+ defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
+ VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
+ defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
+ VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+ SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
+ (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+ (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
+ (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+ (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
+ (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+ (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
+ (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+ (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (v2f64 (simple_load addr:$src2)))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Extract Floating-Point Sign mask
+//===----------------------------------------------------------------------===//
+
+/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
+multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
+ string asm, Domain d> {
+ def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
+ Sched<[WriteFMOVMSK]>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
+ SSEPackedSingle>, PS, VEX, VEX_WIG;
+ defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
+ SSEPackedDouble>, PD, VEX, VEX_WIG;
+ defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
+ SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
+ defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
+ SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
+
+ // Also support integer VTs to avoid a int->fp bitcast in the DAG.
+ def : Pat<(X86movmsk (v4i32 VR128:$src)),
+ (VMOVMSKPSrr VR128:$src)>;
+ def : Pat<(X86movmsk (v2i64 VR128:$src)),
+ (VMOVMSKPDrr VR128:$src)>;
+ def : Pat<(X86movmsk (v8i32 VR256:$src)),
+ (VMOVMSKPSYrr VR256:$src)>;
+ def : Pat<(X86movmsk (v4i64 VR256:$src)),
+ (VMOVMSKPDYrr VR256:$src)>;
+}
+
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
+ SSEPackedSingle>, PS;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
+ SSEPackedDouble>, PD;
+
+let Predicates = [UseSSE2] in {
+ // Also support integer VTs to avoid a int->fp bitcast in the DAG.
+ def : Pat<(X86movmsk (v4i32 VR128:$src)),
+ (MOVMSKPSrr VR128:$src)>;
+ def : Pat<(X86movmsk (v2i64 VR128:$src)),
+ (MOVMSKPDrr VR128:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
+ bit IsCommutable, bit Is2Addr> {
+ let isCommutable = IsCommutable in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+} // ExeDomain = SSEPackedInt
+
+multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
+ ValueType OpVT128, ValueType OpVT256,
+ X86SchedWriteWidths sched, bit IsCommutable,
+ Predicate prd> {
+let Predicates = [HasAVX, prd] in
+ defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
+ VR128, load, i128mem, sched.XMM,
+ IsCommutable, 0>, VEX_4V, VEX_WIG;
+
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
+ memop, i128mem, sched.XMM, IsCommutable, 1>;
+
+let Predicates = [HasAVX2, prd] in
+ defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
+ OpVT256, VR256, load, i256mem, sched.YMM,
+ IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
+}
+
+// These are ordered here for pattern ordering requirements with the fp versions
+
+defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
+ SchedWriteVecLogic, 1, NoVLX>;
+defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
+ SchedWriteVecLogic, 1, NoVLX>;
+defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
+ SchedWriteVecLogic, 1, NoVLX>;
+defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
+ SchedWriteVecLogic, 0, NoVLX>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Logical Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
+///
+/// There are no patterns here because isel prefers integer versions for SSE2
+/// and later. There are SSE1 v4f32 patterns later.
+multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ let Predicates = [HasAVX, NoVLX] in {
+ defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
+ [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
+
+ defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
+ [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
+
+ defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
+ [], [], 0>, PS, VEX_4V, VEX_WIG;
+
+ defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
+ [], [], 0>, PD, VEX_4V, VEX_WIG;
+ }
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
+ [], []>, PS;
+
+ defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
+ [], []>, PD;
+ }
+}
+
+defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
+defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
+defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
+let isCommutable = 0 in
+ defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
+
+let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
+ (VPANDYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
+ (VPANDYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
+ (VPANDYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
+ (VPORYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
+ (VPORYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
+ (VPORYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
+ (VPXORYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
+ (VPXORYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
+ (VPXORYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
+ (VPANDNYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
+ (VPANDNYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
+ (VPANDNYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
+ (VPANDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
+ (VPANDYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
+ (VPANDYrm VR256:$src1, addr:$src2)>;
+
+ def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
+ (VPORYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
+ (VPORYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
+ (VPORYrm VR256:$src1, addr:$src2)>;
+
+ def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
+ (VPXORYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
+ (VPXORYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
+ (VPXORYrm VR256:$src1, addr:$src2)>;
+
+ def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
+ (VPANDNYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
+ (VPANDNYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
+ (VPANDNYrm VR256:$src1, addr:$src2)>;
+}
+
+// If only AVX1 is supported, we need to handle integer operations with
+// floating point instructions since the integer versions aren't available.
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
+ (VANDPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
+ (VANDPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
+ (VANDPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
+ (VANDPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
+ (VORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
+ (VORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
+ (VORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
+ (VORPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
+ (VXORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
+ (VXORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
+ (VXORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
+ (VXORPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
+ (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
+ (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
+ (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
+ (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
+ (VANDPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
+ (VANDPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
+ (VANDPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
+ (VANDPSYrm VR256:$src1, addr:$src2)>;
+
+ def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
+ (VORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
+ (VORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
+ (VORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
+ (VORPSYrm VR256:$src1, addr:$src2)>;
+
+ def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
+ (VXORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
+ (VXORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
+ (VXORPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
+ (VXORPSYrm VR256:$src1, addr:$src2)>;
+
+ def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
+ (VANDNPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
+ (VANDNPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
+ (VANDNPSYrm VR256:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
+ (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
+ (VPANDrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
+ (VPANDrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
+ (VPANDrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
+ (VPORrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
+ (VPORrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
+ (VPORrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
+ (VPXORrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
+ (VPXORrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
+ (VPXORrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
+ (VPANDNrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
+ (VPANDNrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
+ (VPANDNrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
+ (VPANDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
+ (VPANDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
+ (VPANDrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
+ (VPORrm VR128:$src1, addr:$src2)>;
+ def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
+ (VPORrm VR128:$src1, addr:$src2)>;
+ def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
+ (VPORrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
+ (VPXORrm VR128:$src1, addr:$src2)>;
+ def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
+ (VPXORrm VR128:$src1, addr:$src2)>;
+ def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
+ (VPXORrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
+ (VPANDNrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
+ (VPANDNrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
+ (VPANDNrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
+ (PANDrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
+ (PANDrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
+ (PANDrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
+ (PORrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
+ (PORrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
+ (PORrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
+ (PXORrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
+ (PXORrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
+ (PXORrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
+ (PANDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
+ (PANDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
+ (PANDrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
+ (PORrm VR128:$src1, addr:$src2)>;
+ def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
+ (PORrm VR128:$src1, addr:$src2)>;
+ def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
+ (PORrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
+ (PXORrm VR128:$src1, addr:$src2)>;
+ def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
+ (PXORrm VR128:$src1, addr:$src2)>;
+ def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
+ (PXORrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
+ (PANDNrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
+ (PANDNrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
+ (PANDNrm VR128:$src1, addr:$src2)>;
+}
+
+// Patterns for packed operations when we don't have integer type available.
+def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
+ (ANDPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
+ (ORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
+ (XORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
+ (ANDNPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
+ (ANDPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
+ (ORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
+ (XORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
+ (ANDNPSrm VR128:$src1, addr:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
+/// vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem.
+///
+
+/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
+/// classes below
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteSizes sched> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ let Predicates = [HasAVX, NoVLX] in {
+ defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ VR128, v4f32, f128mem, loadv4f32,
+ SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
+ defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ VR128, v2f64, f128mem, loadv2f64,
+ SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
+
+ defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, VR256, v8f32, f256mem, loadv8f32,
+ SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
+ defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, VR256, v4f64, f256mem, loadv4f64,
+ SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
+ }
+
+ let Constraints = "$src1 = $dst" in {
+ defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+ v4f32, f128mem, memopv4f32, SSEPackedSingle,
+ sched.PS.XMM>, PS;
+ defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+ v2f64, f128mem, memopv2f64, SSEPackedDouble,
+ sched.PD.XMM>, PD;
+ }
+}
+}
+
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteSizes sched> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+ OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG;
+ defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+ OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG;
+
+ let Constraints = "$src1 = $dst" in {
+ defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+ OpNode, FR32, f32mem, SSEPackedSingle,
+ sched.PS.Scl>, XS;
+ defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+ OpNode, FR64, f64mem, SSEPackedDouble,
+ sched.PD.Scl>, XD;
+ }
+}
+}
+
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ X86SchedWriteSizes sched> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
+ !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
+ SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+ defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
+ !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
+ SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
+
+ let Constraints = "$src1 = $dst" in {
+ defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
+ !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
+ SSEPackedSingle, sched.PS.Scl>, XS;
+ defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
+ !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
+ SSEPackedDouble, sched.PD.Scl>, XD;
+ }
+}
+}
+
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
+ basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
+ basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
+let isCommutable = 0 in {
+ defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
+ defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
+ basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
+ basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
+ defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
+ defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
+}
+
+let isCodeGenOnly = 1 in {
+ defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
+ defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
+ basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
+}
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// either:
+//
+// (1) a scalar fp operation followed by a blend
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// A[0] += B[0];
+// return A;
+// }
+//
+// Previously we generated:
+// addss %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// __m128 C = A + B;
+// return (__m128) {c[0], a[1], a[2], a[3]};
+// }
+//
+// Previously we generated:
+// addps %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
+ ValueType VT, ValueType EltTy,
+ RegisterClass RC, PatFrag ld_frag,
+ Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ // extracted scalar math op with insert via movss/movsd
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ RC:$src))))),
+ (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
+ (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ (ld_frag addr:$src)))))),
+ (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
+ }
+
+ // Repeat for AVX versions of the instructions.
+ let Predicates = [UseAVX] in {
+ // extracted scalar math op with insert via movss/movsd
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ RC:$src))))),
+ (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
+ (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ (ld_frag addr:$src)))))),
+ (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
+ }
+}
+
+defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+
+defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+
+/// Unop Arithmetic
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+
+/// sse_fp_unop_s - SSE1 unops in scalar form
+/// For the non-AVX defs, we need $src1 to be tied to $dst because
+/// the HW instructions are 2 operand / destructive.
+multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType ScalarVT, X86MemOperand x86memop,
+ Operand intmemop, SDNode OpNode, Domain d,
+ X86FoldableSchedWrite sched, Predicate target> {
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
+ def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+ [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
+ Requires<[target]>;
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+ [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
+ Sched<[sched.Folded]>,
+ Requires<[target, OptForSize]>;
+ }
+
+ let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
+ def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
+ Sched<[sched]>;
+ let mayLoad = 1 in
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+
+}
+
+multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
+ PatFrags mem_frags, Intrinsic Intr,
+ Predicate target, string Suffix> {
+ let Predicates = [target] in {
+ // These are unary operations, but they are modeled as having 2 source operands
+ // because the high elements of the destination are unchanged in SSE.
+ def : Pat<(Intr VR128:$src),
+ (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
+ }
+ // We don't want to fold scalar loads into these instructions unless
+ // optimizing for size. This is because the folded instruction will have a
+ // partial register update, while the unfolded sequence will not, e.g.
+ // movss mem, %xmm0
+ // rcpss %xmm0, %xmm0
+ // which has a clobber before the rcp, vs.
+ // rcpss mem, %xmm0
+ let Predicates = [target, OptForSize] in {
+ def : Pat<(Intr (mem_frags addr:$src2)),
+ (!cast<Instruction>(NAME#m_Int)
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
+ }
+}
+
+multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, PatFrags mem_frags,
+ Intrinsic Intr, Predicate target> {
+ let Predicates = [target] in {
+ def : Pat<(Intr VR128:$src),
+ (!cast<Instruction>(NAME#r_Int) VR128:$src,
+ VR128:$src)>;
+ }
+ let Predicates = [target, OptForSize] in {
+ def : Pat<(Intr (mem_frags addr:$src2)),
+ (!cast<Instruction>(NAME#m_Int)
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
+ }
+}
+
+multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType ScalarVT, X86MemOperand x86memop,
+ Operand intmemop, SDNode OpNode, Domain d,
+ X86FoldableSchedWrite sched, Predicate target> {
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
+ def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [], d>, Sched<[sched]>;
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+ let hasSideEffects = 0, ExeDomain = d in {
+ def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched]>;
+ let mayLoad = 1 in
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, intmemop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+
+ // We don't want to fold scalar loads into these instructions unless
+ // optimizing for size. This is because the folded instruction will have a
+ // partial register update, while the unfolded sequence will not, e.g.
+ // vmovss mem, %xmm0
+ // vrcpss %xmm0, %xmm0, %xmm0
+ // which has a clobber before the rcp, vs.
+ // vrcpss mem, %xmm0, %xmm0
+ // TODO: In theory, we could fold the load, and avoid the stall caused by
+ // the partial register store, either in BreakFalseDeps or with smarter RA.
+ let Predicates = [target] in {
+ def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
+ (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
+ }
+ let Predicates = [target, OptForSize] in {
+ def : Pat<(ScalarVT (OpNode (load addr:$src))),
+ (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
+ addr:$src)>;
+ }
+}
+
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, list<Predicate> prds> {
+let Predicates = prds in {
+ def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
+ VEX, Sched<[sched.XMM]>, VEX_WIG;
+ def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
+ VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
+ def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
+ VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
+ def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
+ VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
+}
+
+ def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
+ Sched<[sched.XMM]>;
+ def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
+ Sched<[sched.XMM.Folded]>;
+}
+
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+let Predicates = [HasAVX, NoVLX] in {
+ def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
+ VEX, Sched<[sched.XMM]>, VEX_WIG;
+ def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
+ VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
+ def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
+ VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
+ def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
+ VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
+}
+
+ def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
+ Sched<[sched.XMM]>;
+ def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
+ Sched<[sched.XMM.Folded]>;
+}
+
+multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, Predicate AVXTarget> {
+ defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+ !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
+ UseSSE1, "SS">, XS;
+ defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+ !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
+ AVXTarget>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
+}
+
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, Predicate AVXTarget> {
+ defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem,
+ ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
+ defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
+ f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG;
+}
+
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, Predicate AVXTarget> {
+ defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem,
+ sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
+ defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
+ f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG;
+}
+
+// Square root.
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
+ sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
+ sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
+ sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
+ sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
+defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
+ sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
+ ValueType VT, Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+ (OpNode (extractelt VT:$src, 0))))),
+ (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+
+ // Repeat for AVX versions of the instructions.
+ let Predicates = [UseAVX] in {
+ def : Pat<(VT (Move VT:$dst, (scalar_to_vector
+ (OpNode (extractelt VT:$src, 0))))),
+ (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+}
+
+defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
+
+multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
+ SDNode Move, ValueType VT,
+ Predicate BasePredicate> {
+ let Predicates = [BasePredicate] in {
+ def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+ (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+
+ // Repeat for AVX versions of the instructions.
+ let Predicates = [HasAVX] in {
+ def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+ (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+ }
+}
+
+defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
+ v4f32, UseSSE1>;
+defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
+ v4f32, UseSSE1>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Non-temporal stores
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let Predicates = [HasAVX, NoVLX] in {
+let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
+def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src),
+ addr:$dst)]>, VEX, VEX_WIG;
+def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2f64 VR128:$src),
+ addr:$dst)]>, VEX, VEX_WIG;
+} // SchedRW
+
+let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
+def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v8f32 VR256:$src),
+ addr:$dst)]>, VEX, VEX_L, VEX_WIG;
+def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f64 VR256:$src),
+ addr:$dst)]>, VEX, VEX_L, VEX_WIG;
+} // SchedRW
+
+let ExeDomain = SSEPackedInt in {
+def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src),
+ addr:$dst)]>, VEX, VEX_WIG,
+ Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
+def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR256:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4i64 VR256:$src),
+ addr:$dst)]>, VEX, VEX_L, VEX_WIG,
+ Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
+} // ExeDomain
+} // Predicates
+
+let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
+} // SchedRW
+
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
+
+let SchedRW = [WriteStoreNT] in {
+// There is no AVX form for instructions below this point
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movnti{l}\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
+ PS, Requires<[HasSSE2]>;
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movnti{q}\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
+ PS, Requires<[HasSSE2]>;
+} // SchedRW = [WriteStoreNT]
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+
+ def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
+}
+
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Prefetch and memory fence
+//===----------------------------------------------------------------------===//
+
+// Prefetch intrinsic.
+let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
+def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
+ "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
+def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
+ "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
+def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
+ "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
+def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
+ "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
+}
+
+// FIXME: How should flush instruction be modeled?
+let SchedRW = [WriteLoad] in {
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+ "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+ PS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteNop] in {
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins),
+ "pause", [(int_x86_sse2_pause)]>, OBXS;
+}
+
+let SchedRW = [WriteFence] in {
+// Load, store, and memory fence
+// TODO: As with mfence, we may want to ease the availability of sfence/lfence
+// to include any 64-bit target.
+def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
+ PS, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
+ PS, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
+ PS, Requires<[HasMFence]>;
+} // SchedRW
+
+def : Pat<(X86MFence), (MFENCE)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Load/Store XCSR register
+//===----------------------------------------------------------------------===//
+
+let mayLoad=1, hasSideEffects=1 in
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
+ VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
+let mayStore=1, hasSideEffects=1 in
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
+ VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
+
+let mayLoad=1, hasSideEffects=1 in
+def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
+ PS, Sched<[WriteLDMXCSR]>;
+let mayStore=1, hasSideEffects=1 in
+def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
+ PS, Sched<[WriteSTMXCSR]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+let hasSideEffects = 0 in {
+def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
+def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.XMM.RR]>,
+ VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RR]>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
+def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.XMM.RR]>,
+ VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RR]>,
+ VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
+}
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+ hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
+def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+ VEX, VEX_L, VEX_WIG;
+def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv2i64 addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.XMM.RM]>,
+ XS, VEX, VEX_WIG;
+def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+ XS, VEX, VEX_L, VEX_WIG;
+}
+
+let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
+def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
+ Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR256:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
+def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",
+ [(store (v2i64 VR128:$src), addr:$dst)]>,
+ Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+ "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
+ Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
+}
+
+let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
+let hasSideEffects = 0 in {
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>;
+
+def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ XS, Requires<[UseSSE2]>;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>,
+ FoldGenData<"MOVDQArr">;
+
+def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}", []>,
+ XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
+}
+} // SchedRW
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+ hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
+def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
+ XS, Requires<[UseSSE2]>;
+}
+
+let mayStore = 1, hasSideEffects = 0,
+ SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
+def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
+ XS, Requires<[UseSSE2]>;
+}
+
+} // ExeDomain = SSEPackedInt
+
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
+ (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
+ (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
+def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
+ (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
+ (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
+
+// Reversed version with ".s" suffix for GAS compatibility.
+def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
+ (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
+ (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
+
+let Predicates = [HasAVX, NoVLX] in {
+ // Additional patterns for other integer sizes.
+ def : Pat<(alignedloadv4i32 addr:$src),
+ (VMOVDQArm addr:$src)>;
+ def : Pat<(alignedloadv8i16 addr:$src),
+ (VMOVDQArm addr:$src)>;
+ def : Pat<(alignedloadv16i8 addr:$src),
+ (VMOVDQArm addr:$src)>;
+ def : Pat<(loadv4i32 addr:$src),
+ (VMOVDQUrm addr:$src)>;
+ def : Pat<(loadv8i16 addr:$src),
+ (VMOVDQUrm addr:$src)>;
+ def : Pat<(loadv16i8 addr:$src),
+ (VMOVDQUrm addr:$src)>;
+
+ def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (VMOVDQAmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (VMOVDQAmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (VMOVDQAmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (VMOVDQUmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (VMOVDQUmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (VMOVDQUmr addr:$dst, VR128:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Arithmetic Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
+multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+ Sched<[sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+ (memop_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+} // ExeDomain = SSEPackedInt
+
+defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
+ SchedWriteVecALU, 1, NoVLX>;
+defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
+ SchedWriteVecALU, 1, NoVLX>;
+defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
+ SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
+defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
+ SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
+defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
+ SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
+defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
+defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
+defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
+ SchedWriteVecALU, 0, NoVLX>;
+defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
+ SchedWriteVecALU, 0, NoVLX>;
+defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
+defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
+ SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
+defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+ SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
+defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
+ SchedWriteVecIMul, 1, NoVLX>;
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+ load, i128mem, SchedWriteVecIMul.XMM, 0>,
+ VEX_4V, VEX_WIG;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
+ VR256, load, i256mem, SchedWriteVecIMul.YMM,
+ 0>, VEX_4V, VEX_L, VEX_WIG;
+let Constraints = "$src1 = $dst" in
+defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+ memop, i128mem, SchedWriteVecIMul.XMM>;
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
+ load, i128mem, SchedWritePSADBW.XMM, 0>,
+ VEX_4V, VEX_WIG;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
+ load, i256mem, SchedWritePSADBW.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+let Constraints = "$src1 = $dst" in
+defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
+ memop, i128mem, SchedWritePSADBW.XMM>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, RegisterClass RC,
+ X86FoldableSchedWrite sched,
+ X86FoldableSchedWrite schedImm,
+ ValueType DstVT, ValueType SrcVT,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ // src2 is always 128-bit
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
+ Sched<[sched]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1,
+ (SrcVT (ld_frag addr:$src2)))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+ (ins RC:$src1, u8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
+ Sched<[schedImm]>;
+}
+
+multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, ValueType DstVT128,
+ ValueType DstVT256, ValueType SrcVT,
+ X86SchedWriteWidths sched,
+ X86SchedWriteWidths schedImm, Predicate prd> {
+let Predicates = [HasAVX, prd] in
+ defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+ OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
+ DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
+let Predicates = [HasAVX2, prd] in
+ defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+ OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
+ DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
+ VEX_WIG;
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
+ VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
+ memop>;
+}
+
+multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
+ SDNode OpNode, RegisterClass RC, ValueType VT,
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
+ def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
+ Sched<[sched]>;
+}
+
+multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+ defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+ VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+ defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+ VR256, v32i8, sched.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
+ sched.XMM>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+ v8i16, v16i16, v8i16, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
+ defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+ v4i32, v8i32, v4i32, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
+ defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+ v2i64, v4i64, v2i64, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
+
+ defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+ v8i16, v16i16, v8i16, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
+ defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+ v4i32, v8i32, v4i32, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
+ defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+ v2i64, v4i64, v2i64, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
+
+ defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+ v8i16, v16i16, v8i16, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
+ defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+ v4i32, v8i32, v4i32, SchedWriteVecShift,
+ SchedWriteVecShiftImm, NoVLX>;
+
+ defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
+ SchedWriteShuffle>;
+ defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
+ SchedWriteShuffle>;
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Comparison Instructions
+//===---------------------------------------------------------------------===//
+
+defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
+ SchedWriteVecALU, 1, TruePredicate>;
+defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
+ SchedWriteVecALU, 1, TruePredicate>;
+defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
+ SchedWriteVecALU, 1, TruePredicate>;
+defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
+ SchedWriteVecALU, 0, TruePredicate>;
+defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
+ SchedWriteVecALU, 0, TruePredicate>;
+defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
+ SchedWriteVecALU, 0, TruePredicate>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Shuffle Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
+ SDNode OpNode, X86SchedWriteWidths sched,
+ Predicate prd> {
+let Predicates = [HasAVX, prd] in {
+ def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
+ VEX, Sched<[sched.XMM]>, VEX_WIG;
+ def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (load addr:$src1),
+ (i8 timm:$src2))))]>, VEX,
+ Sched<[sched.XMM.Folded]>, VEX_WIG;
+}
+
+let Predicates = [HasAVX2, prd] in {
+ def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
+ VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
+ def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src1, u8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode (load addr:$src1),
+ (i8 timm:$src2))))]>, VEX, VEX_L,
+ Sched<[sched.YMM.Folded]>, VEX_WIG;
+}
+
+let Predicates = [UseSSE2] in {
+ def ri : Ii8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
+ Sched<[sched.XMM]>;
+ def mi : Ii8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (memop addr:$src1),
+ (i8 timm:$src2))))]>,
+ Sched<[sched.XMM.Folded]>;
+}
+}
+} // ExeDomain = SSEPackedInt
+
+defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
+ SchedWriteShuffle, NoVLX>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
+ SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
+ SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
+
+//===---------------------------------------------------------------------===//
+// Packed Integer Pack Instructions (SSE & AVX)
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, RegisterClass RC,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ def rr : PDI<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
+ Sched<[sched]>;
+ def rm : PDI<opc, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OutVT (OpNode (ArgVT RC:$src1),
+ (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+ ValueType ArgVT, SDNode OpNode, RegisterClass RC,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ def rr : SS48I<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
+ Sched<[sched]>;
+ def rm : SS48I<opc, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OutVT (OpNode (ArgVT RC:$src1),
+ (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+
+ defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+
+ defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+ defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+
+ defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+
+ defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Unpack Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+ def rr : PDI<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
+ def rm : PDI<opc, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
+ i128mem, SchedWriteShuffle.XMM, load, 0>,
+ VEX_4V, VEX_WIG;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
+ i256mem, SchedWriteShuffle.YMM, load, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+ defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+ defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+ defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+
+ defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+ defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+ defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+ defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
+ i128mem, SchedWriteShuffle.XMM, memop>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Extract and Insert
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pinsrw<bit Is2Addr = 1> {
+ def rr : Ii8<0xC4, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1,
+ GR32orGR64:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
+ def rm : Ii8<0xC4, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ i16mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+ timm:$src3))]>,
+ Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
+}
+
+// Extract
+let Predicates = [HasAVX, NoBWI] in
+def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+ "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+ timm:$src2))]>,
+ PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
+def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
+ (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+ timm:$src2))]>,
+ Sched<[WriteVecExtract]>;
+
+// Insert
+let Predicates = [HasAVX, NoBWI] in
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
+
+let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
+defm PINSRW : sse2_pinsrw, PD;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Mask Creation
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+
+def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
+ Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
+
+let Predicates = [HasAVX2] in {
+def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins VR256:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
+ Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
+}
+
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
+ Sched<[WriteVecMOVMSK]>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Conditional Store
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
+let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
+def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
+ (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+ VEX, VEX_WIG;
+let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
+def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
+ (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
+ VEX, VEX_WIG;
+
+let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
+let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Doubleword/Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
+def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+ VEX, Sched<[WriteVecLoad]>;
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ VEX, Sched<[WriteVecLoad]>;
+let isCodeGenOnly = 1 in
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
+
+def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))]>,
+ Sched<[WriteVecMoveFromGpr]>;
+def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+ Sched<[WriteVecLoad]>;
+def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))]>,
+ Sched<[WriteVecMoveFromGpr]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteVecLoad]>;
+let isCodeGenOnly = 1 in
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))]>,
+ Sched<[WriteVecMoveFromGpr]>;
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+ def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
+
+ def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))]>,
+ Sched<[WriteVecMoveFromGpr]>;
+
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+ (iPTR 0)))]>, VEX,
+ Sched<[WriteVecMoveToGpr]>;
+def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)]>,
+ VEX, Sched<[WriteVecStore]>;
+def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+ (iPTR 0)))]>,
+ Sched<[WriteVecMoveToGpr]>;
+def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (extractelt (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)]>,
+ Sched<[WriteVecStore]>;
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+let ExeDomain = SSEPackedInt in {
+let SchedRW = [WriteVecMoveToGpr] in {
+def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+ (iPTR 0)))]>,
+ VEX;
+
+def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+ (iPTR 0)))]>;
+} //SchedRW
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ VEX, Sched<[WriteVecStore]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteVecStore]>;
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+ def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))]>,
+ VEX, Sched<[WriteVecMoveToGpr]>;
+
+ def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))]>,
+ Sched<[WriteVecMoveToGpr]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+//===---------------------------------------------------------------------===//
+// Move Scalar Single to Double Int
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+ def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))]>,
+ VEX, Sched<[WriteVecMoveToGpr]>;
+ def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))]>,
+ Sched<[WriteVecMoveToGpr]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+let Predicates = [UseAVX] in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (VMOVDI2PDIrr GR32:$src)>;
+
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (VMOV64toPQIrr GR64:$src)>;
+
+ // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
+ // These instructions also write zeros in the high part of a 256-bit register.
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
+ (VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v8i32 (X86vzload32 addr:$src)),
+ (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+ (MOVDI2PDIrr GR32:$src)>;
+
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (MOV64toPQIrr GR64:$src)>;
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
+ (MOVDI2PDIrm addr:$src)>;
+}
+
+// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
+// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
+// these aliases.
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
+ (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
+ (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+ (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Quadword Int to Packed Quadword Int
+//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
+def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+ VEX, Requires<[UseAVX]>, VEX_WIG;
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+ XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+} // ExeDomain, SchedRW
+
+//===---------------------------------------------------------------------===//
+// Move Packed Quadword Int to Quadword Int
+//
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
+def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (extractelt (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>,
+ VEX, VEX_WIG;
+def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (extractelt (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+} // ExeDomain, SchedRW
+
+// For disassembler only
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+ SchedRW = [SchedWriteVecLogic.XMM] in {
+def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
+def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>;
+}
+
+def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
+ (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
+def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
+ (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
+
+let Predicates = [UseAVX] in {
+ def : Pat<(v2i64 (X86vzload64 addr:$src)),
+ (VMOVQI2PQIrm addr:$src)>;
+ def : Pat<(v4i64 (X86vzload64 addr:$src)),
+ (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
+
+ def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
+ (VMOVPQI2QImr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
+
+ def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
+ (MOVPQI2QImr addr:$dst, VR128:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
+let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
+def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+ XS, VEX, Requires<[UseAVX]>, VEX_WIG;
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+ XS, Requires<[UseSSE2]>;
+} // ExeDomain, SchedRW
+
+let Predicates = [UseAVX] in {
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (VMOVZPQILo2PQIrr VR128:$src)>;
+}
+let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (MOVZPQILo2PQIrr VR128:$src)>;
+}
+
+let Predicates = [UseAVX] in {
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIrr
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIrr
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+ ValueType vt, RegisterClass RC, PatFrag mem_frag,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (vt (OpNode RC:$src)))]>,
+ Sched<[sched]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
+ Sched<[sched.Folded]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+ v4f32, VR128, loadv4f32, f128mem,
+ SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
+ defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+ v4f32, VR128, loadv4f32, f128mem,
+ SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
+ defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+ v8f32, VR256, loadv8f32, f256mem,
+ SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
+ defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+ v8f32, VR256, loadv8f32, f256mem,
+ SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+ memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+ memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+ (VMOVSHDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
+ (VMOVSHDUPrm addr:$src)>;
+ def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+ (VMOVSLDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
+ (VMOVSLDUPrm addr:$src)>;
+ def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+ (VMOVSHDUPYrr VR256:$src)>;
+ def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
+ (VMOVSHDUPYrm addr:$src)>;
+ def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+ (VMOVSLDUPYrr VR256:$src)>;
+ def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
+ (VMOVSLDUPYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+ (MOVSHDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
+ (MOVSHDUPrm addr:$src)>;
+ def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+ (MOVSLDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
+ (MOVSLDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
+def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
+ Sched<[sched.XMM]>;
+def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (v2f64 (X86Movddup
+ (scalar_to_vector (loadf64 addr:$src)))))]>,
+ Sched<[sched.XMM.Folded]>;
+}
+
+// FIXME: Merge with above classes when there are patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
+def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
+ Sched<[sched.YMM]>;
+def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
+ Sched<[sched.YMM.Folded]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
+ VEX, VEX_WIG;
+ defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
+ VEX, VEX_L, VEX_WIG;
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
+
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+}
+
+let Predicates = [UseSSE3] in {
+ def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
+ (MOVDDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
+let Predicates = [HasAVX] in {
+ def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
+ def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vlddqu\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
+} // Predicates
+
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "lddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
+ Sched<[SchedWriteVecMoveLS.XMM.RM]>;
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Arithmetic
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ def rr : I<0xD0, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
+ def rm : I<0xD0, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+}
+
+let Predicates = [HasAVX] in {
+ let ExeDomain = SSEPackedSingle in {
+ defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
+ SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
+ XD, VEX_4V, VEX_WIG;
+ defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
+ SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
+ XD, VEX_4V, VEX_L, VEX_WIG;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
+ SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
+ PD, VEX_4V, VEX_WIG;
+ defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
+ SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
+ PD, VEX_4V, VEX_L, VEX_WIG;
+ }
+}
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
+ let ExeDomain = SSEPackedSingle in
+ defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
+ SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
+ let ExeDomain = SSEPackedDouble in
+ defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
+ SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Horizontal ops
+multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode,
+ X86FoldableSchedWrite sched, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
+
+ def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+}
+multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+ X86MemOperand x86memop, SDNode OpNode,
+ X86FoldableSchedWrite sched, PatFrag ld_frag,
+ bit Is2Addr = 1> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
+
+ def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+}
+
+let Predicates = [HasAVX] in {
+ let ExeDomain = SSEPackedSingle in {
+ defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+ X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
+ defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+ X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
+ defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+ X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+ X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
+ X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
+ defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
+ X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
+ defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
+ X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
+ X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
+ }
+}
+
+let Constraints = "$src1 = $dst" in {
+ let ExeDomain = SSEPackedSingle in {
+ defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
+ WriteFHAdd, memopv4f32>;
+ defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
+ WriteFHAdd, memopv4f32>;
+ }
+ let ExeDomain = SSEPackedDouble in {
+ defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
+ WriteFHAdd, memopv2f64>;
+ defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
+ WriteFHAdd, memopv2f64>;
+ }
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Absolute Instructions
+//===---------------------------------------------------------------------===//
+
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
+ def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
+ Sched<[sched.XMM]>;
+
+ def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (vt (OpNode (ld_frag addr:$src))))]>,
+ Sched<[sched.XMM.Folded]>;
+}
+
+/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
+ SDNode OpNode, X86SchedWriteWidths sched> {
+ def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
+ Sched<[sched.YMM]>;
+
+ def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (vt (OpNode (load addr:$src))))]>,
+ Sched<[sched.YMM.Folded]>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
+ load>, VEX, VEX_WIG;
+ defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
+ load>, VEX, VEX_WIG;
+}
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
+ load>, VEX, VEX_WIG;
+}
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
+ VEX, VEX_L, VEX_WIG;
+ defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
+ VEX, VEX_L, VEX_WIG;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
+ VEX, VEX_L, VEX_WIG;
+}
+
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
+ memop>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
+ memop>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
+ memop>;
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Binary Operator Instructions
+//===---------------------------------------------------------------------===//
+
+/// SS3I_binop_rm - Simple SSSE3 bin op
+multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType OpVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
+ Sched<[sched]>;
+ def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, X86FoldableSchedWrite sched,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ Sched<[sched]>;
+ def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId256,
+ X86FoldableSchedWrite sched> {
+ let isCommutable = 1 in
+ def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+ Sched<[sched]>;
+ def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (IntId256 VR256:$src1, (load addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+ defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
+ VR128, load, i128mem,
+ SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
+ defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
+ v16i8, VR128, load, i128mem,
+ SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
+}
+defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
+ VR128, load, i128mem,
+ SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX] in {
+let isCommutable = 0 in {
+ defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
+ load, i128mem,
+ SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
+ defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
+ load, i128mem,
+ SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
+ defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
+ load, i128mem,
+ SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
+ defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
+ load, i128mem,
+ SchedWritePHAdd.XMM, 0>, VEX_4V;
+ defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
+ int_x86_ssse3_psign_b_128,
+ SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
+ defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
+ int_x86_ssse3_psign_w_128,
+ SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
+ defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
+ int_x86_ssse3_psign_d_128,
+ SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
+ defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
+ int_x86_ssse3_phadd_sw_128,
+ SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
+ defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
+ int_x86_ssse3_phsub_sw_128,
+ SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
+}
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+ defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
+ VR256, load, i256mem,
+ SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
+ v32i8, VR256, load, i256mem,
+ SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
+}
+defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
+ VR256, load, i256mem,
+ SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2] in {
+let isCommutable = 0 in {
+ defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
+ VR256, load, i256mem,
+ SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
+ load, i256mem,
+ SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
+ VR256, load, i256mem,
+ SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
+ load, i256mem,
+ SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
+ defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
+ SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
+ SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
+ SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+ int_x86_avx2_phadd_sw,
+ SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+ int_x86_avx2_phsub_sw,
+ SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
+}
+}
+
+// None of these have i8 immediate fields.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+let isCommutable = 0 in {
+ defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
+ memop, i128mem, SchedWritePHAdd.XMM>;
+ defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
+ memop, i128mem, SchedWritePHAdd.XMM>;
+ defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
+ memop, i128mem, SchedWritePHAdd.XMM>;
+ defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
+ memop, i128mem, SchedWritePHAdd.XMM>;
+ defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
+ SchedWriteVecALU.XMM, memop>;
+ defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
+ SchedWriteVecALU.XMM, memop>;
+ defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
+ SchedWriteVecALU.XMM, memop>;
+ defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
+ memop, i128mem, SchedWriteVarShuffle.XMM>;
+ defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
+ int_x86_ssse3_phadd_sw_128,
+ SchedWritePHAdd.XMM, memop>;
+ defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
+ int_x86_ssse3_phsub_sw_128,
+ SchedWritePHAdd.XMM, memop>;
+ defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
+ v16i8, VR128, memop, i128mem,
+ SchedWriteVecIMul.XMM>;
+}
+defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
+ VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Align Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched, bit Is2Addr = 1> {
+ let hasSideEffects = 0 in {
+ def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
+ Sched<[sched]>;
+ let mayLoad = 1 in
+ def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (VT (X86PAlignr RC:$src1,
+ (memop_frag addr:$src2),
+ (i8 timm:$src3))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+ defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
+ SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+ defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
+ SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
+ defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
+ SchedWriteShuffle.XMM>;
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Thread synchronization
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
+ TB, Requires<[HasSSE3, Not64BitMode]>;
+let Uses = [RAX, ECX, EDX] in
+def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
+ TB, Requires<[HasSSE3, In64BitMode]>;
+
+let Uses = [ECX, EAX] in
+def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
+ [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
+} // SchedRW
+
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
+ Requires<[Not64BitMode]>;
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Move with Sign/Zero Extend
+// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+ RegisterClass OutRC, RegisterClass InRC,
+ X86FoldableSchedWrite sched> {
+ def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[sched]>;
+
+ def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+ Sched<[sched.Folded]>;
+}
+
+multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
+ X86MemOperand MemOp, X86MemOperand MemYOp,
+ Predicate prd> {
+ defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
+ SchedWriteShuffle.XMM>;
+ let Predicates = [HasAVX, prd] in
+ defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
+ VR128, VR128, SchedWriteShuffle.XMM>,
+ VEX, VEX_WIG;
+ let Predicates = [HasAVX2, prd] in
+ defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
+ VR256, VR128, WriteShuffle256>,
+ VEX, VEX_L, VEX_WIG;
+}
+
+multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+ X86MemOperand MemYOp, Predicate prd> {
+ defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
+ MemOp, MemYOp, prd>;
+ defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
+ !strconcat("pmovzx", OpcodeStr),
+ MemOp, MemYOp, prd>;
+}
+
+defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
+defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
+defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
+
+defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
+defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
+
+defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
+
+// AVX2 Patterns
+multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
+ SDNode ExtOp, SDNode InVecOp> {
+ // Register-Register patterns
+ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+ }
+ let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
+ def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
+
+ def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
+ def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
+ (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
+ }
+
+ // Simple Register-Memory patterns
+ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+
+ def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+ }
+
+ let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+ def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+ def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+ }
+
+ // AVX2 Register-Memory patterns
+ let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+ def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
+ (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ }
+}
+
+defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
+defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
+
+// SSE4.1/AVX patterns.
+multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
+ SDNode ExtOp> {
+ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
+
+ def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
+ (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+ def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+ def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
+ (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
+ (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
+ (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
+ (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
+ (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
+ (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+ }
+}
+
+defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
+defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
+
+let Predicates = [UseSSE41] in {
+ defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
+ defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Extract Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
+ timm:$src2))]>,
+ Sched<[WriteVecExtract]>;
+ let hasSideEffects = 0, mayStore = 1 in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+ defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
+
+defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
+
+ let hasSideEffects = 0, mayStore = 1 in
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+ defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
+
+defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst,
+ (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
+ Sched<[WriteVecExtract]>;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
+
+defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR64:$dst,
+ (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
+ Sched<[WriteVecExtract]>;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
+
+defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32orGR64:$dst,
+ (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+ Sched<[WriteVecExtract]>;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+ addr:$dst)]>, Sched<[WriteVecExtractSt]>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ let Predicates = [UseAVX] in
+ defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
+ defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Insert Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
+ Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+ defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
+let Constraints = "$src1 = $dst" in
+ defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
+
+multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
+ Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+ defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR64:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
+ Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+ defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
+let Constraints = "$src1 = $dst" in
+ defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
+ Sched<[SchedWriteFShuffle.XMM]>;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(asm,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (X86insertps VR128:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ timm:$src3))]>,
+ Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ let Predicates = [UseAVX] in
+ defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
+ VEX_4V, VEX_WIG;
+ let Constraints = "$src1 = $dst" in
+ defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Round Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ ValueType VT, PatFrag mem_frag, SDNode OpNode,
+ X86FoldableSchedWrite sched> {
+ // Intrinsic operation, reg.
+ // Vector intrinsic operation, reg
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ def r : SS4AIi8<opc, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
+ Sched<[sched]>;
+
+ // Vector intrinsic operation, mem
+ def m : SS4AIi8<opc, MRMSrcMem,
+ (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
+ Sched<[sched.Folded]>;
+}
+}
+
+multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr, X86FoldableSchedWrite sched> {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
+ def SSr : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
+
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
+}
+
+multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr, X86FoldableSchedWrite sched> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
+ def SSr : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
+
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
+}
+}
+
+multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr, X86FoldableSchedWrite sched,
+ ValueType VT32, ValueType VT64,
+ SDNode OpNode, bit Is2Addr = 1> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+let ExeDomain = SSEPackedSingle in {
+ def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
+ Sched<[sched]>;
+
+ def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
+
+let ExeDomain = SSEPackedDouble in {
+ def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
+ Sched<[sched]>;
+
+ def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set VR128:$dst,
+ (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
+}
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+let Predicates = [HasAVX, NoVLX] in {
+ let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ // Intrinsic form
+ defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
+ loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
+ VEX, VEX_WIG;
+ defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
+ loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
+ VEX, VEX_L, VEX_WIG;
+ }
+
+ let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
+ defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
+ loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
+ VEX, VEX_WIG;
+ defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
+ loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
+ VEX, VEX_L, VEX_WIG;
+ }
+}
+let Predicates = [UseAVX] in {
+ defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
+ v4f32, v2f64, X86RndScales, 0>,
+ VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
+ defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
+ VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
+}
+
+let Predicates = [UseAVX] in {
+ def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
+ def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
+}
+
+let Predicates = [UseAVX, OptForSize] in {
+ def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+ def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
+ memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
+let ExeDomain = SSEPackedDouble in
+defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
+ memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
+
+defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
+
+let Constraints = "$src1 = $dst" in
+defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
+ v4f32, v2f64, X86RndScales>;
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
+ (ROUNDSSr FR32:$src1, timm:$src2)>;
+ def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
+ (ROUNDSDr FR64:$src1, timm:$src2)>;
+}
+
+let Predicates = [UseSSE41, OptForSize] in {
+ def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
+ (ROUNDSSm addr:$src1, timm:$src2)>;
+ def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
+ (ROUNDSDm addr:$src1, timm:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Bit Test
+//===----------------------------------------------------------------------===//
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+ Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
+def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
+ Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
+ VEX, VEX_WIG;
+
+def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
+ Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
+def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
+ "vptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
+ Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
+ VEX, VEX_L, VEX_WIG;
+}
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+ Sched<[SchedWriteVecTest.XMM]>;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "ptest\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+ Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
+}
+
+// The bit test instructions below are AVX only
+multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
+ X86FoldableSchedWrite sched> {
+ def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
+ Sched<[sched]>, VEX;
+ def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedSingle in {
+defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
+ SchedWriteFTest.XMM>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
+ SchedWriteFTest.YMM>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
+ SchedWriteFTest.XMM>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
+ SchedWriteFTest.YMM>, VEX_L;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Misc Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+ def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT]>, OpSize16, XS;
+ def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+ (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
+
+ def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT]>, OpSize32, XS;
+
+ def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+ (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
+
+ def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT]>, XS;
+ def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+ (implicit EFLAGS)]>,
+ Sched<[WritePOPCNT.Folded]>, XS;
+}
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, PatFrag ld_frag,
+ X86FoldableSchedWrite Sched> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
+ Sched<[Sched]>;
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (v8i16 (OpNode (ld_frag addr:$src))))]>,
+ Sched<[Sched.Folded]>;
+}
+
+// PHMIN has the same profile as PSAD, thus we use the same scheduling
+// model, although the naming is misleading.
+let Predicates = [HasAVX] in
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
+ X86phminpos, load,
+ WritePHMINPOS>, VEX, VEX_WIG;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
+ X86phminpos, memop,
+ WritePHMINPOS>;
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
+ bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
+ def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
+ load, i128mem, SchedWriteVecIMul.XMM, 0>,
+ VEX_4V, VEX_WIG;
+}
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+ defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
+ load, i256mem, SchedWriteVecIMul.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+}
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM, 1>;
+ defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM, 1>;
+ defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM, 1>;
+ defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM, 1>;
+ defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM, 1>;
+ defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM, 1>;
+ defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM, 1>;
+ defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM, 1>;
+ defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
+ memop, i128mem, SchedWriteVecIMul.XMM, 1>;
+}
+
+let Predicates = [HasAVX, NoVLX] in
+ defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
+ load, i128mem, SchedWritePMULLD.XMM, 0>,
+ VEX_4V, VEX_WIG;
+let Predicates = [HasAVX] in
+ defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+
+let Predicates = [HasAVX2, NoVLX] in
+ defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
+ load, i256mem, SchedWritePMULLD.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+let Predicates = [HasAVX2] in
+ defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+
+let Constraints = "$src1 = $dst" in {
+ defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
+ memop, i128mem, SchedWritePMULLD.XMM, 1>;
+ defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM, 1>;
+}
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr,
+ X86FoldableSchedWrite sched> {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
+ Sched<[sched]>;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr,
+ X86FoldableSchedWrite sched> {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
+ Sched<[sched]>;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+def BlendCommuteImm2 : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue() & 0x03;
+ return getI8Imm(Imm ^ 0x03, SDLoc(N));
+}]>;
+
+def BlendCommuteImm4 : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue() & 0x0f;
+ return getI8Imm(Imm ^ 0x0f, SDLoc(N));
+}]>;
+
+def BlendCommuteImm8 : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue() & 0xff;
+ return getI8Imm(Imm ^ 0xff, SDLoc(N));
+}]>;
+
+// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
+def BlendScaleImm4 : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
+def BlendScaleImm2 : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0xf << (i * 4);
+ }
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
+def BlendScaleImm2to4 : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
+def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm ^ 0xff, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
+def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0xf << (i * 4);
+ }
+ return getI8Imm(NewImm ^ 0xff, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
+def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm ^ 0xf, SDLoc(N));
+}]>;
+
+let Predicates = [HasAVX] in {
+ let isCommutable = 0 in {
+ defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+ VR128, load, i128mem, 0,
+ SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
+ }
+
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ let ExeDomain = SSEPackedSingle in
+ defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
+ VR128, load, f128mem, 0,
+ SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
+ let ExeDomain = SSEPackedDouble in
+ defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
+ VR128, load, f128mem, 0,
+ SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
+ let ExeDomain = SSEPackedSingle in
+ defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
+ VR256, load, i256mem, 0,
+ SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
+}
+}
+
+let Predicates = [HasAVX2] in {
+ let isCommutable = 0 in {
+ defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
+ VR256, load, i256mem, 0,
+ SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
+ }
+}
+
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 0 in {
+ defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+ VR128, memop, i128mem, 1,
+ SchedWriteMPSAD.XMM>;
+ }
+
+ let ExeDomain = SSEPackedSingle in
+ defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
+ VR128, memop, f128mem, 1,
+ SchedWriteDPPS.XMM>, SIMD_EXC;
+ let ExeDomain = SSEPackedDouble in
+ defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
+ VR128, memop, f128mem, 1,
+ SchedWriteDPPD.XMM>, SIMD_EXC;
+}
+
+/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
+multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr, Domain d,
+ X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
+let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
+ Sched<[sched]>;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+ // Pattern to commute if load is in first source.
+ def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
+ (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
+ (commuteXForm timm:$src3))>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
+ VR128, load, f128mem, 0, SSEPackedSingle,
+ SchedWriteFBlend.XMM, BlendCommuteImm4>,
+ VEX_4V, VEX_WIG;
+ defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
+ VR256, load, f256mem, 0, SSEPackedSingle,
+ SchedWriteFBlend.YMM, BlendCommuteImm8>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
+ VR128, load, f128mem, 0, SSEPackedDouble,
+ SchedWriteFBlend.XMM, BlendCommuteImm2>,
+ VEX_4V, VEX_WIG;
+ defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
+ VR256, load, f256mem, 0, SSEPackedDouble,
+ SchedWriteFBlend.YMM, BlendCommuteImm4>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
+ VR128, load, i128mem, 0, SSEPackedInt,
+ SchedWriteBlend.XMM, BlendCommuteImm8>,
+ VEX_4V, VEX_WIG;
+}
+
+let Predicates = [HasAVX2] in {
+ defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
+ VR256, load, i256mem, 0, SSEPackedInt,
+ SchedWriteBlend.YMM, BlendCommuteImm8>,
+ VEX_4V, VEX_L, VEX_WIG;
+}
+
+// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
+// ExecutionDomainFixPass will cleanup domains later on.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
+ (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
+
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movsd via commuting under optsize.
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
+
+def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
+ (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
+def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
+
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movss via commuting under optsize.
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
+}
+
+defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
+ VR128, memop, f128mem, 1, SSEPackedSingle,
+ SchedWriteFBlend.XMM, BlendCommuteImm4>;
+defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
+ VR128, memop, f128mem, 1, SSEPackedDouble,
+ SchedWriteFBlend.XMM, BlendCommuteImm2>;
+defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
+ VR128, memop, i128mem, 1, SSEPackedInt,
+ SchedWriteBlend.XMM, BlendCommuteImm8>;
+
+let Predicates = [UseSSE41] in {
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movss via commuting under optsize.
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
+
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
+}
+
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+let Predicates = [HasAVX] in {
+def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
+ (VBLENDPDYrri VR256:$src1,
+ (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ VR128:$src2, sub_xmm), 0x3)>;
+def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
+ (VBLENDPSYrri VR256:$src1,
+ (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ VR128:$src2, sub_xmm), 0xf)>;
+
+def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
+ (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
+def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+}
+
+/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ PatFrag mem_frag, SDNode OpNode,
+ X86FoldableSchedWrite sched> {
+ def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
+ SSEPackedInt>, TAPD, VEX_4V,
+ Sched<[sched]>;
+
+ def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src3, (mem_frag addr:$src2),
+ RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
+ Sched<[sched.Folded, sched.ReadAfterFold,
+ // x86memop:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC::$src3
+ sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedDouble in {
+defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
+ v2f64, loadv2f64, X86Blendv,
+ SchedWriteFVarBlend.XMM>;
+defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
+ v4f64, loadv4f64, X86Blendv,
+ SchedWriteFVarBlend.YMM>, VEX_L;
+} // ExeDomain = SSEPackedDouble
+let ExeDomain = SSEPackedSingle in {
+defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
+ v4f32, loadv4f32, X86Blendv,
+ SchedWriteFVarBlend.XMM>;
+defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
+ v8f32, loadv8f32, X86Blendv,
+ SchedWriteFVarBlend.YMM>, VEX_L;
+} // ExeDomain = SSEPackedSingle
+defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
+ v16i8, loadv16i8, X86Blendv,
+ SchedWriteVarBlend.XMM>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
+ v32i8, loadv32i8, X86Blendv,
+ SchedWriteVarBlend.YMM>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
+ (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
+ (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+ def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+ (v8i32 VR256:$src2))),
+ (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+ def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+ (v4i64 VR256:$src2))),
+ (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
+// Prefer a movss or movsd over a blendps when optimizing for size. these were
+// changed to use blends because blends have better throughput on sandybridge
+// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [HasAVX, OptForSpeed] in {
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
+ (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
+ def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
+ (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
+
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
+ (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
+ def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
+ (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
+
+ // Move low f32 and clear high bits.
+ def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
+ (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
+ (i8 1))), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
+ (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
+ (i8 3))), sub_xmm)>;
+}
+
+// Prefer a movss or movsd over a blendps when optimizing for size. these were
+// changed to use blends because blends have better throughput on sandybridge
+// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseSSE41, OptForSpeed] in {
+ // With SSE41 we can use blends for these patterns.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
+ (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
+ def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
+ (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
+
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
+ (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
+ def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
+ (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
+}
+
+
+/// SS41I_ternary - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+ multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
+ PatFrag mem_frag, X86MemOperand x86memop,
+ SDNode OpNode, X86FoldableSchedWrite sched> {
+ def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr,
+ "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+ [(set VR128:$dst,
+ (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
+ Sched<[sched]>;
+
+ def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr,
+ "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+ [(set VR128:$dst,
+ (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+}
+
+let ExeDomain = SSEPackedDouble in
+defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
+ X86Blendv, SchedWriteFVarBlend.XMM>;
+let ExeDomain = SSEPackedSingle in
+defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
+ X86Blendv, SchedWriteFVarBlend.XMM>;
+defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
+ X86Blendv, SchedWriteVarBlend.XMM>;
+
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
+ (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
+ (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
+def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
+ (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
+ (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
+def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
+ (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
+ (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
+ (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+ def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
+ (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+}
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+
+let Predicates = [HasAVX, NoVLX] in
+def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
+let Predicates = [HasAVX2, NoVLX] in
+def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movntdqa\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
+
+let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v8f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
+ def : Pat<(v4f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
+ def : Pat<(v4i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
+ def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
+ def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
+ def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
+ def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
+ def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
+ def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
+ def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
+ def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+ def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
+ def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
+ def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
+ def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
+ def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
+ def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
+}
+
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, X86FoldableSchedWrite sched,
+ bit Is2Addr = 1> {
+ def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+ Sched<[sched]>;
+ def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX] in
+ defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
+ load, i128mem, SchedWriteVecALU.XMM, 0>,
+ VEX_4V, VEX_WIG;
+
+let Predicates = [HasAVX2] in
+ defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
+ load, i256mem, SchedWriteVecALU.YMM, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+
+let Constraints = "$src1 = $dst" in
+ defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
+ memop, i128mem, SchedWriteVecALU.XMM>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - String/text Processing Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass pcmpistrm_SS42AI<string asm> {
+ def rr : SS42AI<0x62, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrM]>;
+ let mayLoad = 1 in
+ def rm :SS42AI<0x62, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
+}
+
+let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+ defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
+}
+
+multiclass SS42AI_pcmpestrm<string asm> {
+ def rr : SS42AI<0x60, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrM]>;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x60, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+ defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+ def rr : SS42AI<0x63, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrI]>;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x63, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
+}
+
+let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+ defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+ def rr : SS42AI<0x61, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrI]>;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x61, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+ defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - CRC Instructions
+//===----------------------------------------------------------------------===//
+
+// No CRC instructions have AVX equivalents
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
+ RegisterClass RCIn, SDPatternOperator Int> :
+ SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+ !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+ [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
+ Sched<[WriteCRC32]>;
+
+class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
+ X86MemOperand x86memop, SDPatternOperator Int> :
+ SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+ !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+ [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
+ Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
+
+let Constraints = "$src1 = $dst" in {
+ def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
+ int_x86_sse42_crc32_32_8>;
+ def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
+ int_x86_sse42_crc32_32_8>;
+ def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
+ int_x86_sse42_crc32_32_16>, OpSize16;
+ def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
+ int_x86_sse42_crc32_32_16>, OpSize16;
+ def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
+ int_x86_sse42_crc32_32_32>, OpSize32;
+ def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
+ int_x86_sse42_crc32_32_32>, OpSize32;
+ def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
+ int_x86_sse42_crc32_64_64>, REX_W;
+ def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
+ int_x86_sse42_crc32_64_64>, REX_W;
+ let hasSideEffects = 0 in {
+ let mayLoad = 1 in
+ def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
+ null_frag>, REX_W;
+ def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
+ null_frag>, REX_W;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SHA-NI Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
+multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
+ X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
+ def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !if(UsesXMM0,
+ !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
+ [!if(UsesXMM0,
+ (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
+ (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
+ T8PS, Sched<[sched]>;
+
+ def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !if(UsesXMM0,
+ !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
+ [!if(UsesXMM0,
+ (set VR128:$dst, (IntId VR128:$src1,
+ (memop addr:$src2), XMM0)),
+ (set VR128:$dst, (IntId VR128:$src1,
+ (memop addr:$src2))))]>, T8PS,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
+ def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
+ (i8 timm:$src3)))]>, TAPS,
+ Sched<[SchedWriteVecIMul.XMM]>;
+ def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_sha1rnds4 VR128:$src1,
+ (memop addr:$src2),
+ (i8 timm:$src3)))]>, TAPS,
+ Sched<[SchedWriteVecIMul.XMM.Folded,
+ SchedWriteVecIMul.XMM.ReadAfterFold]>;
+
+ defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
+ SchedWriteVecIMul.XMM>;
+ defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
+ SchedWriteVecIMul.XMM>;
+ defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
+ SchedWriteVecIMul.XMM>;
+
+ let Uses=[XMM0] in
+ defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
+ SchedWriteVecIMul.XMM, 1>;
+
+ defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
+ SchedWriteVecIMul.XMM>;
+ defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
+ SchedWriteVecIMul.XMM>;
+}
+
+// Aliases with explicit %xmm0
+def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
+ (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
+ (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
+
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId, PatFrag ld_frag,
+ bit Is2Addr = 0, RegisterClass RC = VR128,
+ X86MemOperand MemOp = i128mem> {
+ let AsmString = OpcodeStr#
+ !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
+ def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2), "",
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
+ Sched<[WriteAESDecEnc]>;
+ def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, MemOp:$src2), "",
+ [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
+ Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
+ }
+}
+
+// Perform One Round of an AES Encryption/Decryption Flow
+let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
+ defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
+ int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
+ defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
+ int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
+ defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
+ int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
+ defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
+ int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
+}
+
+let Predicates = [NoVLX, HasVAES] in {
+ defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
+ int_x86_aesni_aesenc_256, load, 0, VR256,
+ i256mem>, VEX_4V, VEX_L, VEX_WIG;
+ defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
+ int_x86_aesni_aesenclast_256, load, 0, VR256,
+ i256mem>, VEX_4V, VEX_L, VEX_WIG;
+ defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
+ int_x86_aesni_aesdec_256, load, 0, VR256,
+ i256mem>, VEX_4V, VEX_L, VEX_WIG;
+ defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
+ int_x86_aesni_aesdeclast_256, load, 0, VR256,
+ i256mem>, VEX_4V, VEX_L, VEX_WIG;
+}
+
+let Constraints = "$src1 = $dst" in {
+ defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
+ int_x86_aesni_aesenc, memop, 1>;
+ defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
+ int_x86_aesni_aesenclast, memop, 1>;
+ defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
+ int_x86_aesni_aesdec, memop, 1>;
+ defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
+ int_x86_aesni_aesdeclast, memop, 1>;
+}
+
+// Perform the AES InvMixColumn Transformation
+let Predicates = [HasAVX, HasAES] in {
+ def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1),
+ "vaesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
+ VEX, VEX_WIG;
+ def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1),
+ "vaesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
+ Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
+}
+def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
+def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
+ Sched<[WriteAESIMC.Folded]>;
+
+// AES Round Key Generation Assist
+let Predicates = [HasAVX, HasAES] in {
+ def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
+ Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
+ def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
+ Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
+}
+def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
+ Sched<[WriteAESKeyGen]>;
+def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
+ Sched<[WriteAESKeyGen.Folded]>;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL Instructions
+//===----------------------------------------------------------------------===//
+
+// Immediate transform to help with commuting.
+def PCLMULCommuteImm : SDNodeXForm<timm, [{
+ uint8_t Imm = N->getZExtValue();
+ return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
+}]>;
+
+// SSE carry-less Multiplication instructions
+let Predicates = [NoAVX, HasPCLMUL] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = 1 in
+ def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
+ Sched<[WriteCLMul]>;
+
+ def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
+ timm:$src3))]>,
+ Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
+ } // Constraints = "$src1 = $dst"
+
+ def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
+ (i8 timm:$src3)),
+ (PCLMULQDQrm VR128:$src1, addr:$src2,
+ (PCLMULCommuteImm timm:$src3))>;
+} // Predicates = [NoAVX, HasPCLMUL]
+
+// SSE aliases
+foreach HI = ["hq","lq"] in
+foreach LO = ["hq","lq"] in {
+ def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
+ (PCLMULQDQrr VR128:$dst, VR128:$src,
+ !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
+ def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
+ (PCLMULQDQrm VR128:$dst, i128mem:$src,
+ !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
+}
+
+// AVX carry-less Multiplication instructions
+multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
+ PatFrag LdFrag, Intrinsic IntId> {
+ let isCommutable = 1 in
+ def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set RC:$dst,
+ (IntId RC:$src1, RC:$src2, timm:$src3))]>,
+ Sched<[WriteCLMul]>;
+
+ def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, MemOp:$src2, u8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set RC:$dst,
+ (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
+ Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
+
+ // We can commute a load in the first operand by swapping the sources and
+ // rotating the immediate.
+ def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
+ (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
+ (PCLMULCommuteImm timm:$src3))>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
+defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
+ int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
+
+let Predicates = [NoVLX, HasVPCLMULQDQ] in
+defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
+ int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
+
+multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
+ X86MemOperand MemOp, string Hi, string Lo> {
+ def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
+ !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
+ def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
+ !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
+}
+
+multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
+ X86MemOperand MemOp> {
+ defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
+ defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
+ defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
+ defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
+}
+
+// AVX aliases
+defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
+defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
+
+//===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let ExeDomain = SSEPackedInt in {
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
+ (ins VR128:$src, u8imm:$len, u8imm:$idx),
+ "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+ [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
+ timm:$idx))]>,
+ PD, Sched<[SchedWriteVecALU.XMM]>;
+def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "extrq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+ VR128:$mask))]>,
+ PD, Sched<[SchedWriteVecALU.XMM]>;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
+ "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+ [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
+ timm:$len, timm:$idx))]>,
+ XD, Sched<[SchedWriteVecALU.XMM]>;
+def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "insertq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+ VR128:$mask))]>,
+ XD, Sched<[SchedWriteVecALU.XMM]>;
+}
+} // ExeDomain = SSEPackedInt
+
+// Non-temporal (unaligned) scalar stores.
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+ "movntss\t{$src, $dst|$dst, $src}", []>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
+} // SchedRW
+
+def : Pat<(nontemporalstore FR32:$src, addr:$dst),
+ (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
+
+def : Pat<(nontemporalstore FR64:$src, addr:$dst),
+ (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
+
+} // AddedComplexity
+} // HasSSE4A
+
+//===----------------------------------------------------------------------===//
+// AVX Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+// destination operand
+//
+class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ PatFrag bcast_frag, SchedWrite Sched> :
+ AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
+ Sched<[Sched]>, VEX;
+
+// AVX2 adds register forms
+class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
+ AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
+ Sched<[Sched]>, VEX;
+
+let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
+ def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
+ f32mem, v4f32, X86VBroadcastld32,
+ SchedWriteFShuffle.XMM.Folded>;
+ def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
+ f32mem, v8f32, X86VBroadcastld32,
+ SchedWriteFShuffle.XMM.Folded>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
+def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
+ v4f64, X86VBroadcastld64,
+ SchedWriteFShuffle.XMM.Folded>, VEX_L;
+
+let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
+ def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
+ v4f32, v4f32, SchedWriteFShuffle.XMM>;
+ def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
+ v8f32, v4f32, WriteFShuffle256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
+def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
+ v4f64, v2f64, WriteFShuffle256>, VEX_L;
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
+// halves of a 256-bit vector.
+//
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
+def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
+ (ins i128mem:$src),
+ "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
+ Sched<[WriteShuffleLd]>, VEX, VEX_L;
+
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
+ ExeDomain = SSEPackedSingle in
+def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
+ (ins f128mem:$src),
+ "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
+ Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+// NOTE: We're using FP instructions here, but execution domain fixing can
+// convert to integer when profitable.
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+ "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
+ "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+}
+
+// To create a 256-bit all ones value, we should produce VCMPTRUEPS
+// with YMM register containing zero.
+// FIXME: Avoid producing vxorps to clear the fake inputs.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
+}
+
+multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
+ PatFrag memop_frag> {
+ def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
+ (iPTR imm)),
+ (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+ def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
+ (From (memop_frag addr:$src2)),
+ (iPTR imm)),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
+ defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
+}
+
+let Predicates = [HasAVX1Only] in {
+ defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>;
+ defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
+ defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
+let mayStore = 1 in
+def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
+ "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
+}
+
+multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
+ def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (To (!cast<Instruction>(InstrStr#rr)
+ (From VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+ def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+// AVX1 patterns
+let Predicates = [HasAVX, NoVLX] in {
+ defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
+ defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
+}
+
+let Predicates = [HasAVX1Only] in {
+ defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
+ defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
+ defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
+multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
+ Intrinsic IntLd, Intrinsic IntLd256,
+ Intrinsic IntSt, Intrinsic IntSt256,
+ X86SchedWriteMaskMove schedX,
+ X86SchedWriteMaskMove schedY> {
+ def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+ VEX_4V, Sched<[schedX.RM]>;
+ def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L, Sched<[schedY.RM]>;
+ def mr : AVX8I<opc_mr, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
+ VEX_4V, Sched<[schedX.MR]>;
+ def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
+ VEX_4V, VEX_L, Sched<[schedY.MR]>;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
+ int_x86_avx_maskload_ps,
+ int_x86_avx_maskload_ps_256,
+ int_x86_avx_maskstore_ps,
+ int_x86_avx_maskstore_ps_256,
+ WriteFMaskMove32, WriteFMaskMove32Y>;
+let ExeDomain = SSEPackedDouble in
+defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
+ int_x86_avx_maskload_pd,
+ int_x86_avx_maskload_pd_256,
+ int_x86_avx_maskstore_pd,
+ int_x86_avx_maskstore_pd_256,
+ WriteFMaskMove64, WriteFMaskMove64Y>;
+
+//===----------------------------------------------------------------------===//
+// AVX_VNNI
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
+multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit IsCommutable> {
+ let isCommutable = IsCommutable in
+ def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
+ VR128:$src2, VR128:$src3)))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
+ (loadv4i32 addr:$src3))))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ let isCommutable = IsCommutable in
+ def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
+ VR256:$src2, VR256:$src3)))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+
+ def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
+ (loadv8i32 addr:$src3))))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+}
+
+defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
+defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
+defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix;
+defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
+
+def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86vpmaddwd node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
+ def : Pat<(v8i32 (add VR256:$src1,
+ (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
+ (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+ def : Pat<(v8i32 (add VR256:$src1,
+ (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
+ (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
+ def : Pat<(v4i32 (add VR128:$src1,
+ (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
+ (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+ def : Pat<(v4i32 (add VR128:$src1,
+ (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
+ (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
+
+multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
+ RegisterClass RC, X86MemOperand x86memop_f,
+ X86MemOperand x86memop_i,
+ ValueType f_vt, ValueType i_vt,
+ X86FoldableSchedWrite sched,
+ X86FoldableSchedWrite varsched> {
+ let Predicates = [HasAVX, NoVLX] in {
+ def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
+ Sched<[varsched]>;
+ def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop_i:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
+ (i_vt (load addr:$src2)))))]>, VEX_4V,
+ Sched<[varsched.Folded, sched.ReadAfterFold]>;
+
+ def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
+ Sched<[sched]>;
+ def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop_f:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
+ Sched<[sched.Folded]>;
+ }// Predicates = [HasAVX, NoVLX]
+}
+
+let ExeDomain = SSEPackedSingle in {
+ defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+ v4f32, v4i32, SchedWriteFShuffle.XMM,
+ SchedWriteFVarShuffle.XMM>;
+ defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+ v8f32, v8i32, SchedWriteFShuffle.YMM,
+ SchedWriteFVarShuffle.YMM>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+ defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+ v2f64, v2i64, SchedWriteFShuffle.XMM,
+ SchedWriteFVarShuffle.XMM>;
+ defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+ v4f64, v4i64, SchedWriteFShuffle.YMM,
+ SchedWriteFVarShuffle.YMM>, VEX_L;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
+
+let ExeDomain = SSEPackedSingle in {
+let isCommutable = 1 in
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
+}
+
+// Immediate transform to help with commuting.
+def Perm2XCommuteImm : SDNodeXForm<timm, [{
+ return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
+}]>;
+
+multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
+ def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
+ (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
+ def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
+ // Pattern with load in other operand.
+ def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+ (Perm2XCommuteImm timm:$imm))>;
+}
+
+let Predicates = [HasAVX] in {
+ defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
+ defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
+}
+
+let Predicates = [HasAVX1Only] in {
+ defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
+ defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
+ defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+// Note: These instruction do not affect the YMM16-YMM31.
+//
+
+let SchedRW = [WriteSystem] in {
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+ // Zero All YMM registers
+ def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+ [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
+ Requires<[HasAVX]>, VEX_WIG;
+
+ // Zero Upper bits of YMM registers
+ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+ [(int_x86_avx_vzeroupper)]>, PS, VEX,
+ Requires<[HasAVX]>, VEX_WIG;
+} // Defs
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//
+
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched> {
+ def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+ "vcvtph2ps\t{$src, $dst|$dst, $src}",
+ [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
+ T8PD, VEX, Sched<[sched]>;
+ let hasSideEffects = 0, mayLoad = 1 in
+ def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ "vcvtph2ps\t{$src, $dst|$dst, $src}",
+ []>, T8PD, VEX, Sched<[sched.Folded]>;
+}
+
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
+ SchedWrite RR, SchedWrite MR> {
+ def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+ (ins RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
+ TAPD, VEX, Sched<[RR]>;
+ let hasSideEffects = 0, mayStore = 1 in
+ def mr : Ii8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ TAPD, VEX, Sched<[MR]>;
+}
+
+let Predicates = [HasF16C, NoVLX] in {
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
+ defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
+ WriteCvtPS2PHSt>, SIMD_EXC;
+ defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
+ WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
+
+ // Pattern match vcvtph2ps of a scalar i64 load.
+ def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
+ (VCVTPH2PSrm addr:$src)>;
+ def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
+ (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (VCVTPH2PSrm addr:$src)>;
+ def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
+ (VCVTPH2PSYrm addr:$src)>;
+
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
+ def : Pat<(store (i64 (extractelt
+ (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
+ def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
+multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, X86FoldableSchedWrite sched,
+ RegisterClass RC,
+ X86MemOperand x86memop, SDNodeXForm commuteXForm> {
+ let isCommutable = 1 in
+ def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
+ Sched<[sched]>, VEX_4V;
+ def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
+
+ // Pattern to commute if load is in first source.
+ def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
+ (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
+ (commuteXForm timm:$src3))>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
+ SchedWriteBlend.XMM, VR128, i128mem,
+ BlendCommuteImm4>;
+defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
+ SchedWriteBlend.YMM, VR256, i256mem,
+ BlendCommuteImm8>, VEX_L;
+
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
+ (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
+
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+ (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
+}
+
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+// NOTE: We're using FP instructions here, but execution domain fixing should
+// take care of using integer instructions when profitable.
+let Predicates = [HasAVX] in {
+def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
+ (VBLENDPSYrri VR256:$src1,
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
+ (VBLENDPSYrri VR256:$src1,
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
+ (VBLENDPSYrri VR256:$src1,
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
+ (VBLENDPSYrri VR256:$src1,
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src2, sub_xmm), 0xf)>;
+
+def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPBROADCAST - Load from memory and broadcast to all elements of the
+// destination operand
+//
+multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag bcast_frag,
+ ValueType OpVT128, ValueType OpVT256, Predicate prd> {
+ let Predicates = [HasAVX2, prd] in {
+ def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+ Sched<[SchedWriteShuffle.XMM]>, VEX;
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (OpVT128 (bcast_frag addr:$src)))]>,
+ Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+ Sched<[WriteShuffle256]>, VEX, VEX_L;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (OpVT256 (bcast_frag addr:$src)))]>,
+ Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
+
+ // Provide aliases for broadcast from the same register class that
+ // automatically does the extract.
+ def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
+ (!cast<Instruction>(NAME#"Yrr")
+ (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
+ }
+}
+
+defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
+ v16i8, v32i8, NoVLX_Or_NoBWI>;
+defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
+ v8i16, v16i16, NoVLX_Or_NoBWI>;
+defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
+ v4i32, v8i32, NoVLX>;
+defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
+ v2i64, v4i64, NoVLX>;
+
+let Predicates = [HasAVX2, NoVLX] in {
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
+ (VPBROADCASTBrr (VMOVDI2PDIrr
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit))))>;
+ def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
+ (VPBROADCASTBYrr (VMOVDI2PDIrr
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit))))>;
+
+ def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
+ (VPBROADCASTWrr (VMOVDI2PDIrr
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit))))>;
+ def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
+ (VPBROADCASTWYrr (VMOVDI2PDIrr
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit))))>;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
+ def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
+ (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
+}
+
+// AVX1 broadcast patterns
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
+ (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
+ (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
+ (VBROADCASTSSrm addr:$src)>;
+}
+
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+let Predicates = [HasAVX, NoVLX] in {
+ // 128bit broadcasts:
+ def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+ (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
+ def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
+ (VMOVDDUPrm addr:$src)>;
+
+ def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
+ (VMOVDDUPrr VR128:$src)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
+ (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
+ (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
+
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
+ (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+ (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
+ (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
+
+ def : Pat<(v2i64 (X86VBroadcast i64:$src)),
+ (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
+ def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
+ (VMOVDDUPrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM - Permute instructions
+//
+
+multiclass avx2_perm<bits<8> opc, string OpcodeStr,
+ ValueType OpVT, X86FoldableSchedWrite Sched,
+ X86MemOperand memOp> {
+ let Predicates = [HasAVX2, NoVLX] in {
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+ Sched<[Sched]>, VEX_4V, VEX_L;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, memOp:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermv VR256:$src1,
+ (load addr:$src2))))]>,
+ Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
+ }
+}
+
+defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
+
+multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+ ValueType OpVT, X86FoldableSchedWrite Sched,
+ X86MemOperand memOp> {
+ let Predicates = [HasAVX2, NoVLX] in {
+ def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
+ Sched<[Sched]>, VEX, VEX_L;
+ def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins memOp:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (OpVT (X86VPermi (mem_frag addr:$src1),
+ (i8 timm:$src2))))]>,
+ Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
+ }
+}
+
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
+ WriteShuffle256, i256mem>, VEX_W;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
+ WriteFShuffle256, f256mem>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
+//
+let isCommutable = 1 in
+def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+
+let Predicates = [HasAVX2] in {
+ defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
+ defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
+ defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VINSERTI128 - Insert packed integer values
+//
+let hasSideEffects = 0 in {
+def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+ "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
+ "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>;
+ defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
+ defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTI128 - Extract packed integer values
+//
+def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
+ (ins VR256:$src1, u8imm:$src2),
+ "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[WriteShuffle256]>, VEX, VEX_L;
+let hasSideEffects = 0, mayStore = 1 in
+def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
+ "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
+ defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
+ defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
+//
+multiclass avx2_pmovmask<string OpcodeStr,
+ Intrinsic IntLd128, Intrinsic IntLd256,
+ Intrinsic IntSt128, Intrinsic IntSt256,
+ X86SchedWriteMaskMove schedX,
+ X86SchedWriteMaskMove schedY> {
+ def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
+ VEX_4V, Sched<[schedX.RM]>;
+ def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+ VEX_4V, VEX_L, Sched<[schedY.RM]>;
+ def mr : AVX28I<0x8e, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
+ VEX_4V, Sched<[schedX.MR]>;
+ def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
+ VEX_4V, VEX_L, Sched<[schedY.MR]>;
+}
+
+defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
+ int_x86_avx2_maskload_d,
+ int_x86_avx2_maskload_d_256,
+ int_x86_avx2_maskstore_d,
+ int_x86_avx2_maskstore_d_256,
+ WriteVecMaskMove32, WriteVecMaskMove32Y>;
+defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
+ int_x86_avx2_maskload_q,
+ int_x86_avx2_maskload_q_256,
+ int_x86_avx2_maskstore_q,
+ int_x86_avx2_maskstore_q_256,
+ WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
+
+multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
+ ValueType MaskVT> {
+ // masked store
+ def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
+ (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
+ // masked load
+ def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
+ (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+ def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
+ (VT immAllZerosV))),
+ (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+}
+let Predicates = [HasAVX] in {
+ defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
+ defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
+}
+let Predicates = [HasAVX1Only] in {
+ // load/store i32/i64 not supported use ps/pd version
+ defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
+ defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
+}
+let Predicates = [HasAVX2] in {
+ defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
+ defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
+ defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
+ defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Variable Bit Shifts
+//
+multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128, ValueType vt256> {
+ def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
+ VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1,
+ (vt128 (load addr:$src2)))))]>,
+ VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
+ SchedWriteVarVecShift.XMM.ReadAfterFold]>;
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1,
+ (vt256 (load addr:$src2)))))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
+ SchedWriteVarVecShift.YMM.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
+ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
+ defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
+ defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
+ defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
+}
+
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+
+// FIXME: Improve scheduling of gather instructions.
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
+ ValueType VTy, RegisterClass RC256,
+ X86MemOperand memop128, X86MemOperand memop256,
+ ValueType MTx = VTx, ValueType MTy = VTy> {
+let mayLoad = 1, hasSideEffects = 0 in {
+ def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
+ (ins VR128:$src1, memop128:$src2, VR128:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX, Sched<[WriteLoad]>;
+ def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
+ (ins RC256:$src1, memop256:$src2, RC256:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX, VEX_L, Sched<[WriteLoad]>;
+}
+}
+
+let Predicates = [HasAVX2] in {
+ let mayLoad = 1, hasSideEffects = 0, Constraints
+ = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+ in {
+ defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64,
+ VR256, vx128mem, vx256mem>, VEX_W;
+ defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64,
+ VR256, vx128mem, vy256mem>, VEX_W;
+ defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32,
+ VR256, vx128mem, vy256mem>;
+ defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32,
+ VR128, vx64mem, vy128mem>;
+
+ let ExeDomain = SSEPackedDouble in {
+ defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64,
+ VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W;
+ defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64,
+ VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W;
+ }
+
+ let ExeDomain = SSEPackedSingle in {
+ defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32,
+ VR256, vx128mem, vy256mem, v4i32, v8i32>;
+ defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32,
+ VR128, vx64mem, vy128mem, v4i32, v4i32>;
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// GFNI instructions
+//===----------------------------------------------------------------------===//
+
+multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
+ RegisterClass RC, PatFrag MemOpFrag,
+ X86MemOperand X86MemOp, bit Is2Addr = 0> {
+ let ExeDomain = SSEPackedInt,
+ AsmString = !if(Is2Addr,
+ OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
+ let isCommutable = 1 in
+ def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
+ [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
+ Sched<[SchedWriteVecALU.XMM]>, T8PD;
+
+ def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
+ [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
+ (MemOpFrag addr:$src2))))]>,
+ Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
+ }
+}
+
+multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
+ SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
+ X86MemOperand X86MemOp, bit Is2Addr = 0> {
+ let AsmString = !if(Is2Addr,
+ OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
+ def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3), "",
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
+ SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
+ def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
+ [(set RC:$dst, (OpVT (OpNode RC:$src1,
+ (MemOpFrag addr:$src2),
+ timm:$src3)))], SSEPackedInt>,
+ Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
+ }
+}
+
+multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
+ let Constraints = "$src1 = $dst",
+ Predicates = [HasGFNI, UseSSE2] in
+ defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
+ VR128, load, i128mem, 1>;
+ let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
+ defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
+ load, i128mem>, VEX_4V, VEX_W;
+ defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
+ load, i256mem>, VEX_4V, VEX_L, VEX_W;
+ }
+}
+
+// GF2P8MULB
+let Constraints = "$src1 = $dst",
+ Predicates = [HasGFNI, UseSSE2] in
+defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
+ i128mem, 1>;
+let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
+ defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
+ i128mem>, VEX_4V;
+ defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
+ i256mem>, VEX_4V, VEX_L;
+}
+// GF2P8AFFINEINVQB, GF2P8AFFINEQB
+let isCommutable = 0 in {
+ defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
+ X86GF2P8affineinvqb>, TAPD;
+ defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
+ X86GF2P8affineqb>, TAPD;
+}
+
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td
new file mode 100644
index 000000000000..d8f70b016c7b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td
@@ -0,0 +1,72 @@
+//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD SVM instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVM instructions
+
+let SchedRW = [WriteSystem] in {
+// 0F 01 D9
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+
+// 0F 01 DC
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+
+// 0F 01 DD
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+
+// 0F 01 DE
+let Uses = [EAX] in
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit", []>, TB;
+
+// 0F 01 D8
+let Uses = [EAX] in
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
+ Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
+ Requires<[In64BitMode]>;
+
+// 0F 01 DA
+let Uses = [EAX] in
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
+ Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
+ Requires<[In64BitMode]>;
+
+// 0F 01 DB
+let Uses = [EAX] in
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
+ Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
+ Requires<[In64BitMode]>;
+
+// 0F 01 DF
+let Uses = [EAX, ECX] in
+def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX, ECX] in
+def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga", []>, TB, Requires<[In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"skinit\t{%eax|eax}", (SKINIT), 0>;
+def : InstAlias<"vmrun\t{%eax|eax}", (VMRUN32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmrun\t{%rax|rax}", (VMRUN64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"vmload\t{%eax|eax}", (VMLOAD32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmload\t{%rax|rax}", (VMLOAD64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"vmsave\t{%eax|eax}", (VMSAVE32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmsave\t{%rax|rax}", (VMSAVE64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"invlpga\t{%eax, %ecx|eax, ecx}", (INVLPGA32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"invlpga\t{%rax, %ecx|rax, ecx}", (INVLPGA64), 0>, Requires<[In64BitMode]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td
new file mode 100644
index 000000000000..823ff78b9903
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -0,0 +1,1033 @@
+//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the shift and rotate instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: Someone needs to smear multipattern goodness all over this file.
+
+let Defs = [EFLAGS] in {
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL], SchedRW = [WriteShiftCL] in {
+def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "shl{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (shl GR8:$src1, CL))]>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+ "shl{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (shl GR16:$src1, CL))]>, OpSize16;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+ "shl{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (shl GR32:$src1, CL))]>, OpSize32;
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+ "shl{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (shl GR64:$src1, CL))]>;
+} // Uses = [CL], SchedRW
+
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "shl{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+
+def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "shl{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize16;
+def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "shl{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>,
+ OpSize32;
+def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "shl{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
+} // isConvertibleToThreeAddress = 1
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
+let hasSideEffects = 0 in {
+def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+ "shl{b}\t$dst", []>;
+def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+ "shl{w}\t$dst", []>, OpSize16;
+def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+ "shl{l}\t$dst", []>, OpSize32;
+def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+ "shl{q}\t$dst", []>;
+} // hasSideEffects = 0
+} // Constraints = "$src = $dst", SchedRW
+
+// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
+// using CL?
+let Uses = [CL], SchedRW = [WriteShiftCLLd, WriteRMW] in {
+def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>;
+def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>,
+ OpSize16;
+def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>,
+ OpSize32;
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t{%cl, $dst|$dst, cl}",
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+}
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "shl{b}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "shl{w}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize16;
+def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "shl{l}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize32;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "shl{q}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+
+// Shift by 1
+def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t$dst",
+ [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t$dst",
+ [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize16;
+def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t$dst",
+ [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize32;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t$dst",
+ [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL], SchedRW = [WriteShiftCL] in {
+def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "shr{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (srl GR8:$src1, CL))]>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (srl GR16:$src1, CL))]>, OpSize16;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (srl GR32:$src1, CL))]>, OpSize32;
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (srl GR64:$src1, CL))]>;
+}
+
+def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2),
+ "shr{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
+def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "shr{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize16;
+def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "shr{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>,
+ OpSize32;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2),
+ "shr{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
+
+// Shift right by 1
+def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+ "shr{b}\t$dst",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
+def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t$dst",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize16;
+def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t$dst",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>, OpSize32;
+def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t$dst",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let Uses = [CL], SchedRW = [WriteShiftCLLd, WriteRMW] in {
+def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>;
+def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
+ OpSize16;
+def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>,
+ OpSize32;
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t{%cl, $dst|$dst, cl}",
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+}
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "shr{b}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "shr{w}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize16;
+def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "shr{l}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize32;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "shr{q}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+
+// Shift by 1
+def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t$dst",
+ [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t$dst",
+ [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize16;
+def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t$dst",
+ [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize32;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t$dst",
+ [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL], SchedRW = [WriteShiftCL] in {
+def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (sra GR8:$src1, CL))]>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (sra GR16:$src1, CL))]>,
+ OpSize16;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (sra GR32:$src1, CL))]>,
+ OpSize32;
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (sra GR64:$src1, CL))]>;
+}
+
+def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "sar{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
+def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "sar{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize16;
+def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "sar{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>,
+ OpSize32;
+def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "sar{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t$dst",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
+def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t$dst",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize16;
+def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t$dst",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>, OpSize32;
+def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t$dst",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let Uses = [CL], SchedRW = [WriteShiftCLLd, WriteRMW] in {
+def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>;
+def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>,
+ OpSize16;
+def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>,
+ OpSize32;
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t{%cl, $dst|$dst, cl}",
+ [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+}
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "sar{b}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "sar{w}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize16;
+def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "sar{l}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize32;
+def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "sar{q}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+
+// Shift by 1
+def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t$dst",
+ [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t$dst",
+ [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize16;
+def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t$dst",
+ [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize32;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t$dst",
+ [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Rotate instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteRotate] in {
+
+let Uses = [CL, EFLAGS], SchedRW = [WriteRotateCL] in {
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcl{b}\t{%cl, $dst|$dst, cl}", []>;
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcl{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcl{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcl{q}\t{%cl, $dst|$dst, cl}", []>;
+} // Uses = [CL, EFLAGS]
+
+let Uses = [EFLAGS] in {
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcl{b}\t$dst", []>;
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcl{w}\t$dst", []>, OpSize16;
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcl{l}\t$dst", []>, OpSize32;
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcl{q}\t$dst", []>;
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+} // Uses = [EFLAGS]
+
+let Uses = [CL, EFLAGS], SchedRW = [WriteRotateCL] in {
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcr{b}\t{%cl, $dst|$dst, cl}", []>;
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcr{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcr{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcr{q}\t{%cl, $dst|$dst, cl}", []>;
+} // Uses = [CL, EFLAGS]
+
+let Uses = [EFLAGS] in {
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+ "rcr{b}\t$dst", []>;
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+ "rcr{w}\t$dst", []>, OpSize16;
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+ "rcr{l}\t$dst", []>, OpSize32;
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+ "rcr{q}\t$dst", []>;
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+} // Uses = [EFLAGS]
+
+} // Constraints = "$src = $dst"
+
+let SchedRW = [WriteRotateLd, WriteRMW], mayStore = 1 in {
+let Uses = [EFLAGS] in {
+def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t$dst", []>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t$dst", []>, OpSize16;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
+def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t$dst", []>, OpSize32;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
+def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t$dst", []>, Requires<[In64BitMode]>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>,
+ Requires<[In64BitMode]>;
+
+def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t$dst", []>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t$dst", []>, OpSize16;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16;
+def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t$dst", []>, OpSize32;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32;
+def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t$dst", []>, Requires<[In64BitMode]>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>,
+ Requires<[In64BitMode]>;
+} // Uses = [EFLAGS]
+
+let Uses = [CL, EFLAGS], SchedRW = [WriteRotateCLLd, WriteRMW] in {
+def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t{%cl, $dst|$dst, cl}", []>;
+def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
+def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
+def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t{%cl, $dst|$dst, cl}", []>,
+ Requires<[In64BitMode]>;
+
+def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t{%cl, $dst|$dst, cl}", []>;
+def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16;
+def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32;
+def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t{%cl, $dst|$dst, cl}", []>,
+ Requires<[In64BitMode]>;
+} // Uses = [CL, EFLAGS]
+} // SchedRW
+} // hasSideEffects = 0
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteRotate] in {
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL], SchedRW = [WriteRotateCL] in {
+def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (rotl GR8:$src1, CL))]>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (rotl GR16:$src1, CL))]>, OpSize16;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (rotl GR32:$src1, CL))]>, OpSize32;
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (rotl GR64:$src1, CL))]>;
+}
+
+def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "rol{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "rol{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize16;
+def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "rol{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>,
+ OpSize32;
+def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "rol{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t$dst",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
+def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t$dst",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize16;
+def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t$dst",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>, OpSize32;
+def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t$dst",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
+} // Constraints = "$src = $dst", SchedRW
+
+let Uses = [CL], SchedRW = [WriteRotateCLLd, WriteRMW] in {
+def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>;
+def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize16;
+def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>, OpSize32;
+def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t{%cl, $dst|$dst, cl}",
+ [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+}
+
+let SchedRW = [WriteRotateLd, WriteRMW] in {
+def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
+ "rol{b}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)]>;
+def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1),
+ "rol{w}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+ OpSize16;
+def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
+ "rol{l}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+ OpSize32;
+def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
+ "rol{q}\t{$src1, $dst|$dst, $src1}",
+ [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+
+// Rotate by 1
+def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t$dst",
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t$dst",
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize16;
+def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t$dst",
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize32;
+def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t$dst",
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteRotate] in {
+let Uses = [CL], SchedRW = [WriteRotateCL] in {
+def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t{%cl, $dst|$dst, cl}",
+ [(set GR8:$dst, (rotr GR8:$src1, CL))]>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t{%cl, $dst|$dst, cl}",
+ [(set GR16:$dst, (rotr GR16:$src1, CL))]>, OpSize16;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t{%cl, $dst|$dst, cl}",
+ [(set GR32:$dst, (rotr GR32:$src1, CL))]>, OpSize32;
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t{%cl, $dst|$dst, cl}",
+ [(set GR64:$dst, (rotr GR64:$src1, CL))]>;
+}
+
+def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+ "ror{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
+def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+ "ror{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize16;
+def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+ "ror{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>,
+ OpSize32;
+def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$src2),
+ "ror{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t$dst",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
+def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t$dst",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize16;
+def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t$dst",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>, OpSize32;
+def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t$dst",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
+} // Constraints = "$src = $dst", SchedRW
+
+let Uses = [CL], SchedRW = [WriteRotateCLLd, WriteRMW] in {
+def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>;
+def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize16;
+def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>, OpSize32;
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t{%cl, $dst|$dst, cl}",
+ [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+}
+
+let SchedRW = [WriteRotateLd, WriteRMW] in {
+def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
+ "ror{b}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src),
+ "ror{w}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize16;
+def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
+ "ror{l}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize32;
+def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
+ "ror{q}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+
+// Rotate by 1
+def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t$dst",
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t$dst",
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize16;
+def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t$dst",
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize32;
+def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t$dst",
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Double shift instructions (generalizations of rotate)
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst" in {
+
+let Uses = [CL], SchedRW = [WriteSHDrrcl] in {
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2, CL))]>,
+ TB, OpSize16;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1, CL))]>,
+ TB, OpSize16;
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2, CL))]>,
+ TB, OpSize32;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1, CL))]>,
+ TB, OpSize32;
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2, CL))]>,
+ TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1, CL))]>,
+ TB;
+} // SchedRW
+
+let isCommutable = 1, SchedRW = [WriteSHDrri] in { // These instructions commute to each other.
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))]>,
+ TB, OpSize16;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1,
+ (i8 imm:$src3)))]>,
+ TB, OpSize16;
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))]>,
+ TB, OpSize32;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1,
+ (i8 imm:$src3)))]>,
+ TB, OpSize32;
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1,
+ (i8 imm:$src3)))]>,
+ TB;
+} // SchedRW
+} // Constraints = "$src = $dst"
+
+let Uses = [CL], SchedRW = [WriteSHDmrcl] in {
+def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86fshl (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)]>, TB, OpSize16;
+def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (X86fshr GR16:$src2, (loadi16 addr:$dst), CL),
+ addr:$dst)]>, TB, OpSize16;
+
+def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (fshl (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)]>, TB, OpSize32;
+def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (fshr GR32:$src2, (loadi32 addr:$dst), CL),
+ addr:$dst)]>, TB, OpSize32;
+
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (fshl (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)]>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+ [(store (fshr GR64:$src2, (loadi64 addr:$dst), CL),
+ addr:$dst)]>, TB;
+} // SchedRW
+
+let SchedRW = [WriteSHDmri] in {
+def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86fshl (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize16;
+def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86fshr GR16:$src2, (loadi16 addr:$dst),
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize16;
+
+def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (fshl (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize32;
+def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (fshr GR32:$src2, (loadi32 addr:$dst),
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize32;
+
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (fshl (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (fshr GR64:$src2, (loadi64 addr:$dst),
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+} // SchedRW
+
+} // Defs = [EFLAGS]
+
+// Use the opposite rotate if allows us to use the rotate by 1 instruction.
+def : Pat<(rotl GR8:$src1, (i8 7)), (ROR8r1 GR8:$src1)>;
+def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1 GR16:$src1)>;
+def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1 GR32:$src1)>;
+def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1 GR64:$src1)>;
+def : Pat<(rotr GR8:$src1, (i8 7)), (ROL8r1 GR8:$src1)>;
+def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1 GR16:$src1)>;
+def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1 GR32:$src1)>;
+def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1 GR64:$src1)>;
+
+def : Pat<(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst),
+ (ROR8m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst),
+ (ROR16m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst),
+ (ROR32m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst),
+ (ROR64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(store (rotr (loadi8 addr:$dst), (i8 7)), addr:$dst),
+ (ROL8m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi16 addr:$dst), (i8 15)), addr:$dst),
+ (ROL16m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi32 addr:$dst), (i8 31)), addr:$dst),
+ (ROL32m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi64 addr:$dst), (i8 63)), addr:$dst),
+ (ROL64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
+// Sandy Bridge and newer Intel processors support faster rotates using
+// SHLD to avoid a partial flag update on the normal rotate instructions.
+// Use a pseudo so that TwoInstructionPass and register allocation will see
+// this as unary instruction.
+let Predicates = [HasFastSHLDRotate], AddedComplexity = 5,
+ Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteSHDrri],
+ Constraints = "$src1 = $dst" in {
+ def SHLDROT32ri : I<0, Pseudo, (outs GR32:$dst),
+ (ins GR32:$src1, u8imm:$shamt), "",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$shamt)))]>;
+ def SHLDROT64ri : I<0, Pseudo, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$shamt), "",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$shamt)))]>;
+
+ def SHRDROT32ri : I<0, Pseudo, (outs GR32:$dst),
+ (ins GR32:$src1, u8imm:$shamt), "",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$shamt)))]>;
+ def SHRDROT64ri : I<0, Pseudo, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$shamt), "",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$shamt)))]>;
+}
+
+def ROT32L2R_imm8 : SDNodeXForm<imm, [{
+ // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+ return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+def ROT64L2R_imm8 : SDNodeXForm<imm, [{
+ // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+ return getI8Imm(64 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+// NOTE: We use WriteShift for these rotates as they avoid the stalls
+// of many of the older x86 rotate instructions.
+multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, TAXD, VEX, Sched<[WriteShift]>;
+ let mayLoad = 1 in
+ def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop:$src1, u8imm:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, TAXD, VEX, Sched<[WriteShiftLd]>;
+}
+}
+
+multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+ def rr : I<0xF7, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ VEX, Sched<[WriteShift]>;
+ let mayLoad = 1 in
+ def rm : I<0xF7, MRMSrcMem4VOp3,
+ (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+ VEX, Sched<[WriteShift.Folded,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src2
+ WriteShift.ReadAfterFold]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+ defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
+ defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W;
+ defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
+ defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W;
+ defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
+ defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
+ defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
+ defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, VEX_W;
+
+ // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+ let AddedComplexity = 10 in {
+ def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
+ (RORX32ri GR32:$src, imm:$shamt)>;
+ def : Pat<(rotr GR64:$src, (i8 imm:$shamt)),
+ (RORX64ri GR64:$src, imm:$shamt)>;
+
+ def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+ (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+ def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+ (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+ }
+
+ def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)),
+ (RORX32mi addr:$src, imm:$shamt)>;
+ def : Pat<(rotr (loadi64 addr:$src), (i8 imm:$shamt)),
+ (RORX64mi addr:$src, imm:$shamt)>;
+
+ def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+ (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+ def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+ (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+ // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+ // immediate shift, i.e. the following code is considered better
+ //
+ // mov %edi, %esi
+ // shl $imm, %esi
+ // ... %edi, ...
+ //
+ // than
+ //
+ // movb $imm, %sil
+ // shlx %sil, %edi, %esi
+ // ... %edi, ...
+ //
+ let AddedComplexity = 1 in {
+ def : Pat<(sra GR32:$src1, GR8:$src2),
+ (SARX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra GR64:$src1, GR8:$src2),
+ (SARX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl GR32:$src1, GR8:$src2),
+ (SHRX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl GR64:$src1, GR8:$src2),
+ (SHRX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl GR32:$src1, GR8:$src2),
+ (SHLX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl GR64:$src1, GR8:$src2),
+ (SHLX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ }
+
+ // We prefer to use
+ // mov (%ecx), %esi
+ // shl $imm, $esi
+ //
+ // over
+ //
+ // movb $imm, %al
+ // shlx %al, (%ecx), %esi
+ //
+ // This priority is enforced by IsProfitableToFoldLoad.
+ def : Pat<(sra (loadi32 addr:$src1), GR8:$src2),
+ (SARX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra (loadi64 addr:$src1), GR8:$src2),
+ (SARX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl (loadi32 addr:$src1), GR8:$src2),
+ (SHRX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl (loadi64 addr:$src1), GR8:$src2),
+ (SHRX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl (loadi32 addr:$src1), GR8:$src2),
+ (SHLX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl (loadi64 addr:$src1), GR8:$src2),
+ (SHLX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+}
+
+def : Pat<(rotl GR8:$src1, (i8 relocImm:$src2)),
+ (ROL8ri GR8:$src1, relocImm:$src2)>;
+def : Pat<(rotl GR16:$src1, (i8 relocImm:$src2)),
+ (ROL16ri GR16:$src1, relocImm:$src2)>;
+def : Pat<(rotl GR32:$src1, (i8 relocImm:$src2)),
+ (ROL32ri GR32:$src1, relocImm:$src2)>;
+def : Pat<(rotl GR64:$src1, (i8 relocImm:$src2)),
+ (ROL64ri GR64:$src1, relocImm:$src2)>;
+
+def : Pat<(rotr GR8:$src1, (i8 relocImm:$src2)),
+ (ROR8ri GR8:$src1, relocImm:$src2)>;
+def : Pat<(rotr GR16:$src1, (i8 relocImm:$src2)),
+ (ROR16ri GR16:$src1, relocImm:$src2)>;
+def : Pat<(rotr GR32:$src1, (i8 relocImm:$src2)),
+ (ROR32ri GR32:$src1, relocImm:$src2)>;
+def : Pat<(rotr GR64:$src1, (i8 relocImm:$src2)),
+ (ROR64ri GR64:$src1, relocImm:$src2)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
new file mode 100644
index 000000000000..eb8740896e5d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
@@ -0,0 +1,755 @@
+//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instructions that are generally used in
+// privileged modes. These are not typically used by the compiler, but are
+// supported for the assembler and disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let Defs = [RAX, RDX] in
+def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", []>, TB;
+
+let Defs = [RAX, RCX, RDX] in
+def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
+
+// CPU flow control instructions
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in {
+ def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+
+ def UD1Wm : I<0xB9, MRMSrcMem, (outs), (ins GR16:$src1, i16mem:$src2),
+ "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+ def UD1Lm : I<0xB9, MRMSrcMem, (outs), (ins GR32:$src1, i32mem:$src2),
+ "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+ def UD1Qm : RI<0xB9, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+ "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
+
+ def UD1Wr : I<0xB9, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+ def UD1Lr : I<0xB9, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+ def UD1Qr : RI<0xB9, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
+}
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB;
+
+// Interrupt and SysCall Instructions.
+let Uses = [EFLAGS] in
+ def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>, Requires<[Not64BitMode]>;
+
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
+} // SchedRW
+
+def UBSAN_UD1 : PseudoI<(outs), (ins i32imm:$kind), [(ubsantrap (i32 timm:$kind))]>;
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+let SchedRW = [WriteSystem] in {
+
+def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
+ [(int_x86_int timm:$trap)]>;
+
+
+def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
+def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB,
+ Requires<[In64BitMode]>;
+
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB;
+
+def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexitq", []>, TB,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+def : Pat<(debugtrap),
+ (INT3)>, Requires<[NotPS4]>;
+def : Pat<(debugtrap),
+ (INT (i8 0x41))>, Requires<[IsPS4]>;
+
+//===----------------------------------------------------------------------===//
+// Input/Output Instructions.
+//
+let SchedRW = [WriteSystem] in {
+let Defs = [AL], Uses = [DX] in
+def IN8rr : I<0xEC, RawFrm, (outs), (ins), "in{b}\t{%dx, %al|al, dx}", []>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins), "in{w}\t{%dx, %ax|ax, dx}", []>,
+ OpSize16;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins), "in{l}\t{%dx, %eax|eax, dx}", []>,
+ OpSize32;
+
+let Defs = [AL] in
+def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port),
+ "in{b}\t{$port, %al|al, $port}", []>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+ "in{w}\t{$port, %ax|ax, $port}", []>, OpSize16;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+ "in{l}\t{$port, %eax|eax, $port}", []>, OpSize32;
+
+let Uses = [DX, AL] in
+def OUT8rr : I<0xEE, RawFrm, (outs), (ins), "out{b}\t{%al, %dx|dx, al}", []>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins), "out{w}\t{%ax, %dx|dx, ax}", []>,
+ OpSize16;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins), "out{l}\t{%eax, %dx|dx, eax}", []>,
+ OpSize32;
+
+let Uses = [AL] in
+def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port),
+ "out{b}\t{%al, $port|$port, al}", []>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+ "out{w}\t{%ax, $port|$port, ax}", []>, OpSize16;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+ "out{l}\t{%eax, $port|$port, eax}", []>, OpSize32;
+
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from debug registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
+ Requires<[In64BitMode]>;
+
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from control registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
+ Requires<[In64BitMode]>;
+
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ Requires<[Not64BitMode]>;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segment override instruction prefixes
+
+let SchedRW = [WriteNop] in {
+def CS_PREFIX : I<0x2E, PrefixByte, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, PrefixByte, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, PrefixByte, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, PrefixByte, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, PrefixByte, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, PrefixByte, (outs), (ins), "gs", []>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Address-size override prefixes.
+//
+
+let SchedRW = [WriteNop] in {
+def ADDR16_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr16", []>,
+ Requires<[In32BitMode]>;
+def ADDR32_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr32", []>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from segment registers.
+//
+
+let SchedRW = [WriteMove] in {
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16;
+def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32;
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+let mayStore = 1 in {
+def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>;
+}
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16;
+def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in {
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segmentation support instructions.
+
+let SchedRW = [WriteSystem] in {
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB;
+
+let mayLoad = 1 in
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize16, NotMemoryFoldable;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16orGR32orGR64:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize16, NotMemoryFoldable;
+
+let mayLoad = 1 in
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize32, NotMemoryFoldable;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR16orGR32orGR64:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize32, NotMemoryFoldable;
+let mayLoad = 1 in
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR16orGR32orGR64:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
+
+let mayLoad = 1 in
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize16, NotMemoryFoldable;
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16orGR32orGR64:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize16, NotMemoryFoldable;
+let mayLoad = 1 in
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize32, NotMemoryFoldable;
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR16orGR32orGR64:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB,
+ OpSize32, NotMemoryFoldable;
+let mayLoad = 1 in
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR16orGR32orGR64:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
+
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
+
+def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+ "str{w}\t$dst", []>, TB, OpSize16;
+def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
+ "str{l}\t$dst", []>, TB, OpSize32;
+def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
+ "str{q}\t$dst", []>, TB;
+let mayStore = 1 in
+def STRm : I<0x00, MRM1m, (outs), (ins i16mem:$dst), "str{w}\t$dst", []>, TB;
+
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), "ltr{w}\t$src", []>, TB, NotMemoryFoldable;
+let mayLoad = 1 in
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", []>, TB, NotMemoryFoldable;
+
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), "push{w}\t{%cs|cs}", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), "push{l}\t{%cs|cs}", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), "push{w}\t{%ss|ss}", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), "push{l}\t{%ss|ss}", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), "push{w}\t{%ds|ds}", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), "push{l}\t{%ds|ds}", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins), "push{w}\t{%es|es}", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins), "push{l}\t{%es|es}", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), "push{w}\t{%fs|fs}", []>,
+ OpSize16, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), "push{l}\t{%fs|fs}", []>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), "push{w}\t{%gs|gs}", []>,
+ OpSize16, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), "push{l}\t{%gs|gs}", []>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), "push{q}\t{%fs|fs}", []>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), "push{q}\t{%gs|gs}", []>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+// No "pop cs" instruction.
+def POPSS16 : I<0x17, RawFrm, (outs), (ins), "pop{w}\t{%ss|ss}", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+def POPSS32 : I<0x17, RawFrm, (outs), (ins), "pop{l}\t{%ss|ss}", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins), "pop{w}\t{%ds|ds}", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins), "pop{l}\t{%ds|ds}", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def POPES16 : I<0x07, RawFrm, (outs), (ins), "pop{w}\t{%es|es}", []>,
+ OpSize16, Requires<[Not64BitMode]>;
+def POPES32 : I<0x07, RawFrm, (outs), (ins), "pop{l}\t{%es|es}", []>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins), "pop{w}\t{%fs|fs}", []>,
+ OpSize16, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins), "pop{l}\t{%fs|fs}", []>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins), "pop{q}\t{%fs|fs}", []>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins), "pop{w}\t{%gs|gs}", []>,
+ OpSize16, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins), "pop{l}\t{%gs|gs}", []>, TB,
+ OpSize32, Requires<[Not64BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins), "pop{q}\t{%gs|gs}", []>, TB,
+ OpSize32, Requires<[In64BitMode]>;
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
+ Requires<[Not64BitMode]>;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "lds{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
+ Requires<[Not64BitMode]>;
+
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "lss{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src),
+ "lss{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize16,
+ Requires<[Not64BitMode]>;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "les{l}\t{$src, $dst|$dst, $src}", []>, OpSize32,
+ Requires<[Not64BitMode]>;
+
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src),
+ "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src),
+ "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src),
+ "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32;
+
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src),
+ "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg), "verr\t$seg", []>, TB, NotMemoryFoldable;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), "verw\t$seg", []>, TB, NotMemoryFoldable;
+let mayLoad = 1 in {
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), "verr\t$seg", []>, TB, NotMemoryFoldable;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), "verw\t$seg", []>, TB, NotMemoryFoldable;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Descriptor-table support instructions
+
+let SchedRW = [WriteSystem] in {
+def SGDT16m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst),
+ "sgdtw\t$dst", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SGDT32m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst),
+ "sgdt{l|d}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SGDT64m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst),
+ "sgdt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SIDT16m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst),
+ "sidtw\t$dst", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SIDT32m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst),
+ "sidt{l|d}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SIDT64m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst),
+ "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+ "sldt{w}\t$dst", []>, TB, OpSize16;
+let mayStore = 1 in
+def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst),
+ "sldt{w}\t$dst", []>, TB;
+def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
+ "sldt{l}\t$dst", []>, OpSize32, TB;
+
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+// extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+ "sldt{q}\t$dst", []>, TB, Requires<[In64BitMode]>;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaquemem:$src),
+ "lgdtw\t$src", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LGDT32m : I<0x01, MRM2m, (outs), (ins opaquemem:$src),
+ "lgdt{l|d}\t$src", []>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LGDT64m : I<0x01, MRM2m, (outs), (ins opaquemem:$src),
+ "lgdt{q}\t$src", []>, TB, Requires<[In64BitMode]>;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaquemem:$src),
+ "lidtw\t$src", []>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LIDT32m : I<0x01, MRM3m, (outs), (ins opaquemem:$src),
+ "lidt{l|d}\t$src", []>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LIDT64m : I<0x01, MRM3m, (outs), (ins opaquemem:$src),
+ "lidt{q}\t$src", []>, TB, Requires<[In64BitMode]>;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+ "lldt{w}\t$src", []>, TB, NotMemoryFoldable;
+let mayLoad = 1 in
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+ "lldt{w}\t$src", []>, TB, NotMemoryFoldable;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Specialized register support
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
+let Defs = [EAX, EDX], Uses = [ECX] in
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
+
+let Defs = [RAX, RDX], Uses = [ECX] in
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
+ "smsw{w}\t$dst", []>, OpSize16, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
+ "smsw{l}\t$dst", []>, OpSize32, TB;
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
+ "smsw{q}\t$dst", []>, TB;
+
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst),
+ "smsw{w}\t$dst", []>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+ "lmsw{w}\t$src", []>, TB, NotMemoryFoldable;
+let mayLoad = 1 in
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+ "lmsw{w}\t$src", []>, TB, NotMemoryFoldable;
+
+let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
+ def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Cache instructions
+let SchedRW = [WriteSystem] in {
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, PS;
+
+// wbnoinvd is like wbinvd, except without invalidation
+// encoding: like wbinvd + an 0xF3 prefix
+def WBNOINVD : I<0x09, RawFrm, (outs), (ins), "wbnoinvd",
+ [(int_x86_wbnoinvd)]>, XS,
+ Requires<[HasWBNOINVD]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// CET instructions
+// Use with caution, availability is not predicated on features.
+let SchedRW = [WriteSystem] in {
+ let Uses = [SSP] in {
+ let Defs = [SSP] in {
+ def INCSSPD : I<0xAE, MRM5r, (outs), (ins GR32:$src), "incsspd\t$src",
+ [(int_x86_incsspd GR32:$src)]>, XS;
+ def INCSSPQ : RI<0xAE, MRM5r, (outs), (ins GR64:$src), "incsspq\t$src",
+ [(int_x86_incsspq GR64:$src)]>, XS;
+ } // Defs SSP
+
+ let Constraints = "$src = $dst" in {
+ def RDSSPD : I<0x1E, MRM1r, (outs GR32:$dst), (ins GR32:$src),
+ "rdsspd\t$dst",
+ [(set GR32:$dst, (int_x86_rdsspd GR32:$src))]>, XS;
+ def RDSSPQ : RI<0x1E, MRM1r, (outs GR64:$dst), (ins GR64:$src),
+ "rdsspq\t$dst",
+ [(set GR64:$dst, (int_x86_rdsspq GR64:$src))]>, XS;
+ }
+
+ let Defs = [SSP] in {
+ def SAVEPREVSSP : I<0x01, MRM_EA, (outs), (ins), "saveprevssp",
+ [(int_x86_saveprevssp)]>, XS;
+ def RSTORSSP : I<0x01, MRM5m, (outs), (ins i32mem:$src),
+ "rstorssp\t$src",
+ [(int_x86_rstorssp addr:$src)]>, XS;
+ } // Defs SSP
+ } // Uses SSP
+
+ def WRSSD : I<0xF6, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "wrssd\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8PS;
+ def WRSSQ : RI<0xF6, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "wrssq\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8PS;
+ def WRUSSD : I<0xF5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "wrussd\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrussd GR32:$src, addr:$dst)]>, T8PD;
+ def WRUSSQ : RI<0xF5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "wrussq\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrussq GR64:$src, addr:$dst)]>, T8PD;
+
+ let Defs = [SSP] in {
+ let Uses = [SSP] in {
+ def SETSSBSY : I<0x01, MRM_E8, (outs), (ins), "setssbsy",
+ [(int_x86_setssbsy)]>, XS;
+ } // Uses SSP
+
+ def CLRSSBSY : I<0xAE, MRM6m, (outs), (ins i32mem:$src),
+ "clrssbsy\t$src",
+ [(int_x86_clrssbsy addr:$src)]>, XS;
+ } // Defs SSP
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+ def ENDBR64 : I<0x1E, MRM_FA, (outs), (ins), "endbr64", []>, XS;
+ def ENDBR32 : I<0x1E, MRM_FB, (outs), (ins), "endbr32", []>, XS;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
+let SchedRW = [WriteSystem] in {
+let Predicates = [HasXSAVE] in {
+let Defs = [EDX, EAX], Uses = [ECX] in
+ def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, PS;
+
+let Uses = [EDX, EAX, ECX] in
+ def XSETBV : I<0x01, MRM_D1, (outs), (ins),
+ "xsetbv",
+ [(int_x86_xsetbv ECX, EDX, EAX)]>, PS;
+
+} // HasXSAVE
+
+let Uses = [EDX, EAX] in {
+def XSAVE : I<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
+ "xsave\t$dst",
+ [(int_x86_xsave addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
+def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
+ "xsave64\t$dst",
+ [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
+ "xrstor\t$dst",
+ [(int_x86_xrstor addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
+def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
+ "xrstor64\t$dst",
+ [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
+ "xsaveopt\t$dst",
+ [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT]>;
+def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
+ "xsaveopt64\t$dst",
+ [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
+def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
+ "xsavec\t$dst",
+ [(int_x86_xsavec addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC]>;
+def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
+ "xsavec64\t$dst",
+ [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC, In64BitMode]>;
+def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
+ "xsaves\t$dst",
+ [(int_x86_xsaves addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
+def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
+ "xsaves64\t$dst",
+ [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
+ "xrstors\t$dst",
+ [(int_x86_xrstors addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
+def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
+ "xrstors64\t$dst",
+ [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES, In64BitMode]>;
+} // Uses
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// VIA PadLock crypto instructions
+let Defs = [RAX, RDI], Uses = [RDX, RDI], SchedRW = [WriteSystem] in
+ def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB, REP;
+
+def : InstAlias<"xstorerng", (XSTORE)>;
+
+let SchedRW = [WriteSystem] in {
+let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
+ def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB, REP;
+ def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB, REP;
+ def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB, REP;
+ def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB, REP;
+ def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB, REP;
+}
+
+let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
+ def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB, REP;
+ def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB, REP;
+}
+let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
+ def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB, REP;
+} // SchedRW
+
+//==-----------------------------------------------------------------------===//
+// PKU - enable protection key
+let SchedRW = [WriteSystem] in {
+let Defs = [EAX, EDX], Uses = [ECX] in
+ def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru",
+ [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, PS;
+let Uses = [EAX, ECX, EDX] in
+ def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru",
+ [(X86wrpkru EAX, EDX, ECX)]>, PS;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in {
+ def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+ "rdfsbase{l}\t$dst",
+ [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
+ def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+ "rdfsbase{q}\t$dst",
+ [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
+ def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+ "rdgsbase{l}\t$dst",
+ [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
+ def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+ "rdgsbase{q}\t$dst",
+ [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
+ def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
+ "wrfsbase{l}\t$src",
+ [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
+ def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
+ "wrfsbase{q}\t$src",
+ [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
+ def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
+ "wrgsbase{l}\t$src",
+ [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
+ def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
+ "wrgsbase{q}\t$src",
+ [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// INVPCID Instruction
+let SchedRW = [WriteSystem] in {
+def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invpcid\t{$src2, $src1|$src1, $src2}",
+ [(int_x86_invpcid GR32:$src1, addr:$src2)]>, T8PD,
+ Requires<[Not64BitMode, HasINVPCID]>;
+def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode, HasINVPCID]>;
+} // SchedRW
+
+let Predicates = [In64BitMode, HasINVPCID] in {
+ // The instruction can only use a 64 bit register as the register argument
+ // in 64 bit mode, while the intrinsic only accepts a 32 bit argument
+ // corresponding to it.
+ // The accepted values for now are 0,1,2,3 anyways (see Intel SDM -- INVCPID
+ // type),/ so it doesn't hurt us that one can't supply a 64 bit value here.
+ def : Pat<(int_x86_invpcid GR32:$src1, addr:$src2),
+ (INVPCID64
+ (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src1), sub_32bit),
+ addr:$src2)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SMAP Instruction
+let Defs = [EFLAGS], SchedRW = [WriteSystem] in {
+ def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, PS;
+ def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, PS;
+}
+
+//===----------------------------------------------------------------------===//
+// SMX Instruction
+let SchedRW = [WriteSystem] in {
+let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
+ def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, PS;
+} // Uses, Defs
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// TS flag control instruction.
+let SchedRW = [WriteSystem] in {
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// IF (inside EFLAGS) management instructions.
+let SchedRW = [WriteSystem], Uses = [EFLAGS], Defs = [EFLAGS] in {
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// RDPID Instruction
+let SchedRW = [WriteSystem] in {
+def RDPID32 : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+ "rdpid\t$dst", [(set GR32:$dst, (int_x86_rdpid))]>, XS,
+ Requires<[Not64BitMode, HasRDPID]>;
+def RDPID64 : I<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdpid\t$dst", []>, XS,
+ Requires<[In64BitMode, HasRDPID]>;
+} // SchedRW
+
+let Predicates = [In64BitMode, HasRDPID] in {
+ // Due to silly instruction definition, we have to compensate for the
+ // instruction outputing a 64-bit register.
+ def : Pat<(int_x86_rdpid),
+ (EXTRACT_SUBREG (RDPID64), sub_32bit)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PTWRITE Instruction - Write Data to a Processor Trace Packet
+let SchedRW = [WriteSystem] in {
+def PTWRITEm: I<0xAE, MRM4m, (outs), (ins i32mem:$dst),
+ "ptwrite{l}\t$dst", [(int_x86_ptwrite32 (loadi32 addr:$dst))]>, XS,
+ Requires<[HasPTWRITE]>;
+def PTWRITE64m : RI<0xAE, MRM4m, (outs), (ins i64mem:$dst),
+ "ptwrite{q}\t$dst", [(int_x86_ptwrite64 (loadi64 addr:$dst))]>, XS,
+ Requires<[In64BitMode, HasPTWRITE]>;
+
+def PTWRITEr : I<0xAE, MRM4r, (outs), (ins GR32:$dst),
+ "ptwrite{l}\t$dst", [(int_x86_ptwrite32 GR32:$dst)]>, XS,
+ Requires<[HasPTWRITE]>;
+def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
+ "ptwrite{q}\t$dst", [(int_x86_ptwrite64 GR64:$dst)]>, XS,
+ Requires<[In64BitMode, HasPTWRITE]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Platform Configuration instruction
+
+// From ISA docs:
+// "This instruction is used to execute functions for configuring platform
+// features.
+// EAX: Leaf function to be invoked.
+// RBX/RCX/RDX: Leaf-specific purpose."
+// "Successful execution of the leaf clears RAX (set to zero) and ZF, CF, PF,
+// AF, OF, and SF are cleared. In case of failure, the failure reason is
+// indicated in RAX with ZF set to 1 and CF, PF, AF, OF, and SF are cleared."
+// Thus all these mentioned registers are considered clobbered.
+
+let SchedRW = [WriteSystem] in {
+let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in
+ def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, PS,
+ Requires<[HasPCONFIG]>;
+} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td
new file mode 100644
index 000000000000..8d7cd6082095
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td
@@ -0,0 +1,39 @@
+//===- X86InstrTDX.td - TDX Instruction Set Extension -*- tablegen -*===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TDX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TDX instructions
+
+// 64-bit only instructions
+let SchedRW = [WriteSystem], Predicates = [In64BitMode] in {
+// SEAMCALL - Call to SEAM VMX-root Operation Module
+def SEAMCALL : I<0x01, MRM_CF, (outs), (ins),
+ "seamcall", []>, PD;
+
+// SEAMRET - Return to Legacy VMX-root Operation
+def SEAMRET : I<0x01, MRM_CD, (outs), (ins),
+ "seamret", []>, PD;
+
+// SEAMOPS - SEAM Operations
+def SEAMOPS : I<0x01, MRM_CE, (outs), (ins),
+ "seamops", []>, PD;
+
+} // SchedRW
+
+// common instructions
+let SchedRW = [WriteSystem] in {
+// TDCALL - Call SEAM Module Functions
+def TDCALL : I<0x01, MRM_CC, (outs), (ins),
+ "tdcall", []>, PD;
+
+} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 000000000000..28563eeb4484
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,59 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+let SchedRW = [WriteSystem] in {
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+ "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+ Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in {
+def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst),
+ "xbegin\t$dst", []>, OpSize16;
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
+ "xbegin\t$dst", []>, OpSize32;
+}
+
+// Pseudo instruction to fake the definition of EAX on the fallback code path.
+let isPseudo = 1, Defs = [EAX] in {
+def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
+}
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+ "xend", [(int_x86_xend)]>, PS, Requires<[HasRTM]>;
+
+let Defs = [EFLAGS] in
+def XTEST : I<0x01, MRM_D6, (outs), (ins),
+ "xtest", [(set EFLAGS, (X86xtest))]>, PS, Requires<[HasRTM]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+ "xabort\t$imm",
+ [(int_x86_xabort timm:$imm)]>, Requires<[HasRTM]>;
+} // SchedRW
+
+// HLE prefixes
+let SchedRW = [WriteSystem] in {
+
+let isAsmParserOnly = 1 in {
+def XACQUIRE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "xacquire", []>;
+def XRELEASE_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "xrelease", []>;
+}
+
+} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td
new file mode 100644
index 000000000000..d204a33358ea
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td
@@ -0,0 +1,87 @@
+//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel VMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VMX instructions
+
+let SchedRW = [WriteSystem] in {
+// 66 0F 38 80
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[Not64BitMode]>;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode]>;
+
+// 66 0F 38 81
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[Not64BitMode]>;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ Requires<[In64BitMode]>;
+
+// 0F 01 C1
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmclear\t$vmcs", []>, PD;
+
+// OF 01 D4
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, PS;
+
+// 0F 01 C2
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+
+// 0F 01 C3
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmptrld\t$vmcs", []>, PS;
+def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
+ "vmptrst\t$vmcs", []>, PS;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+ NotMemoryFoldable;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+ NotMemoryFoldable;
+
+let mayStore = 1 in {
+def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+ NotMemoryFoldable;
+def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+ NotMemoryFoldable;
+} // mayStore
+
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+ NotMemoryFoldable;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+ NotMemoryFoldable;
+
+let mayLoad = 1 in {
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>,
+ NotMemoryFoldable;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>,
+ NotMemoryFoldable;
+} // mayLoad
+
+// 0F 01 C4
+def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+ "vmxon\t$vmxon", []>, XS;
+} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVecCompiler.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVecCompiler.td
new file mode 100644
index 000000000000..e98843bd3ae3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -0,0 +1,459 @@
+//===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various vector pseudo instructions used by the
+// compiler, as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [NoAVX512] in {
+ // A vector extract of the first f32/f64 position is a subregister copy
+ def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+ def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+}
+
+let Predicates = [HasAVX512] in {
+ // A vector extract of the first f32/f64 position is a subregister copy
+ def : Pat<(f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X)>;
+ def : Pat<(f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X)>;
+}
+
+let Predicates = [NoVLX] in {
+ // Implicitly promote a 32-bit scalar to a vector.
+ def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+ (COPY_TO_REGCLASS FR32:$src, VR128)>;
+ // Implicitly promote a 64-bit scalar to a vector.
+ def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+ (COPY_TO_REGCLASS FR64:$src, VR128)>;
+}
+
+let Predicates = [HasVLX] in {
+ // Implicitly promote a 32-bit scalar to a vector.
+ def : Pat<(v4f32 (scalar_to_vector FR32X:$src)),
+ (COPY_TO_REGCLASS FR32X:$src, VR128X)>;
+ // Implicitly promote a 64-bit scalar to a vector.
+ def : Pat<(v2f64 (scalar_to_vector FR64X:$src)),
+ (COPY_TO_REGCLASS FR64X:$src, VR128X)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subvector tricks
+//===----------------------------------------------------------------------===//
+
+// Patterns for insert_subvector/extract_subvector to/from index=0
+multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT,
+ RegisterClass RC, ValueType VT,
+ SubRegIndex subIdx> {
+ def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
+ (subVT (EXTRACT_SUBREG RC:$src, subIdx))>;
+
+ def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
+ (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;
+}
+
+// A 128-bit subvector extract from the first 256-bit vector position is a
+// subregister copy that needs no instruction. Likewise, a 128-bit subvector
+// insert to the first 256-bit vector position is a subregister copy that needs
+// no instruction.
+defm : subvector_subreg_lowering<VR128, v4i32, VR256, v8i32, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v4f32, VR256, v8f32, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v2i64, VR256, v4i64, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8, sub_xmm>;
+
+// A 128-bit subvector extract from the first 512-bit vector position is a
+// subregister copy that needs no instruction. Likewise, a 128-bit subvector
+// insert to the first 512-bit vector position is a subregister copy that needs
+// no instruction.
+defm : subvector_subreg_lowering<VR128, v4i32, VR512, v16i32, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v4f32, VR512, v16f32, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v2i64, VR512, v8i64, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8, sub_xmm>;
+
+// A 128-bit subvector extract from the first 512-bit vector position is a
+// subregister copy that needs no instruction. Likewise, a 128-bit subvector
+// insert to the first 512-bit vector position is a subregister copy that needs
+// no instruction.
+defm : subvector_subreg_lowering<VR256, v8i32, VR512, v16i32, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v8f32, VR512, v16f32, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v4i64, VR512, v8i64, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v4f64, VR512, v8f64, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>;
+
+
+// If we're inserting into an all zeros vector, just use a plain move which
+// will zero the upper bits. A post-isel hook will take care of removing
+// any moves that we can prove are unnecessary.
+multiclass subvec_zero_lowering<string MoveStr,
+ RegisterClass RC, ValueType DstTy,
+ ValueType SrcTy, ValueType ZeroTy,
+ SubRegIndex SubIdx> {
+ def : Pat<(DstTy (insert_subvector immAllZerosV,
+ (SrcTy RC:$src), (iPTR 0))),
+ (SUBREG_TO_REG (i64 0),
+ (SrcTy (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm : subvec_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, sub_xmm>;
+}
+
+let Predicates = [HasVLX] in {
+ defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, sub_xmm>;
+
+ defm : subvec_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, sub_xmm>;
+
+ defm : subvec_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, sub_ymm>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+ defm : subvec_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, sub_xmm>;
+ defm : subvec_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, sub_xmm>;
+
+ defm : subvec_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, sub_ymm>;
+ defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, sub_ymm>;
+}
+
+class maskzeroupper<ValueType vt, RegisterClass RC> :
+ PatLeaf<(vt RC:$src), [{
+ return isMaskZeroExtended(N);
+ }]>;
+
+def maskzeroupperv1i1 : maskzeroupper<v1i1, VK1>;
+def maskzeroupperv2i1 : maskzeroupper<v2i1, VK2>;
+def maskzeroupperv4i1 : maskzeroupper<v4i1, VK4>;
+def maskzeroupperv8i1 : maskzeroupper<v8i1, VK8>;
+def maskzeroupperv16i1 : maskzeroupper<v16i1, VK16>;
+def maskzeroupperv32i1 : maskzeroupper<v32i1, VK32>;
+
+// The patterns determine if we can depend on the upper bits of a mask register
+// being zeroed by the previous operation so that we can skip explicit
+// zeroing.
+let Predicates = [HasBWI] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK32)>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv8i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK8:$src, VK32)>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv16i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK16:$src, VK32)>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK64)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv8i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK8:$src, VK64)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv16i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK16:$src, VK64)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv32i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK32:$src, VK64)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK16)>;
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ maskzeroupperv8i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK8:$src, VK16)>;
+}
+
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK8)>;
+}
+
+let Predicates = [HasVLX, HasDQI] in {
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ maskzeroupperv2i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK2:$src, VK8)>;
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ maskzeroupperv4i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK4:$src, VK8)>;
+}
+
+let Predicates = [HasVLX] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ maskzeroupperv2i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK2:$src, VK16)>;
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ maskzeroupperv4i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK4:$src, VK16)>;
+}
+
+let Predicates = [HasBWI, HasVLX] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv2i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK2:$src, VK32)>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv4i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK4:$src, VK32)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv2i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK2:$src, VK64)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv4i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK4:$src, VK64)>;
+}
+
+// If the bits are not zero we have to fall back to explicitly zeroing by
+// using shifts.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v1i1 VK1:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK1:$mask, VK16),
+ (i8 15)), (i8 15))>;
+
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v2i1 VK2:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16),
+ (i8 14)), (i8 14))>;
+
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v4i1 VK4:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16),
+ (i8 12)), (i8 12))>;
+}
+
+let Predicates = [HasAVX512, NoDQI] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK8:$mask, VK16),
+ (i8 8)), (i8 8))>;
+}
+
+let Predicates = [HasDQI] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK16)>;
+
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ (v1i1 VK1:$mask), (iPTR 0))),
+ (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK1:$mask, VK8),
+ (i8 7)), (i8 7))>;
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ (v2i1 VK2:$mask), (iPTR 0))),
+ (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK2:$mask, VK8),
+ (i8 6)), (i8 6))>;
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ (v4i1 VK4:$mask), (iPTR 0))),
+ (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK4:$mask, VK8),
+ (i8 4)), (i8 4))>;
+}
+
+let Predicates = [HasBWI] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v16i1 VK16:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK32)>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v16i1 VK16:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK64)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v32i1 VK32:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVDkk VK32:$mask), VK64)>;
+}
+
+let Predicates = [HasBWI, NoDQI] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK8:$mask, VK32),
+ (i8 24)), (i8 24))>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK8:$mask, VK64),
+ (i8 56)), (i8 56))>;
+}
+
+let Predicates = [HasBWI, HasDQI] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK32)>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>;
+}
+
+let Predicates = [HasBWI] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v1i1 VK1:$mask), (iPTR 0))),
+ (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32),
+ (i8 31)), (i8 31))>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v2i1 VK2:$mask), (iPTR 0))),
+ (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK2:$mask, VK32),
+ (i8 30)), (i8 30))>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v4i1 VK4:$mask), (iPTR 0))),
+ (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK4:$mask, VK32),
+ (i8 28)), (i8 28))>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v1i1 VK1:$mask), (iPTR 0))),
+ (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK1:$mask, VK64),
+ (i8 63)), (i8 63))>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v2i1 VK2:$mask), (iPTR 0))),
+ (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK2:$mask, VK64),
+ (i8 62)), (i8 62))>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v4i1 VK4:$mask), (iPTR 0))),
+ (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK4:$mask, VK64),
+ (i8 60)), (i8 60))>;
+}
+
+//===----------------------------------------------------------------------===//
+// Extra selection patterns for f128, f128mem
+
+// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
+let Predicates = [NoAVX] in {
+def : Pat<(alignedstore (f128 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+def : Pat<(store (f128 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+
+def : Pat<(alignedloadf128 addr:$src),
+ (MOVAPSrm addr:$src)>;
+def : Pat<(loadf128 addr:$src),
+ (MOVUPSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(alignedstore (f128 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+def : Pat<(store (f128 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+def : Pat<(alignedloadf128 addr:$src),
+ (VMOVAPSrm addr:$src)>;
+def : Pat<(loadf128 addr:$src),
+ (VMOVUPSrm addr:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+def : Pat<(alignedstore (f128 VR128X:$src), addr:$dst),
+ (VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
+def : Pat<(store (f128 VR128X:$src), addr:$dst),
+ (VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
+
+def : Pat<(alignedloadf128 addr:$src),
+ (VMOVAPSZ128rm addr:$src)>;
+def : Pat<(loadf128 addr:$src),
+ (VMOVUPSZ128rm addr:$src)>;
+}
+
+let Predicates = [UseSSE1] in {
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))),
+ (ANDPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
+ (ANDPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))),
+ (ORPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
+ (ORPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))),
+ (XORPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
+ (XORPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(f128 (X86fand VR128:$src1, (loadf128 addr:$src2))),
+ (VANDPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
+ (VANDPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(f128 (X86for VR128:$src1, (loadf128 addr:$src2))),
+ (VORPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
+ (VORPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128:$src1, (loadf128 addr:$src2))),
+ (VXORPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
+ (VXORPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasVLX] in {
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(f128 (X86fand VR128X:$src1, (loadf128 addr:$src2))),
+ (VANDPSZ128rm VR128X:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fand VR128X:$src1, VR128X:$src2)),
+ (VANDPSZ128rr VR128X:$src1, VR128X:$src2)>;
+
+def : Pat<(f128 (X86for VR128X:$src1, (loadf128 addr:$src2))),
+ (VORPSZ128rm VR128X:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86for VR128X:$src1, VR128X:$src2)),
+ (VORPSZ128rr VR128X:$src1, VR128X:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128X:$src1, (loadf128 addr:$src2))),
+ (VXORPSZ128rm VR128X:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128X:$src1, VR128X:$src2)),
+ (VXORPSZ128rr VR128X:$src1, VR128X:$src2)>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td
new file mode 100644
index 000000000000..a5976b7d2d74
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td
@@ -0,0 +1,473 @@
+//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes XOP (eXtended OPerations)
+//
+//===----------------------------------------------------------------------===//
+
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
+ Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd>;
+ defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq>;
+ defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw>;
+ defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq>;
+ defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd>;
+ defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq>;
+ defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd>;
+ defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq>;
+ defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw>;
+ defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq>;
+ defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd>;
+ defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq>;
+ defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw>;
+ defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq>;
+ defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd>;
+}
+
+// Scalar load 2 addr operand instructions
+multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ Operand memop, PatFrags mem_frags,
+ X86FoldableSchedWrite sched> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (mem_frags addr:$src)))]>, XOP,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ X86FoldableSchedWrite sched> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ X86FoldableSchedWrite sched> {
+ def Yrr : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[sched]>;
+ def Yrm : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (Int (load addr:$src)))]>, XOP, VEX_L,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+ ssmem, sse_load_f32, SchedWriteFRnd.Scl>;
+ defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps,
+ SchedWriteFRnd.XMM>;
+ defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
+ SchedWriteFRnd.YMM>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+ sdmem, sse_load_f64, SchedWriteFRnd.Scl>;
+ defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd,
+ SchedWriteFRnd.XMM>;
+ defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
+ SchedWriteFRnd.YMM>;
+}
+
+multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128, X86FoldableSchedWrite sched> {
+ def rr : IXOP<opc, MRMSrcReg4VOp3, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
+ XOP, Sched<[sched]>;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (load addr:$src2)))))]>,
+ XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
+ (ins i128mem:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 (load addr:$src1)),
+ (vt128 VR128:$src2))))]>,
+ XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>,
+ XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8, SchedWriteVarVecShift.XMM>;
+ defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32, SchedWriteVarVecShift.XMM>;
+ defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64, SchedWriteVarVecShift.XMM>;
+ defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16, SchedWriteVarVecShift.XMM>;
+ defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8, SchedWriteVarVecShift.XMM>;
+ defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32, SchedWriteVarVecShift.XMM>;
+ defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64, SchedWriteVarVecShift.XMM>;
+ defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16, SchedWriteVarVecShift.XMM>;
+ defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8, SchedWriteVarVecShift.XMM>;
+ defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32, SchedWriteVarVecShift.XMM>;
+ defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64, SchedWriteVarVecShift.XMM>;
+ defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16, SchedWriteVarVecShift.XMM>;
+}
+
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128, X86FoldableSchedWrite sched> {
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), timm:$src2)))]>,
+ XOP, Sched<[sched]>;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 (load addr:$src1)), timm:$src2)))]>,
+ XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8,
+ SchedWriteVecShiftImm.XMM>;
+ defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32,
+ SchedWriteVecShiftImm.XMM>;
+ defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64,
+ SchedWriteVecShiftImm.XMM>;
+ defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16,
+ SchedWriteVecShiftImm.XMM>;
+}
+
+// Instruction where second source can be memory, but third must be register
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ X86FoldableSchedWrite sched> {
+ let isCommutable = 1 in
+ def rr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V,
+ Sched<[sched]>;
+ def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int VR128:$src1, (load addr:$src2),
+ VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd",
+ int_x86_xop_vpmadcswd, SchedWriteVecIMul.XMM>;
+ defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd",
+ int_x86_xop_vpmadcsswd, SchedWriteVecIMul.XMM>;
+ defm VPMACSWW : xop4opm2<0x95, "vpmacsww",
+ int_x86_xop_vpmacsww, SchedWriteVecIMul.XMM>;
+ defm VPMACSWD : xop4opm2<0x96, "vpmacswd",
+ int_x86_xop_vpmacswd, SchedWriteVecIMul.XMM>;
+ defm VPMACSSWW : xop4opm2<0x85, "vpmacssww",
+ int_x86_xop_vpmacssww, SchedWriteVecIMul.XMM>;
+ defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd",
+ int_x86_xop_vpmacsswd, SchedWriteVecIMul.XMM>;
+ defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql",
+ int_x86_xop_vpmacssdql, SchedWritePMULLD.XMM>;
+ defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh",
+ int_x86_xop_vpmacssdqh, SchedWritePMULLD.XMM>;
+ defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd",
+ int_x86_xop_vpmacssdd, SchedWritePMULLD.XMM>;
+ defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql",
+ int_x86_xop_vpmacsdql, SchedWritePMULLD.XMM>;
+ defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh",
+ int_x86_xop_vpmacsdqh, SchedWritePMULLD.XMM>;
+ defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd",
+ int_x86_xop_vpmacsdd, SchedWritePMULLD.XMM>;
+}
+
+// IFMA patterns - for cases where we can safely ignore the overflow bits from
+// the multiply or easily match with existing intrinsics.
+let Predicates = [HasXOP] in {
+ def : Pat<(v8i16 (add (mul (v8i16 VR128:$src1), (v8i16 VR128:$src2)),
+ (v8i16 VR128:$src3))),
+ (VPMACSWWrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+ def : Pat<(v4i32 (add (mul (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
+ (v4i32 VR128:$src3))),
+ (VPMACSDDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+ def : Pat<(v2i64 (add (X86pmuldq (bc_v2i64 (X86PShufd (v4i32 VR128:$src1), (i8 -11))),
+ (bc_v2i64 (X86PShufd (v4i32 VR128:$src2), (i8 -11)))),
+ (v2i64 VR128:$src3))),
+ (VPMACSDQHrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+ def : Pat<(v2i64 (add (X86pmuldq (v2i64 VR128:$src1), (v2i64 VR128:$src2)),
+ (v2i64 VR128:$src3))),
+ (VPMACSDQLrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+ def : Pat<(v4i32 (add (X86vpmaddwd (v8i16 VR128:$src1), (v8i16 VR128:$src2)),
+ (v4i32 VR128:$src3))),
+ (VPMADCSWDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+}
+
+// Transforms to swizzle an immediate to help matching memory operand in first
+// operand.
+def CommuteVPCOMCC : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue() & 0x7;
+ Imm = X86::getSwappedVPCOMImm(Imm);
+ return getI8Imm(Imm, SDLoc(N));
+}]>;
+
+// Instruction where second source can be memory, third must be imm8
+multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
+ X86FoldableSchedWrite sched> {
+ let ExeDomain = SSEPackedInt in { // SSE integer instructions
+ let isCommutable = 1 in
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$cc),
+ !strconcat("vpcom", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ timm:$cc)))]>,
+ XOP_4V, Sched<[sched]>;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$cc),
+ !strconcat("vpcom", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (load addr:$src2)),
+ timm:$cc)))]>,
+ XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
+
+ def : Pat<(OpNode (load addr:$src2),
+ (vt128 VR128:$src1), timm:$cc),
+ (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
+ (CommuteVPCOMCC timm:$cc))>;
+}
+
+defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>;
+defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16, SchedWriteVecALU.XMM>;
+defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32, SchedWriteVecALU.XMM>;
+defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64, SchedWriteVecALU.XMM>;
+defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8, SchedWriteVecALU.XMM>;
+defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16, SchedWriteVecALU.XMM>;
+defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32, SchedWriteVecALU.XMM>;
+defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64, SchedWriteVecALU.XMM>;
+
+multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128, X86FoldableSchedWrite sched> {
+ def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ (vt128 VR128:$src3))))]>,
+ XOP_4V, Sched<[sched]>;
+ def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ (vt128 (load addr:$src3)))))]>,
+ XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+ def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)),
+ (vt128 VR128:$src3))))]>,
+ XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+ // 128mem:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // VR128:$src3
+ sched.ReadAfterFold]>;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rrr>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8,
+ SchedWriteVarShuffle.XMM>;
+}
+
+// Instruction where either second or third source can be memory
+multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ X86FoldableSchedWrite sched> {
+ def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
+ (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
+ Sched<[sched]>;
+ // FIXME: We can't write a pattern for this in tablegen.
+ let hasSideEffects = 0, mayLoad = 1 in
+ def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>,
+ XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+ def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
+ (X86andnp RC:$src3, (load addr:$src2)))))]>,
+ XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+ // x86memop:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC::$src3
+ sched.ReadAfterFold]>;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rrr>;
+}
+
+let ExeDomain = SSEPackedInt in {
+ defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64,
+ SchedWriteShuffle.XMM>;
+ defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64,
+ SchedWriteShuffle.YMM>, VEX_L;
+}
+
+let Predicates = [HasXOP] in {
+ def : Pat<(v16i8 (or (and VR128:$src3, VR128:$src1),
+ (X86andnp VR128:$src3, VR128:$src2))),
+ (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+ def : Pat<(v8i16 (or (and VR128:$src3, VR128:$src1),
+ (X86andnp VR128:$src3, VR128:$src2))),
+ (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+ def : Pat<(v4i32 (or (and VR128:$src3, VR128:$src1),
+ (X86andnp VR128:$src3, VR128:$src2))),
+ (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+ def : Pat<(or (and VR128:$src3, VR128:$src1),
+ (X86andnp VR128:$src3, (loadv16i8 addr:$src2))),
+ (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
+ def : Pat<(or (and VR128:$src3, VR128:$src1),
+ (X86andnp VR128:$src3, (loadv8i16 addr:$src2))),
+ (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
+ def : Pat<(or (and VR128:$src3, VR128:$src1),
+ (X86andnp VR128:$src3, (loadv4i32 addr:$src2))),
+ (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+ def : Pat<(v32i8 (or (and VR256:$src3, VR256:$src1),
+ (X86andnp VR256:$src3, VR256:$src2))),
+ (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+ def : Pat<(v16i16 (or (and VR256:$src3, VR256:$src1),
+ (X86andnp VR256:$src3, VR256:$src2))),
+ (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+ def : Pat<(v8i32 (or (and VR256:$src3, VR256:$src1),
+ (X86andnp VR256:$src3, VR256:$src2))),
+ (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+
+ def : Pat<(or (and VR256:$src3, VR256:$src1),
+ (X86andnp VR256:$src3, (loadv32i8 addr:$src2))),
+ (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
+ def : Pat<(or (and VR256:$src3, VR256:$src1),
+ (X86andnp VR256:$src3, (loadv16i16 addr:$src2))),
+ (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
+ def : Pat<(or (and VR256:$src3, VR256:$src1),
+ (X86andnp VR256:$src3, (loadv8i32 addr:$src2))),
+ (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
+}
+
+multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand intmemop, X86MemOperand fpmemop,
+ ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag,
+ X86FoldableSchedWrite sched> {
+ def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set RC:$dst,
+ (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 timm:$src4))))]>,
+ Sched<[sched]>;
+ def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, intmemop:$src3, u4imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set RC:$dst,
+ (VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3),
+ (i8 timm:$src4))))]>, VEX_W,
+ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+ def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, fpmemop:$src2, RC:$src3, u4imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ [(set RC:$dst,
+ (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
+ RC:$src3, (i8 timm:$src4))))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold,
+ // fpmemop:$src2
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // RC:$src3
+ sched.ReadAfterFold]>;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4),
+ !strconcat(OpcodeStr,
+ "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+ []>, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VPERMIL2PD : xop_vpermil2<0x49, "vpermil2pd", VR128, i128mem, f128mem,
+ v2f64, loadv2f64, loadv2i64,
+ SchedWriteFVarShuffle.XMM>;
+ defm VPERMIL2PDY : xop_vpermil2<0x49, "vpermil2pd", VR256, i256mem, f256mem,
+ v4f64, loadv4f64, loadv4i64,
+ SchedWriteFVarShuffle.YMM>, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in {
+ defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
+ v4f32, loadv4f32, loadv4i32,
+ SchedWriteFVarShuffle.XMM>;
+ defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
+ v8f32, loadv8f32, loadv8i32,
+ SchedWriteFVarShuffle.YMM>, VEX_L;
+}
+
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
new file mode 100644
index 000000000000..ff531713037c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -0,0 +1,1693 @@
+//===- X86InstructionSelector.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86RegisterBankInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <tuple>
+
+#define DEBUG_TYPE "X86-isel"
+
+using namespace llvm;
+
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class X86InstructionSelector : public InstructionSelector {
+public:
+ X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI,
+ const X86RegisterBankInfo &RBI);
+
+ bool select(MachineInstr &I) override;
+ static const char *getName() { return DEBUG_TYPE; }
+
+private:
+ /// tblgen-erated 'select' implementation, used as the initial selector for
+ /// the patterns that don't require complex C++.
+ bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
+ // TODO: remove after supported by Tablegen-erated instruction selection.
+ unsigned getLoadStoreOp(const LLT &Ty, const RegisterBank &RB, unsigned Opc,
+ Align Alignment) const;
+
+ bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectTruncOrPtrToInt(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectAnyext(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectFCmp(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF);
+ bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF);
+ bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectCondBranch(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectTurnIntoCOPY(MachineInstr &I, MachineRegisterInfo &MRI,
+ const unsigned DstReg,
+ const TargetRegisterClass *DstRC,
+ const unsigned SrcReg,
+ const TargetRegisterClass *SrcRC) const;
+ bool materializeFP(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectImplicitDefOrPHI(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectDivRem(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectIntrinsicWSideEffects(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+
+ // emit insert subreg instruction and insert it before MachineInstr &I
+ bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+ MachineRegisterInfo &MRI, MachineFunction &MF) const;
+ // emit extract subreg instruction and insert it before MachineInstr &I
+ bool emitExtractSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+ MachineRegisterInfo &MRI, MachineFunction &MF) const;
+
+ const TargetRegisterClass *getRegClass(LLT Ty, const RegisterBank &RB) const;
+ const TargetRegisterClass *getRegClass(LLT Ty, unsigned Reg,
+ MachineRegisterInfo &MRI) const;
+
+ const X86TargetMachine &TM;
+ const X86Subtarget &STI;
+ const X86InstrInfo &TII;
+ const X86RegisterInfo &TRI;
+ const X86RegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM,
+ const X86Subtarget &STI,
+ const X86RegisterBankInfo &RBI)
+ : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+ TRI(*STI.getRegisterInfo()), RBI(RBI),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+// FIXME: This should be target-independent, inferred from the types declared
+// for each class in the bank.
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
+ if (RB.getID() == X86::GPRRegBankID) {
+ if (Ty.getSizeInBits() <= 8)
+ return &X86::GR8RegClass;
+ if (Ty.getSizeInBits() == 16)
+ return &X86::GR16RegClass;
+ if (Ty.getSizeInBits() == 32)
+ return &X86::GR32RegClass;
+ if (Ty.getSizeInBits() == 64)
+ return &X86::GR64RegClass;
+ }
+ if (RB.getID() == X86::VECRRegBankID) {
+ if (Ty.getSizeInBits() == 32)
+ return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
+ if (Ty.getSizeInBits() == 64)
+ return STI.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
+ if (Ty.getSizeInBits() == 128)
+ return STI.hasAVX512() ? &X86::VR128XRegClass : &X86::VR128RegClass;
+ if (Ty.getSizeInBits() == 256)
+ return STI.hasAVX512() ? &X86::VR256XRegClass : &X86::VR256RegClass;
+ if (Ty.getSizeInBits() == 512)
+ return &X86::VR512RegClass;
+ }
+
+ llvm_unreachable("Unknown RegBank!");
+}
+
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, unsigned Reg,
+ MachineRegisterInfo &MRI) const {
+ const RegisterBank &RegBank = *RBI.getRegBank(Reg, MRI, TRI);
+ return getRegClass(Ty, RegBank);
+}
+
+static unsigned getSubRegIndex(const TargetRegisterClass *RC) {
+ unsigned SubIdx = X86::NoSubRegister;
+ if (RC == &X86::GR32RegClass) {
+ SubIdx = X86::sub_32bit;
+ } else if (RC == &X86::GR16RegClass) {
+ SubIdx = X86::sub_16bit;
+ } else if (RC == &X86::GR8RegClass) {
+ SubIdx = X86::sub_8bit;
+ }
+
+ return SubIdx;
+}
+
+static const TargetRegisterClass *getRegClassFromGRPhysReg(Register Reg) {
+ assert(Reg.isPhysical());
+ if (X86::GR64RegClass.contains(Reg))
+ return &X86::GR64RegClass;
+ if (X86::GR32RegClass.contains(Reg))
+ return &X86::GR32RegClass;
+ if (X86::GR16RegClass.contains(Reg))
+ return &X86::GR16RegClass;
+ if (X86::GR8RegClass.contains(Reg))
+ return &X86::GR8RegClass;
+
+ llvm_unreachable("Unknown RegClass for PhysReg!");
+}
+
+// Set X86 Opcode and constrain DestReg.
+bool X86InstructionSelector::selectCopy(MachineInstr &I,
+ MachineRegisterInfo &MRI) const {
+ Register DstReg = I.getOperand(0).getReg();
+ const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+ const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+
+ Register SrcReg = I.getOperand(1).getReg();
+ const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+ const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
+
+ if (DstReg.isPhysical()) {
+ assert(I.isCopy() && "Generic operators do not allow physical registers");
+
+ if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID &&
+ DstRegBank.getID() == X86::GPRRegBankID) {
+
+ const TargetRegisterClass *SrcRC =
+ getRegClass(MRI.getType(SrcReg), SrcRegBank);
+ const TargetRegisterClass *DstRC = getRegClassFromGRPhysReg(DstReg);
+
+ if (SrcRC != DstRC) {
+ // This case can be generated by ABI lowering, performe anyext
+ Register ExtSrc = MRI.createVirtualRegister(DstRC);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::SUBREG_TO_REG))
+ .addDef(ExtSrc)
+ .addImm(0)
+ .addReg(SrcReg)
+ .addImm(getSubRegIndex(SrcRC));
+
+ I.getOperand(1).setReg(ExtSrc);
+ }
+ }
+
+ return true;
+ }
+
+ assert((!SrcReg.isPhysical() || I.isCopy()) &&
+ "No phys reg on generic operators");
+ assert((DstSize == SrcSize ||
+ // Copies are a mean to setup initial types, the number of
+ // bits may not exactly match.
+ (SrcReg.isPhysical() &&
+ DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
+ "Copy with different width?!");
+
+ const TargetRegisterClass *DstRC =
+ getRegClass(MRI.getType(DstReg), DstRegBank);
+
+ if (SrcRegBank.getID() == X86::GPRRegBankID &&
+ DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize &&
+ SrcReg.isPhysical()) {
+ // Change the physical register to performe truncate.
+
+ const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg);
+
+ if (DstRC != SrcRC) {
+ I.getOperand(1).setSubReg(getSubRegIndex(DstRC));
+ I.getOperand(1).substPhysReg(SrcReg, TRI);
+ }
+ }
+
+ // No need to constrain SrcReg. It will get constrained when
+ // we hit another of its use or its defs.
+ // Copies do not have constraints.
+ const TargetRegisterClass *OldRC = MRI.getRegClassOrNull(DstReg);
+ if (!OldRC || !DstRC->hasSubClassEq(OldRC)) {
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+ }
+ I.setDesc(TII.get(X86::COPY));
+ return true;
+}
+
+bool X86InstructionSelector::select(MachineInstr &I) {
+ assert(I.getParent() && "Instruction should be in a basic block!");
+ assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+ MachineBasicBlock &MBB = *I.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned Opcode = I.getOpcode();
+ if (!isPreISelGenericOpcode(Opcode)) {
+ // Certain non-generic instructions also need some special handling.
+
+ if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
+ return false;
+
+ if (I.isCopy())
+ return selectCopy(I, MRI);
+
+ return true;
+ }
+
+ assert(I.getNumOperands() == I.getNumExplicitOperands() &&
+ "Generic instruction has unexpected implicit operands\n");
+
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+
+ LLVM_DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
+
+ // TODO: This should be implemented by tblgen.
+ switch (I.getOpcode()) {
+ default:
+ return false;
+ case TargetOpcode::G_STORE:
+ case TargetOpcode::G_LOAD:
+ return selectLoadStoreOp(I, MRI, MF);
+ case TargetOpcode::G_PTR_ADD:
+ case TargetOpcode::G_FRAME_INDEX:
+ return selectFrameIndexOrGep(I, MRI, MF);
+ case TargetOpcode::G_GLOBAL_VALUE:
+ return selectGlobalValue(I, MRI, MF);
+ case TargetOpcode::G_CONSTANT:
+ return selectConstant(I, MRI, MF);
+ case TargetOpcode::G_FCONSTANT:
+ return materializeFP(I, MRI, MF);
+ case TargetOpcode::G_PTRTOINT:
+ case TargetOpcode::G_TRUNC:
+ return selectTruncOrPtrToInt(I, MRI, MF);
+ case TargetOpcode::G_INTTOPTR:
+ return selectCopy(I, MRI);
+ case TargetOpcode::G_ZEXT:
+ return selectZext(I, MRI, MF);
+ case TargetOpcode::G_ANYEXT:
+ return selectAnyext(I, MRI, MF);
+ case TargetOpcode::G_ICMP:
+ return selectCmp(I, MRI, MF);
+ case TargetOpcode::G_FCMP:
+ return selectFCmp(I, MRI, MF);
+ case TargetOpcode::G_UADDE:
+ return selectUadde(I, MRI, MF);
+ case TargetOpcode::G_UNMERGE_VALUES:
+ return selectUnmergeValues(I, MRI, MF);
+ case TargetOpcode::G_MERGE_VALUES:
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return selectMergeValues(I, MRI, MF);
+ case TargetOpcode::G_EXTRACT:
+ return selectExtract(I, MRI, MF);
+ case TargetOpcode::G_INSERT:
+ return selectInsert(I, MRI, MF);
+ case TargetOpcode::G_BRCOND:
+ return selectCondBranch(I, MRI, MF);
+ case TargetOpcode::G_IMPLICIT_DEF:
+ case TargetOpcode::G_PHI:
+ return selectImplicitDefOrPHI(I, MRI);
+ case TargetOpcode::G_SDIV:
+ case TargetOpcode::G_UDIV:
+ case TargetOpcode::G_SREM:
+ case TargetOpcode::G_UREM:
+ return selectDivRem(I, MRI, MF);
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ return selectIntrinsicWSideEffects(I, MRI, MF);
+ }
+
+ return false;
+}
+
+unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
+ const RegisterBank &RB,
+ unsigned Opc,
+ Align Alignment) const {
+ bool Isload = (Opc == TargetOpcode::G_LOAD);
+ bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
+ if (Ty == LLT::scalar(8)) {
+ if (X86::GPRRegBankID == RB.getID())
+ return Isload ? X86::MOV8rm : X86::MOV8mr;
+ } else if (Ty == LLT::scalar(16)) {
+ if (X86::GPRRegBankID == RB.getID())
+ return Isload ? X86::MOV16rm : X86::MOV16mr;
+ } else if (Ty == LLT::scalar(32) || Ty == LLT::pointer(0, 32)) {
+ if (X86::GPRRegBankID == RB.getID())
+ return Isload ? X86::MOV32rm : X86::MOV32mr;
+ if (X86::VECRRegBankID == RB.getID())
+ return Isload ? (HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt)
+ : (HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr :
+ X86::MOVSSmr);
+ } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) {
+ if (X86::GPRRegBankID == RB.getID())
+ return Isload ? X86::MOV64rm : X86::MOV64mr;
+ if (X86::VECRRegBankID == RB.getID())
+ return Isload ? (HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt)
+ : (HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr :
+ X86::MOVSDmr);
+ } else if (Ty.isVector() && Ty.getSizeInBits() == 128) {
+ if (Alignment >= Align(16))
+ return Isload ? (HasVLX ? X86::VMOVAPSZ128rm
+ : HasAVX512
+ ? X86::VMOVAPSZ128rm_NOVLX
+ : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm)
+ : (HasVLX ? X86::VMOVAPSZ128mr
+ : HasAVX512
+ ? X86::VMOVAPSZ128mr_NOVLX
+ : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+ else
+ return Isload ? (HasVLX ? X86::VMOVUPSZ128rm
+ : HasAVX512
+ ? X86::VMOVUPSZ128rm_NOVLX
+ : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm)
+ : (HasVLX ? X86::VMOVUPSZ128mr
+ : HasAVX512
+ ? X86::VMOVUPSZ128mr_NOVLX
+ : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+ } else if (Ty.isVector() && Ty.getSizeInBits() == 256) {
+ if (Alignment >= Align(32))
+ return Isload ? (HasVLX ? X86::VMOVAPSZ256rm
+ : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
+ : X86::VMOVAPSYrm)
+ : (HasVLX ? X86::VMOVAPSZ256mr
+ : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
+ : X86::VMOVAPSYmr);
+ else
+ return Isload ? (HasVLX ? X86::VMOVUPSZ256rm
+ : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
+ : X86::VMOVUPSYrm)
+ : (HasVLX ? X86::VMOVUPSZ256mr
+ : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
+ : X86::VMOVUPSYmr);
+ } else if (Ty.isVector() && Ty.getSizeInBits() == 512) {
+ if (Alignment >= Align(64))
+ return Isload ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+ else
+ return Isload ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+ }
+ return Opc;
+}
+
+// Fill in an address from the given instruction.
+static void X86SelectAddress(const MachineInstr &I,
+ const MachineRegisterInfo &MRI,
+ X86AddressMode &AM) {
+ assert(I.getOperand(0).isReg() && "unsupported opperand.");
+ assert(MRI.getType(I.getOperand(0).getReg()).isPointer() &&
+ "unsupported type.");
+
+ if (I.getOpcode() == TargetOpcode::G_PTR_ADD) {
+ if (auto COff = getConstantVRegSExtVal(I.getOperand(2).getReg(), MRI)) {
+ int64_t Imm = *COff;
+ if (isInt<32>(Imm)) { // Check for displacement overflow.
+ AM.Disp = static_cast<int32_t>(Imm);
+ AM.Base.Reg = I.getOperand(1).getReg();
+ return;
+ }
+ }
+ } else if (I.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+ AM.Base.FrameIndex = I.getOperand(1).getIndex();
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ return;
+ }
+
+ // Default behavior.
+ AM.Base.Reg = I.getOperand(0).getReg();
+}
+
+bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ unsigned Opc = I.getOpcode();
+
+ assert((Opc == TargetOpcode::G_STORE || Opc == TargetOpcode::G_LOAD) &&
+ "unexpected instruction");
+
+ const Register DefReg = I.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DefReg);
+ const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+ assert(I.hasOneMemOperand());
+ auto &MemOp = **I.memoperands_begin();
+ if (MemOp.isAtomic()) {
+ // Note: for unordered operations, we rely on the fact the appropriate MMO
+ // is already on the instruction we're mutating, and thus we don't need to
+ // make any changes. So long as we select an opcode which is capable of
+ // loading or storing the appropriate size atomically, the rest of the
+ // backend is required to respect the MMO state.
+ if (!MemOp.isUnordered()) {
+ LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n");
+ return false;
+ }
+ if (MemOp.getAlign() < Ty.getSizeInBits() / 8) {
+ LLVM_DEBUG(dbgs() << "Unaligned atomics not supported yet\n");
+ return false;
+ }
+ }
+
+ unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlign());
+ if (NewOpc == Opc)
+ return false;
+
+ X86AddressMode AM;
+ X86SelectAddress(*MRI.getVRegDef(I.getOperand(1).getReg()), MRI, AM);
+
+ I.setDesc(TII.get(NewOpc));
+ MachineInstrBuilder MIB(MF, I);
+ if (Opc == TargetOpcode::G_LOAD) {
+ I.RemoveOperand(1);
+ addFullAddress(MIB, AM);
+ } else {
+ // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
+ I.RemoveOperand(1);
+ I.RemoveOperand(0);
+ addFullAddress(MIB, AM).addUse(DefReg);
+ }
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+static unsigned getLeaOP(LLT Ty, const X86Subtarget &STI) {
+ if (Ty == LLT::pointer(0, 64))
+ return X86::LEA64r;
+ else if (Ty == LLT::pointer(0, 32))
+ return STI.isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r;
+ else
+ llvm_unreachable("Can't get LEA opcode. Unsupported type.");
+}
+
+bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ unsigned Opc = I.getOpcode();
+
+ assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_PTR_ADD) &&
+ "unexpected instruction");
+
+ const Register DefReg = I.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DefReg);
+
+ // Use LEA to calculate frame index and GEP
+ unsigned NewOpc = getLeaOP(Ty, STI);
+ I.setDesc(TII.get(NewOpc));
+ MachineInstrBuilder MIB(MF, I);
+
+ if (Opc == TargetOpcode::G_FRAME_INDEX) {
+ addOffset(MIB, 0);
+ } else {
+ MachineOperand &InxOp = I.getOperand(2);
+ I.addOperand(InxOp); // set IndexReg
+ InxOp.ChangeToImmediate(1); // set Scale
+ MIB.addImm(0).addReg(0);
+ }
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_GLOBAL_VALUE) &&
+ "unexpected instruction");
+
+ auto GV = I.getOperand(1).getGlobal();
+ if (GV->isThreadLocal()) {
+ return false; // TODO: we don't support TLS yet.
+ }
+
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ X86AddressMode AM;
+ AM.GV = GV;
+ AM.GVOpFlags = STI.classifyGlobalReference(GV);
+
+ // TODO: The ABI requires an extra load. not supported yet.
+ if (isGlobalStubReference(AM.GVOpFlags))
+ return false;
+
+ // TODO: This reference is relative to the pic base. not supported yet.
+ if (isGlobalRelativeToPICBase(AM.GVOpFlags))
+ return false;
+
+ if (STI.isPICStyleRIPRel()) {
+ // Use rip-relative addressing.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ }
+
+ const Register DefReg = I.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DefReg);
+ unsigned NewOpc = getLeaOP(Ty, STI);
+
+ I.setDesc(TII.get(NewOpc));
+ MachineInstrBuilder MIB(MF, I);
+
+ I.RemoveOperand(1);
+ addFullAddress(MIB, AM);
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectConstant(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_CONSTANT) &&
+ "unexpected instruction");
+
+ const Register DefReg = I.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DefReg);
+
+ if (RBI.getRegBank(DefReg, MRI, TRI)->getID() != X86::GPRRegBankID)
+ return false;
+
+ uint64_t Val = 0;
+ if (I.getOperand(1).isCImm()) {
+ Val = I.getOperand(1).getCImm()->getZExtValue();
+ I.getOperand(1).ChangeToImmediate(Val);
+ } else if (I.getOperand(1).isImm()) {
+ Val = I.getOperand(1).getImm();
+ } else
+ llvm_unreachable("Unsupported operand type.");
+
+ unsigned NewOpc;
+ switch (Ty.getSizeInBits()) {
+ case 8:
+ NewOpc = X86::MOV8ri;
+ break;
+ case 16:
+ NewOpc = X86::MOV16ri;
+ break;
+ case 32:
+ NewOpc = X86::MOV32ri;
+ break;
+ case 64:
+ // TODO: in case isUInt<32>(Val), X86::MOV32ri can be used
+ if (isInt<32>(Val))
+ NewOpc = X86::MOV64ri32;
+ else
+ NewOpc = X86::MOV64ri;
+ break;
+ default:
+ llvm_unreachable("Can't select G_CONSTANT, unsupported type.");
+ }
+
+ I.setDesc(TII.get(NewOpc));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+// Helper function for selectTruncOrPtrToInt and selectAnyext.
+// Returns true if DstRC lives on a floating register class and
+// SrcRC lives on a 128-bit vector class.
+static bool canTurnIntoCOPY(const TargetRegisterClass *DstRC,
+ const TargetRegisterClass *SrcRC) {
+ return (DstRC == &X86::FR32RegClass || DstRC == &X86::FR32XRegClass ||
+ DstRC == &X86::FR64RegClass || DstRC == &X86::FR64XRegClass) &&
+ (SrcRC == &X86::VR128RegClass || SrcRC == &X86::VR128XRegClass);
+}
+
+bool X86InstructionSelector::selectTurnIntoCOPY(
+ MachineInstr &I, MachineRegisterInfo &MRI, const unsigned DstReg,
+ const TargetRegisterClass *DstRC, const unsigned SrcReg,
+ const TargetRegisterClass *SrcRC) const {
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+ I.setDesc(TII.get(X86::COPY));
+ return true;
+}
+
+bool X86InstructionSelector::selectTruncOrPtrToInt(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_TRUNC ||
+ I.getOpcode() == TargetOpcode::G_PTRTOINT) &&
+ "unexpected instruction");
+
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+
+ if (DstRB.getID() != SrcRB.getID()) {
+ LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode())
+ << " input/output on different banks\n");
+ return false;
+ }
+
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
+
+ if (!DstRC || !SrcRC)
+ return false;
+
+ // If that's truncation of the value that lives on the vector class and goes
+ // into the floating class, just replace it with copy, as we are able to
+ // select it as a regular move.
+ if (canTurnIntoCOPY(DstRC, SrcRC))
+ return selectTurnIntoCOPY(I, MRI, DstReg, DstRC, SrcReg, SrcRC);
+
+ if (DstRB.getID() != X86::GPRRegBankID)
+ return false;
+
+ unsigned SubIdx;
+ if (DstRC == SrcRC) {
+ // Nothing to be done
+ SubIdx = X86::NoSubRegister;
+ } else if (DstRC == &X86::GR32RegClass) {
+ SubIdx = X86::sub_32bit;
+ } else if (DstRC == &X86::GR16RegClass) {
+ SubIdx = X86::sub_16bit;
+ } else if (DstRC == &X86::GR8RegClass) {
+ SubIdx = X86::sub_8bit;
+ } else {
+ return false;
+ }
+
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << "\n");
+ return false;
+ }
+
+ I.getOperand(1).setSubReg(SubIdx);
+
+ I.setDesc(TII.get(X86::COPY));
+ return true;
+}
+
+bool X86InstructionSelector::selectZext(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_ZEXT) && "unexpected instruction");
+
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(16)) &&
+ "8=>16 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(32)) &&
+ "8=>32 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(32)) &&
+ "16=>32 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(64)) &&
+ "8=>64 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(64)) &&
+ "16=>64 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(32) && DstTy == LLT::scalar(64)) &&
+ "32=>64 Zext is handled by tablegen");
+
+ if (SrcTy != LLT::scalar(1))
+ return false;
+
+ unsigned AndOpc;
+ if (DstTy == LLT::scalar(8))
+ AndOpc = X86::AND8ri;
+ else if (DstTy == LLT::scalar(16))
+ AndOpc = X86::AND16ri8;
+ else if (DstTy == LLT::scalar(32))
+ AndOpc = X86::AND32ri8;
+ else if (DstTy == LLT::scalar(64))
+ AndOpc = X86::AND64ri8;
+ else
+ return false;
+
+ Register DefReg = SrcReg;
+ if (DstTy != LLT::scalar(8)) {
+ Register ImpDefReg =
+ MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImpDefReg);
+
+ DefReg = MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::INSERT_SUBREG), DefReg)
+ .addReg(ImpDefReg)
+ .addReg(SrcReg)
+ .addImm(X86::sub_8bit);
+ }
+
+ MachineInstr &AndInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg)
+ .addReg(DefReg)
+ .addImm(1);
+
+ constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectAnyext(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_ANYEXT) && "unexpected instruction");
+
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+
+ assert(DstRB.getID() == SrcRB.getID() &&
+ "G_ANYEXT input/output on different banks\n");
+
+ assert(DstTy.getSizeInBits() > SrcTy.getSizeInBits() &&
+ "G_ANYEXT incorrect operand size");
+
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
+
+ // If that's ANY_EXT of the value that lives on the floating class and goes
+ // into the vector class, just replace it with copy, as we are able to select
+ // it as a regular move.
+ if (canTurnIntoCOPY(SrcRC, DstRC))
+ return selectTurnIntoCOPY(I, MRI, SrcReg, SrcRC, DstReg, DstRC);
+
+ if (DstRB.getID() != X86::GPRRegBankID)
+ return false;
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+
+ if (SrcRC == DstRC) {
+ I.setDesc(TII.get(X86::COPY));
+ return true;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::SUBREG_TO_REG))
+ .addDef(DstReg)
+ .addImm(0)
+ .addReg(SrcReg)
+ .addImm(getSubRegIndex(SrcRC));
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectCmp(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_ICMP) && "unexpected instruction");
+
+ X86::CondCode CC;
+ bool SwapArgs;
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
+ (CmpInst::Predicate)I.getOperand(1).getPredicate());
+
+ Register LHS = I.getOperand(2).getReg();
+ Register RHS = I.getOperand(3).getReg();
+
+ if (SwapArgs)
+ std::swap(LHS, RHS);
+
+ unsigned OpCmp;
+ LLT Ty = MRI.getType(LHS);
+
+ switch (Ty.getSizeInBits()) {
+ default:
+ return false;
+ case 8:
+ OpCmp = X86::CMP8rr;
+ break;
+ case 16:
+ OpCmp = X86::CMP16rr;
+ break;
+ case 32:
+ OpCmp = X86::CMP32rr;
+ break;
+ case 64:
+ OpCmp = X86::CMP64rr;
+ break;
+ }
+
+ MachineInstr &CmpInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp))
+ .addReg(LHS)
+ .addReg(RHS);
+
+ MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(X86::SETCCr), I.getOperand(0).getReg()).addImm(CC);
+
+ constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectFCmp(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_FCMP) && "unexpected instruction");
+
+ Register LhsReg = I.getOperand(2).getReg();
+ Register RhsReg = I.getOperand(3).getReg();
+ CmpInst::Predicate Predicate =
+ (CmpInst::Predicate)I.getOperand(1).getPredicate();
+
+ // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+ static const uint16_t SETFOpcTable[2][3] = {
+ {X86::COND_E, X86::COND_NP, X86::AND8rr},
+ {X86::COND_NE, X86::COND_P, X86::OR8rr}};
+ const uint16_t *SETFOpc = nullptr;
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_OEQ:
+ SETFOpc = &SETFOpcTable[0][0];
+ break;
+ case CmpInst::FCMP_UNE:
+ SETFOpc = &SETFOpcTable[1][0];
+ break;
+ }
+
+ // Compute the opcode for the CMP instruction.
+ unsigned OpCmp;
+ LLT Ty = MRI.getType(LhsReg);
+ switch (Ty.getSizeInBits()) {
+ default:
+ return false;
+ case 32:
+ OpCmp = X86::UCOMISSrr;
+ break;
+ case 64:
+ OpCmp = X86::UCOMISDrr;
+ break;
+ }
+
+ Register ResultReg = I.getOperand(0).getReg();
+ RBI.constrainGenericRegister(
+ ResultReg,
+ *getRegClass(LLT::scalar(8), *RBI.getRegBank(ResultReg, MRI, TRI)), MRI);
+ if (SETFOpc) {
+ MachineInstr &CmpInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp))
+ .addReg(LhsReg)
+ .addReg(RhsReg);
+
+ Register FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass);
+ Register FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass);
+ MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]);
+ MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(X86::SETCCr), FlagReg2).addImm(SETFOpc[1]);
+ MachineInstr &Set3 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(SETFOpc[2]), ResultReg)
+ .addReg(FlagReg1)
+ .addReg(FlagReg2);
+ constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(Set1, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(Set2, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(Set3, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ X86::CondCode CC;
+ bool SwapArgs;
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
+ assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+ if (SwapArgs)
+ std::swap(LhsReg, RhsReg);
+
+ // Emit a compare of LHS/RHS.
+ MachineInstr &CmpInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp))
+ .addReg(LhsReg)
+ .addReg(RhsReg);
+
+ MachineInstr &Set =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), ResultReg).addImm(CC);
+ constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(Set, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectUadde(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_UADDE) && "unexpected instruction");
+
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register CarryOutReg = I.getOperand(1).getReg();
+ const Register Op0Reg = I.getOperand(2).getReg();
+ const Register Op1Reg = I.getOperand(3).getReg();
+ Register CarryInReg = I.getOperand(4).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+
+ if (DstTy != LLT::scalar(32))
+ return false;
+
+ // find CarryIn def instruction.
+ MachineInstr *Def = MRI.getVRegDef(CarryInReg);
+ while (Def->getOpcode() == TargetOpcode::G_TRUNC) {
+ CarryInReg = Def->getOperand(1).getReg();
+ Def = MRI.getVRegDef(CarryInReg);
+ }
+
+ unsigned Opcode;
+ if (Def->getOpcode() == TargetOpcode::G_UADDE) {
+ // carry set by prev ADD.
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), X86::EFLAGS)
+ .addReg(CarryInReg);
+
+ if (!RBI.constrainGenericRegister(CarryInReg, X86::GR32RegClass, MRI))
+ return false;
+
+ Opcode = X86::ADC32rr;
+ } else if (auto val = getConstantVRegVal(CarryInReg, MRI)) {
+ // carry is constant, support only 0.
+ if (*val != 0)
+ return false;
+
+ Opcode = X86::ADD32rr;
+ } else
+ return false;
+
+ MachineInstr &AddInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg)
+ .addReg(Op0Reg)
+ .addReg(Op1Reg);
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), CarryOutReg)
+ .addReg(X86::EFLAGS);
+
+ if (!constrainSelectedInstRegOperands(AddInst, TII, TRI, RBI) ||
+ !RBI.constrainGenericRegister(CarryOutReg, X86::GR32RegClass, MRI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectExtract(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_EXTRACT) &&
+ "unexpected instruction");
+
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
+ int64_t Index = I.getOperand(2).getImm();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ // Meanwile handle vector type only.
+ if (!DstTy.isVector())
+ return false;
+
+ if (Index % DstTy.getSizeInBits() != 0)
+ return false; // Not extract subvector.
+
+ if (Index == 0) {
+ // Replace by extract subreg copy.
+ if (!emitExtractSubreg(DstReg, SrcReg, I, MRI, MF))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
+ if (SrcTy.getSizeInBits() == 256 && DstTy.getSizeInBits() == 128) {
+ if (HasVLX)
+ I.setDesc(TII.get(X86::VEXTRACTF32x4Z256rr));
+ else if (HasAVX)
+ I.setDesc(TII.get(X86::VEXTRACTF128rr));
+ else
+ return false;
+ } else if (SrcTy.getSizeInBits() == 512 && HasAVX512) {
+ if (DstTy.getSizeInBits() == 128)
+ I.setDesc(TII.get(X86::VEXTRACTF32x4Zrr));
+ else if (DstTy.getSizeInBits() == 256)
+ I.setDesc(TII.get(X86::VEXTRACTF64x4Zrr));
+ else
+ return false;
+ } else
+ return false;
+
+ // Convert to X86 VEXTRACT immediate.
+ Index = Index / DstTy.getSizeInBits();
+ I.getOperand(2).setImm(Index);
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
+ MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ unsigned SubIdx = X86::NoSubRegister;
+
+ if (!DstTy.isVector() || !SrcTy.isVector())
+ return false;
+
+ assert(SrcTy.getSizeInBits() > DstTy.getSizeInBits() &&
+ "Incorrect Src/Dst register size");
+
+ if (DstTy.getSizeInBits() == 128)
+ SubIdx = X86::sub_xmm;
+ else if (DstTy.getSizeInBits() == 256)
+ SubIdx = X86::sub_ymm;
+ else
+ return false;
+
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain EXTRACT_SUBREG\n");
+ return false;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), DstReg)
+ .addReg(SrcReg, 0, SubIdx);
+
+ return true;
+}
+
+bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
+ MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ unsigned SubIdx = X86::NoSubRegister;
+
+ // TODO: support scalar types
+ if (!DstTy.isVector() || !SrcTy.isVector())
+ return false;
+
+ assert(SrcTy.getSizeInBits() < DstTy.getSizeInBits() &&
+ "Incorrect Src/Dst register size");
+
+ if (SrcTy.getSizeInBits() == 128)
+ SubIdx = X86::sub_xmm;
+ else if (SrcTy.getSizeInBits() == 256)
+ SubIdx = X86::sub_ymm;
+ else
+ return false;
+
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
+ return false;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY))
+ .addReg(DstReg, RegState::DefineNoRead, SubIdx)
+ .addReg(SrcReg);
+
+ return true;
+}
+
+bool X86InstructionSelector::selectInsert(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_INSERT) && "unexpected instruction");
+
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
+ const Register InsertReg = I.getOperand(2).getReg();
+ int64_t Index = I.getOperand(3).getImm();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT InsertRegTy = MRI.getType(InsertReg);
+
+ // Meanwile handle vector type only.
+ if (!DstTy.isVector())
+ return false;
+
+ if (Index % InsertRegTy.getSizeInBits() != 0)
+ return false; // Not insert subvector.
+
+ if (Index == 0 && MRI.getVRegDef(SrcReg)->isImplicitDef()) {
+ // Replace by subreg copy.
+ if (!emitInsertSubreg(DstReg, InsertReg, I, MRI, MF))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
+ if (DstTy.getSizeInBits() == 256 && InsertRegTy.getSizeInBits() == 128) {
+ if (HasVLX)
+ I.setDesc(TII.get(X86::VINSERTF32x4Z256rr));
+ else if (HasAVX)
+ I.setDesc(TII.get(X86::VINSERTF128rr));
+ else
+ return false;
+ } else if (DstTy.getSizeInBits() == 512 && HasAVX512) {
+ if (InsertRegTy.getSizeInBits() == 128)
+ I.setDesc(TII.get(X86::VINSERTF32x4Zrr));
+ else if (InsertRegTy.getSizeInBits() == 256)
+ I.setDesc(TII.get(X86::VINSERTF64x4Zrr));
+ else
+ return false;
+ } else
+ return false;
+
+ // Convert to X86 VINSERT immediate.
+ Index = Index / InsertRegTy.getSizeInBits();
+
+ I.getOperand(3).setImm(Index);
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectUnmergeValues(
+ MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) {
+ assert((I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES) &&
+ "unexpected instruction");
+
+ // Split to extracts.
+ unsigned NumDefs = I.getNumOperands() - 1;
+ Register SrcReg = I.getOperand(NumDefs).getReg();
+ unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
+
+ for (unsigned Idx = 0; Idx < NumDefs; ++Idx) {
+ MachineInstr &ExtrInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::G_EXTRACT), I.getOperand(Idx).getReg())
+ .addReg(SrcReg)
+ .addImm(Idx * DefSize);
+
+ if (!select(ExtrInst))
+ return false;
+ }
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectMergeValues(
+ MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) {
+ assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES ||
+ I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS) &&
+ "unexpected instruction");
+
+ // Split to inserts.
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg0 = I.getOperand(1).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg0);
+ unsigned SrcSize = SrcTy.getSizeInBits();
+
+ const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+
+ // For the first src use insertSubReg.
+ Register DefReg = MRI.createGenericVirtualRegister(DstTy);
+ MRI.setRegBank(DefReg, RegBank);
+ if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF))
+ return false;
+
+ for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) {
+ Register Tmp = MRI.createGenericVirtualRegister(DstTy);
+ MRI.setRegBank(Tmp, RegBank);
+
+ MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::G_INSERT), Tmp)
+ .addReg(DefReg)
+ .addReg(I.getOperand(Idx).getReg())
+ .addImm((Idx - 1) * SrcSize);
+
+ DefReg = Tmp;
+
+ if (!select(InsertInst))
+ return false;
+ }
+
+ MachineInstr &CopyInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::COPY), DstReg)
+ .addReg(DefReg);
+
+ if (!select(CopyInst))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectCondBranch(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_BRCOND) && "unexpected instruction");
+
+ const Register CondReg = I.getOperand(0).getReg();
+ MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+
+ MachineInstr &TestInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::TEST8ri))
+ .addReg(CondReg)
+ .addImm(1);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JCC_1))
+ .addMBB(DestMBB).addImm(X86::COND_NE);
+
+ constrainSelectedInstRegOperands(TestInst, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::materializeFP(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_FCONSTANT) &&
+ "unexpected instruction");
+
+ // Can't handle alternate code models yet.
+ CodeModel::Model CM = TM.getCodeModel();
+ if (CM != CodeModel::Small && CM != CodeModel::Large)
+ return false;
+
+ const Register DstReg = I.getOperand(0).getReg();
+ const LLT DstTy = MRI.getType(DstReg);
+ const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+ Align Alignment = Align(DstTy.getSizeInBytes());
+ const DebugLoc &DbgLoc = I.getDebugLoc();
+
+ unsigned Opc =
+ getLoadStoreOp(DstTy, RegBank, TargetOpcode::G_LOAD, Alignment);
+
+ // Create the load from the constant pool.
+ const ConstantFP *CFP = I.getOperand(1).getFPImm();
+ unsigned CPI = MF.getConstantPool()->getConstantPoolIndex(CFP, Alignment);
+ MachineInstr *LoadInst = nullptr;
+ unsigned char OpFlag = STI.classifyLocalReference(nullptr);
+
+ if (CM == CodeModel::Large && STI.is64Bit()) {
+ // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+ // they cannot be folded into immediate fields.
+
+ Register AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*I.getParent(), I, DbgLoc, TII.get(X86::MOV64ri), AddrReg)
+ .addConstantPoolIndex(CPI, 0, OpFlag);
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
+ MF.getDataLayout().getPointerSize(), Alignment);
+
+ LoadInst =
+ addDirectMem(BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg),
+ AddrReg)
+ .addMemOperand(MMO);
+
+ } else if (CM == CodeModel::Small || !STI.is64Bit()) {
+ // Handle the case when globals fit in our immediate field.
+ // This is true for X86-32 always and X86-64 when in -mcmodel=small mode.
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ if (OpFlag == X86II::MO_PIC_BASE_OFFSET || OpFlag == X86II::MO_GOTOFF) {
+ // PICBase can be allocated by TII.getGlobalBaseReg(&MF).
+ // In DAGISEL the code that initialize it generated by the CGBR pass.
+ return false; // TODO support the mode.
+ } else if (STI.is64Bit() && TM.getCodeModel() == CodeModel::Small)
+ PICBase = X86::RIP;
+
+ LoadInst = addConstantPoolReference(
+ BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg), CPI, PICBase,
+ OpFlag);
+ } else
+ return false;
+
+ constrainSelectedInstRegOperands(*LoadInst, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectImplicitDefOrPHI(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert((I.getOpcode() == TargetOpcode::G_IMPLICIT_DEF ||
+ I.getOpcode() == TargetOpcode::G_PHI) &&
+ "unexpected instruction");
+
+ Register DstReg = I.getOperand(0).getReg();
+
+ if (!MRI.getRegClassOrNull(DstReg)) {
+ const LLT DstTy = MRI.getType(DstReg);
+ const TargetRegisterClass *RC = getRegClass(DstTy, DstReg, MRI);
+
+ if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+ }
+
+ if (I.getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
+ I.setDesc(TII.get(X86::IMPLICIT_DEF));
+ else
+ I.setDesc(TII.get(X86::PHI));
+
+ return true;
+}
+
+bool X86InstructionSelector::selectDivRem(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ // The implementation of this function is taken from X86FastISel.
+ assert((I.getOpcode() == TargetOpcode::G_SDIV ||
+ I.getOpcode() == TargetOpcode::G_SREM ||
+ I.getOpcode() == TargetOpcode::G_UDIV ||
+ I.getOpcode() == TargetOpcode::G_UREM) &&
+ "unexpected instruction");
+
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register Op1Reg = I.getOperand(1).getReg();
+ const Register Op2Reg = I.getOperand(2).getReg();
+
+ const LLT RegTy = MRI.getType(DstReg);
+ assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) &&
+ "Arguments and return value types must match");
+
+ const RegisterBank *RegRB = RBI.getRegBank(DstReg, MRI, TRI);
+ if (!RegRB || RegRB->getID() != X86::GPRRegBankID)
+ return false;
+
+ const static unsigned NumTypes = 4; // i8, i16, i32, i64
+ const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
+ const static bool S = true; // IsSigned
+ const static bool U = false; // !IsSigned
+ const static unsigned Copy = TargetOpcode::COPY;
+ // For the X86 IDIV instruction, in most cases the dividend
+ // (numerator) must be in a specific register pair highreg:lowreg,
+ // producing the quotient in lowreg and the remainder in highreg.
+ // For most data types, to set up the instruction, the dividend is
+ // copied into lowreg, and lowreg is sign-extended into highreg. The
+ // exception is i8, where the dividend is defined as a single register rather
+ // than a register pair, and we therefore directly sign-extend the dividend
+ // into lowreg, instead of copying, and ignore the highreg.
+ const static struct DivRemEntry {
+ // The following portion depends only on the data type.
+ unsigned SizeInBits;
+ unsigned LowInReg; // low part of the register pair
+ unsigned HighInReg; // high part of the register pair
+ // The following portion depends on both the data type and the operation.
+ struct DivRemResult {
+ unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
+ unsigned OpSignExtend; // Opcode for sign-extending lowreg into
+ // highreg, or copying a zero into highreg.
+ unsigned OpCopy; // Opcode for copying dividend into lowreg, or
+ // zero/sign-extending into lowreg for i8.
+ unsigned DivRemResultReg; // Register containing the desired result.
+ bool IsOpSigned; // Whether to use signed or unsigned form.
+ } ResultTable[NumOps];
+ } OpTable[NumTypes] = {
+ {8,
+ X86::AX,
+ 0,
+ {
+ {X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S}, // SDiv
+ {X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S}, // SRem
+ {X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U}, // UDiv
+ {X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U}, // URem
+ }}, // i8
+ {16,
+ X86::AX,
+ X86::DX,
+ {
+ {X86::IDIV16r, X86::CWD, Copy, X86::AX, S}, // SDiv
+ {X86::IDIV16r, X86::CWD, Copy, X86::DX, S}, // SRem
+ {X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U}, // UDiv
+ {X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U}, // URem
+ }}, // i16
+ {32,
+ X86::EAX,
+ X86::EDX,
+ {
+ {X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S}, // SDiv
+ {X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S}, // SRem
+ {X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U}, // UDiv
+ {X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U}, // URem
+ }}, // i32
+ {64,
+ X86::RAX,
+ X86::RDX,
+ {
+ {X86::IDIV64r, X86::CQO, Copy, X86::RAX, S}, // SDiv
+ {X86::IDIV64r, X86::CQO, Copy, X86::RDX, S}, // SRem
+ {X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U}, // UDiv
+ {X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U}, // URem
+ }}, // i64
+ };
+
+ auto OpEntryIt = llvm::find_if(OpTable, [RegTy](const DivRemEntry &El) {
+ return El.SizeInBits == RegTy.getSizeInBits();
+ });
+ if (OpEntryIt == std::end(OpTable))
+ return false;
+
+ unsigned OpIndex;
+ switch (I.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected div/rem opcode");
+ case TargetOpcode::G_SDIV:
+ OpIndex = 0;
+ break;
+ case TargetOpcode::G_SREM:
+ OpIndex = 1;
+ break;
+ case TargetOpcode::G_UDIV:
+ OpIndex = 2;
+ break;
+ case TargetOpcode::G_UREM:
+ OpIndex = 3;
+ break;
+ }
+
+ const DivRemEntry &TypeEntry = *OpEntryIt;
+ const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+
+ const TargetRegisterClass *RegRC = getRegClass(RegTy, *RegRB);
+ if (!RBI.constrainGenericRegister(Op1Reg, *RegRC, MRI) ||
+ !RBI.constrainGenericRegister(Op2Reg, *RegRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *RegRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+
+ // Move op1 into low-order input register.
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpEntry.OpCopy),
+ TypeEntry.LowInReg)
+ .addReg(Op1Reg);
+ // Zero-extend or sign-extend into high-order input register.
+ if (OpEntry.OpSignExtend) {
+ if (OpEntry.IsOpSigned)
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(OpEntry.OpSignExtend));
+ else {
+ Register Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::MOV32r0),
+ Zero32);
+
+ // Copy the zero into the appropriate sub/super/identical physical
+ // register. Unfortunately the operations needed are not uniform enough
+ // to fit neatly into the table above.
+ if (RegTy.getSizeInBits() == 16) {
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Copy),
+ TypeEntry.HighInReg)
+ .addReg(Zero32, 0, X86::sub_16bit);
+ } else if (RegTy.getSizeInBits() == 32) {
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Copy),
+ TypeEntry.HighInReg)
+ .addReg(Zero32);
+ } else if (RegTy.getSizeInBits() == 64) {
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+ .addImm(0)
+ .addReg(Zero32)
+ .addImm(X86::sub_32bit);
+ }
+ }
+ }
+ // Generate the DIV/IDIV instruction.
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpEntry.OpDivRem))
+ .addReg(Op2Reg);
+ // For i8 remainder, we can't reference ah directly, as we'll end
+ // up with bogus copies like %r9b = COPY %ah. Reference ax
+ // instead to prevent ah references in a rex instruction.
+ //
+ // The current assumption of the fast register allocator is that isel
+ // won't generate explicit references to the GR8_NOREX registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
+ if ((I.getOpcode() == Instruction::SRem ||
+ I.getOpcode() == Instruction::URem) &&
+ OpEntry.DivRemResultReg == X86::AH && STI.is64Bit()) {
+ Register SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+ Register ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Copy), SourceSuperReg)
+ .addReg(X86::AX);
+
+ // Shift AX right by 8 bits instead of using AH.
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SHR16ri),
+ ResultSuperReg)
+ .addReg(SourceSuperReg)
+ .addImm(8);
+
+ // Now reference the 8-bit subreg of the result.
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::SUBREG_TO_REG))
+ .addDef(DstReg)
+ .addImm(0)
+ .addReg(ResultSuperReg)
+ .addImm(X86::sub_8bit);
+ } else {
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
+ DstReg)
+ .addReg(OpEntry.DivRemResultReg);
+ }
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectIntrinsicWSideEffects(
+ MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const {
+
+ assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS &&
+ "unexpected instruction");
+
+ if (I.getOperand(0).getIntrinsicID() != Intrinsic::trap)
+ return false;
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::TRAP));
+
+ I.eraseFromParent();
+ return true;
+}
+
+InstructionSelector *
+llvm::createX86InstructionSelector(const X86TargetMachine &TM,
+ X86Subtarget &Subtarget,
+ X86RegisterBankInfo &RBI) {
+ return new X86InstructionSelector(TM, Subtarget, RBI);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
new file mode 100644
index 000000000000..95655dd4723b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -0,0 +1,848 @@
+//===- X86InterleavedAccess.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the X86 implementation of the interleaved accesses
+/// optimization generating X86-specific instructions/intrinsics for
+/// interleaved access groups.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MachineValueType.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+
+using namespace llvm;
+
+namespace {
+
+/// This class holds necessary information to represent an interleaved
+/// access group and supports utilities to lower the group into
+/// X86-specific instructions/intrinsics.
+/// E.g. A group of interleaving access loads (Factor = 2; accessing every
+/// other element)
+/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
+/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
+class X86InterleavedAccessGroup {
+ /// Reference to the wide-load instruction of an interleaved access
+ /// group.
+ Instruction *const Inst;
+
+ /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
+ ArrayRef<ShuffleVectorInst *> Shuffles;
+
+ /// Reference to the starting index of each user-shuffle.
+ ArrayRef<unsigned> Indices;
+
+ /// Reference to the interleaving stride in terms of elements.
+ const unsigned Factor;
+
+ /// Reference to the underlying target.
+ const X86Subtarget &Subtarget;
+
+ const DataLayout &DL;
+
+ IRBuilder<> &Builder;
+
+ /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
+ /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
+ void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
+ SmallVectorImpl<Instruction *> &DecomposedVectors);
+
+ /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
+ /// returns the transposed-vectors in \p TransposedVectors.
+ /// E.g.
+ /// InputVectors:
+ /// In-V0 = p1, p2, p3, p4
+ /// In-V1 = q1, q2, q3, q4
+ /// In-V2 = r1, r2, r3, r4
+ /// In-V3 = s1, s2, s3, s4
+ /// OutputVectors:
+ /// Out-V0 = p1, q1, r1, s1
+ /// Out-V1 = p2, q2, r2, s2
+ /// Out-V2 = p3, q3, r3, s3
+ /// Out-V3 = P4, q4, r4, s4
+ void transpose_4x4(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix);
+ void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned NumSubVecElems);
+ void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix);
+ void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned NumSubVecElems);
+ void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned NumSubVecElems);
+
+public:
+ /// In order to form an interleaved access group X86InterleavedAccessGroup
+ /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
+ /// \p Shuffs, reference to the first indices of each interleaved-vector
+ /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
+ /// X86-specific instructions/intrinsics it also requires the underlying
+ /// target information \p STarget.
+ explicit X86InterleavedAccessGroup(Instruction *I,
+ ArrayRef<ShuffleVectorInst *> Shuffs,
+ ArrayRef<unsigned> Ind, const unsigned F,
+ const X86Subtarget &STarget,
+ IRBuilder<> &B)
+ : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
+ DL(Inst->getModule()->getDataLayout()), Builder(B) {}
+
+ /// Returns true if this interleaved access group can be lowered into
+ /// x86-specific instructions/intrinsics, false otherwise.
+ bool isSupported() const;
+
+ /// Lowers this interleaved access group into X86-specific
+ /// instructions/intrinsics.
+ bool lowerIntoOptimizedSequence();
+};
+
+} // end anonymous namespace
+
+bool X86InterleavedAccessGroup::isSupported() const {
+ VectorType *ShuffleVecTy = Shuffles[0]->getType();
+ Type *ShuffleEltTy = ShuffleVecTy->getElementType();
+ unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
+ unsigned WideInstSize;
+
+ // Currently, lowering is supported for the following vectors:
+ // Stride 4:
+ // 1. Store and load of 4-element vectors of 64 bits on AVX.
+ // 2. Store of 16/32-element vectors of 8 bits on AVX.
+ // Stride 3:
+ // 1. Load of 16/32-element vectors of 8 bits on AVX.
+ if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
+ return false;
+
+ if (isa<LoadInst>(Inst)) {
+ WideInstSize = DL.getTypeSizeInBits(Inst->getType());
+ if (cast<LoadInst>(Inst)->getPointerAddressSpace())
+ return false;
+ } else
+ WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
+
+ // We support shuffle represents stride 4 for byte type with size of
+ // WideInstSize.
+ if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
+ return true;
+
+ if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
+ (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
+ WideInstSize == 2048))
+ return true;
+
+ if (ShuffleElemSize == 8 && Factor == 3 &&
+ (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
+ return true;
+
+ return false;
+}
+
+void X86InterleavedAccessGroup::decompose(
+ Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
+ SmallVectorImpl<Instruction *> &DecomposedVectors) {
+ assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
+ "Expected Load or Shuffle");
+
+ Type *VecWidth = VecInst->getType();
+ (void)VecWidth;
+ assert(VecWidth->isVectorTy() &&
+ DL.getTypeSizeInBits(VecWidth) >=
+ DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
+ "Invalid Inst-size!!!");
+
+ if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
+ Value *Op0 = SVI->getOperand(0);
+ Value *Op1 = SVI->getOperand(1);
+
+ // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
+ for (unsigned i = 0; i < NumSubVectors; ++i)
+ DecomposedVectors.push_back(
+ cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
+ Op0, Op1,
+ createSequentialMask(Indices[i], SubVecTy->getNumElements(),
+ 0))));
+ return;
+ }
+
+ // Decompose the load instruction.
+ LoadInst *LI = cast<LoadInst>(VecInst);
+ Type *VecBaseTy, *VecBasePtrTy;
+ Value *VecBasePtr;
+ unsigned int NumLoads = NumSubVectors;
+ // In the case of stride 3 with a vector of 32 elements load the information
+ // in the following way:
+ // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
+ unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
+ if (VecLength == 768 || VecLength == 1536) {
+ VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
+ VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
+ VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
+ NumLoads = NumSubVectors * (VecLength / 384);
+ } else {
+ VecBaseTy = SubVecTy;
+ VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
+ VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
+ }
+ // Generate N loads of T type.
+ assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) &&
+ "VecBaseTy's size must be a multiple of 8");
+ const Align FirstAlignment = LI->getAlign();
+ const Align SubsequentAlignment = commonAlignment(
+ FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize() / 8);
+ Align Alignment = FirstAlignment;
+ for (unsigned i = 0; i < NumLoads; i++) {
+ // TODO: Support inbounds GEP.
+ Value *NewBasePtr =
+ Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
+ Instruction *NewLoad =
+ Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
+ DecomposedVectors.push_back(NewLoad);
+ Alignment = SubsequentAlignment;
+ }
+}
+
+// Changing the scale of the vector type by reducing the number of elements and
+// doubling the scalar size.
+static MVT scaleVectorType(MVT VT) {
+ unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
+ return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
+ VT.getVectorNumElements() / 2);
+}
+
+static constexpr int Concat[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+// genShuffleBland - Creates shuffle according to two vectors.This function is
+// only works on instructions with lane inside 256 registers. According to
+// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
+// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
+// Where the 'LowOffset' refers to the first vector and the highOffset refers to
+// the second vector.
+// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
+// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
+// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
+// For the sequence to work as a mirror to the load.
+// We must consider the elements order as above.
+// In this function we are combining two types of shuffles.
+// The first one is vpshufed and the second is a type of "blend" shuffle.
+// By computing the shuffle on a sequence of 16 elements(one lane) and add the
+// correct offset. We are creating a vpsuffed + blend sequence between two
+// shuffles.
+static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &Out, int LowOffset,
+ int HighOffset) {
+ assert(VT.getSizeInBits() >= 256 &&
+ "This function doesn't accept width smaller then 256");
+ unsigned NumOfElm = VT.getVectorNumElements();
+ for (unsigned i = 0; i < Mask.size(); i++)
+ Out.push_back(Mask[i] + LowOffset);
+ for (unsigned i = 0; i < Mask.size(); i++)
+ Out.push_back(Mask[i] + HighOffset + NumOfElm);
+}
+
+// reorderSubVector returns the data to is the original state. And de-facto is
+// the opposite of the function concatSubVector.
+
+// For VecElems = 16
+// Invec[0] - |0| TransposedMatrix[0] - |0|
+// Invec[1] - |1| => TransposedMatrix[1] - |1|
+// Invec[2] - |2| TransposedMatrix[2] - |2|
+
+// For VecElems = 32
+// Invec[0] - |0|3| TransposedMatrix[0] - |0|1|
+// Invec[1] - |1|4| => TransposedMatrix[1] - |2|3|
+// Invec[2] - |2|5| TransposedMatrix[2] - |4|5|
+
+// For VecElems = 64
+// Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 |
+// Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 |
+// Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11|
+
+static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
+ ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
+ unsigned VecElems, unsigned Stride,
+ IRBuilder<> &Builder) {
+
+ if (VecElems == 16) {
+ for (unsigned i = 0; i < Stride; i++)
+ TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
+ return;
+ }
+
+ SmallVector<int, 32> OptimizeShuf;
+ Value *Temp[8];
+
+ for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
+ genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
+ (i + 1) / Stride * 16);
+ Temp[i / 2] = Builder.CreateShuffleVector(
+ Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
+ OptimizeShuf.clear();
+ }
+
+ if (VecElems == 32) {
+ std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
+ return;
+ } else
+ for (unsigned i = 0; i < Stride; i++)
+ TransposedMatrix[i] =
+ Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
+}
+
+void X86InterleavedAccessGroup::interleave8bitStride4VF8(
+ ArrayRef<Instruction *> Matrix,
+ SmallVectorImpl<Value *> &TransposedMatrix) {
+ // Assuming we start from the following vectors:
+ // Matrix[0]= c0 c1 c2 c3 c4 ... c7
+ // Matrix[1]= m0 m1 m2 m3 m4 ... m7
+ // Matrix[2]= y0 y1 y2 y3 y4 ... y7
+ // Matrix[3]= k0 k1 k2 k3 k4 ... k7
+
+ MVT VT = MVT::v8i16;
+ TransposedMatrix.resize(2);
+ SmallVector<int, 16> MaskLow;
+ SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
+ SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
+
+ for (unsigned i = 0; i < 8; ++i) {
+ MaskLow.push_back(i);
+ MaskLow.push_back(i + 8);
+ }
+
+ createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
+ createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
+ narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord);
+ narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord);
+ // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
+ // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
+ Value *IntrVec1Low =
+ Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
+ Value *IntrVec2Low =
+ Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
+
+ // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
+ // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
+
+ TransposedMatrix[0] =
+ Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
+ TransposedMatrix[1] =
+ Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
+}
+
+void X86InterleavedAccessGroup::interleave8bitStride4(
+ ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned NumOfElm) {
+ // Example: Assuming we start from the following vectors:
+ // Matrix[0]= c0 c1 c2 c3 c4 ... c31
+ // Matrix[1]= m0 m1 m2 m3 m4 ... m31
+ // Matrix[2]= y0 y1 y2 y3 y4 ... y31
+ // Matrix[3]= k0 k1 k2 k3 k4 ... k31
+
+ MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
+ MVT HalfVT = scaleVectorType(VT);
+
+ TransposedMatrix.resize(4);
+ SmallVector<int, 32> MaskHigh;
+ SmallVector<int, 32> MaskLow;
+ SmallVector<int, 32> LowHighMask[2];
+ SmallVector<int, 32> MaskHighTemp;
+ SmallVector<int, 32> MaskLowTemp;
+
+ // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
+ // shuffle pattern.
+
+ createUnpackShuffleMask(VT, MaskLow, true, false);
+ createUnpackShuffleMask(VT, MaskHigh, false, false);
+
+ // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
+ // shuffle pattern.
+
+ createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
+ createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
+ narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]);
+ narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]);
+
+ // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
+ // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
+ // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23
+ // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
+ Value *IntrVec[4];
+
+ IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
+ IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
+ IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
+ IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
+
+ // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23
+ // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
+ // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19
+ // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
+
+ Value *VecOut[4];
+ for (int i = 0; i < 4; i++)
+ VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
+ LowHighMask[i % 2]);
+
+ // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7
+ // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15
+ // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23
+ // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31
+
+ if (VT == MVT::v16i8) {
+ std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
+ return;
+ }
+
+ reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
+ NumOfElm, 4, Builder);
+}
+
+// createShuffleStride returns shuffle mask of size N.
+// The shuffle pattern is as following :
+// {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
+// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
+// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
+// Where Lane is the # of lanes in a register:
+// VectorSize = 128 => Lane = 1
+// VectorSize = 256 => Lane = 2
+// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
+// {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
+static void createShuffleStride(MVT VT, int Stride,
+ SmallVectorImpl<int> &Mask) {
+ int VectorSize = VT.getSizeInBits();
+ int VF = VT.getVectorNumElements();
+ int LaneCount = std::max(VectorSize / 128, 1);
+ for (int Lane = 0; Lane < LaneCount; Lane++)
+ for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
+ Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
+}
+
+// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
+// inside mask a shuffleMask. A mask contains exactly 3 groups, where
+// each group is a monotonically increasing sequence with stride 3.
+// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
+static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
+ int VectorSize = VT.getSizeInBits();
+ int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
+ for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
+ int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
+ SizeInfo.push_back(GroupSize);
+ FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
+ }
+}
+
+// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
+// vpalign works according to lanes
+// Where Lane is the # of lanes in a register:
+// VectorWide = 128 => Lane = 1
+// VectorWide = 256 => Lane = 2
+// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
+// For Lane = 2 shuffle pattern is:
+// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
+// Imm variable sets the offset amount. The result of the
+// function is stored inside ShuffleMask vector and it built as described in
+// the begin of the description. AlignDirection is a boolean that indicates the
+// direction of the alignment. (false - align to the "right" side while true -
+// align to the "left" side)
+static void DecodePALIGNRMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask,
+ bool AlignDirection = true, bool Unary = false) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
+ unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ unsigned Base = i + Offset;
+ // if i+offset is out of this lane then we actually need the other source
+ // If Unary the other source is the first source.
+ if (Base >= NumLaneElts)
+ Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
+ ShuffleMask.push_back(Base + l);
+ }
+ }
+}
+
+// concatSubVector - The function rebuilds the data to a correct expected
+// order. An assumption(The shape of the matrix) was taken for the
+// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
+// This function ensures that the data is built in correct way for the lane
+// instructions. Each lane inside the vector is a 128-bit length.
+//
+// The 'InVec' argument contains the data in increasing order. In InVec[0] You
+// can find the first 128 bit data. The number of different lanes inside a
+// vector depends on the 'VecElems'.In general, the formula is
+// VecElems * type / 128. The size of the array 'InVec' depends and equal to
+// 'VecElems'.
+
+// For VecElems = 16
+// Invec[0] - |0| Vec[0] - |0|
+// Invec[1] - |1| => Vec[1] - |1|
+// Invec[2] - |2| Vec[2] - |2|
+
+// For VecElems = 32
+// Invec[0] - |0|1| Vec[0] - |0|3|
+// Invec[1] - |2|3| => Vec[1] - |1|4|
+// Invec[2] - |4|5| Vec[2] - |2|5|
+
+// For VecElems = 64
+// Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 |
+// Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10|
+// Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11|
+
+static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
+ unsigned VecElems, IRBuilder<> &Builder) {
+ if (VecElems == 16) {
+ for (int i = 0; i < 3; i++)
+ Vec[i] = InVec[i];
+ return;
+ }
+
+ for (unsigned j = 0; j < VecElems / 32; j++)
+ for (int i = 0; i < 3; i++)
+ Vec[i + j * 3] = Builder.CreateShuffleVector(
+ InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
+
+ if (VecElems == 32)
+ return;
+
+ for (int i = 0; i < 3; i++)
+ Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
+}
+
+void X86InterleavedAccessGroup::deinterleave8bitStride3(
+ ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned VecElems) {
+ // Example: Assuming we start from the following vectors:
+ // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
+ // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
+ // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
+
+ TransposedMatrix.resize(3);
+ SmallVector<int, 32> VPShuf;
+ SmallVector<int, 32> VPAlign[2];
+ SmallVector<int, 32> VPAlign2;
+ SmallVector<int, 32> VPAlign3;
+ SmallVector<int, 3> GroupSize;
+ Value *Vec[6], *TempVector[3];
+
+ MVT VT = MVT::getVT(Shuffles[0]->getType());
+
+ createShuffleStride(VT, 3, VPShuf);
+ setGroupSize(VT, GroupSize);
+
+ for (int i = 0; i < 2; i++)
+ DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
+
+ DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
+ DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
+
+ concatSubVector(Vec, InVec, VecElems, Builder);
+ // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
+ // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
+ // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
+
+ for (int i = 0; i < 3; i++)
+ Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
+
+ // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
+ // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
+ // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
+
+ for (int i = 0; i < 3; i++)
+ TempVector[i] =
+ Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
+
+ // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
+ // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
+ // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
+
+ for (int i = 0; i < 3; i++)
+ Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
+ VPAlign[1]);
+
+ // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
+ // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
+ // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
+
+ Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
+ TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
+ TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
+ TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
+}
+
+// group2Shuffle reorder the shuffle stride back into continuous order.
+// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
+// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
+static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<int> &Output) {
+ int IndexGroup[3] = {0, 0, 0};
+ int Index = 0;
+ int VectorWidth = VT.getSizeInBits();
+ int VF = VT.getVectorNumElements();
+ // Find the index of the different groups.
+ int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
+ for (int i = 0; i < 3; i++) {
+ IndexGroup[(Index * 3) % (VF / Lane)] = Index;
+ Index += Mask[i];
+ }
+ // According to the index compute the convert mask.
+ for (int i = 0; i < VF / Lane; i++) {
+ Output.push_back(IndexGroup[i % 3]);
+ IndexGroup[i % 3]++;
+ }
+}
+
+void X86InterleavedAccessGroup::interleave8bitStride3(
+ ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned VecElems) {
+ // Example: Assuming we start from the following vectors:
+ // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
+ // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
+ // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
+
+ TransposedMatrix.resize(3);
+ SmallVector<int, 3> GroupSize;
+ SmallVector<int, 32> VPShuf;
+ SmallVector<int, 32> VPAlign[3];
+ SmallVector<int, 32> VPAlign2;
+ SmallVector<int, 32> VPAlign3;
+
+ Value *Vec[3], *TempVector[3];
+ MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
+
+ setGroupSize(VT, GroupSize);
+
+ for (int i = 0; i < 3; i++)
+ DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
+
+ DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
+ DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
+
+ // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
+ // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
+ // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
+
+ Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
+ Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
+ Vec[2] = InVec[2];
+
+ // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
+ // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
+ // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
+
+ for (int i = 0; i < 3; i++)
+ TempVector[i] =
+ Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
+
+ // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
+ // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
+ // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
+
+ for (int i = 0; i < 3; i++)
+ Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
+ VPAlign[2]);
+
+ // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
+ // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
+ // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
+
+ unsigned NumOfElm = VT.getVectorNumElements();
+ group2Shuffle(VT, GroupSize, VPShuf);
+ reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
+}
+
+void X86InterleavedAccessGroup::transpose_4x4(
+ ArrayRef<Instruction *> Matrix,
+ SmallVectorImpl<Value *> &TransposedMatrix) {
+ assert(Matrix.size() == 4 && "Invalid matrix size");
+ TransposedMatrix.resize(4);
+
+ // dst = src1[0,1],src2[0,1]
+ static constexpr int IntMask1[] = {0, 1, 4, 5};
+ ArrayRef<int> Mask = makeArrayRef(IntMask1, 4);
+ Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
+ Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
+
+ // dst = src1[2,3],src2[2,3]
+ static constexpr int IntMask2[] = {2, 3, 6, 7};
+ Mask = makeArrayRef(IntMask2, 4);
+ Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
+ Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
+
+ // dst = src1[0],src2[0],src1[2],src2[2]
+ static constexpr int IntMask3[] = {0, 4, 2, 6};
+ Mask = makeArrayRef(IntMask3, 4);
+ TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
+ TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
+
+ // dst = src1[1],src2[1],src1[3],src2[3]
+ static constexpr int IntMask4[] = {1, 5, 3, 7};
+ Mask = makeArrayRef(IntMask4, 4);
+ TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
+ TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
+}
+
+// Lowers this interleaved access group into X86-specific
+// instructions/intrinsics.
+bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
+ SmallVector<Instruction *, 4> DecomposedVectors;
+ SmallVector<Value *, 4> TransposedVectors;
+ auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
+
+ if (isa<LoadInst>(Inst)) {
+ // Try to generate target-sized register(/instruction).
+ decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
+
+ auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
+ unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
+ // Perform matrix-transposition in order to compute interleaved
+ // results by generating some sort of (optimized) target-specific
+ // instructions.
+
+ switch (NumSubVecElems) {
+ default:
+ return false;
+ case 4:
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+ break;
+ case 8:
+ case 16:
+ case 32:
+ case 64:
+ deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
+ NumSubVecElems);
+ break;
+ }
+
+ // Now replace the unoptimized-interleaved-vectors with the
+ // transposed-interleaved vectors.
+ for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
+ Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+
+ return true;
+ }
+
+ Type *ShuffleEltTy = ShuffleTy->getElementType();
+ unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
+
+ // Lower the interleaved stores:
+ // 1. Decompose the interleaved wide shuffle into individual shuffle
+ // vectors.
+ decompose(Shuffles[0], Factor,
+ FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
+ DecomposedVectors);
+
+ // 2. Transpose the interleaved-vectors into vectors of contiguous
+ // elements.
+ switch (NumSubVecElems) {
+ case 4:
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+ break;
+ case 8:
+ interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
+ break;
+ case 16:
+ case 32:
+ case 64:
+ if (Factor == 4)
+ interleave8bitStride4(DecomposedVectors, TransposedVectors,
+ NumSubVecElems);
+ if (Factor == 3)
+ interleave8bitStride3(DecomposedVectors, TransposedVectors,
+ NumSubVecElems);
+ break;
+ default:
+ return false;
+ }
+
+ // 3. Concatenate the contiguous-vectors back into a wide vector.
+ Value *WideVec = concatenateVectors(Builder, TransposedVectors);
+
+ // 4. Generate a store instruction for wide-vec.
+ StoreInst *SI = cast<StoreInst>(Inst);
+ Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign());
+
+ return true;
+}
+
+// Lower interleaved load(s) into target specific instructions/
+// intrinsics. Lowering sequence varies depending on the vector-types, factor,
+// number of shuffles and ISA.
+// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
+bool X86TargetLowering::lowerInterleavedLoad(
+ LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices, unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+ assert(!Shuffles.empty() && "Empty shufflevector input");
+ assert(Shuffles.size() == Indices.size() &&
+ "Unmatched number of shufflevectors and indices");
+
+ // Create an interleaved access group.
+ IRBuilder<> Builder(LI);
+ X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
+ Builder);
+
+ return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
+
+bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+ ShuffleVectorInst *SVI,
+ unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
+ assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
+ 0 &&
+ "Invalid interleaved store");
+
+ // Holds the indices of SVI that correspond to the starting index of each
+ // interleaved shuffle.
+ SmallVector<unsigned, 4> Indices;
+ auto Mask = SVI->getShuffleMask();
+ for (unsigned i = 0; i < Factor; i++)
+ Indices.push_back(Mask[i]);
+
+ ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
+
+ // Create an interleaved access group.
+ IRBuilder<> Builder(SI);
+ X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
+ Builder);
+
+ return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
new file mode 100644
index 000000000000..72ab3e9cf78d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -0,0 +1,1177 @@
+//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the details for lowering X86 intrinsics
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "llvm/IR/IntrinsicsX86.h"
+
+namespace llvm {
+
+enum IntrinsicType : uint16_t {
+ CVTNEPS2BF16_MASK,
+ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
+ INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8,
+ INTR_TYPE_3OP_IMM8,
+ CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI,
+ CVTPD2PS_MASK,
+ INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE,
+ INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE,
+ INTR_TYPE_1OP_MASK, INTR_TYPE_2OP_MASK,
+ IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_SAE,
+ INTR_TYPE_SCALAR_MASK_RND,
+ INTR_TYPE_3OP_SCALAR_MASK_SAE,
+ COMPRESS_EXPAND_IN_REG,
+ TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2DQ_MASK, CVTQQ2PS_MASK,
+ TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
+ FIXUPIMM, FIXUPIMM_MASKZ, GATHER_AVX2,
+ ROUNDP, ROUNDS
+};
+
+struct IntrinsicData {
+
+ uint16_t Id;
+ IntrinsicType Type;
+ uint16_t Opc0;
+ uint16_t Opc1;
+
+ bool operator<(const IntrinsicData &RHS) const {
+ return Id < RHS.Id;
+ }
+ bool operator==(const IntrinsicData &RHS) const {
+ return RHS.Id == Id;
+ }
+ friend bool operator<(const IntrinsicData &LHS, unsigned Id) {
+ return LHS.Id < Id;
+ }
+};
+
+#define X86_INTRINSIC_DATA(id, type, op0, op1) \
+ { Intrinsic::x86_##id, type, op0, op1 }
+
+/*
+ * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData IntrinsicsWithChain[] = {
+ X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, 0, 0),
+
+ X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0),
+
+ X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
+ X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
+ X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
+ X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm),
+ X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH,
+ X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
+ X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
+ X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
+
+ X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, 0, 0),
+
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, 0, 0),
+
+ X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
+ X86::VSCATTERPF1DPDm),
+ X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
+ X86::VSCATTERPF1DPSm),
+ X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm,
+ X86::VSCATTERPF1QPDm),
+ X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
+ X86::VSCATTERPF1QPSm),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0),
+ X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
+ X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
+ X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
+ X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0),
+ X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0),
+ X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
+ X86_INTRINSIC_DATA(rdtsc, RDTSC, X86::RDTSC, 0),
+ X86_INTRINSIC_DATA(rdtscp, RDTSC, X86::RDTSCP, 0),
+ X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0),
+ X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0),
+};
+
+/*
+ * Find Intrinsic data by intrinsic ID
+ */
+static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
+ const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithChain),
+ std::end(IntrinsicsWithChain),
+ IntNo);
+ if (Data != std::end(IntrinsicsWithChain) && Data->Id == IntNo)
+ return Data;
+ return nullptr;
+}
+
+/*
+ * IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData IntrinsicsWithoutChain[] = {
+ X86_INTRINSIC_DATA(addcarry_32, ADX, X86ISD::ADC, X86ISD::ADD),
+ X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD),
+ X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
+ X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
+ X86_INTRINSIC_DATA(avx_blendv_pd_256, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(avx_blendv_ps_256, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
+ X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
+ X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx_cvt_ps2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(avx_max_pd_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx_round_pd_256, ROUNDP, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx_round_ps_256, ROUNDP, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_d_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_d_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_q_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_q_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_q_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_dbpsadbw_128, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_dbpsadbw_256, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
+ X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
+ X86_INTRINSIC_DATA(avx512_fpclass_pd_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_pd_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_pd_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_ps_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_ps_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_fpclass_ps_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_kadd_b, INTR_TYPE_2OP, X86ISD::KADD, 0),
+ X86_INTRINSIC_DATA(avx512_kadd_d, INTR_TYPE_2OP, X86ISD::KADD, 0),
+ X86_INTRINSIC_DATA(avx512_kadd_q, INTR_TYPE_2OP, X86ISD::KADD, 0),
+ X86_INTRINSIC_DATA(avx512_kadd_w, INTR_TYPE_2OP, X86ISD::KADD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FADDS, X86ISD::FADDS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FADDS, X86ISD::FADDS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
+ X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
+ X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
+
+ X86_INTRINSIC_DATA(avx512_mask_compress, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, CVTPD2DQ_MASK,
+ X86ISD::CVTP2SI, X86ISD::MCVTP2SI),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, CVTPD2PS_MASK,
+ X86ISD::VFPROUND, X86ISD::VMFPROUND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VFPROUND, X86ISD::VFPROUND_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, CVTPD2DQ_MASK,
+ X86ISD::CVTP2UI, X86ISD::MCVTP2UI),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK_SAE,
+ ISD::FP_EXTEND, X86ISD::VFPEXT_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, CVTQQ2PS_MASK,
+ X86ISD::CVTSI2P, X86ISD::MCVTSI2P),
+ X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RND,
+ X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, CVTPD2DQ_MASK,
+ X86ISD::CVTTP2SI, X86ISD::MCVTTP2SI),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, CVTPD2DQ_MASK,
+ X86ISD::CVTTP2UI, X86ISD::MCVTTP2UI),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2UI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, CVTQQ2PS_MASK,
+ X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+ X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FDIVS, X86ISD::FDIVS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FDIVS, X86ISD::FDIVS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_expand, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::FGETEXP, X86ISD::FGETEXP_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::FGETEXP, X86ISD::FGETEXP_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, X86ISD::VGETMANT_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, X86ISD::VGETMANT_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_SAE,
+ X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_SAE,
+ X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMAXS, X86ISD::FMAXS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMAXS, X86ISD::FMAXS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMINS, X86ISD::FMINS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMINS, X86ISD::FMINS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FMULS, X86ISD::FMULS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FMULS, X86ISD::FMULS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, TRUNCATE_TO_REG,
+ ISD::TRUNCATE, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, TRUNCATE_TO_REG,
+ ISD::TRUNCATE, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, TRUNCATE_TO_REG,
+ ISD::TRUNCATE, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, TRUNCATE_TO_REG,
+ ISD::TRUNCATE, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNC, X86ISD::VMTRUNC),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCUS, X86ISD::VMTRUNCUS),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, X86ISD::SCALEF_RND),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, X86ISD::SCALEF_RND),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::SCALEFS, X86ISD::SCALEFS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::SCALEFS, X86ISD::SCALEFS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSQRTS, X86ISD::FSQRTS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSQRTS, X86ISD::FSQRTS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSUBS, X86ISD::FSUBS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSUBS, X86ISD::FSUBS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK,
+ X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, CVTPS2PH_MASK,
+ X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, CVTPS2PH_MASK,
+ X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH),
+
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_512, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_128, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_256, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_512, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
+
+ X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE),
+ X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE),
+ X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE),
+ X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE),
+ X86_INTRINSIC_DATA(avx512_mul_pd_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_mul_ps_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_di_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_hi_128, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_hi_256, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_hi_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_qi_128, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_qi_256, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_qi_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_sf_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_permvar_si_512, VPERM_2OP, X86ISD::VPERMV, 0),
+ X86_INTRINSIC_DATA(avx512_pmaddubs_w_512, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_pmaddw_d_512, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_pmul_hr_sw_512, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(avx512_pmulh_w_512, INTR_TYPE_2OP, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(avx512_pmulhu_w_512, INTR_TYPE_2OP, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(avx512_pmultishift_qb_128, INTR_TYPE_2OP, X86ISD::MULTISHIFT, 0),
+ X86_INTRINSIC_DATA(avx512_pmultishift_qb_256, INTR_TYPE_2OP, X86ISD::MULTISHIFT, 0),
+ X86_INTRINSIC_DATA(avx512_pmultishift_qb_512, INTR_TYPE_2OP, X86ISD::MULTISHIFT, 0),
+ X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_psll_q_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_psll_w_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_q_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_w_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_d_512, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_q_128, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_q_256, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_q_512, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_w_512, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_d_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_q_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_q_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_q_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_w_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_w_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_w_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_d_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_q_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_w_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
+ X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
+ X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
+ X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
+ X86_INTRINSIC_DATA(avx512_sitofp_round, INTR_TYPE_1OP, ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_sub_pd_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_sub_ps_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_uitofp_round, INTR_TYPE_1OP, ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+ X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2usi32, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2usi64, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND),
+ X86_INTRINSIC_DATA(avx512_vcvtss2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
+ X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
+ X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND),
+ X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND),
+ X86_INTRINSIC_DATA(avx512_vfmadd_f32, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_vfmadd_f64, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_vfmadd_pd_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_vfmadd_ps_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+ X86_INTRINSIC_DATA(avx512_vfmaddsub_pd_512, INTR_TYPE_3OP, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+ X86_INTRINSIC_DATA(avx512_vfmaddsub_ps_512, INTR_TYPE_3OP, X86ISD::FMADDSUB,
+ X86ISD::FMADDSUB_RND),
+
+ X86_INTRINSIC_DATA(avx512_vpdpbusd_128, INTR_TYPE_3OP, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusd_256, INTR_TYPE_3OP, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusd_512, INTR_TYPE_3OP, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusds_128, INTR_TYPE_3OP, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusds_256, INTR_TYPE_3OP, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpbusds_512, INTR_TYPE_3OP, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssd_128, INTR_TYPE_3OP, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssd_256, INTR_TYPE_3OP, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssd_512, INTR_TYPE_3OP, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssds_128, INTR_TYPE_3OP, X86ISD::VPDPWSSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssds_256, INTR_TYPE_3OP, X86ISD::VPDPWSSDS, 0),
+ X86_INTRINSIC_DATA(avx512_vpdpwssds_512, INTR_TYPE_3OP, X86ISD::VPDPWSSDS, 0),
+
+ X86_INTRINSIC_DATA(avx512_vpermi2var_d_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_d_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_d_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_hi_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_hi_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_hi_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_pd_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_pd_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_pd_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_ps_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_ps_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_ps_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_q_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_q_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_q_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_qi_128, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_qi_256, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermi2var_qi_512, INTR_TYPE_3OP, X86ISD::VPERMV3, 0),
+ X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_128 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_256 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_512 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpshufbitqmb_128, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
+ X86_INTRINSIC_DATA(avx512_vpshufbitqmb_256, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
+ X86_INTRINSIC_DATA(avx512_vpshufbitqmb_512, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
+ // bfloat16
+ X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_128, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_256, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_512, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_256, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_512, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_128, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
+ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
+ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
+ X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16),
+ X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+ X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+ X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0),
+ X86_INTRINSIC_DATA(bmi_bzhi_64, INTR_TYPE_2OP, X86ISD::BZHI, 0),
+ X86_INTRINSIC_DATA(bmi_pdep_32, INTR_TYPE_2OP, X86ISD::PDEP, 0),
+ X86_INTRINSIC_DATA(bmi_pdep_64, INTR_TYPE_2OP, X86ISD::PDEP, 0),
+ X86_INTRINSIC_DATA(bmi_pext_32, INTR_TYPE_2OP, X86ISD::PEXT, 0),
+ X86_INTRINSIC_DATA(bmi_pext_64, INTR_TYPE_2OP, X86ISD::PEXT, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
+ X86_INTRINSIC_DATA(sse_cmp_ss, INTR_TYPE_3OP, X86ISD::FSETCC, 0),
+ X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse_cvtss2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+ X86_INTRINSIC_DATA(sse_cvtss2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+ X86_INTRINSIC_DATA(sse_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
+ X86_INTRINSIC_DATA(sse_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
+ X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(sse_max_ss, INTR_TYPE_2OP, X86ISD::FMAXS, 0),
+ X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(sse_min_ss, INTR_TYPE_2OP, X86ISD::FMINS, 0),
+ X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse_ucomile_ss, COMI, X86ISD::UCOMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse2_cmp_pd, INTR_TYPE_3OP, X86ISD::CMPP, 0),
+ X86_INTRINSIC_DATA(sse2_cmp_sd, INTR_TYPE_3OP, X86ISD::FSETCC, 0),
+ X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse2_cvtpd2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(sse2_cvtps2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvtsd2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvtsd2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvtsd2ss, INTR_TYPE_2OP, X86ISD::VFPROUNDS, 0),
+ X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvttsd2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvttsd2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
+ X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+ X86_INTRINSIC_DATA(sse2_max_sd, INTR_TYPE_2OP, X86ISD::FMAXS, 0),
+ X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+ X86_INTRINSIC_DATA(sse2_min_sd, INTR_TYPE_2OP, X86ISD::FMINS, 0),
+ X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+ X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
+ X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
+ X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(sse2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(sse2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(sse2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(sse2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(sse2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(sse2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(sse2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(sse2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(sse2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(sse2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(sse2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ),
+ X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE),
+ X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT),
+ X86_INTRINSIC_DATA(sse2_ucomile_sd, COMI, X86ISD::UCOMI, ISD::SETLE),
+ X86_INTRINSIC_DATA(sse2_ucomilt_sd, COMI, X86ISD::UCOMI, ISD::SETLT),
+ X86_INTRINSIC_DATA(sse2_ucomineq_sd, COMI, X86ISD::UCOMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse3_addsub_pd, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
+ X86_INTRINSIC_DATA(sse3_addsub_ps, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
+ X86_INTRINSIC_DATA(sse3_hadd_pd, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0),
+ X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(sse41_blendvpd, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(sse41_blendvps, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
+ X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse41_pblendvb, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
+ X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0),
+ X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
+ X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
+ X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0),
+ X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB),
+ X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
+ X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTRI, 0),
+ X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
+ X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
+
+ X86_INTRINSIC_DATA(vgf2p8affineinvqb_128, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEINVQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineinvqb_256, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEINVQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineinvqb_512, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEINVQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineqb_128, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineqb_256, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineqb_512, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8mulb_128, INTR_TYPE_2OP,
+ X86ISD::GF2P8MULB, 0),
+ X86_INTRINSIC_DATA(vgf2p8mulb_256, INTR_TYPE_2OP,
+ X86ISD::GF2P8MULB, 0),
+ X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP,
+ X86ISD::GF2P8MULB, 0),
+
+ X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0),
+ X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0)
+};
+
+/*
+ * Retrieve data for Intrinsic without chain.
+ * Return nullptr if intrinsic is not defined in the table.
+ */
+static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
+ const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
+ std::end(IntrinsicsWithoutChain),
+ IntNo);
+ if (Data != std::end(IntrinsicsWithoutChain) && Data->Id == IntNo)
+ return Data;
+ return nullptr;
+}
+
+static void verifyIntrinsicTables() {
+ assert(llvm::is_sorted(IntrinsicsWithoutChain) &&
+ llvm::is_sorted(IntrinsicsWithChain) &&
+ "Intrinsic data tables should be sorted by Intrinsic ID");
+ assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
+ std::end(IntrinsicsWithoutChain)) ==
+ std::end(IntrinsicsWithoutChain)) &&
+ (std::adjacent_find(std::begin(IntrinsicsWithChain),
+ std::end(IntrinsicsWithChain)) ==
+ std::end(IntrinsicsWithChain)) &&
+ "Intrinsic data tables should have unique entries");
+}
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
new file mode 100644
index 000000000000..1b371ac2a108
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -0,0 +1,526 @@
+//===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86LegalizerInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+using namespace TargetOpcode;
+using namespace LegalizeActions;
+
+/// FIXME: The following static functions are SizeChangeStrategy functions
+/// that are meant to temporarily mimic the behaviour of the old legalization
+/// based on doubling/halving non-legal types as closely as possible. This is
+/// not entirly possible as only legalizing the types that are exactly a power
+/// of 2 times the size of the legal types would require specifying all those
+/// sizes explicitly.
+/// In practice, not specifying those isn't a problem, and the below functions
+/// should disappear quickly as we add support for legalizing non-power-of-2
+/// sized types further.
+static void
+addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
+ const LegalizerInfo::SizeAndActionsVec &v) {
+ for (unsigned i = 0; i < v.size(); ++i) {
+ result.push_back(v[i]);
+ if (i + 1 < v[i].first && i + 1 < v.size() &&
+ v[i + 1].first != v[i].first + 1)
+ result.push_back({v[i].first + 1, Unsupported});
+ }
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 1);
+ LegalizerInfo::SizeAndActionsVec result = {{1, WidenScalar},
+ {2, Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, Unsupported});
+ return result;
+}
+
+X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
+ const X86TargetMachine &TM)
+ : Subtarget(STI), TM(TM) {
+
+ setLegalizerInfo32bit();
+ setLegalizerInfo64bit();
+ setLegalizerInfoSSE1();
+ setLegalizerInfoSSE2();
+ setLegalizerInfoSSE41();
+ setLegalizerInfoAVX();
+ setLegalizerInfoAVX2();
+ setLegalizerInfoAVX512();
+ setLegalizerInfoAVX512DQ();
+ setLegalizerInfoAVX512BW();
+
+ getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN)
+ .scalarize(0)
+ .minScalar(0, LLT::scalar(32))
+ .libcall();
+
+ setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
+ for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
+ narrowToSmallerAndWidenToSmallest);
+ setLegalizeScalarToDifferentSizeStrategy(
+ G_PTR_ADD, 1, widenToLargerTypesUnsupportedOtherwise);
+ setLegalizeScalarToDifferentSizeStrategy(
+ G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
+
+ getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
+
+ computeTables();
+ verify(*STI.getInstrInfo());
+}
+
+bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ return true;
+}
+
+void X86LegalizerInfo::setLegalizerInfo32bit() {
+
+ const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
+ const LLT s1 = LLT::scalar(1);
+ const LLT s8 = LLT::scalar(8);
+ const LLT s16 = LLT::scalar(16);
+ const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
+ const LLT s128 = LLT::scalar(128);
+
+ for (auto Ty : {p0, s1, s8, s16, s32})
+ setAction({G_IMPLICIT_DEF, Ty}, Legal);
+
+ for (auto Ty : {s8, s16, s32, p0})
+ setAction({G_PHI, Ty}, Legal);
+
+ for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ for (auto Ty : {s8, s16, s32})
+ setAction({BinOp, Ty}, Legal);
+
+ for (unsigned Op : {G_UADDE}) {
+ setAction({Op, s32}, Legal);
+ setAction({Op, 1, s1}, Legal);
+ }
+
+ for (unsigned MemOp : {G_LOAD, G_STORE}) {
+ for (auto Ty : {s8, s16, s32, p0})
+ setAction({MemOp, Ty}, Legal);
+
+ // And everything's fine in addrspace 0.
+ setAction({MemOp, 1, p0}, Legal);
+ }
+
+ // Pointer-handling
+ setAction({G_FRAME_INDEX, p0}, Legal);
+ setAction({G_GLOBAL_VALUE, p0}, Legal);
+
+ setAction({G_PTR_ADD, p0}, Legal);
+ setAction({G_PTR_ADD, 1, s32}, Legal);
+
+ if (!Subtarget.is64Bit()) {
+ getActionDefinitionsBuilder(G_PTRTOINT)
+ .legalForCartesianProduct({s1, s8, s16, s32}, {p0})
+ .maxScalar(0, s32)
+ .widenScalarToNextPow2(0, /*Min*/ 8);
+ getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
+
+ // Shifts and SDIV
+ getActionDefinitionsBuilder(
+ {G_SDIV, G_SREM, G_UDIV, G_UREM})
+ .legalFor({s8, s16, s32})
+ .clampScalar(0, s8, s32);
+
+ getActionDefinitionsBuilder(
+ {G_SHL, G_LSHR, G_ASHR})
+ .legalFor({{s8, s8}, {s16, s8}, {s32, s8}})
+ .clampScalar(0, s8, s32)
+ .clampScalar(1, s8, s8);
+
+ // Comparison
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalForCartesianProduct({s8}, {s8, s16, s32, p0})
+ .clampScalar(0, s8, s8);
+ }
+
+ // Control-flow
+ setAction({G_BRCOND, s1}, Legal);
+
+ // Constants
+ for (auto Ty : {s8, s16, s32, p0})
+ setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+
+ // Extensions
+ for (auto Ty : {s8, s16, s32}) {
+ setAction({G_ZEXT, Ty}, Legal);
+ setAction({G_SEXT, Ty}, Legal);
+ setAction({G_ANYEXT, Ty}, Legal);
+ }
+ setAction({G_ANYEXT, s128}, Legal);
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
+ // Merge/Unmerge
+ for (const auto &Ty : {s16, s32, s64}) {
+ setAction({G_MERGE_VALUES, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ for (const auto &Ty : {s8, s16, s32}) {
+ setAction({G_MERGE_VALUES, 1, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ }
+}
+
+void X86LegalizerInfo::setLegalizerInfo64bit() {
+
+ if (!Subtarget.is64Bit())
+ return;
+
+ const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
+ const LLT s1 = LLT::scalar(1);
+ const LLT s8 = LLT::scalar(8);
+ const LLT s16 = LLT::scalar(16);
+ const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
+ const LLT s128 = LLT::scalar(128);
+
+ setAction({G_IMPLICIT_DEF, s64}, Legal);
+ // Need to have that, as tryFoldImplicitDef will create this pattern:
+ // s128 = EXTEND (G_IMPLICIT_DEF s32/s64) -> s128 = G_IMPLICIT_DEF
+ setAction({G_IMPLICIT_DEF, s128}, Legal);
+
+ setAction({G_PHI, s64}, Legal);
+
+ for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ setAction({BinOp, s64}, Legal);
+
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ setAction({MemOp, s64}, Legal);
+
+ // Pointer-handling
+ setAction({G_PTR_ADD, 1, s64}, Legal);
+ getActionDefinitionsBuilder(G_PTRTOINT)
+ .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
+ .maxScalar(0, s64)
+ .widenScalarToNextPow2(0, /*Min*/ 8);
+ getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s64}});
+
+ // Constants
+ setAction({TargetOpcode::G_CONSTANT, s64}, Legal);
+
+ // Extensions
+ for (unsigned extOp : {G_ZEXT, G_SEXT, G_ANYEXT}) {
+ setAction({extOp, s64}, Legal);
+ }
+
+ getActionDefinitionsBuilder(G_SITOFP)
+ .legalForCartesianProduct({s32, s64})
+ .clampScalar(1, s32, s64)
+ .widenScalarToNextPow2(1)
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0);
+
+ getActionDefinitionsBuilder(G_FPTOSI)
+ .legalForCartesianProduct({s32, s64})
+ .clampScalar(1, s32, s64)
+ .widenScalarToNextPow2(0)
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(1);
+
+ // Comparison
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalForCartesianProduct({s8}, {s8, s16, s32, s64, p0})
+ .clampScalar(0, s8, s8);
+
+ getActionDefinitionsBuilder(G_FCMP)
+ .legalForCartesianProduct({s8}, {s32, s64})
+ .clampScalar(0, s8, s8)
+ .clampScalar(1, s32, s64)
+ .widenScalarToNextPow2(1);
+
+ // Divisions
+ getActionDefinitionsBuilder(
+ {G_SDIV, G_SREM, G_UDIV, G_UREM})
+ .legalFor({s8, s16, s32, s64})
+ .clampScalar(0, s8, s64);
+
+ // Shifts
+ getActionDefinitionsBuilder(
+ {G_SHL, G_LSHR, G_ASHR})
+ .legalFor({{s8, s8}, {s16, s8}, {s32, s8}, {s64, s8}})
+ .clampScalar(0, s8, s64)
+ .clampScalar(1, s8, s8);
+
+ // Merge/Unmerge
+ setAction({G_MERGE_VALUES, s128}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, s128}, Legal);
+ setAction({G_MERGE_VALUES, 1, s128}, Legal);
+ setAction({G_UNMERGE_VALUES, s128}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE1() {
+ if (!Subtarget.hasSSE1())
+ return;
+
+ const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
+
+ for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+ for (auto Ty : {s32, v4s32})
+ setAction({BinOp, Ty}, Legal);
+
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ for (auto Ty : {v4s32, v2s64})
+ setAction({MemOp, Ty}, Legal);
+
+ // Constants
+ setAction({TargetOpcode::G_FCONSTANT, s32}, Legal);
+
+ // Merge/Unmerge
+ for (const auto &Ty : {v4s32, v2s64}) {
+ setAction({G_CONCAT_VECTORS, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ setAction({G_MERGE_VALUES, 1, s64}, Legal);
+ setAction({G_UNMERGE_VALUES, s64}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE2() {
+ if (!Subtarget.hasSSE2())
+ return;
+
+ const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
+ const LLT v16s8 = LLT::vector(16, 8);
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
+
+ const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v16s16 = LLT::vector(16, 16);
+ const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v4s64 = LLT::vector(4, 64);
+
+ for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+ for (auto Ty : {s64, v2s64})
+ setAction({BinOp, Ty}, Legal);
+
+ for (unsigned BinOp : {G_ADD, G_SUB})
+ for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
+ setAction({BinOp, Ty}, Legal);
+
+ setAction({G_MUL, v8s16}, Legal);
+
+ setAction({G_FPEXT, s64}, Legal);
+ setAction({G_FPEXT, 1, s32}, Legal);
+
+ setAction({G_FPTRUNC, s32}, Legal);
+ setAction({G_FPTRUNC, 1, s64}, Legal);
+
+ // Constants
+ setAction({TargetOpcode::G_FCONSTANT, s64}, Legal);
+
+ // Merge/Unmerge
+ for (const auto &Ty :
+ {v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) {
+ setAction({G_CONCAT_VECTORS, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ for (const auto &Ty : {v16s8, v8s16, v4s32, v2s64}) {
+ setAction({G_CONCAT_VECTORS, 1, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ }
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE41() {
+ if (!Subtarget.hasSSE41())
+ return;
+
+ const LLT v4s32 = LLT::vector(4, 32);
+
+ setAction({G_MUL, v4s32}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX() {
+ if (!Subtarget.hasAVX())
+ return;
+
+ const LLT v16s8 = LLT::vector(16, 8);
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
+
+ const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v64s8 = LLT::vector(64, 8);
+ const LLT v16s16 = LLT::vector(16, 16);
+ const LLT v32s16 = LLT::vector(32, 16);
+ const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v16s32 = LLT::vector(16, 32);
+ const LLT v4s64 = LLT::vector(4, 64);
+ const LLT v8s64 = LLT::vector(8, 64);
+
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ for (auto Ty : {v8s32, v4s64})
+ setAction({MemOp, Ty}, Legal);
+
+ for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) {
+ setAction({G_INSERT, Ty}, Legal);
+ setAction({G_EXTRACT, 1, Ty}, Legal);
+ }
+ for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) {
+ setAction({G_INSERT, 1, Ty}, Legal);
+ setAction({G_EXTRACT, Ty}, Legal);
+ }
+ // Merge/Unmerge
+ for (const auto &Ty :
+ {v32s8, v64s8, v16s16, v32s16, v8s32, v16s32, v4s64, v8s64}) {
+ setAction({G_CONCAT_VECTORS, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ for (const auto &Ty :
+ {v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) {
+ setAction({G_CONCAT_VECTORS, 1, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ }
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX2() {
+ if (!Subtarget.hasAVX2())
+ return;
+
+ const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v16s16 = LLT::vector(16, 16);
+ const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v4s64 = LLT::vector(4, 64);
+
+ const LLT v64s8 = LLT::vector(64, 8);
+ const LLT v32s16 = LLT::vector(32, 16);
+ const LLT v16s32 = LLT::vector(16, 32);
+ const LLT v8s64 = LLT::vector(8, 64);
+
+ for (unsigned BinOp : {G_ADD, G_SUB})
+ for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+ setAction({BinOp, Ty}, Legal);
+
+ for (auto Ty : {v16s16, v8s32})
+ setAction({G_MUL, Ty}, Legal);
+
+ // Merge/Unmerge
+ for (const auto &Ty : {v64s8, v32s16, v16s32, v8s64}) {
+ setAction({G_CONCAT_VECTORS, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ for (const auto &Ty : {v32s8, v16s16, v8s32, v4s64}) {
+ setAction({G_CONCAT_VECTORS, 1, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ }
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX512() {
+ if (!Subtarget.hasAVX512())
+ return;
+
+ const LLT v16s8 = LLT::vector(16, 8);
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
+
+ const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v16s16 = LLT::vector(16, 16);
+ const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v4s64 = LLT::vector(4, 64);
+
+ const LLT v64s8 = LLT::vector(64, 8);
+ const LLT v32s16 = LLT::vector(32, 16);
+ const LLT v16s32 = LLT::vector(16, 32);
+ const LLT v8s64 = LLT::vector(8, 64);
+
+ for (unsigned BinOp : {G_ADD, G_SUB})
+ for (auto Ty : {v16s32, v8s64})
+ setAction({BinOp, Ty}, Legal);
+
+ setAction({G_MUL, v16s32}, Legal);
+
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ for (auto Ty : {v16s32, v8s64})
+ setAction({MemOp, Ty}, Legal);
+
+ for (auto Ty : {v64s8, v32s16, v16s32, v8s64}) {
+ setAction({G_INSERT, Ty}, Legal);
+ setAction({G_EXTRACT, 1, Ty}, Legal);
+ }
+ for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64}) {
+ setAction({G_INSERT, 1, Ty}, Legal);
+ setAction({G_EXTRACT, Ty}, Legal);
+ }
+
+ /************ VLX *******************/
+ if (!Subtarget.hasVLX())
+ return;
+
+ for (auto Ty : {v4s32, v8s32})
+ setAction({G_MUL, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX512DQ() {
+ if (!(Subtarget.hasAVX512() && Subtarget.hasDQI()))
+ return;
+
+ const LLT v8s64 = LLT::vector(8, 64);
+
+ setAction({G_MUL, v8s64}, Legal);
+
+ /************ VLX *******************/
+ if (!Subtarget.hasVLX())
+ return;
+
+ const LLT v2s64 = LLT::vector(2, 64);
+ const LLT v4s64 = LLT::vector(4, 64);
+
+ for (auto Ty : {v2s64, v4s64})
+ setAction({G_MUL, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX512BW() {
+ if (!(Subtarget.hasAVX512() && Subtarget.hasBWI()))
+ return;
+
+ const LLT v64s8 = LLT::vector(64, 8);
+ const LLT v32s16 = LLT::vector(32, 16);
+
+ for (unsigned BinOp : {G_ADD, G_SUB})
+ for (auto Ty : {v64s8, v32s16})
+ setAction({BinOp, Ty}, Legal);
+
+ setAction({G_MUL, v32s16}, Legal);
+
+ /************ VLX *******************/
+ if (!Subtarget.hasVLX())
+ return;
+
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v16s16 = LLT::vector(16, 16);
+
+ for (auto Ty : {v8s16, v16s16})
+ setAction({G_MUL, Ty}, Legal);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h
new file mode 100644
index 000000000000..72d25096d72b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h
@@ -0,0 +1,51 @@
+//===- X86LegalizerInfo.h ------------------------------------------*- C++
+//-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class X86Subtarget;
+class X86TargetMachine;
+
+/// This class provides the information for the target register banks.
+class X86LegalizerInfo : public LegalizerInfo {
+private:
+ /// Keep a reference to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget &Subtarget;
+ const X86TargetMachine &TM;
+
+public:
+ X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
+
+ bool legalizeIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI) const override;
+
+private:
+ void setLegalizerInfo32bit();
+ void setLegalizerInfo64bit();
+ void setLegalizerInfoSSE1();
+ void setLegalizerInfoSSE2();
+ void setLegalizerInfoSSE41();
+ void setLegalizerInfoAVX();
+ void setLegalizerInfoAVX2();
+ void setLegalizerInfoAVX512();
+ void setLegalizerInfoAVX512DQ();
+ void setLegalizerInfoAVX512BW();
+};
+} // namespace llvm
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
new file mode 100644
index 000000000000..810fee052b5a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -0,0 +1,816 @@
+//==-- X86LoadValueInjectionLoadHardening.cpp - LVI load hardening for x86 --=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Description: This pass finds Load Value Injection (LVI) gadgets consisting
+/// of a load from memory (i.e., SOURCE), and any operation that may transmit
+/// the value loaded from memory over a covert channel, or use the value loaded
+/// from memory to determine a branch/call target (i.e., SINK). After finding
+/// all such gadgets in a given function, the pass minimally inserts LFENCE
+/// instructions in such a manner that the following property is satisfied: for
+/// all SOURCE+SINK pairs, all paths in the CFG from SOURCE to SINK contain at
+/// least one LFENCE instruction. The algorithm that implements this minimal
+/// insertion is influenced by an academic paper that minimally inserts memory
+/// fences for high-performance concurrent programs:
+/// http://www.cs.ucr.edu/~lesani/companion/oopsla15/OOPSLA15.pdf
+/// The algorithm implemented in this pass is as follows:
+/// 1. Build a condensed CFG (i.e., a GadgetGraph) consisting only of the
+/// following components:
+/// - SOURCE instructions (also includes function arguments)
+/// - SINK instructions
+/// - Basic block entry points
+/// - Basic block terminators
+/// - LFENCE instructions
+/// 2. Analyze the GadgetGraph to determine which SOURCE+SINK pairs (i.e.,
+/// gadgets) are already mitigated by existing LFENCEs. If all gadgets have been
+/// mitigated, go to step 6.
+/// 3. Use a heuristic or plugin to approximate minimal LFENCE insertion.
+/// 4. Insert one LFENCE along each CFG edge that was cut in step 3.
+/// 5. Go to step 2.
+/// 6. If any LFENCEs were inserted, return `true` from runOnMachineFunction()
+/// to tell LLVM that the function was modified.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ImmutableGraph.h"
+#include "X86.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define PASS_KEY "x86-lvi-load"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
+STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
+STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
+ "were deployed");
+STATISTIC(NumGadgets, "Number of LVI gadgets detected during analysis");
+
+static cl::opt<std::string> OptimizePluginPath(
+ PASS_KEY "-opt-plugin",
+ cl::desc("Specify a plugin to optimize LFENCE insertion"), cl::Hidden);
+
+static cl::opt<bool> NoConditionalBranches(
+ PASS_KEY "-no-cbranch",
+ cl::desc("Don't treat conditional branches as disclosure gadgets. This "
+ "may improve performance, at the cost of security."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EmitDot(
+ PASS_KEY "-dot",
+ cl::desc(
+ "For each function, emit a dot graph depicting potential LVI gadgets"),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EmitDotOnly(
+ PASS_KEY "-dot-only",
+ cl::desc("For each function, emit a dot graph depicting potential LVI "
+ "gadgets, and do not insert any fences"),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EmitDotVerify(
+ PASS_KEY "-dot-verify",
+ cl::desc("For each function, emit a dot graph to stdout depicting "
+ "potential LVI gadgets, used for testing purposes only"),
+ cl::init(false), cl::Hidden);
+
+static llvm::sys::DynamicLibrary OptimizeDL;
+typedef int (*OptimizeCutT)(unsigned int *Nodes, unsigned int NodesSize,
+ unsigned int *Edges, int *EdgeValues,
+ int *CutEdges /* out */, unsigned int EdgesSize);
+static OptimizeCutT OptimizeCut = nullptr;
+
+namespace {
+
+struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> {
+ static constexpr int GadgetEdgeSentinel = -1;
+ static constexpr MachineInstr *const ArgNodeSentinel = nullptr;
+
+ using GraphT = ImmutableGraph<MachineInstr *, int>;
+ using Node = typename GraphT::Node;
+ using Edge = typename GraphT::Edge;
+ using size_type = typename GraphT::size_type;
+ MachineGadgetGraph(std::unique_ptr<Node[]> Nodes,
+ std::unique_ptr<Edge[]> Edges, size_type NodesSize,
+ size_type EdgesSize, int NumFences = 0, int NumGadgets = 0)
+ : GraphT(std::move(Nodes), std::move(Edges), NodesSize, EdgesSize),
+ NumFences(NumFences), NumGadgets(NumGadgets) {}
+ static inline bool isCFGEdge(const Edge &E) {
+ return E.getValue() != GadgetEdgeSentinel;
+ }
+ static inline bool isGadgetEdge(const Edge &E) {
+ return E.getValue() == GadgetEdgeSentinel;
+ }
+ int NumFences;
+ int NumGadgets;
+};
+
+class X86LoadValueInjectionLoadHardeningPass : public MachineFunctionPass {
+public:
+ X86LoadValueInjectionLoadHardeningPass() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "X86 Load Value Injection (LVI) Load Hardening";
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ static char ID;
+
+private:
+ using GraphBuilder = ImmutableGraphBuilder<MachineGadgetGraph>;
+ using Edge = MachineGadgetGraph::Edge;
+ using Node = MachineGadgetGraph::Node;
+ using EdgeSet = MachineGadgetGraph::EdgeSet;
+ using NodeSet = MachineGadgetGraph::NodeSet;
+
+ const X86Subtarget *STI;
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+
+ std::unique_ptr<MachineGadgetGraph>
+ getGadgetGraph(MachineFunction &MF, const MachineLoopInfo &MLI,
+ const MachineDominatorTree &MDT,
+ const MachineDominanceFrontier &MDF) const;
+ int hardenLoadsWithPlugin(MachineFunction &MF,
+ std::unique_ptr<MachineGadgetGraph> Graph) const;
+ int hardenLoadsWithHeuristic(MachineFunction &MF,
+ std::unique_ptr<MachineGadgetGraph> Graph) const;
+ int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G,
+ EdgeSet &ElimEdges /* in, out */,
+ NodeSet &ElimNodes /* in, out */) const;
+ std::unique_ptr<MachineGadgetGraph>
+ trimMitigatedEdges(std::unique_ptr<MachineGadgetGraph> Graph) const;
+ int insertFences(MachineFunction &MF, MachineGadgetGraph &G,
+ EdgeSet &CutEdges /* in, out */) const;
+ bool instrUsesRegToAccessMemory(const MachineInstr &I, unsigned Reg) const;
+ bool instrUsesRegToBranch(const MachineInstr &I, unsigned Reg) const;
+ inline bool isFence(const MachineInstr *MI) const {
+ return MI && (MI->getOpcode() == X86::LFENCE ||
+ (STI->useLVIControlFlowIntegrity() && MI->isCall()));
+ }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <>
+struct GraphTraits<MachineGadgetGraph *>
+ : GraphTraits<ImmutableGraph<MachineInstr *, int> *> {};
+
+template <>
+struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
+ using GraphType = MachineGadgetGraph;
+ using Traits = llvm::GraphTraits<GraphType *>;
+ using NodeRef = typename Traits::NodeRef;
+ using EdgeRef = typename Traits::EdgeRef;
+ using ChildIteratorType = typename Traits::ChildIteratorType;
+ using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
+
+ DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
+
+ std::string getNodeLabel(NodeRef Node, GraphType *) {
+ if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel)
+ return "ARGS";
+
+ std::string Str;
+ raw_string_ostream OS(Str);
+ OS << *Node->getValue();
+ return OS.str();
+ }
+
+ static std::string getNodeAttributes(NodeRef Node, GraphType *) {
+ MachineInstr *MI = Node->getValue();
+ if (MI == MachineGadgetGraph::ArgNodeSentinel)
+ return "color = blue";
+ if (MI->getOpcode() == X86::LFENCE)
+ return "color = green";
+ return "";
+ }
+
+ static std::string getEdgeAttributes(NodeRef, ChildIteratorType E,
+ GraphType *) {
+ int EdgeVal = (*E.getCurrent()).getValue();
+ return EdgeVal >= 0 ? "label = " + std::to_string(EdgeVal)
+ : "color = red, style = \"dashed\"";
+ }
+};
+
+} // end namespace llvm
+
+constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel;
+constexpr int MachineGadgetGraph::GadgetEdgeSentinel;
+
+char X86LoadValueInjectionLoadHardeningPass::ID = 0;
+
+void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
+ AnalysisUsage &AU) const {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineLoopInfo>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominanceFrontier>();
+ AU.setPreservesCFG();
+}
+
+static void writeGadgetGraph(raw_ostream &OS, MachineFunction &MF,
+ MachineGadgetGraph *G) {
+ WriteGraph(OS, G, /*ShortNames*/ false,
+ "Speculative gadgets for \"" + MF.getName() + "\" function");
+}
+
+bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
+ MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
+ << " *****\n");
+ STI = &MF.getSubtarget<X86Subtarget>();
+ if (!STI->useLVILoadHardening())
+ return false;
+
+ // FIXME: support 32-bit
+ if (!STI->is64Bit())
+ report_fatal_error("LVI load hardening is only supported on 64-bit", false);
+
+ // Don't skip functions with the "optnone" attr but participate in opt-bisect.
+ const Function &F = MF.getFunction();
+ if (!F.hasOptNone() && skipFunction(F))
+ return false;
+
+ ++NumFunctionsConsidered;
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ LLVM_DEBUG(dbgs() << "Building gadget graph...\n");
+ const auto &MLI = getAnalysis<MachineLoopInfo>();
+ const auto &MDT = getAnalysis<MachineDominatorTree>();
+ const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+ std::unique_ptr<MachineGadgetGraph> Graph = getGadgetGraph(MF, MLI, MDT, MDF);
+ LLVM_DEBUG(dbgs() << "Building gadget graph... Done\n");
+ if (Graph == nullptr)
+ return false; // didn't find any gadgets
+
+ if (EmitDotVerify) {
+ writeGadgetGraph(outs(), MF, Graph.get());
+ return false;
+ }
+
+ if (EmitDot || EmitDotOnly) {
+ LLVM_DEBUG(dbgs() << "Emitting gadget graph...\n");
+ std::error_code FileError;
+ std::string FileName = "lvi.";
+ FileName += MF.getName();
+ FileName += ".dot";
+ raw_fd_ostream FileOut(FileName, FileError);
+ if (FileError)
+ errs() << FileError.message();
+ writeGadgetGraph(FileOut, MF, Graph.get());
+ FileOut.close();
+ LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n");
+ if (EmitDotOnly)
+ return false;
+ }
+
+ int FencesInserted;
+ if (!OptimizePluginPath.empty()) {
+ if (!OptimizeDL.isValid()) {
+ std::string ErrorMsg;
+ OptimizeDL = llvm::sys::DynamicLibrary::getPermanentLibrary(
+ OptimizePluginPath.c_str(), &ErrorMsg);
+ if (!ErrorMsg.empty())
+ report_fatal_error("Failed to load opt plugin: \"" + ErrorMsg + '\"');
+ OptimizeCut = (OptimizeCutT)OptimizeDL.getAddressOfSymbol("optimize_cut");
+ if (!OptimizeCut)
+ report_fatal_error("Invalid optimization plugin");
+ }
+ FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph));
+ } else { // Use the default greedy heuristic
+ FencesInserted = hardenLoadsWithHeuristic(MF, std::move(Graph));
+ }
+
+ if (FencesInserted > 0)
+ ++NumFunctionsMitigated;
+ NumFences += FencesInserted;
+ return (FencesInserted > 0);
+}
+
+std::unique_ptr<MachineGadgetGraph>
+X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
+ MachineFunction &MF, const MachineLoopInfo &MLI,
+ const MachineDominatorTree &MDT,
+ const MachineDominanceFrontier &MDF) const {
+ using namespace rdf;
+
+ // Build the Register Dataflow Graph using the RDF framework
+ TargetOperandInfo TOI{*TII};
+ DataFlowGraph DFG{MF, *TII, *TRI, MDT, MDF, TOI};
+ DFG.build();
+ Liveness L{MF.getRegInfo(), DFG};
+ L.computePhiInfo();
+
+ GraphBuilder Builder;
+ using GraphIter = typename GraphBuilder::BuilderNodeRef;
+ DenseMap<MachineInstr *, GraphIter> NodeMap;
+ int FenceCount = 0, GadgetCount = 0;
+ auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) {
+ auto Ref = NodeMap.find(MI);
+ if (Ref == NodeMap.end()) {
+ auto I = Builder.addVertex(MI);
+ NodeMap[MI] = I;
+ return std::pair<GraphIter, bool>{I, true};
+ }
+ return std::pair<GraphIter, bool>{Ref->getSecond(), false};
+ };
+
+ // The `Transmitters` map memoizes transmitters found for each def. If a def
+ // has not yet been analyzed, then it will not appear in the map. If a def
+ // has been analyzed and was determined not to have any transmitters, then
+ // its list of transmitters will be empty.
+ DenseMap<NodeId, std::vector<NodeId>> Transmitters;
+
+ // Analyze all machine instructions to find gadgets and LFENCEs, adding
+ // each interesting value to `Nodes`
+ auto AnalyzeDef = [&](NodeAddr<DefNode *> SourceDef) {
+ SmallSet<NodeId, 8> UsesVisited, DefsVisited;
+ std::function<void(NodeAddr<DefNode *>)> AnalyzeDefUseChain =
+ [&](NodeAddr<DefNode *> Def) {
+ if (Transmitters.find(Def.Id) != Transmitters.end())
+ return; // Already analyzed `Def`
+
+ // Use RDF to find all the uses of `Def`
+ rdf::NodeSet Uses;
+ RegisterRef DefReg = Def.Addr->getRegRef(DFG);
+ for (auto UseID : L.getAllReachedUses(DefReg, Def)) {
+ auto Use = DFG.addr<UseNode *>(UseID);
+ if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node
+ NodeAddr<PhiNode *> Phi = Use.Addr->getOwner(DFG);
+ for (auto I : L.getRealUses(Phi.Id)) {
+ if (DFG.getPRI().alias(RegisterRef(I.first), DefReg)) {
+ for (auto UA : I.second)
+ Uses.emplace(UA.first);
+ }
+ }
+ } else { // not a phi node
+ Uses.emplace(UseID);
+ }
+ }
+
+ // For each use of `Def`, we want to know whether:
+ // (1) The use can leak the Def'ed value,
+ // (2) The use can further propagate the Def'ed value to more defs
+ for (auto UseID : Uses) {
+ if (!UsesVisited.insert(UseID).second)
+ continue; // Already visited this use of `Def`
+
+ auto Use = DFG.addr<UseNode *>(UseID);
+ assert(!(Use.Addr->getFlags() & NodeAttrs::PhiRef));
+ MachineOperand &UseMO = Use.Addr->getOp();
+ MachineInstr &UseMI = *UseMO.getParent();
+ assert(UseMO.isReg());
+
+ // We naively assume that an instruction propagates any loaded
+ // uses to all defs unless the instruction is a call, in which
+ // case all arguments will be treated as gadget sources during
+ // analysis of the callee function.
+ if (UseMI.isCall())
+ continue;
+
+ // Check whether this use can transmit (leak) its value.
+ if (instrUsesRegToAccessMemory(UseMI, UseMO.getReg()) ||
+ (!NoConditionalBranches &&
+ instrUsesRegToBranch(UseMI, UseMO.getReg()))) {
+ Transmitters[Def.Id].push_back(Use.Addr->getOwner(DFG).Id);
+ if (UseMI.mayLoad())
+ continue; // Found a transmitting load -- no need to continue
+ // traversing its defs (i.e., this load will become
+ // a new gadget source anyways).
+ }
+
+ // Check whether the use propagates to more defs.
+ NodeAddr<InstrNode *> Owner{Use.Addr->getOwner(DFG)};
+ rdf::NodeList AnalyzedChildDefs;
+ for (auto &ChildDef :
+ Owner.Addr->members_if(DataFlowGraph::IsDef, DFG)) {
+ if (!DefsVisited.insert(ChildDef.Id).second)
+ continue; // Already visited this def
+ if (Def.Addr->getAttrs() & NodeAttrs::Dead)
+ continue;
+ if (Def.Id == ChildDef.Id)
+ continue; // `Def` uses itself (e.g., increment loop counter)
+
+ AnalyzeDefUseChain(ChildDef);
+
+ // `Def` inherits all of its child defs' transmitters.
+ for (auto TransmitterId : Transmitters[ChildDef.Id])
+ Transmitters[Def.Id].push_back(TransmitterId);
+ }
+ }
+
+ // Note that this statement adds `Def.Id` to the map if no
+ // transmitters were found for `Def`.
+ auto &DefTransmitters = Transmitters[Def.Id];
+
+ // Remove duplicate transmitters
+ llvm::sort(DefTransmitters);
+ DefTransmitters.erase(
+ std::unique(DefTransmitters.begin(), DefTransmitters.end()),
+ DefTransmitters.end());
+ };
+
+ // Find all of the transmitters
+ AnalyzeDefUseChain(SourceDef);
+ auto &SourceDefTransmitters = Transmitters[SourceDef.Id];
+ if (SourceDefTransmitters.empty())
+ return; // No transmitters for `SourceDef`
+
+ MachineInstr *Source = SourceDef.Addr->getFlags() & NodeAttrs::PhiRef
+ ? MachineGadgetGraph::ArgNodeSentinel
+ : SourceDef.Addr->getOp().getParent();
+ auto GadgetSource = MaybeAddNode(Source);
+ // Each transmitter is a sink for `SourceDef`.
+ for (auto TransmitterId : SourceDefTransmitters) {
+ MachineInstr *Sink = DFG.addr<StmtNode *>(TransmitterId).Addr->getCode();
+ auto GadgetSink = MaybeAddNode(Sink);
+ // Add the gadget edge to the graph.
+ Builder.addEdge(MachineGadgetGraph::GadgetEdgeSentinel,
+ GadgetSource.first, GadgetSink.first);
+ ++GadgetCount;
+ }
+ };
+
+ LLVM_DEBUG(dbgs() << "Analyzing def-use chains to find gadgets\n");
+ // Analyze function arguments
+ NodeAddr<BlockNode *> EntryBlock = DFG.getFunc().Addr->getEntryBlock(DFG);
+ for (NodeAddr<PhiNode *> ArgPhi :
+ EntryBlock.Addr->members_if(DataFlowGraph::IsPhi, DFG)) {
+ NodeList Defs = ArgPhi.Addr->members_if(DataFlowGraph::IsDef, DFG);
+ llvm::for_each(Defs, AnalyzeDef);
+ }
+ // Analyze every instruction in MF
+ for (NodeAddr<BlockNode *> BA : DFG.getFunc().Addr->members(DFG)) {
+ for (NodeAddr<StmtNode *> SA :
+ BA.Addr->members_if(DataFlowGraph::IsCode<NodeAttrs::Stmt>, DFG)) {
+ MachineInstr *MI = SA.Addr->getCode();
+ if (isFence(MI)) {
+ MaybeAddNode(MI);
+ ++FenceCount;
+ } else if (MI->mayLoad()) {
+ NodeList Defs = SA.Addr->members_if(DataFlowGraph::IsDef, DFG);
+ llvm::for_each(Defs, AnalyzeDef);
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Found " << FenceCount << " fences\n");
+ LLVM_DEBUG(dbgs() << "Found " << GadgetCount << " gadgets\n");
+ if (GadgetCount == 0)
+ return nullptr;
+ NumGadgets += GadgetCount;
+
+ // Traverse CFG to build the rest of the graph
+ SmallSet<MachineBasicBlock *, 8> BlocksVisited;
+ std::function<void(MachineBasicBlock *, GraphIter, unsigned)> TraverseCFG =
+ [&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) {
+ unsigned LoopDepth = MLI.getLoopDepth(MBB);
+ if (!MBB->empty()) {
+ // Always add the first instruction in each block
+ auto NI = MBB->begin();
+ auto BeginBB = MaybeAddNode(&*NI);
+ Builder.addEdge(ParentDepth, GI, BeginBB.first);
+ if (!BlocksVisited.insert(MBB).second)
+ return;
+
+ // Add any instructions within the block that are gadget components
+ GI = BeginBB.first;
+ while (++NI != MBB->end()) {
+ auto Ref = NodeMap.find(&*NI);
+ if (Ref != NodeMap.end()) {
+ Builder.addEdge(LoopDepth, GI, Ref->getSecond());
+ GI = Ref->getSecond();
+ }
+ }
+
+ // Always add the terminator instruction, if one exists
+ auto T = MBB->getFirstTerminator();
+ if (T != MBB->end()) {
+ auto EndBB = MaybeAddNode(&*T);
+ if (EndBB.second)
+ Builder.addEdge(LoopDepth, GI, EndBB.first);
+ GI = EndBB.first;
+ }
+ }
+ for (MachineBasicBlock *Succ : MBB->successors())
+ TraverseCFG(Succ, GI, LoopDepth);
+ };
+ // ArgNodeSentinel is a pseudo-instruction that represents MF args in the
+ // GadgetGraph
+ GraphIter ArgNode = MaybeAddNode(MachineGadgetGraph::ArgNodeSentinel).first;
+ TraverseCFG(&MF.front(), ArgNode, 0);
+ std::unique_ptr<MachineGadgetGraph> G{Builder.get(FenceCount, GadgetCount)};
+ LLVM_DEBUG(dbgs() << "Found " << G->nodes_size() << " nodes\n");
+ return G;
+}
+
+// Returns the number of remaining gadget edges that could not be eliminated
+int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
+ MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */,
+ NodeSet &ElimNodes /* in, out */) const {
+ if (G.NumFences > 0) {
+ // Eliminate fences and CFG edges that ingress and egress the fence, as
+ // they are trivially mitigated.
+ for (const Edge &E : G.edges()) {
+ const Node *Dest = E.getDest();
+ if (isFence(Dest->getValue())) {
+ ElimNodes.insert(*Dest);
+ ElimEdges.insert(E);
+ for (const Edge &DE : Dest->edges())
+ ElimEdges.insert(DE);
+ }
+ }
+ }
+
+ // Find and eliminate gadget edges that have been mitigated.
+ int MitigatedGadgets = 0, RemainingGadgets = 0;
+ NodeSet ReachableNodes{G};
+ for (const Node &RootN : G.nodes()) {
+ if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge))
+ continue; // skip this node if it isn't a gadget source
+
+ // Find all of the nodes that are CFG-reachable from RootN using DFS
+ ReachableNodes.clear();
+ std::function<void(const Node *, bool)> FindReachableNodes =
+ [&](const Node *N, bool FirstNode) {
+ if (!FirstNode)
+ ReachableNodes.insert(*N);
+ for (const Edge &E : N->edges()) {
+ const Node *Dest = E.getDest();
+ if (MachineGadgetGraph::isCFGEdge(E) && !ElimEdges.contains(E) &&
+ !ReachableNodes.contains(*Dest))
+ FindReachableNodes(Dest, false);
+ }
+ };
+ FindReachableNodes(&RootN, true);
+
+ // Any gadget whose sink is unreachable has been mitigated
+ for (const Edge &E : RootN.edges()) {
+ if (MachineGadgetGraph::isGadgetEdge(E)) {
+ if (ReachableNodes.contains(*E.getDest())) {
+ // This gadget's sink is reachable
+ ++RemainingGadgets;
+ } else { // This gadget's sink is unreachable, and therefore mitigated
+ ++MitigatedGadgets;
+ ElimEdges.insert(E);
+ }
+ }
+ }
+ }
+ return RemainingGadgets;
+}
+
+std::unique_ptr<MachineGadgetGraph>
+X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges(
+ std::unique_ptr<MachineGadgetGraph> Graph) const {
+ NodeSet ElimNodes{*Graph};
+ EdgeSet ElimEdges{*Graph};
+ int RemainingGadgets =
+ elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes);
+ if (ElimEdges.empty() && ElimNodes.empty()) {
+ Graph->NumFences = 0;
+ Graph->NumGadgets = RemainingGadgets;
+ } else {
+ Graph = GraphBuilder::trim(*Graph, ElimNodes, ElimEdges, 0 /* NumFences */,
+ RemainingGadgets);
+ }
+ return Graph;
+}
+
+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
+ MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
+ int FencesInserted = 0;
+
+ do {
+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+ Graph = trimMitigatedEdges(std::move(Graph));
+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+ if (Graph->NumGadgets == 0)
+ break;
+
+ LLVM_DEBUG(dbgs() << "Cutting edges...\n");
+ EdgeSet CutEdges{*Graph};
+ auto Nodes = std::make_unique<unsigned int[]>(Graph->nodes_size() +
+ 1 /* terminator node */);
+ auto Edges = std::make_unique<unsigned int[]>(Graph->edges_size());
+ auto EdgeCuts = std::make_unique<int[]>(Graph->edges_size());
+ auto EdgeValues = std::make_unique<int[]>(Graph->edges_size());
+ for (const Node &N : Graph->nodes()) {
+ Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin());
+ }
+ Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node
+ for (const Edge &E : Graph->edges()) {
+ Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest());
+ EdgeValues[Graph->getEdgeIndex(E)] = E.getValue();
+ }
+ OptimizeCut(Nodes.get(), Graph->nodes_size(), Edges.get(), EdgeValues.get(),
+ EdgeCuts.get(), Graph->edges_size());
+ for (int I = 0; I < Graph->edges_size(); ++I)
+ if (EdgeCuts[I])
+ CutEdges.set(I);
+ LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
+ LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
+
+ LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n");
+ FencesInserted += insertFences(MF, *Graph, CutEdges);
+ LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
+ LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
+
+ Graph = GraphBuilder::trim(*Graph, NodeSet{*Graph}, CutEdges);
+ } while (true);
+
+ return FencesInserted;
+}
+
+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithHeuristic(
+ MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
+ // If `MF` does not have any fences, then no gadgets would have been
+ // mitigated at this point.
+ if (Graph->NumFences > 0) {
+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+ Graph = trimMitigatedEdges(std::move(Graph));
+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+ }
+
+ if (Graph->NumGadgets == 0)
+ return 0;
+
+ LLVM_DEBUG(dbgs() << "Cutting edges...\n");
+ EdgeSet CutEdges{*Graph};
+
+ // Begin by collecting all ingress CFG edges for each node
+ DenseMap<const Node *, SmallVector<const Edge *, 2>> IngressEdgeMap;
+ for (const Edge &E : Graph->edges())
+ if (MachineGadgetGraph::isCFGEdge(E))
+ IngressEdgeMap[E.getDest()].push_back(&E);
+
+ // For each gadget edge, make cuts that guarantee the gadget will be
+ // mitigated. A computationally efficient way to achieve this is to either:
+ // (a) cut all egress CFG edges from the gadget source, or
+ // (b) cut all ingress CFG edges to the gadget sink.
+ //
+ // Moreover, the algorithm tries not to make a cut into a loop by preferring
+ // to make a (b)-type cut if the gadget source resides at a greater loop depth
+ // than the gadget sink, or an (a)-type cut otherwise.
+ for (const Node &N : Graph->nodes()) {
+ for (const Edge &E : N.edges()) {
+ if (!MachineGadgetGraph::isGadgetEdge(E))
+ continue;
+
+ SmallVector<const Edge *, 2> EgressEdges;
+ SmallVector<const Edge *, 2> &IngressEdges = IngressEdgeMap[E.getDest()];
+ for (const Edge &EgressEdge : N.edges())
+ if (MachineGadgetGraph::isCFGEdge(EgressEdge))
+ EgressEdges.push_back(&EgressEdge);
+
+ int EgressCutCost = 0, IngressCutCost = 0;
+ for (const Edge *EgressEdge : EgressEdges)
+ if (!CutEdges.contains(*EgressEdge))
+ EgressCutCost += EgressEdge->getValue();
+ for (const Edge *IngressEdge : IngressEdges)
+ if (!CutEdges.contains(*IngressEdge))
+ IngressCutCost += IngressEdge->getValue();
+
+ auto &EdgesToCut =
+ IngressCutCost < EgressCutCost ? IngressEdges : EgressEdges;
+ for (const Edge *E : EdgesToCut)
+ CutEdges.insert(*E);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
+ LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
+
+ LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n");
+ int FencesInserted = insertFences(MF, *Graph, CutEdges);
+ LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
+ LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
+
+ return FencesInserted;
+}
+
+int X86LoadValueInjectionLoadHardeningPass::insertFences(
+ MachineFunction &MF, MachineGadgetGraph &G,
+ EdgeSet &CutEdges /* in, out */) const {
+ int FencesInserted = 0;
+ for (const Node &N : G.nodes()) {
+ for (const Edge &E : N.edges()) {
+ if (CutEdges.contains(E)) {
+ MachineInstr *MI = N.getValue(), *Prev;
+ MachineBasicBlock *MBB; // Insert an LFENCE in this MBB
+ MachineBasicBlock::iterator InsertionPt; // ...at this point
+ if (MI == MachineGadgetGraph::ArgNodeSentinel) {
+ // insert LFENCE at beginning of entry block
+ MBB = &MF.front();
+ InsertionPt = MBB->begin();
+ Prev = nullptr;
+ } else if (MI->isBranch()) { // insert the LFENCE before the branch
+ MBB = MI->getParent();
+ InsertionPt = MI;
+ Prev = MI->getPrevNode();
+ // Remove all egress CFG edges from this branch because the inserted
+ // LFENCE prevents gadgets from crossing the branch.
+ for (const Edge &E : N.edges()) {
+ if (MachineGadgetGraph::isCFGEdge(E))
+ CutEdges.insert(E);
+ }
+ } else { // insert the LFENCE after the instruction
+ MBB = MI->getParent();
+ InsertionPt = MI->getNextNode() ? MI->getNextNode() : MBB->end();
+ Prev = InsertionPt == MBB->end()
+ ? (MBB->empty() ? nullptr : &MBB->back())
+ : InsertionPt->getPrevNode();
+ }
+ // Ensure this insertion is not redundant (two LFENCEs in sequence).
+ if ((InsertionPt == MBB->end() || !isFence(&*InsertionPt)) &&
+ (!Prev || !isFence(Prev))) {
+ BuildMI(*MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE));
+ ++FencesInserted;
+ }
+ }
+ }
+ }
+ return FencesInserted;
+}
+
+bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToAccessMemory(
+ const MachineInstr &MI, unsigned Reg) const {
+ if (!MI.mayLoadOrStore() || MI.getOpcode() == X86::MFENCE ||
+ MI.getOpcode() == X86::SFENCE || MI.getOpcode() == X86::LFENCE)
+ return false;
+
+ // FIXME: This does not handle pseudo loading instruction like TCRETURN*
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemRefBeginIdx < 0) {
+ LLVM_DEBUG(dbgs() << "Warning: unable to obtain memory operand for loading "
+ "instruction:\n";
+ MI.print(dbgs()); dbgs() << '\n';);
+ return false;
+ }
+ MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+ const MachineOperand &BaseMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+ const MachineOperand &IndexMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+ return (BaseMO.isReg() && BaseMO.getReg() != X86::NoRegister &&
+ TRI->regsOverlap(BaseMO.getReg(), Reg)) ||
+ (IndexMO.isReg() && IndexMO.getReg() != X86::NoRegister &&
+ TRI->regsOverlap(IndexMO.getReg(), Reg));
+}
+
+bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToBranch(
+ const MachineInstr &MI, unsigned Reg) const {
+ if (!MI.isConditionalBranch())
+ return false;
+ for (const MachineOperand &Use : MI.uses())
+ if (Use.isReg() && Use.getReg() == Reg)
+ return true;
+ return false;
+}
+
+INITIALIZE_PASS_BEGIN(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
+ "X86 LVI load hardening", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
+ "X86 LVI load hardening", false, false)
+
+FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningPass() {
+ return new X86LoadValueInjectionLoadHardeningPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
new file mode 100644
index 000000000000..7b6276c1d87e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -0,0 +1,120 @@
+//===-- X86LoadValueInjectionRetHardening.cpp - LVI RET hardening for x86 --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Description: Replaces every `ret` instruction with the sequence:
+/// ```
+/// pop <scratch-reg>
+/// lfence
+/// jmp *<scratch-reg>
+/// ```
+/// where `<scratch-reg>` is some available scratch register, according to the
+/// calling convention of the function being mitigated.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include <bitset>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-lvi-ret"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
+STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
+STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
+ "were deployed");
+
+namespace {
+
+class X86LoadValueInjectionRetHardeningPass : public MachineFunctionPass {
+public:
+ X86LoadValueInjectionRetHardeningPass() : MachineFunctionPass(ID) {}
+ StringRef getPassName() const override {
+ return "X86 Load Value Injection (LVI) Ret-Hardening";
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86LoadValueInjectionRetHardeningPass::ID = 0;
+
+bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
+ MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
+ << " *****\n");
+ const X86Subtarget *Subtarget = &MF.getSubtarget<X86Subtarget>();
+ if (!Subtarget->useLVIControlFlowIntegrity() || !Subtarget->is64Bit())
+ return false; // FIXME: support 32-bit
+
+ // Don't skip functions with the "optnone" attr but participate in opt-bisect.
+ const Function &F = MF.getFunction();
+ if (!F.hasOptNone() && skipFunction(F))
+ return false;
+
+ ++NumFunctionsConsidered;
+ const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
+
+ bool Modified = false;
+ for (auto &MBB : MF) {
+ for (auto MBBI = MBB.begin(); MBBI != MBB.end(); ++MBBI) {
+ if (MBBI->getOpcode() != X86::RETQ)
+ continue;
+
+ unsigned ClobberReg = TRI->findDeadCallerSavedReg(MBB, MBBI);
+ if (ClobberReg != X86::NoRegister) {
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::POP64r))
+ .addReg(ClobberReg, RegState::Define)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::JMP64r))
+ .addReg(ClobberReg);
+ MBB.erase(MBBI);
+ } else {
+ // In case there is no available scratch register, we can still read
+ // from RSP to assert that RSP points to a valid page. The write to RSP
+ // is also helpful because it verifies that the stack's write
+ // permissions are intact.
+ MachineInstr *Fence =
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
+ addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
+ X86::RSP, false, 0)
+ .addImm(0)
+ ->addRegisterDead(X86::EFLAGS, TRI);
+ }
+
+ ++NumFences;
+ Modified = true;
+ break;
+ }
+ }
+
+ if (Modified)
+ ++NumFunctionsMitigated;
+ return Modified;
+}
+
+INITIALIZE_PASS(X86LoadValueInjectionRetHardeningPass, PASS_KEY,
+ "X86 LVI ret hardener", false, false)
+
+FunctionPass *llvm::createX86LoadValueInjectionRetHardeningPass() {
+ return new X86LoadValueInjectionRetHardeningPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
new file mode 100644
index 000000000000..85166decd8cd
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -0,0 +1,351 @@
+//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to transform <256 x i32> load/store
+/// <256 x i32> is bitcasted to x86_amx on X86, and AMX instruction set only
+/// provides simple operation on x86_amx. The basic elementwise operation
+/// is not supported by AMX. Since x86_amx is bitcasted from vector <256 x i32>
+/// and only AMX intrinsics can operate on the type, we need transform
+/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can
+/// not be combined with load/store, we transform the bitcast to amx load/store
+/// and <256 x i32> store/load.
+//
+//===----------------------------------------------------------------------===//
+//
+#include "X86.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "lower-amx-type"
+
+static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) {
+ Function &F = *BB->getParent();
+ Module *M = BB->getModule();
+ const DataLayout &DL = M->getDataLayout();
+
+ Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
+ LLVMContext &Ctx = Builder.getContext();
+ auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
+ unsigned AllocaAS = DL.getAllocaAddrSpace();
+ AllocaInst *AllocaRes =
+ new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front());
+ AllocaRes->setAlignment(AllocaAlignment);
+ return AllocaRes;
+}
+
+static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
+ Value *Row = nullptr, *Col = nullptr;
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Expect amx intrinsics");
+ case Intrinsic::x86_tileloadd64_internal:
+ case Intrinsic::x86_tilestored64_internal: {
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ // a * b + c
+ // The shape depends on which operand.
+ case Intrinsic::x86_tdpbssd_internal: {
+ switch (OpNo) {
+ case 3:
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ case 4:
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(2);
+ break;
+ case 5:
+ Row = II->getArgOperand(2);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ break;
+ }
+ }
+
+ return std::make_pair(Row, Col);
+}
+
+// %src = load <256 x i32>, <256 x i32>* %addr, align 64
+// %2 = bitcast <256 x i32> %src to x86_amx
+// -->
+// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+// i8* %addr, i64 %stride64)
+static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
+ Value *Row = nullptr, *Col = nullptr;
+ Use &U = *(Bitcast->use_begin());
+ unsigned OpNo = U.getOperandNo();
+ auto *II = cast<IntrinsicInst>(U.getUser());
+ std::tie(Row, Col) = getShape(II, OpNo);
+ IRBuilder<> Builder(Bitcast);
+ // Use the maximun column as stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy());
+ std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+
+ Value *NewInst =
+ Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
+ Bitcast->replaceAllUsesWith(NewInst);
+}
+
+// %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
+// %stride);
+// %13 = bitcast x86_amx %src to <256 x i32>
+// store <256 x i32> %13, <256 x i32>* %addr, align 64
+// -->
+// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+// %stride64, %13)
+static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
+
+ Value *Tile = Bitcast->getOperand(0);
+ auto *II = cast<IntrinsicInst>(Tile);
+ // Tile is output from AMX intrinsic. The first operand of the
+ // intrinsic is row, the second operand of the intrinsic is column.
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+ IRBuilder<> Builder(ST);
+ // Use the maximum column as stride. It must be the same with load
+ // stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
+ std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+ if (Bitcast->hasOneUse())
+ return;
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // store <256 x i32> %13, <256 x i32>* %addr, align 64
+ // %add = <256 x i32> %13, <256 x i32> %src2
+ // -->
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+ // %stride64, %13)
+ // %14 = load <256 x i32>, %addr
+ // %add = <256 x i32> %14, <256 x i32> %src2
+ Value *Vec = Builder.CreateLoad(Bitcast->getType(), ST->getOperand(1));
+ Bitcast->replaceAllUsesWith(Vec);
+}
+
+// transform bitcast to <store, load> instructions.
+static bool transformBitcast(BitCastInst *Bitcast) {
+ IRBuilder<> Builder(Bitcast);
+ AllocaInst *AllocaAddr;
+ Value *I8Ptr, *Stride;
+ auto *Src = Bitcast->getOperand(0);
+
+ auto Prepare = [&]() {
+ AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent());
+ I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
+ Stride = Builder.getInt64(64);
+ };
+
+ if (Bitcast->getType()->isX86_AMXTy()) {
+ // %2 = bitcast <256 x i32> %src to x86_amx
+ // -->
+ // %addr = alloca <256 x i32>, align 64
+ // store <256 x i32> %src, <256 x i32>* %addr, align 64
+ // %addr2 = bitcast <256 x i32>* to i8*
+ // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %addr2,
+ // i64 64)
+ Use &U = *(Bitcast->use_begin());
+ unsigned OpNo = U.getOperandNo();
+ auto *II = dyn_cast<IntrinsicInst>(U.getUser());
+ if (!II)
+ return false; // May be bitcast from x86amx to <256 x i32>.
+ Prepare();
+ Builder.CreateStore(Src, AllocaAddr);
+ // TODO we can pick an constant operand for the shape.
+ Value *Row = nullptr, *Col = nullptr;
+ std::tie(Row, Col) = getShape(II, OpNo);
+ std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+ Value *NewInst = Builder.CreateIntrinsic(
+ Intrinsic::x86_tileloadd64_internal, None, Args);
+ Bitcast->replaceAllUsesWith(NewInst);
+ } else {
+ // %2 = bitcast x86_amx %src to <256 x i32>
+ // -->
+ // %addr = alloca <256 x i32>, align 64
+ // %addr2 = bitcast <256 x i32>* to i8*
+ // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col,
+ // i8* %addr2, i64 %stride)
+ // %2 = load <256 x i32>, <256 x i32>* %addr, align 64
+ auto *II = dyn_cast<IntrinsicInst>(Src);
+ if (!II)
+ return false; // May be bitcast from <256 x i32> to x86amx.
+ Prepare();
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+ std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src};
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+ Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr);
+ Bitcast->replaceAllUsesWith(NewInst);
+ }
+
+ return true;
+}
+
+namespace {
+class X86LowerAMXType {
+ Function &Func;
+
+public:
+ X86LowerAMXType(Function &F) : Func(F) {}
+ bool visit();
+};
+
+bool X86LowerAMXType::visit() {
+ SmallVector<Instruction *, 8> DeadInsts;
+
+ for (BasicBlock *BB : post_order(&Func)) {
+ for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
+ II != IE;) {
+ Instruction &Inst = *II++;
+ auto *Bitcast = dyn_cast<BitCastInst>(&Inst);
+ if (!Bitcast)
+ continue;
+
+ Value *Src = Bitcast->getOperand(0);
+ if (Bitcast->getType()->isX86_AMXTy()) {
+ if (Bitcast->user_empty()) {
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ LoadInst *LD = dyn_cast<LoadInst>(Src);
+ if (!LD) {
+ if (transformBitcast(Bitcast))
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ // If load has mutli-user, duplicate a vector load.
+ // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+ // %2 = bitcast <256 x i32> %src to x86_amx
+ // %add = add <256 x i32> %src, <256 x i32> %src2
+ // -->
+ // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+ // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %addr, i64 %stride64)
+ // %add = add <256 x i32> %src, <256 x i32> %src2
+
+ // If load has one user, the load will be eliminated in DAG ISel.
+ // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+ // %2 = bitcast <256 x i32> %src to x86_amx
+ // -->
+ // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %addr, i64 %stride64)
+ combineLoadBitcast(LD, Bitcast);
+ DeadInsts.push_back(Bitcast);
+ if (LD->hasOneUse())
+ DeadInsts.push_back(LD);
+ } else if (Src->getType()->isX86_AMXTy()) {
+ if (Bitcast->user_empty()) {
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ StoreInst *ST = nullptr;
+ for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end();
+ UI != UE;) {
+ Value *I = (UI++)->getUser();
+ ST = dyn_cast<StoreInst>(I);
+ if (ST)
+ break;
+ }
+ if (!ST) {
+ if (transformBitcast(Bitcast))
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ // If bitcast (%13) has one use, combine bitcast and store to amx store.
+ // %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
+ // %stride);
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // store <256 x i32> %13, <256 x i32>* %addr, align 64
+ // -->
+ // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+ // %stride64, %13)
+ //
+ // If bitcast (%13) has multi-use, transform as below.
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // store <256 x i32> %13, <256 x i32>* %addr, align 64
+ // %add = <256 x i32> %13, <256 x i32> %src2
+ // -->
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+ // %stride64, %13)
+ // %14 = load <256 x i32>, %addr
+ // %add = <256 x i32> %14, <256 x i32> %src2
+ //
+ combineBitcastStore(Bitcast, ST);
+ // Delete user first.
+ DeadInsts.push_back(ST);
+ DeadInsts.push_back(Bitcast);
+ }
+ }
+ }
+
+ bool C = !DeadInsts.empty();
+
+ for (auto *Inst : DeadInsts)
+ Inst->eraseFromParent();
+
+ return C;
+}
+} // anonymous namespace
+
+namespace {
+
+class X86LowerAMXTypeLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
+ initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ X86LowerAMXType LAT(F);
+ bool C = LAT.visit();
+ return C;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+};
+
+} // anonymous namespace
+
+static const char PassName[] = "Lower AMX type for load/store";
+char X86LowerAMXTypeLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+ false)
+INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+ false)
+
+FunctionPass *llvm::createX86LowerAMXTypePass() {
+ return new X86LowerAMXTypeLegacyPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
new file mode 100644
index 000000000000..89fa3ae3a3f4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -0,0 +1,2631 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86InstComments.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
+#include "MCTargetDesc/X86TargetStreamer.h"
+#include "X86AsmPrinter.h"
+#include "X86RegisterInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+ MCContext &Ctx;
+ const MachineFunction &MF;
+ const TargetMachine &TM;
+ const MCAsmInfo &MAI;
+ X86AsmPrinter &AsmPrinter;
+
+public:
+ X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
+
+ Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const;
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+ MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+} // end anonymous namespace
+
+/// A RAII helper which defines a region of instructions which can't have
+/// padding added between them for correctness.
+struct NoAutoPaddingScope {
+ MCStreamer &OS;
+ const bool OldAllowAutoPadding;
+ NoAutoPaddingScope(MCStreamer &OS)
+ : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) {
+ changeAndComment(false);
+ }
+ ~NoAutoPaddingScope() { changeAndComment(OldAllowAutoPadding); }
+ void changeAndComment(bool b) {
+ if (b == OS.getAllowAutoPadding())
+ return;
+ OS.setAllowAutoPadding(b);
+ if (b)
+ OS.emitRawComment("autopadding");
+ else
+ OS.emitRawComment("noautopadding");
+ }
+};
+
+// Emit a minimal sequence of nops spanning NumBytes bytes.
+static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
+ const X86Subtarget *Subtarget);
+
+void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
+ const MCSubtargetInfo &STI,
+ MCCodeEmitter *CodeEmitter) {
+ if (InShadow) {
+ SmallString<256> Code;
+ SmallVector<MCFixup, 4> Fixups;
+ raw_svector_ostream VecOS(Code);
+ CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
+ CurrentShadowSize += Code.size();
+ if (CurrentShadowSize >= RequiredShadowSize)
+ InShadow = false; // The shadow is big enough. Stop counting.
+ }
+}
+
+void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
+ MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
+ if (InShadow && CurrentShadowSize < RequiredShadowSize) {
+ InShadow = false;
+ emitX86Nops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+ &MF->getSubtarget<X86Subtarget>());
+ }
+}
+
+void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
+ OutStreamer->emitInstruction(Inst, getSubtargetInfo());
+ SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
+}
+
+X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
+ X86AsmPrinter &asmprinter)
+ : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
+ AsmPrinter(asmprinter) {}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+ return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::GetSymbolFromOperand(const MachineOperand &MO) const {
+ const Triple &TT = TM.getTargetTriple();
+ if (MO.isGlobal() && TT.isOSBinFormatELF())
+ return AsmPrinter.getSymbolPreferLocal(*MO.getGlobal());
+
+ const DataLayout &DL = MF.getDataLayout();
+ assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) &&
+ "Isn't a symbol reference");
+
+ MCSymbol *Sym = nullptr;
+ SmallString<128> Name;
+ StringRef Suffix;
+
+ switch (MO.getTargetFlags()) {
+ case X86II::MO_DLLIMPORT:
+ // Handle dllimport linkage.
+ Name += "__imp_";
+ break;
+ case X86II::MO_COFFSTUB:
+ Name += ".refptr.";
+ break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ Suffix = "$non_lazy_ptr";
+ break;
+ }
+
+ if (!Suffix.empty())
+ Name += DL.getPrivateGlobalPrefix();
+
+ if (MO.isGlobal()) {
+ const GlobalValue *GV = MO.getGlobal();
+ AsmPrinter.getNameWithPrefix(Name, GV);
+ } else if (MO.isSymbol()) {
+ Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
+ } else if (MO.isMBB()) {
+ assert(Suffix.empty());
+ Sym = MO.getMBB()->getSymbol();
+ }
+
+ Name += Suffix;
+ if (!Sym)
+ Sym = Ctx.getOrCreateSymbol(Name);
+
+ // If the target flags on the operand changes the name of the symbol, do that
+ // before we return the symbol.
+ switch (MO.getTargetFlags()) {
+ default:
+ break;
+ case X86II::MO_COFFSTUB: {
+ MachineModuleInfoCOFF &MMICOFF =
+ MF.getMMI().getObjFileInfo<MachineModuleInfoCOFF>();
+ MachineModuleInfoImpl::StubValueTy &StubSym = MMICOFF.getGVStubEntry(Sym);
+ if (!StubSym.getPointer()) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym = MachineModuleInfoImpl::StubValueTy(
+ AsmPrinter.getSymbol(MO.getGlobal()), true);
+ }
+ break;
+ }
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getGVStubEntry(Sym);
+ if (!StubSym.getPointer()) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym = MachineModuleInfoImpl::StubValueTy(
+ AsmPrinter.getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ }
+ break;
+ }
+ }
+
+ return Sym;
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ const MCExpr *Expr = nullptr;
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+ switch (MO.getTargetFlags()) {
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ // These affect the name of the symbol, not any suffix.
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ case X86II::MO_COFFSTUB:
+ break;
+
+ case X86II::MO_TLVP:
+ RefKind = MCSymbolRefExpr::VK_TLVP;
+ break;
+ case X86II::MO_TLVP_PIC_BASE:
+ Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+ // Subtract the pic base.
+ Expr = MCBinaryExpr::createSub(
+ Expr, MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx), Ctx);
+ break;
+ case X86II::MO_SECREL:
+ RefKind = MCSymbolRefExpr::VK_SECREL;
+ break;
+ case X86II::MO_TLSGD:
+ RefKind = MCSymbolRefExpr::VK_TLSGD;
+ break;
+ case X86II::MO_TLSLD:
+ RefKind = MCSymbolRefExpr::VK_TLSLD;
+ break;
+ case X86II::MO_TLSLDM:
+ RefKind = MCSymbolRefExpr::VK_TLSLDM;
+ break;
+ case X86II::MO_GOTTPOFF:
+ RefKind = MCSymbolRefExpr::VK_GOTTPOFF;
+ break;
+ case X86II::MO_INDNTPOFF:
+ RefKind = MCSymbolRefExpr::VK_INDNTPOFF;
+ break;
+ case X86II::MO_TPOFF:
+ RefKind = MCSymbolRefExpr::VK_TPOFF;
+ break;
+ case X86II::MO_DTPOFF:
+ RefKind = MCSymbolRefExpr::VK_DTPOFF;
+ break;
+ case X86II::MO_NTPOFF:
+ RefKind = MCSymbolRefExpr::VK_NTPOFF;
+ break;
+ case X86II::MO_GOTNTPOFF:
+ RefKind = MCSymbolRefExpr::VK_GOTNTPOFF;
+ break;
+ case X86II::MO_GOTPCREL:
+ RefKind = MCSymbolRefExpr::VK_GOTPCREL;
+ break;
+ case X86II::MO_GOT:
+ RefKind = MCSymbolRefExpr::VK_GOT;
+ break;
+ case X86II::MO_GOTOFF:
+ RefKind = MCSymbolRefExpr::VK_GOTOFF;
+ break;
+ case X86II::MO_PLT:
+ RefKind = MCSymbolRefExpr::VK_PLT;
+ break;
+ case X86II::MO_ABS8:
+ RefKind = MCSymbolRefExpr::VK_X86_ABS8;
+ break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ Expr = MCSymbolRefExpr::create(Sym, Ctx);
+ // Subtract the pic base.
+ Expr = MCBinaryExpr::createSub(
+ Expr, MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx), Ctx);
+ if (MO.isJTI()) {
+ assert(MAI.doesSetDirectiveSuppressReloc());
+ // If .set directive is supported, use it to reduce the number of
+ // relocations the assembler will generate for differences between
+ // local labels. This is only safe when the symbols are in the same
+ // section so we are restricting it to jumptable references.
+ MCSymbol *Label = Ctx.createTempSymbol();
+ AsmPrinter.OutStreamer->emitAssignment(Label, Expr);
+ Expr = MCSymbolRefExpr::create(Label, Ctx);
+ }
+ break;
+ }
+
+ if (!Expr)
+ Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+
+ if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ return MCOperand::createExpr(Expr);
+}
+
+/// Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// a short fixed-register form.
+static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
+ unsigned ImmOp = Inst.getNumOperands() - 1;
+ assert(Inst.getOperand(0).isReg() &&
+ (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
+ ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
+ Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
+ Inst.getNumOperands() == 2) &&
+ "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(0).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(ImmOp);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+}
+
+/// If a movsx instruction has a shorter encoding for the used register
+/// simplify the instruction to use it instead.
+static void SimplifyMOVSX(MCInst &Inst) {
+ unsigned NewOpcode = 0;
+ unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
+ if (Op0 == X86::AX && Op1 == X86::AL)
+ NewOpcode = X86::CBW;
+ break;
+ case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl
+ if (Op0 == X86::EAX && Op1 == X86::AX)
+ NewOpcode = X86::CWDE;
+ break;
+ case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
+ if (Op0 == X86::RAX && Op1 == X86::EAX)
+ NewOpcode = X86::CDQE;
+ break;
+ }
+
+ if (NewOpcode != 0) {
+ Inst = MCInst();
+ Inst.setOpcode(NewOpcode);
+ }
+}
+
+/// Simplify things like MOV32rm to MOV32o32a.
+static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
+ unsigned Opcode) {
+ // Don't make these simplifications in 64-bit mode; other assemblers don't
+ // perform them because they make the code larger.
+ if (Printer.getSubtarget().is64Bit())
+ return;
+
+ bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
+ unsigned AddrBase = IsStore;
+ unsigned RegOp = IsStore ? 0 : 5;
+ unsigned AddrOp = AddrBase + 3;
+ assert(
+ Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
+ Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
+ Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
+ (Inst.getOperand(AddrOp).isExpr() || Inst.getOperand(AddrOp).isImm()) &&
+ "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(RegOp).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // Check whether this is an absolute address.
+ // FIXME: We know TLVP symbol refs aren't, but there should be a better way
+ // to do this here.
+ bool Absolute = true;
+ if (Inst.getOperand(AddrOp).isExpr()) {
+ const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
+ if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
+ if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
+ Absolute = false;
+ }
+
+ if (Absolute &&
+ (Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 ||
+ Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 ||
+ Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(AddrOp);
+ MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+ Inst.addOperand(Seg);
+}
+
+static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
+ return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
+}
+
+Optional<MCOperand>
+X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const {
+ switch (MO.getType()) {
+ default:
+ MI->print(errs());
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ return None;
+ return MCOperand::createReg(MO.getReg());
+ case MachineOperand::MO_Immediate:
+ return MCOperand::createImm(MO.getImm());
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+ case MachineOperand::MO_MCSymbol:
+ return LowerSymbolOperand(MO, MO.getMCSymbol());
+ case MachineOperand::MO_JumpTableIndex:
+ return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+ case MachineOperand::MO_ConstantPoolIndex:
+ return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+ case MachineOperand::MO_BlockAddress:
+ return LowerSymbolOperand(
+ MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+ case MachineOperand::MO_RegisterMask:
+ // Ignore call clobbers.
+ return None;
+ }
+}
+
+// Replace TAILJMP opcodes with their equivalent opcodes that have encoding
+// information.
+static unsigned convertTailJumpOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::TAILJMPr:
+ Opcode = X86::JMP32r;
+ break;
+ case X86::TAILJMPm:
+ Opcode = X86::JMP32m;
+ break;
+ case X86::TAILJMPr64:
+ Opcode = X86::JMP64r;
+ break;
+ case X86::TAILJMPm64:
+ Opcode = X86::JMP64m;
+ break;
+ case X86::TAILJMPr64_REX:
+ Opcode = X86::JMP64r_REX;
+ break;
+ case X86::TAILJMPm64_REX:
+ Opcode = X86::JMP64m_REX;
+ break;
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64:
+ Opcode = X86::JMP_1;
+ break;
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPd64_CC:
+ Opcode = X86::JCC_1;
+ break;
+ }
+
+ return Opcode;
+}
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (const MachineOperand &MO : MI->operands())
+ if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
+ OutMI.addOperand(MaybeMCOp.getValue());
+
+ // Handle a few special cases to eliminate operand modifiers.
+ switch (OutMI.getOpcode()) {
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ case X86::LEA16r:
+ case X86::LEA32r:
+ // LEA should have a segment register, but it must be empty.
+ assert(OutMI.getNumOperands() == 1 + X86::AddrNumOperands &&
+ "Unexpected # of LEA operands");
+ assert(OutMI.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
+ "LEA has segment specified!");
+ break;
+
+ case X86::MULX32Hrr:
+ case X86::MULX32Hrm:
+ case X86::MULX64Hrr:
+ case X86::MULX64Hrm: {
+ // Turn into regular MULX by duplicating the destination.
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::MULX32Hrr: NewOpc = X86::MULX32rr; break;
+ case X86::MULX32Hrm: NewOpc = X86::MULX32rm; break;
+ case X86::MULX64Hrr: NewOpc = X86::MULX64rr; break;
+ case X86::MULX64Hrm: NewOpc = X86::MULX64rm; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ // Duplicate the destination.
+ unsigned DestReg = OutMI.getOperand(0).getReg();
+ OutMI.insert(OutMI.begin(), MCOperand::createReg(DestReg));
+ break;
+ }
+
+ // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
+ // if one of the registers is extended, but other isn't.
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVAPDrr:
+ case X86::VMOVAPDYrr:
+ case X86::VMOVAPSrr:
+ case X86::VMOVAPSYrr:
+ case X86::VMOVDQArr:
+ case X86::VMOVDQAYrr:
+ case X86::VMOVDQUrr:
+ case X86::VMOVDQUYrr:
+ case X86::VMOVUPDrr:
+ case X86::VMOVUPDYrr:
+ case X86::VMOVUPSrr:
+ case X86::VMOVUPSYrr: {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ }
+ break;
+ }
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+ case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ }
+ break;
+ }
+
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rmik:
+ case X86::VPCMPBZ128rri: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rmik:
+ case X86::VPCMPBZ256rri: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrmik:
+ case X86::VPCMPBZrri: case X86::VPCMPBZrrik:
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rmik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ128rri: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rmik:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZ256rri: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrmik:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ case X86::VPCMPDZrri: case X86::VPCMPDZrrik:
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rmik:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ128rri: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rmik:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZ256rri: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrmik:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ case X86::VPCMPQZrri: case X86::VPCMPQZrrik:
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rmik:
+ case X86::VPCMPWZ128rri: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rmik:
+ case X86::VPCMPWZ256rri: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrmik:
+ case X86::VPCMPWZrri: case X86::VPCMPWZrrik: {
+ // Turn immediate 0 into the VPCMPEQ instruction.
+ if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break;
+ case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break;
+ case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break;
+ case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPEQBZ128rrk; break;
+ case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPEQBZ256rm; break;
+ case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPEQBZ256rmk; break;
+ case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPEQBZ256rr; break;
+ case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPEQBZ256rrk; break;
+ case X86::VPCMPBZrmi: NewOpc = X86::VPCMPEQBZrm; break;
+ case X86::VPCMPBZrmik: NewOpc = X86::VPCMPEQBZrmk; break;
+ case X86::VPCMPBZrri: NewOpc = X86::VPCMPEQBZrr; break;
+ case X86::VPCMPBZrrik: NewOpc = X86::VPCMPEQBZrrk; break;
+ case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPEQDZ128rm; break;
+ case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPEQDZ128rmb; break;
+ case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPEQDZ128rmbk; break;
+ case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPEQDZ128rmk; break;
+ case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPEQDZ128rr; break;
+ case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPEQDZ128rrk; break;
+ case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPEQDZ256rm; break;
+ case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPEQDZ256rmb; break;
+ case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPEQDZ256rmbk; break;
+ case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPEQDZ256rmk; break;
+ case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPEQDZ256rr; break;
+ case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPEQDZ256rrk; break;
+ case X86::VPCMPDZrmi: NewOpc = X86::VPCMPEQDZrm; break;
+ case X86::VPCMPDZrmib: NewOpc = X86::VPCMPEQDZrmb; break;
+ case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPEQDZrmbk; break;
+ case X86::VPCMPDZrmik: NewOpc = X86::VPCMPEQDZrmk; break;
+ case X86::VPCMPDZrri: NewOpc = X86::VPCMPEQDZrr; break;
+ case X86::VPCMPDZrrik: NewOpc = X86::VPCMPEQDZrrk; break;
+ case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPEQQZ128rm; break;
+ case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPEQQZ128rmb; break;
+ case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPEQQZ128rmbk; break;
+ case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPEQQZ128rmk; break;
+ case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPEQQZ128rr; break;
+ case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPEQQZ128rrk; break;
+ case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPEQQZ256rm; break;
+ case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPEQQZ256rmb; break;
+ case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPEQQZ256rmbk; break;
+ case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPEQQZ256rmk; break;
+ case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPEQQZ256rr; break;
+ case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPEQQZ256rrk; break;
+ case X86::VPCMPQZrmi: NewOpc = X86::VPCMPEQQZrm; break;
+ case X86::VPCMPQZrmib: NewOpc = X86::VPCMPEQQZrmb; break;
+ case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPEQQZrmbk; break;
+ case X86::VPCMPQZrmik: NewOpc = X86::VPCMPEQQZrmk; break;
+ case X86::VPCMPQZrri: NewOpc = X86::VPCMPEQQZrr; break;
+ case X86::VPCMPQZrrik: NewOpc = X86::VPCMPEQQZrrk; break;
+ case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPEQWZ128rm; break;
+ case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPEQWZ128rmk; break;
+ case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPEQWZ128rr; break;
+ case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPEQWZ128rrk; break;
+ case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPEQWZ256rm; break;
+ case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPEQWZ256rmk; break;
+ case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPEQWZ256rr; break;
+ case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPEQWZ256rrk; break;
+ case X86::VPCMPWZrmi: NewOpc = X86::VPCMPEQWZrm; break;
+ case X86::VPCMPWZrmik: NewOpc = X86::VPCMPEQWZrmk; break;
+ case X86::VPCMPWZrri: NewOpc = X86::VPCMPEQWZrr; break;
+ case X86::VPCMPWZrrik: NewOpc = X86::VPCMPEQWZrrk; break;
+ }
+
+ OutMI.setOpcode(NewOpc);
+ OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
+ break;
+ }
+
+ // Turn immediate 6 into the VPCMPGT instruction.
+ if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break;
+ case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break;
+ case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break;
+ case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPGTBZ128rrk; break;
+ case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPGTBZ256rm; break;
+ case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPGTBZ256rmk; break;
+ case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPGTBZ256rr; break;
+ case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPGTBZ256rrk; break;
+ case X86::VPCMPBZrmi: NewOpc = X86::VPCMPGTBZrm; break;
+ case X86::VPCMPBZrmik: NewOpc = X86::VPCMPGTBZrmk; break;
+ case X86::VPCMPBZrri: NewOpc = X86::VPCMPGTBZrr; break;
+ case X86::VPCMPBZrrik: NewOpc = X86::VPCMPGTBZrrk; break;
+ case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPGTDZ128rm; break;
+ case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPGTDZ128rmb; break;
+ case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPGTDZ128rmbk; break;
+ case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPGTDZ128rmk; break;
+ case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPGTDZ128rr; break;
+ case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPGTDZ128rrk; break;
+ case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPGTDZ256rm; break;
+ case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPGTDZ256rmb; break;
+ case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPGTDZ256rmbk; break;
+ case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPGTDZ256rmk; break;
+ case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPGTDZ256rr; break;
+ case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPGTDZ256rrk; break;
+ case X86::VPCMPDZrmi: NewOpc = X86::VPCMPGTDZrm; break;
+ case X86::VPCMPDZrmib: NewOpc = X86::VPCMPGTDZrmb; break;
+ case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPGTDZrmbk; break;
+ case X86::VPCMPDZrmik: NewOpc = X86::VPCMPGTDZrmk; break;
+ case X86::VPCMPDZrri: NewOpc = X86::VPCMPGTDZrr; break;
+ case X86::VPCMPDZrrik: NewOpc = X86::VPCMPGTDZrrk; break;
+ case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPGTQZ128rm; break;
+ case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPGTQZ128rmb; break;
+ case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPGTQZ128rmbk; break;
+ case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPGTQZ128rmk; break;
+ case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPGTQZ128rr; break;
+ case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPGTQZ128rrk; break;
+ case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPGTQZ256rm; break;
+ case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPGTQZ256rmb; break;
+ case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPGTQZ256rmbk; break;
+ case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPGTQZ256rmk; break;
+ case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPGTQZ256rr; break;
+ case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPGTQZ256rrk; break;
+ case X86::VPCMPQZrmi: NewOpc = X86::VPCMPGTQZrm; break;
+ case X86::VPCMPQZrmib: NewOpc = X86::VPCMPGTQZrmb; break;
+ case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPGTQZrmbk; break;
+ case X86::VPCMPQZrmik: NewOpc = X86::VPCMPGTQZrmk; break;
+ case X86::VPCMPQZrri: NewOpc = X86::VPCMPGTQZrr; break;
+ case X86::VPCMPQZrrik: NewOpc = X86::VPCMPGTQZrrk; break;
+ case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPGTWZ128rm; break;
+ case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPGTWZ128rmk; break;
+ case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPGTWZ128rr; break;
+ case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPGTWZ128rrk; break;
+ case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPGTWZ256rm; break;
+ case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPGTWZ256rmk; break;
+ case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPGTWZ256rr; break;
+ case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPGTWZ256rrk; break;
+ case X86::VPCMPWZrmi: NewOpc = X86::VPCMPGTWZrm; break;
+ case X86::VPCMPWZrmik: NewOpc = X86::VPCMPGTWZrmk; break;
+ case X86::VPCMPWZrri: NewOpc = X86::VPCMPGTWZrr; break;
+ case X86::VPCMPWZrrik: NewOpc = X86::VPCMPGTWZrrk; break;
+ }
+
+ OutMI.setOpcode(NewOpc);
+ OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
+ break;
+ }
+
+ break;
+ }
+
+ // CALL64r, CALL64pcrel32 - These instructions used to have
+ // register inputs modeled as normal uses instead of implicit uses. As such,
+ // they we used to truncate off all but the first operand (the callee). This
+ // issue seems to have been fixed at some point. This assert verifies that.
+ case X86::CALL64r:
+ case X86::CALL64pcrel32:
+ assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
+ break;
+
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+ break;
+ }
+
+ case X86::CLEANUPRET: {
+ // Replace CLEANUPRET with the appropriate RET.
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+ break;
+ }
+
+ case X86::CATCHRET: {
+ // Replace CATCHRET with the appropriate RET.
+ const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
+ unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(Subtarget));
+ OutMI.addOperand(MCOperand::createReg(ReturnReg));
+ break;
+ }
+
+ // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
+ // instruction.
+ case X86::TAILJMPr:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64:
+ assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
+ OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
+ break;
+
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPd64_CC:
+ assert(OutMI.getNumOperands() == 2 && "Unexpected number of operands!");
+ OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
+ break;
+
+ case X86::TAILJMPm:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPm64_REX:
+ assert(OutMI.getNumOperands() == X86::AddrNumOperands &&
+ "Unexpected number of operands!");
+ OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
+ break;
+
+ case X86::DEC16r:
+ case X86::DEC32r:
+ case X86::INC16r:
+ case X86::INC32r:
+ // If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
+ if (!AsmPrinter.getSubtarget().is64Bit()) {
+ unsigned Opcode;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
+ case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
+ case X86::INC16r: Opcode = X86::INC16r_alt; break;
+ case X86::INC32r: Opcode = X86::INC32r_alt; break;
+ }
+ OutMI.setOpcode(Opcode);
+ }
+ break;
+
+ // We don't currently select the correct instruction form for instructions
+ // which have a short %eax, etc. form. Handle this by custom lowering, for
+ // now.
+ //
+ // Note, we are currently not handling the following instructions:
+ // MOV64ao8, MOV64o8a
+ // XCHG16ar, XCHG32ar, XCHG64ar
+ case X86::MOV8mr_NOREX:
+ case X86::MOV8mr:
+ case X86::MOV8rm_NOREX:
+ case X86::MOV8rm:
+ case X86::MOV16mr:
+ case X86::MOV16rm:
+ case X86::MOV32mr:
+ case X86::MOV32rm: {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::MOV8mr_NOREX:
+ case X86::MOV8mr: NewOpc = X86::MOV8o32a; break;
+ case X86::MOV8rm_NOREX:
+ case X86::MOV8rm: NewOpc = X86::MOV8ao32; break;
+ case X86::MOV16mr: NewOpc = X86::MOV16o32a; break;
+ case X86::MOV16rm: NewOpc = X86::MOV16ao32; break;
+ case X86::MOV32mr: NewOpc = X86::MOV32o32a; break;
+ case X86::MOV32rm: NewOpc = X86::MOV32ao32; break;
+ }
+ SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
+ break;
+ }
+
+ case X86::ADC8ri: case X86::ADC16ri: case X86::ADC32ri: case X86::ADC64ri32:
+ case X86::ADD8ri: case X86::ADD16ri: case X86::ADD32ri: case X86::ADD64ri32:
+ case X86::AND8ri: case X86::AND16ri: case X86::AND32ri: case X86::AND64ri32:
+ case X86::CMP8ri: case X86::CMP16ri: case X86::CMP32ri: case X86::CMP64ri32:
+ case X86::OR8ri: case X86::OR16ri: case X86::OR32ri: case X86::OR64ri32:
+ case X86::SBB8ri: case X86::SBB16ri: case X86::SBB32ri: case X86::SBB64ri32:
+ case X86::SUB8ri: case X86::SUB16ri: case X86::SUB32ri: case X86::SUB64ri32:
+ case X86::TEST8ri:case X86::TEST16ri:case X86::TEST32ri:case X86::TEST64ri32:
+ case X86::XOR8ri: case X86::XOR16ri: case X86::XOR32ri: case X86::XOR64ri32: {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::ADC8ri: NewOpc = X86::ADC8i8; break;
+ case X86::ADC16ri: NewOpc = X86::ADC16i16; break;
+ case X86::ADC32ri: NewOpc = X86::ADC32i32; break;
+ case X86::ADC64ri32: NewOpc = X86::ADC64i32; break;
+ case X86::ADD8ri: NewOpc = X86::ADD8i8; break;
+ case X86::ADD16ri: NewOpc = X86::ADD16i16; break;
+ case X86::ADD32ri: NewOpc = X86::ADD32i32; break;
+ case X86::ADD64ri32: NewOpc = X86::ADD64i32; break;
+ case X86::AND8ri: NewOpc = X86::AND8i8; break;
+ case X86::AND16ri: NewOpc = X86::AND16i16; break;
+ case X86::AND32ri: NewOpc = X86::AND32i32; break;
+ case X86::AND64ri32: NewOpc = X86::AND64i32; break;
+ case X86::CMP8ri: NewOpc = X86::CMP8i8; break;
+ case X86::CMP16ri: NewOpc = X86::CMP16i16; break;
+ case X86::CMP32ri: NewOpc = X86::CMP32i32; break;
+ case X86::CMP64ri32: NewOpc = X86::CMP64i32; break;
+ case X86::OR8ri: NewOpc = X86::OR8i8; break;
+ case X86::OR16ri: NewOpc = X86::OR16i16; break;
+ case X86::OR32ri: NewOpc = X86::OR32i32; break;
+ case X86::OR64ri32: NewOpc = X86::OR64i32; break;
+ case X86::SBB8ri: NewOpc = X86::SBB8i8; break;
+ case X86::SBB16ri: NewOpc = X86::SBB16i16; break;
+ case X86::SBB32ri: NewOpc = X86::SBB32i32; break;
+ case X86::SBB64ri32: NewOpc = X86::SBB64i32; break;
+ case X86::SUB8ri: NewOpc = X86::SUB8i8; break;
+ case X86::SUB16ri: NewOpc = X86::SUB16i16; break;
+ case X86::SUB32ri: NewOpc = X86::SUB32i32; break;
+ case X86::SUB64ri32: NewOpc = X86::SUB64i32; break;
+ case X86::TEST8ri: NewOpc = X86::TEST8i8; break;
+ case X86::TEST16ri: NewOpc = X86::TEST16i16; break;
+ case X86::TEST32ri: NewOpc = X86::TEST32i32; break;
+ case X86::TEST64ri32: NewOpc = X86::TEST64i32; break;
+ case X86::XOR8ri: NewOpc = X86::XOR8i8; break;
+ case X86::XOR16ri: NewOpc = X86::XOR16i16; break;
+ case X86::XOR32ri: NewOpc = X86::XOR32i32; break;
+ case X86::XOR64ri32: NewOpc = X86::XOR64i32; break;
+ }
+ SimplifyShortImmForm(OutMI, NewOpc);
+ break;
+ }
+
+ // Try to shrink some forms of movsx.
+ case X86::MOVSX16rr8:
+ case X86::MOVSX32rr16:
+ case X86::MOVSX64rr32:
+ SimplifyMOVSX(OutMI);
+ break;
+
+ case X86::VCMPPDrri:
+ case X86::VCMPPDYrri:
+ case X86::VCMPPSrri:
+ case X86::VCMPPSYrri:
+ case X86::VCMPSDrr:
+ case X86::VCMPSSrr: {
+ // Swap the operands if it will enable a 2 byte VEX encoding.
+ // FIXME: Change the immediate to improve opportunities?
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+ unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ default: break;
+ case 0x00: // EQUAL
+ case 0x03: // UNORDERED
+ case 0x04: // NOT EQUAL
+ case 0x07: // ORDERED
+ std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
+ break;
+ }
+ }
+ break;
+ }
+
+ case X86::VMOVHLPSrr:
+ case X86::VUNPCKHPDrr:
+ // These are not truly commutable so hide them from the default case.
+ break;
+
+ default: {
+ // If the instruction is a commutable arithmetic instruction we might be
+ // able to commute the operands to get a 2 byte VEX prefix.
+ uint64_t TSFlags = MI->getDesc().TSFlags;
+ if (MI->getDesc().isCommutable() &&
+ (TSFlags & X86II::EncodingMask) == X86II::VEX &&
+ (TSFlags & X86II::OpMapMask) == X86II::TB &&
+ (TSFlags & X86II::FormMask) == X86II::MRMSrcReg &&
+ !(TSFlags & X86II::VEX_W) && (TSFlags & X86II::VEX_4V) &&
+ OutMI.getNumOperands() == 3) {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg()))
+ std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
+ }
+ break;
+ }
+ }
+}
+
+void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
+ const MachineInstr &MI) {
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+ bool Is64Bits = MI.getOpcode() != X86::TLS_addr32 &&
+ MI.getOpcode() != X86::TLS_base_addr32;
+ bool Is64BitsLP64 = MI.getOpcode() == X86::TLS_addr64 ||
+ MI.getOpcode() == X86::TLS_base_addr64;
+ MCContext &Ctx = OutStreamer->getContext();
+
+ MCSymbolRefExpr::VariantKind SRVK;
+ switch (MI.getOpcode()) {
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ case X86::TLS_addrX32:
+ SRVK = MCSymbolRefExpr::VK_TLSGD;
+ break;
+ case X86::TLS_base_addr32:
+ SRVK = MCSymbolRefExpr::VK_TLSLDM;
+ break;
+ case X86::TLS_base_addr64:
+ case X86::TLS_base_addrX32:
+ SRVK = MCSymbolRefExpr::VK_TLSLD;
+ break;
+ default:
+ llvm_unreachable("unexpected opcode");
+ }
+
+ const MCSymbolRefExpr *Sym = MCSymbolRefExpr::create(
+ MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)), SRVK, Ctx);
+
+ // As of binutils 2.32, ld has a bogus TLS relaxation error when the GD/LD
+ // code sequence using R_X86_64_GOTPCREL (instead of R_X86_64_GOTPCRELX) is
+ // attempted to be relaxed to IE/LE (binutils PR24784). Work around the bug by
+ // only using GOT when GOTPCRELX is enabled.
+ // TODO Delete the workaround when GOTPCRELX becomes commonplace.
+ bool UseGot = MMI->getModule()->getRtLibUseGOT() &&
+ Ctx.getAsmInfo()->canRelaxRelocations();
+
+ if (Is64Bits) {
+ bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD;
+ if (NeedsPadding && Is64BitsLP64)
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
+ .addReg(X86::RDI)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Sym)
+ .addReg(0));
+ const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("__tls_get_addr");
+ if (NeedsPadding) {
+ if (!UseGot)
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+ }
+ if (UseGot) {
+ const MCExpr *Expr = MCSymbolRefExpr::create(
+ TlsGetAddr, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64m)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Expr)
+ .addReg(0));
+ } else {
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::CALL64pcrel32)
+ .addExpr(MCSymbolRefExpr::create(TlsGetAddr,
+ MCSymbolRefExpr::VK_PLT, Ctx)));
+ }
+ } else {
+ if (SRVK == MCSymbolRefExpr::VK_TLSGD && !UseGot) {
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA32r)
+ .addReg(X86::EAX)
+ .addReg(0)
+ .addImm(1)
+ .addReg(X86::EBX)
+ .addExpr(Sym)
+ .addReg(0));
+ } else {
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA32r)
+ .addReg(X86::EAX)
+ .addReg(X86::EBX)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Sym)
+ .addReg(0));
+ }
+
+ const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("___tls_get_addr");
+ if (UseGot) {
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(TlsGetAddr, MCSymbolRefExpr::VK_GOT, Ctx);
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL32m)
+ .addReg(X86::EBX)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Expr)
+ .addReg(0));
+ } else {
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::CALLpcrel32)
+ .addExpr(MCSymbolRefExpr::create(TlsGetAddr,
+ MCSymbolRefExpr::VK_PLT, Ctx)));
+ }
+ }
+}
+
+/// Emit the largest nop instruction smaller than or equal to \p NumBytes
+/// bytes. Return the size of nop emitted.
+static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
+ const X86Subtarget *Subtarget) {
+ // Determine the longest nop which can be efficiently decoded for the given
+ // target cpu. 15-bytes is the longest single NOP instruction, but some
+ // platforms can't decode the longest forms efficiently.
+ unsigned MaxNopLength = 1;
+ if (Subtarget->is64Bit()) {
+ // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the
+ // IndexReg/BaseReg below need to be updated.
+ if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP))
+ MaxNopLength = 7;
+ else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP))
+ MaxNopLength = 15;
+ else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP))
+ MaxNopLength = 11;
+ else
+ MaxNopLength = 10;
+ } if (Subtarget->is32Bit())
+ MaxNopLength = 2;
+
+ // Cap a single nop emission at the profitable value for the target
+ NumBytes = std::min(NumBytes, MaxNopLength);
+
+ unsigned NopSize;
+ unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
+ IndexReg = Displacement = SegmentReg = 0;
+ BaseReg = X86::RAX;
+ ScaleVal = 1;
+ switch (NumBytes) {
+ case 0:
+ llvm_unreachable("Zero nops?");
+ break;
+ case 1:
+ NopSize = 1;
+ Opc = X86::NOOP;
+ break;
+ case 2:
+ NopSize = 2;
+ Opc = X86::XCHG16ar;
+ break;
+ case 3:
+ NopSize = 3;
+ Opc = X86::NOOPL;
+ break;
+ case 4:
+ NopSize = 4;
+ Opc = X86::NOOPL;
+ Displacement = 8;
+ break;
+ case 5:
+ NopSize = 5;
+ Opc = X86::NOOPL;
+ Displacement = 8;
+ IndexReg = X86::RAX;
+ break;
+ case 6:
+ NopSize = 6;
+ Opc = X86::NOOPW;
+ Displacement = 8;
+ IndexReg = X86::RAX;
+ break;
+ case 7:
+ NopSize = 7;
+ Opc = X86::NOOPL;
+ Displacement = 512;
+ break;
+ case 8:
+ NopSize = 8;
+ Opc = X86::NOOPL;
+ Displacement = 512;
+ IndexReg = X86::RAX;
+ break;
+ case 9:
+ NopSize = 9;
+ Opc = X86::NOOPW;
+ Displacement = 512;
+ IndexReg = X86::RAX;
+ break;
+ default:
+ NopSize = 10;
+ Opc = X86::NOOPW;
+ Displacement = 512;
+ IndexReg = X86::RAX;
+ SegmentReg = X86::CS;
+ break;
+ }
+
+ unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
+ NopSize += NumPrefixes;
+ for (unsigned i = 0; i != NumPrefixes; ++i)
+ OS.emitBytes("\x66");
+
+ switch (Opc) {
+ default: llvm_unreachable("Unexpected opcode");
+ case X86::NOOP:
+ OS.emitInstruction(MCInstBuilder(Opc), *Subtarget);
+ break;
+ case X86::XCHG16ar:
+ OS.emitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX),
+ *Subtarget);
+ break;
+ case X86::NOOPL:
+ case X86::NOOPW:
+ OS.emitInstruction(MCInstBuilder(Opc)
+ .addReg(BaseReg)
+ .addImm(ScaleVal)
+ .addReg(IndexReg)
+ .addImm(Displacement)
+ .addReg(SegmentReg),
+ *Subtarget);
+ break;
+ }
+ assert(NopSize <= NumBytes && "We overemitted?");
+ return NopSize;
+}
+
+/// Emit the optimal amount of multi-byte nops on X86.
+static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
+ const X86Subtarget *Subtarget) {
+ unsigned NopsToEmit = NumBytes;
+ (void)NopsToEmit;
+ while (NumBytes) {
+ NumBytes -= emitNop(OS, NumBytes, Subtarget);
+ assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
+ }
+}
+
+void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
+
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ StatepointOpers SOpers(&MI);
+ if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
+ emitX86Nops(*OutStreamer, PatchBytes, Subtarget);
+ } else {
+ // Lower call target and choose correct opcode
+ const MachineOperand &CallTarget = SOpers.getCallTarget();
+ MCOperand CallTargetMCOp;
+ unsigned CallOpcode;
+ switch (CallTarget.getType()) {
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ CallTargetMCOp = MCIL.LowerSymbolOperand(
+ CallTarget, MCIL.GetSymbolFromOperand(CallTarget));
+ CallOpcode = X86::CALL64pcrel32;
+ // Currently, we only support relative addressing with statepoints.
+ // Otherwise, we'll need a scratch register to hold the target
+ // address. You'll fail asserts during load & relocation if this
+ // symbol is to far away. (TODO: support non-relative addressing)
+ break;
+ case MachineOperand::MO_Immediate:
+ CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
+ CallOpcode = X86::CALL64pcrel32;
+ // Currently, we only support relative addressing with statepoints.
+ // Otherwise, we'll need a scratch register to hold the target
+ // immediate. You'll fail asserts during load & relocation if this
+ // address is to far away. (TODO: support non-relative addressing)
+ break;
+ case MachineOperand::MO_Register:
+ // FIXME: Add retpoline support and remove this.
+ if (Subtarget->useIndirectThunkCalls())
+ report_fatal_error("Lowering register statepoints with thunks not "
+ "yet implemented.");
+ CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
+ CallOpcode = X86::CALL64r;
+ break;
+ default:
+ llvm_unreachable("Unsupported operand type in statepoint call target");
+ break;
+ }
+
+ // Emit call
+ MCInst CallInst;
+ CallInst.setOpcode(CallOpcode);
+ CallInst.addOperand(CallTargetMCOp);
+ OutStreamer->emitInstruction(CallInst, getSubtargetInfo());
+ }
+
+ // Record our statepoint node in the same section used by STACKMAP
+ // and PATCHPOINT
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *MILabel = Ctx.createTempSymbol();
+ OutStreamer->emitLabel(MILabel);
+ SM.recordStatepoint(*MILabel, MI);
+}
+
+void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
+ X86MCInstLower &MCIL) {
+ // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
+ // <opcode>, <operands>
+
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ Register DefRegister = FaultingMI.getOperand(0).getReg();
+ FaultMaps::FaultKind FK =
+ static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
+ MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
+ unsigned Opcode = FaultingMI.getOperand(3).getImm();
+ unsigned OperandsBeginIdx = 4;
+
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *FaultingLabel = Ctx.createTempSymbol();
+ OutStreamer->emitLabel(FaultingLabel);
+
+ assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
+ FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);
+
+ MCInst MI;
+ MI.setOpcode(Opcode);
+
+ if (DefRegister != X86::NoRegister)
+ MI.addOperand(MCOperand::createReg(DefRegister));
+
+ for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
+ E = FaultingMI.operands_end();
+ I != E; ++I)
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
+ MI.addOperand(MaybeOperand.getValue());
+
+ OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
+ OutStreamer->emitInstruction(MI, getSubtargetInfo());
+}
+
+void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ bool Is64Bits = Subtarget->is64Bit();
+ MCContext &Ctx = OutStreamer->getContext();
+ MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
+ const MCSymbolRefExpr *Op =
+ MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_None, Ctx);
+
+ EmitAndCountInstruction(
+ MCInstBuilder(Is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
+ .addExpr(Op));
+}
+
+void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ // PATCHABLE_OP minsize, opcode, operands
+
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ unsigned MinSize = MI.getOperand(0).getImm();
+ unsigned Opcode = MI.getOperand(1).getImm();
+
+ MCInst MCI;
+ MCI.setOpcode(Opcode);
+ for (auto &MO : drop_begin(MI.operands(), 2))
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+ MCI.addOperand(MaybeOperand.getValue());
+
+ SmallString<256> Code;
+ SmallVector<MCFixup, 4> Fixups;
+ raw_svector_ostream VecOS(Code);
+ CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());
+
+ if (Code.size() < MinSize) {
+ if (MinSize == 2 && Subtarget->is32Bit() &&
+ Subtarget->isTargetWindowsMSVC() &&
+ (Subtarget->getCPU().empty() || Subtarget->getCPU() == "pentium3")) {
+ // For compatibilty reasons, when targetting MSVC, is is important to
+ // generate a 'legacy' NOP in the form of a 8B FF MOV EDI, EDI. Some tools
+ // rely specifically on this pattern to be able to patch a function.
+ // This is only for 32-bit targets, when using /arch:IA32 or /arch:SSE.
+ OutStreamer->emitInstruction(
+ MCInstBuilder(X86::MOV32rr_REV).addReg(X86::EDI).addReg(X86::EDI),
+ *Subtarget);
+ } else if (MinSize == 2 && Opcode == X86::PUSH64r) {
+ // This is an optimization that lets us get away without emitting a nop in
+ // many cases.
+ //
+ // NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %r9) takes two
+ // bytes too, so the check on MinSize is important.
+ MCI.setOpcode(X86::PUSH64rmr);
+ } else {
+ unsigned NopSize = emitNop(*OutStreamer, MinSize, Subtarget);
+ assert(NopSize == MinSize && "Could not implement MinSize!");
+ (void)NopSize;
+ }
+ }
+
+ OutStreamer->emitInstruction(MCI, getSubtargetInfo());
+}
+
+// Lower a stackmap of the form:
+// <id>, <shadowBytes>, ...
+void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *MILabel = Ctx.createTempSymbol();
+ OutStreamer->emitLabel(MILabel);
+
+ SM.recordStackMap(*MILabel, MI);
+ unsigned NumShadowBytes = MI.getOperand(1).getImm();
+ SMShadowTracker.reset(NumShadowBytes);
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");
+
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *MILabel = Ctx.createTempSymbol();
+ OutStreamer->emitLabel(MILabel);
+ SM.recordPatchPoint(*MILabel, MI);
+
+ PatchPointOpers opers(&MI);
+ unsigned ScratchIdx = opers.getNextScratchIdx();
+ unsigned EncodedBytes = 0;
+ const MachineOperand &CalleeMO = opers.getCallTarget();
+
+ // Check for null target. If target is non-null (i.e. is non-zero or is
+ // symbolic) then emit a call.
+ if (!(CalleeMO.isImm() && !CalleeMO.getImm())) {
+ MCOperand CalleeMCOp;
+ switch (CalleeMO.getType()) {
+ default:
+ /// FIXME: Add a verifier check for bad callee types.
+ llvm_unreachable("Unrecognized callee operand type.");
+ case MachineOperand::MO_Immediate:
+ if (CalleeMO.getImm())
+ CalleeMCOp = MCOperand::createImm(CalleeMO.getImm());
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_GlobalAddress:
+ CalleeMCOp = MCIL.LowerSymbolOperand(CalleeMO,
+ MCIL.GetSymbolFromOperand(CalleeMO));
+ break;
+ }
+
+ // Emit MOV to materialize the target address and the CALL to target.
+ // This is encoded with 12-13 bytes, depending on which register is used.
+ Register ScratchReg = MI.getOperand(ScratchIdx).getReg();
+ if (X86II::isX86_64ExtendedReg(ScratchReg))
+ EncodedBytes = 13;
+ else
+ EncodedBytes = 12;
+
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
+ // FIXME: Add retpoline support and remove this.
+ if (Subtarget->useIndirectThunkCalls())
+ report_fatal_error(
+ "Lowering patchpoint with thunks not yet implemented.");
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
+ }
+
+ // Emit padding.
+ unsigned NumBytes = opers.getNumPatchBytes();
+ assert(NumBytes >= EncodedBytes &&
+ "Patchpoint can't request size less than the length of a call.");
+
+ emitX86Nops(*OutStreamer, NumBytes - EncodedBytes, Subtarget);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64");
+
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ // We want to emit the following pattern, which follows the x86 calling
+ // convention to prepare for the trampoline call to be patched in.
+ //
+ // .p2align 1, ...
+ // .Lxray_event_sled_N:
+ // jmp +N // jump across the instrumentation sled
+ // ... // set up arguments in register
+ // callq __xray_CustomEvent@plt // force dependency to symbol
+ // ...
+ // <jump here>
+ //
+ // After patching, it would look something like:
+ //
+ // nopw (2-byte nop)
+ // ...
+ // callq __xrayCustomEvent // already lowered
+ // ...
+ //
+ // ---
+ // First we emit the label and the jump.
+ auto CurSled = OutContext.createTempSymbol("xray_event_sled_", true);
+ OutStreamer->AddComment("# XRay Custom Event Log");
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
+
+ // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+ // an operand (computed as an offset from the jmp instruction).
+ // FIXME: Find another less hacky way do force the relative jump.
+ OutStreamer->emitBinaryData("\xeb\x0f");
+
+ // The default C calling convention will place two arguments into %rcx and
+ // %rdx -- so we only work with those.
+ const Register DestRegs[] = {X86::RDI, X86::RSI};
+ bool UsedMask[] = {false, false};
+ // Filled out in loop.
+ Register SrcRegs[] = {0, 0};
+
+ // Then we put the operands in the %rdi and %rsi registers. We spill the
+ // values in the register before we clobber them, and mark them as used in
+ // UsedMask. In case the arguments are already in the correct register, we use
+ // emit nops appropriately sized to keep the sled the same size in every
+ // situation.
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+ if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
+ assert(Op->isReg() && "Only support arguments in registers");
+ SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
+ if (SrcRegs[I] != DestRegs[I]) {
+ UsedMask[I] = true;
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
+ } else {
+ emitX86Nops(*OutStreamer, 4, Subtarget);
+ }
+ }
+
+ // Now that the register values are stashed, mov arguments into place.
+ // FIXME: This doesn't work if one of the later SrcRegs is equal to an
+ // earlier DestReg. We will have already overwritten over the register before
+ // we can copy from it.
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+ if (SrcRegs[I] != DestRegs[I])
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64rr).addReg(DestRegs[I]).addReg(SrcRegs[I]));
+
+ // We emit a hard dependency on the __xray_CustomEvent symbol, which is the
+ // name of the trampoline to be implemented by the XRay runtime.
+ auto TSym = OutContext.getOrCreateSymbol("__xray_CustomEvent");
+ MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym);
+ if (isPositionIndependent())
+ TOp.setTargetFlags(X86II::MO_PLT);
+
+ // Emit the call instruction.
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64pcrel32)
+ .addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));
+
+ // Restore caller-saved and used registers.
+ for (unsigned I = sizeof UsedMask; I-- > 0;)
+ if (UsedMask[I])
+ EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
+ else
+ emitX86Nops(*OutStreamer, 1, Subtarget);
+
+ OutStreamer->AddComment("xray custom event end.");
+
+ // Record the sled version. Version 0 of this sled was spelled differently, so
+ // we let the runtime handle the different offsets we're using. Version 2
+ // changed the absolute address to a PC-relative address.
+ recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 2);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ assert(Subtarget->is64Bit() && "XRay typed events only supports X86-64");
+
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ // We want to emit the following pattern, which follows the x86 calling
+ // convention to prepare for the trampoline call to be patched in.
+ //
+ // .p2align 1, ...
+ // .Lxray_event_sled_N:
+ // jmp +N // jump across the instrumentation sled
+ // ... // set up arguments in register
+ // callq __xray_TypedEvent@plt // force dependency to symbol
+ // ...
+ // <jump here>
+ //
+ // After patching, it would look something like:
+ //
+ // nopw (2-byte nop)
+ // ...
+ // callq __xrayTypedEvent // already lowered
+ // ...
+ //
+ // ---
+ // First we emit the label and the jump.
+ auto CurSled = OutContext.createTempSymbol("xray_typed_event_sled_", true);
+ OutStreamer->AddComment("# XRay Typed Event Log");
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
+
+ // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+ // an operand (computed as an offset from the jmp instruction).
+ // FIXME: Find another less hacky way do force the relative jump.
+ OutStreamer->emitBinaryData("\xeb\x14");
+
+ // An x86-64 convention may place three arguments into %rcx, %rdx, and R8,
+ // so we'll work with those. Or we may be called via SystemV, in which case
+ // we don't have to do any translation.
+ const Register DestRegs[] = {X86::RDI, X86::RSI, X86::RDX};
+ bool UsedMask[] = {false, false, false};
+
+ // Will fill out src regs in the loop.
+ Register SrcRegs[] = {0, 0, 0};
+
+ // Then we put the operands in the SystemV registers. We spill the values in
+ // the registers before we clobber them, and mark them as used in UsedMask.
+ // In case the arguments are already in the correct register, we emit nops
+ // appropriately sized to keep the sled the same size in every situation.
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+ if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
+ // TODO: Is register only support adequate?
+ assert(Op->isReg() && "Only supports arguments in registers");
+ SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
+ if (SrcRegs[I] != DestRegs[I]) {
+ UsedMask[I] = true;
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
+ } else {
+ emitX86Nops(*OutStreamer, 4, Subtarget);
+ }
+ }
+
+ // In the above loop we only stash all of the destination registers or emit
+ // nops if the arguments are already in the right place. Doing the actually
+ // moving is postponed until after all the registers are stashed so nothing
+ // is clobbers. We've already added nops to account for the size of mov and
+ // push if the register is in the right place, so we only have to worry about
+ // emitting movs.
+ // FIXME: This doesn't work if one of the later SrcRegs is equal to an
+ // earlier DestReg. We will have already overwritten over the register before
+ // we can copy from it.
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+ if (UsedMask[I])
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64rr).addReg(DestRegs[I]).addReg(SrcRegs[I]));
+
+ // We emit a hard dependency on the __xray_TypedEvent symbol, which is the
+ // name of the trampoline to be implemented by the XRay runtime.
+ auto TSym = OutContext.getOrCreateSymbol("__xray_TypedEvent");
+ MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym);
+ if (isPositionIndependent())
+ TOp.setTargetFlags(X86II::MO_PLT);
+
+ // Emit the call instruction.
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64pcrel32)
+ .addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));
+
+ // Restore caller-saved and used registers.
+ for (unsigned I = sizeof UsedMask; I-- > 0;)
+ if (UsedMask[I])
+ EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
+ else
+ emitX86Nops(*OutStreamer, 1, Subtarget);
+
+ OutStreamer->AddComment("xray typed event end.");
+
+ // Record the sled version.
+ recordSled(CurSled, MI, SledKind::TYPED_EVENT, 2);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ const Function &F = MF->getFunction();
+ if (F.hasFnAttribute("patchable-function-entry")) {
+ unsigned Num;
+ if (F.getFnAttribute("patchable-function-entry")
+ .getValueAsString()
+ .getAsInteger(10, Num))
+ return;
+ emitX86Nops(*OutStreamer, Num, Subtarget);
+ return;
+ }
+ // We want to emit the following pattern:
+ //
+ // .p2align 1, ...
+ // .Lxray_sled_N:
+ // jmp .tmpN
+ // # 9 bytes worth of noops
+ //
+ // We need the 9 bytes because at runtime, we'd be patching over the full 11
+ // bytes with the following pattern:
+ //
+ // mov %r10, <function id, 32-bit> // 6 bytes
+ // call <relative offset, 32-bits> // 5 bytes
+ //
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
+
+ // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+ // an operand (computed as an offset from the jmp instruction).
+ // FIXME: Find another less hacky way do force the relative jump.
+ OutStreamer->emitBytes("\xeb\x09");
+ emitX86Nops(*OutStreamer, 9, Subtarget);
+ recordSled(CurSled, MI, SledKind::FUNCTION_ENTER, 2);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ // Since PATCHABLE_RET takes the opcode of the return statement as an
+ // argument, we use that to emit the correct form of the RET that we want.
+ // i.e. when we see this:
+ //
+ // PATCHABLE_RET X86::RET ...
+ //
+ // We should emit the RET followed by sleds.
+ //
+ // .p2align 1, ...
+ // .Lxray_sled_N:
+ // ret # or equivalent instruction
+ // # 10 bytes worth of noops
+ //
+ // This just makes sure that the alignment for the next instruction is 2.
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
+ unsigned OpCode = MI.getOperand(0).getImm();
+ MCInst Ret;
+ Ret.setOpcode(OpCode);
+ for (auto &MO : drop_begin(MI.operands()))
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+ Ret.addOperand(MaybeOperand.getValue());
+ OutStreamer->emitInstruction(Ret, getSubtargetInfo());
+ emitX86Nops(*OutStreamer, 10, Subtarget);
+ recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
+ X86MCInstLower &MCIL) {
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ // Like PATCHABLE_RET, we have the actual instruction in the operands to this
+ // instruction so we lower that particular instruction and its operands.
+ // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
+ // we do it for PATCHABLE_FUNCTION_ENTER. The sled should be very similar to
+ // the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
+ // tail call much like how we have it in PATCHABLE_RET.
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
+ auto Target = OutContext.createTempSymbol();
+
+ // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+ // an operand (computed as an offset from the jmp instruction).
+ // FIXME: Find another less hacky way do force the relative jump.
+ OutStreamer->emitBytes("\xeb\x09");
+ emitX86Nops(*OutStreamer, 9, Subtarget);
+ OutStreamer->emitLabel(Target);
+ recordSled(CurSled, MI, SledKind::TAIL_CALL, 2);
+
+ unsigned OpCode = MI.getOperand(0).getImm();
+ OpCode = convertTailJumpOpcode(OpCode);
+ MCInst TC;
+ TC.setOpcode(OpCode);
+
+ // Before emitting the instruction, add a comment to indicate that this is
+ // indeed a tail call.
+ OutStreamer->AddComment("TAILCALL");
+ for (auto &MO : drop_begin(MI.operands()))
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+ TC.addOperand(MaybeOperand.getValue());
+ OutStreamer->emitInstruction(TC, getSubtargetInfo());
+}
+
+// Returns instruction preceding MBBI in MachineFunction.
+// If MBBI is the first instruction of the first basic block, returns null.
+static MachineBasicBlock::const_iterator
+PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
+ const MachineBasicBlock *MBB = MBBI->getParent();
+ while (MBBI == MBB->begin()) {
+ if (MBB == &MBB->getParent()->front())
+ return MachineBasicBlock::const_iterator();
+ MBB = MBB->getPrevNode();
+ MBBI = MBB->end();
+ }
+ --MBBI;
+ return MBBI;
+}
+
+static const Constant *getConstantFromPool(const MachineInstr &MI,
+ const MachineOperand &Op) {
+ if (!Op.isCPI() || Op.getOffset() != 0)
+ return nullptr;
+
+ ArrayRef<MachineConstantPoolEntry> Constants =
+ MI.getParent()->getParent()->getConstantPool()->getConstants();
+ const MachineConstantPoolEntry &ConstantEntry = Constants[Op.getIndex()];
+
+ // Bail if this is a machine constant pool entry, we won't be able to dig out
+ // anything useful.
+ if (ConstantEntry.isMachineConstantPoolEntry())
+ return nullptr;
+
+ return ConstantEntry.Val.ConstVal;
+}
+
+static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx,
+ unsigned SrcOp2Idx, ArrayRef<int> Mask) {
+ std::string Comment;
+
+ // Compute the name for a register. This is really goofy because we have
+ // multiple instruction printers that could (in theory) use different
+ // names. Fortunately most people use the ATT style (outside of Windows)
+ // and they actually agree on register naming here. Ultimately, this is
+ // a comment, and so its OK if it isn't perfect.
+ auto GetRegisterName = [](unsigned RegNum) -> StringRef {
+ return X86ATTInstPrinter::getRegisterName(RegNum);
+ };
+
+ const MachineOperand &DstOp = MI->getOperand(0);
+ const MachineOperand &SrcOp1 = MI->getOperand(SrcOp1Idx);
+ const MachineOperand &SrcOp2 = MI->getOperand(SrcOp2Idx);
+
+ StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
+ StringRef Src1Name =
+ SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem";
+ StringRef Src2Name =
+ SrcOp2.isReg() ? GetRegisterName(SrcOp2.getReg()) : "mem";
+
+ // One source operand, fix the mask to print all elements in one span.
+ SmallVector<int, 8> ShuffleMask(Mask.begin(), Mask.end());
+ if (Src1Name == Src2Name)
+ for (int i = 0, e = ShuffleMask.size(); i != e; ++i)
+ if (ShuffleMask[i] >= e)
+ ShuffleMask[i] -= e;
+
+ raw_string_ostream CS(Comment);
+ CS << DstName;
+
+ // Handle AVX512 MASK/MASXZ write mask comments.
+ // MASK: zmmX {%kY}
+ // MASKZ: zmmX {%kY} {z}
+ if (SrcOp1Idx > 1) {
+ assert((SrcOp1Idx == 2 || SrcOp1Idx == 3) && "Unexpected writemask");
+
+ const MachineOperand &WriteMaskOp = MI->getOperand(SrcOp1Idx - 1);
+ if (WriteMaskOp.isReg()) {
+ CS << " {%" << GetRegisterName(WriteMaskOp.getReg()) << "}";
+
+ if (SrcOp1Idx == 2) {
+ CS << " {z}";
+ }
+ }
+ }
+
+ CS << " = ";
+
+ for (int i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if (i != 0)
+ CS << ",";
+ if (ShuffleMask[i] == SM_SentinelZero) {
+ CS << "zero";
+ continue;
+ }
+
+ // Otherwise, it must come from src1 or src2. Print the span of elements
+ // that comes from this src.
+ bool isSrc1 = ShuffleMask[i] < (int)e;
+ CS << (isSrc1 ? Src1Name : Src2Name) << '[';
+
+ bool IsFirst = true;
+ while (i != e && ShuffleMask[i] != SM_SentinelZero &&
+ (ShuffleMask[i] < (int)e) == isSrc1) {
+ if (!IsFirst)
+ CS << ',';
+ else
+ IsFirst = false;
+ if (ShuffleMask[i] == SM_SentinelUndef)
+ CS << "u";
+ else
+ CS << ShuffleMask[i] % (int)e;
+ ++i;
+ }
+ CS << ']';
+ --i; // For loop increments element #.
+ }
+ CS.flush();
+
+ return Comment;
+}
+
+static void printConstant(const APInt &Val, raw_ostream &CS) {
+ if (Val.getBitWidth() <= 64) {
+ CS << Val.getZExtValue();
+ } else {
+ // print multi-word constant as (w0,w1)
+ CS << "(";
+ for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+ if (i > 0)
+ CS << ",";
+ CS << Val.getRawData()[i];
+ }
+ CS << ")";
+ }
+}
+
+static void printConstant(const APFloat &Flt, raw_ostream &CS) {
+ SmallString<32> Str;
+ // Force scientific notation to distinquish from integers.
+ Flt.toString(Str, 0, 0);
+ CS << Str;
+}
+
+static void printConstant(const Constant *COp, raw_ostream &CS) {
+ if (isa<UndefValue>(COp)) {
+ CS << "u";
+ } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
+ printConstant(CI->getValue(), CS);
+ } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
+ printConstant(CF->getValueAPF(), CS);
+ } else {
+ CS << "?";
+ }
+}
+
+void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only");
+
+ // Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86.
+ if (EmitFPOData) {
+ X86TargetStreamer *XTS =
+ static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
+ switch (MI->getOpcode()) {
+ case X86::SEH_PushReg:
+ XTS->emitFPOPushReg(MI->getOperand(0).getImm());
+ break;
+ case X86::SEH_StackAlloc:
+ XTS->emitFPOStackAlloc(MI->getOperand(0).getImm());
+ break;
+ case X86::SEH_StackAlign:
+ XTS->emitFPOStackAlign(MI->getOperand(0).getImm());
+ break;
+ case X86::SEH_SetFrame:
+ assert(MI->getOperand(1).getImm() == 0 &&
+ ".cv_fpo_setframe takes no offset");
+ XTS->emitFPOSetFrame(MI->getOperand(0).getImm());
+ break;
+ case X86::SEH_EndPrologue:
+ XTS->emitFPOEndPrologue();
+ break;
+ case X86::SEH_SaveReg:
+ case X86::SEH_SaveXMM:
+ case X86::SEH_PushFrame:
+ llvm_unreachable("SEH_ directive incompatible with FPO");
+ break;
+ default:
+ llvm_unreachable("expected SEH_ instruction");
+ }
+ return;
+ }
+
+ // Otherwise, use the .seh_ directives for all other Windows platforms.
+ switch (MI->getOpcode()) {
+ case X86::SEH_PushReg:
+ OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm());
+ break;
+
+ case X86::SEH_SaveReg:
+ OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(),
+ MI->getOperand(1).getImm());
+ break;
+
+ case X86::SEH_SaveXMM:
+ OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(),
+ MI->getOperand(1).getImm());
+ break;
+
+ case X86::SEH_StackAlloc:
+ OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+ break;
+
+ case X86::SEH_SetFrame:
+ OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(),
+ MI->getOperand(1).getImm());
+ break;
+
+ case X86::SEH_PushFrame:
+ OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+ break;
+
+ case X86::SEH_EndPrologue:
+ OutStreamer->EmitWinCFIEndProlog();
+ break;
+
+ default:
+ llvm_unreachable("expected SEH_ instruction");
+ }
+}
+
+static unsigned getRegisterWidth(const MCOperandInfo &Info) {
+ if (Info.RegClass == X86::VR128RegClassID ||
+ Info.RegClass == X86::VR128XRegClassID)
+ return 128;
+ if (Info.RegClass == X86::VR256RegClassID ||
+ Info.RegClass == X86::VR256XRegClassID)
+ return 256;
+ if (Info.RegClass == X86::VR512RegClassID)
+ return 512;
+ llvm_unreachable("Unknown register class!");
+}
+
+static void addConstantComments(const MachineInstr *MI,
+ MCStreamer &OutStreamer) {
+ switch (MI->getOpcode()) {
+ // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+ // a constant shuffle mask. We won't be able to do this at the MC layer
+ // because the mask isn't an immediate.
+ case X86::PSHUFBrm:
+ case X86::VPSHUFBrm:
+ case X86::VPSHUFBYrm:
+ case X86::VPSHUFBZ128rm:
+ case X86::VPSHUFBZ128rmk:
+ case X86::VPSHUFBZ128rmkz:
+ case X86::VPSHUFBZ256rm:
+ case X86::VPSHUFBZ256rmk:
+ case X86::VPSHUFBZ256rmkz:
+ case X86::VPSHUFBZrm:
+ case X86::VPSHUFBZrmk:
+ case X86::VPSHUFBZrmkz: {
+ unsigned SrcIdx = 1;
+ if (X86II::isKMasked(MI->getDesc().TSFlags)) {
+ // Skip mask operand.
+ ++SrcIdx;
+ if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) {
+ // Skip passthru operand.
+ ++SrcIdx;
+ }
+ }
+ unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
+
+ assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+
+ const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
+ SmallVector<int, 64> Mask;
+ DecodePSHUFBMask(C, Width, Mask);
+ if (!Mask.empty())
+ OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+ }
+ break;
+ }
+
+ case X86::VPERMILPSrm:
+ case X86::VPERMILPSYrm:
+ case X86::VPERMILPSZ128rm:
+ case X86::VPERMILPSZ128rmk:
+ case X86::VPERMILPSZ128rmkz:
+ case X86::VPERMILPSZ256rm:
+ case X86::VPERMILPSZ256rmk:
+ case X86::VPERMILPSZ256rmkz:
+ case X86::VPERMILPSZrm:
+ case X86::VPERMILPSZrmk:
+ case X86::VPERMILPSZrmkz:
+ case X86::VPERMILPDrm:
+ case X86::VPERMILPDYrm:
+ case X86::VPERMILPDZ128rm:
+ case X86::VPERMILPDZ128rmk:
+ case X86::VPERMILPDZ128rmkz:
+ case X86::VPERMILPDZ256rm:
+ case X86::VPERMILPDZ256rmk:
+ case X86::VPERMILPDZ256rmkz:
+ case X86::VPERMILPDZrm:
+ case X86::VPERMILPDZrmk:
+ case X86::VPERMILPDZrmkz: {
+ unsigned ElSize;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VPERMILPSrm:
+ case X86::VPERMILPSYrm:
+ case X86::VPERMILPSZ128rm:
+ case X86::VPERMILPSZ256rm:
+ case X86::VPERMILPSZrm:
+ case X86::VPERMILPSZ128rmkz:
+ case X86::VPERMILPSZ256rmkz:
+ case X86::VPERMILPSZrmkz:
+ case X86::VPERMILPSZ128rmk:
+ case X86::VPERMILPSZ256rmk:
+ case X86::VPERMILPSZrmk:
+ ElSize = 32;
+ break;
+ case X86::VPERMILPDrm:
+ case X86::VPERMILPDYrm:
+ case X86::VPERMILPDZ128rm:
+ case X86::VPERMILPDZ256rm:
+ case X86::VPERMILPDZrm:
+ case X86::VPERMILPDZ128rmkz:
+ case X86::VPERMILPDZ256rmkz:
+ case X86::VPERMILPDZrmkz:
+ case X86::VPERMILPDZ128rmk:
+ case X86::VPERMILPDZ256rmk:
+ case X86::VPERMILPDZrmk:
+ ElSize = 64;
+ break;
+ }
+
+ unsigned SrcIdx = 1;
+ if (X86II::isKMasked(MI->getDesc().TSFlags)) {
+ // Skip mask operand.
+ ++SrcIdx;
+ if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) {
+ // Skip passthru operand.
+ ++SrcIdx;
+ }
+ }
+ unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
+
+ assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+
+ const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
+ SmallVector<int, 16> Mask;
+ DecodeVPERMILPMask(C, ElSize, Width, Mask);
+ if (!Mask.empty())
+ OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+ }
+ break;
+ }
+
+ case X86::VPERMIL2PDrm:
+ case X86::VPERMIL2PSrm:
+ case X86::VPERMIL2PDYrm:
+ case X86::VPERMIL2PSYrm: {
+ assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands + 1) &&
+ "Unexpected number of operands!");
+
+ const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
+ if (!CtrlOp.isImm())
+ break;
+
+ unsigned ElSize;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VPERMIL2PSrm: case X86::VPERMIL2PSYrm: ElSize = 32; break;
+ case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
+ }
+
+ const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
+ SmallVector<int, 16> Mask;
+ DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
+ if (!Mask.empty())
+ OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask));
+ }
+ break;
+ }
+
+ case X86::VPPERMrrm: {
+ assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+
+ const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
+ if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+ unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
+ SmallVector<int, 16> Mask;
+ DecodeVPPERMMask(C, Width, Mask);
+ if (!Mask.empty())
+ OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask));
+ }
+ break;
+ }
+
+ case X86::MMX_MOVQ64rm: {
+ assert(MI->getNumOperands() == (1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+ if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+ const MachineOperand &DstOp = MI->getOperand(0);
+ CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+ if (auto *CF = dyn_cast<ConstantFP>(C)) {
+ CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
+ OutStreamer.AddComment(CS.str());
+ }
+ }
+ break;
+ }
+
+#define MOV_CASE(Prefix, Suffix) \
+ case X86::Prefix##MOVAPD##Suffix##rm: \
+ case X86::Prefix##MOVAPS##Suffix##rm: \
+ case X86::Prefix##MOVUPD##Suffix##rm: \
+ case X86::Prefix##MOVUPS##Suffix##rm: \
+ case X86::Prefix##MOVDQA##Suffix##rm: \
+ case X86::Prefix##MOVDQU##Suffix##rm:
+
+#define MOV_AVX512_CASE(Suffix) \
+ case X86::VMOVDQA64##Suffix##rm: \
+ case X86::VMOVDQA32##Suffix##rm: \
+ case X86::VMOVDQU64##Suffix##rm: \
+ case X86::VMOVDQU32##Suffix##rm: \
+ case X86::VMOVDQU16##Suffix##rm: \
+ case X86::VMOVDQU8##Suffix##rm: \
+ case X86::VMOVAPS##Suffix##rm: \
+ case X86::VMOVAPD##Suffix##rm: \
+ case X86::VMOVUPS##Suffix##rm: \
+ case X86::VMOVUPD##Suffix##rm:
+
+#define CASE_ALL_MOV_RM() \
+ MOV_CASE(, ) /* SSE */ \
+ MOV_CASE(V, ) /* AVX-128 */ \
+ MOV_CASE(V, Y) /* AVX-256 */ \
+ MOV_AVX512_CASE(Z) \
+ MOV_AVX512_CASE(Z256) \
+ MOV_AVX512_CASE(Z128)
+
+ // For loads from a constant pool to a vector register, print the constant
+ // loaded.
+ CASE_ALL_MOV_RM()
+ case X86::VBROADCASTF128:
+ case X86::VBROADCASTI128:
+ case X86::VBROADCASTF32X4Z256rm:
+ case X86::VBROADCASTF32X4rm:
+ case X86::VBROADCASTF32X8rm:
+ case X86::VBROADCASTF64X2Z128rm:
+ case X86::VBROADCASTF64X2rm:
+ case X86::VBROADCASTF64X4rm:
+ case X86::VBROADCASTI32X4Z256rm:
+ case X86::VBROADCASTI32X4rm:
+ case X86::VBROADCASTI32X8rm:
+ case X86::VBROADCASTI64X2Z128rm:
+ case X86::VBROADCASTI64X2rm:
+ case X86::VBROADCASTI64X4rm:
+ assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+ if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+ int NumLanes = 1;
+ // Override NumLanes for the broadcast instructions.
+ switch (MI->getOpcode()) {
+ case X86::VBROADCASTF128: NumLanes = 2; break;
+ case X86::VBROADCASTI128: NumLanes = 2; break;
+ case X86::VBROADCASTF32X4Z256rm: NumLanes = 2; break;
+ case X86::VBROADCASTF32X4rm: NumLanes = 4; break;
+ case X86::VBROADCASTF32X8rm: NumLanes = 2; break;
+ case X86::VBROADCASTF64X2Z128rm: NumLanes = 2; break;
+ case X86::VBROADCASTF64X2rm: NumLanes = 4; break;
+ case X86::VBROADCASTF64X4rm: NumLanes = 2; break;
+ case X86::VBROADCASTI32X4Z256rm: NumLanes = 2; break;
+ case X86::VBROADCASTI32X4rm: NumLanes = 4; break;
+ case X86::VBROADCASTI32X8rm: NumLanes = 2; break;
+ case X86::VBROADCASTI64X2Z128rm: NumLanes = 2; break;
+ case X86::VBROADCASTI64X2rm: NumLanes = 4; break;
+ case X86::VBROADCASTI64X4rm: NumLanes = 2; break;
+ }
+
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+ const MachineOperand &DstOp = MI->getOperand(0);
+ CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+ if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+ CS << "[";
+ for (int l = 0; l != NumLanes; ++l) {
+ for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements;
+ ++i) {
+ if (i != 0 || l != 0)
+ CS << ",";
+ if (CDS->getElementType()->isIntegerTy())
+ printConstant(CDS->getElementAsAPInt(i), CS);
+ else if (CDS->getElementType()->isHalfTy() ||
+ CDS->getElementType()->isFloatTy() ||
+ CDS->getElementType()->isDoubleTy())
+ printConstant(CDS->getElementAsAPFloat(i), CS);
+ else
+ CS << "?";
+ }
+ }
+ CS << "]";
+ OutStreamer.AddComment(CS.str());
+ } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+ CS << "<";
+ for (int l = 0; l != NumLanes; ++l) {
+ for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands;
+ ++i) {
+ if (i != 0 || l != 0)
+ CS << ",";
+ printConstant(CV->getOperand(i), CS);
+ }
+ }
+ CS << ">";
+ OutStreamer.AddComment(CS.str());
+ }
+ }
+ break;
+
+ case X86::MOVDDUPrm:
+ case X86::VMOVDDUPrm:
+ case X86::VMOVDDUPZ128rm:
+ case X86::VBROADCASTSSrm:
+ case X86::VBROADCASTSSYrm:
+ case X86::VBROADCASTSSZ128rm:
+ case X86::VBROADCASTSSZ256rm:
+ case X86::VBROADCASTSSZrm:
+ case X86::VBROADCASTSDYrm:
+ case X86::VBROADCASTSDZ256rm:
+ case X86::VBROADCASTSDZrm:
+ case X86::VPBROADCASTBrm:
+ case X86::VPBROADCASTBYrm:
+ case X86::VPBROADCASTBZ128rm:
+ case X86::VPBROADCASTBZ256rm:
+ case X86::VPBROADCASTBZrm:
+ case X86::VPBROADCASTDrm:
+ case X86::VPBROADCASTDYrm:
+ case X86::VPBROADCASTDZ128rm:
+ case X86::VPBROADCASTDZ256rm:
+ case X86::VPBROADCASTDZrm:
+ case X86::VPBROADCASTQrm:
+ case X86::VPBROADCASTQYrm:
+ case X86::VPBROADCASTQZ128rm:
+ case X86::VPBROADCASTQZ256rm:
+ case X86::VPBROADCASTQZrm:
+ case X86::VPBROADCASTWrm:
+ case X86::VPBROADCASTWYrm:
+ case X86::VPBROADCASTWZ128rm:
+ case X86::VPBROADCASTWZ256rm:
+ case X86::VPBROADCASTWZrm:
+ assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+ if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+ int NumElts;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::MOVDDUPrm: NumElts = 2; break;
+ case X86::VMOVDDUPrm: NumElts = 2; break;
+ case X86::VMOVDDUPZ128rm: NumElts = 2; break;
+ case X86::VBROADCASTSSrm: NumElts = 4; break;
+ case X86::VBROADCASTSSYrm: NumElts = 8; break;
+ case X86::VBROADCASTSSZ128rm: NumElts = 4; break;
+ case X86::VBROADCASTSSZ256rm: NumElts = 8; break;
+ case X86::VBROADCASTSSZrm: NumElts = 16; break;
+ case X86::VBROADCASTSDYrm: NumElts = 4; break;
+ case X86::VBROADCASTSDZ256rm: NumElts = 4; break;
+ case X86::VBROADCASTSDZrm: NumElts = 8; break;
+ case X86::VPBROADCASTBrm: NumElts = 16; break;
+ case X86::VPBROADCASTBYrm: NumElts = 32; break;
+ case X86::VPBROADCASTBZ128rm: NumElts = 16; break;
+ case X86::VPBROADCASTBZ256rm: NumElts = 32; break;
+ case X86::VPBROADCASTBZrm: NumElts = 64; break;
+ case X86::VPBROADCASTDrm: NumElts = 4; break;
+ case X86::VPBROADCASTDYrm: NumElts = 8; break;
+ case X86::VPBROADCASTDZ128rm: NumElts = 4; break;
+ case X86::VPBROADCASTDZ256rm: NumElts = 8; break;
+ case X86::VPBROADCASTDZrm: NumElts = 16; break;
+ case X86::VPBROADCASTQrm: NumElts = 2; break;
+ case X86::VPBROADCASTQYrm: NumElts = 4; break;
+ case X86::VPBROADCASTQZ128rm: NumElts = 2; break;
+ case X86::VPBROADCASTQZ256rm: NumElts = 4; break;
+ case X86::VPBROADCASTQZrm: NumElts = 8; break;
+ case X86::VPBROADCASTWrm: NumElts = 8; break;
+ case X86::VPBROADCASTWYrm: NumElts = 16; break;
+ case X86::VPBROADCASTWZ128rm: NumElts = 8; break;
+ case X86::VPBROADCASTWZ256rm: NumElts = 16; break;
+ case X86::VPBROADCASTWZrm: NumElts = 32; break;
+ }
+
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+ const MachineOperand &DstOp = MI->getOperand(0);
+ CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+ CS << "[";
+ for (int i = 0; i != NumElts; ++i) {
+ if (i != 0)
+ CS << ",";
+ printConstant(C, CS);
+ }
+ CS << "]";
+ OutStreamer.AddComment(CS.str());
+ }
+ }
+}
+
+void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
+ X86MCInstLower MCInstLowering(*MF, *this);
+ const X86RegisterInfo *RI =
+ MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
+ // are compressed from EVEX encoding to VEX encoding.
+ if (TM.Options.MCOptions.ShowMCEncoding) {
+ if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
+ OutStreamer->AddComment("EVEX TO VEX Compression ", false);
+ }
+
+ // Add comments for values loaded from constant pool.
+ if (OutStreamer->isVerboseAsm())
+ addConstantComments(MI, *OutStreamer);
+
+ switch (MI->getOpcode()) {
+ case TargetOpcode::DBG_VALUE:
+ llvm_unreachable("Should be handled target independently");
+
+ // Emit nothing here but a comment if we can.
+ case X86::Int_MemBarrier:
+ OutStreamer->emitRawComment("MEMBARRIER");
+ return;
+
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ // Lower these as normal, but add some comments.
+ Register Reg = MI->getOperand(0).getReg();
+ OutStreamer->AddComment(StringRef("eh_return, addr: %") +
+ X86ATTInstPrinter::getRegisterName(Reg));
+ break;
+ }
+ case X86::CLEANUPRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CLEANUPRET");
+ break;
+ }
+
+ case X86::CATCHRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CATCHRET");
+ break;
+ }
+
+ case X86::ENDBR32:
+ case X86::ENDBR64: {
+ // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for
+ // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be
+ // non-empty. If MI is the initial ENDBR, place the
+ // __patchable_function_entries label after ENDBR.
+ if (CurrentPatchableFunctionEntrySym &&
+ CurrentPatchableFunctionEntrySym == CurrentFnBegin &&
+ MI == &MF->front().front()) {
+ MCInst Inst;
+ MCInstLowering.Lower(MI, Inst);
+ EmitAndCountInstruction(Inst);
+ CurrentPatchableFunctionEntrySym = createTempSymbol("patch");
+ OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym);
+ return;
+ }
+ break;
+ }
+
+ case X86::TAILJMPr:
+ case X86::TAILJMPm:
+ case X86::TAILJMPd:
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPd64:
+ case X86::TAILJMPd64_CC:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("TAILCALL");
+ break;
+
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ case X86::TLS_addrX32:
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ case X86::TLS_base_addrX32:
+ return LowerTlsAddr(MCInstLowering, *MI);
+
+ case X86::MOVPC32r: {
+ // This is a pseudo op for a two instruction sequence with a label, which
+ // looks like:
+ // call "L1$pb"
+ // "L1$pb":
+ // popl %esi
+
+ // Emit the call.
+ MCSymbol *PICBase = MF->getPICBaseSymbol();
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::CALLpcrel32)
+ .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+ const X86FrameLowering *FrameLowering =
+ MF->getSubtarget<X86Subtarget>().getFrameLowering();
+ bool hasFP = FrameLowering->hasFP(*MF);
+
+ // TODO: This is needed only if we require precise CFA.
+ bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+ !OutStreamer->getDwarfFrameInfos().back().End;
+
+ int stackGrowth = -RI->getSlotSize();
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->emitCFIAdjustCfaOffset(-stackGrowth);
+ }
+
+ // Emit the label.
+ OutStreamer->emitLabel(PICBase);
+
+ // popl $reg
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->emitCFIAdjustCfaOffset(stackGrowth);
+ }
+ return;
+ }
+
+ case X86::ADD32ri: {
+ // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+ if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ break;
+
+ // Okay, we have something like:
+ // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+ // For this, we want to print something like:
+ // MYGLOBAL + (. - PICBASE)
+ // However, we can't generate a ".", so just emit a new label here and refer
+ // to it.
+ MCSymbol *DotSym = OutContext.createTempSymbol();
+ OutStreamer->emitLabel(DotSym);
+
+ // Now that we have emitted the label, lower the complex operand expression.
+ MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+ const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+ const MCExpr *PICBase =
+ MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+ DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
+
+ DotExpr = MCBinaryExpr::createAdd(
+ MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);
+
+ EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(DotExpr));
+ return;
+ }
+ case TargetOpcode::STATEPOINT:
+ return LowerSTATEPOINT(*MI, MCInstLowering);
+
+ case TargetOpcode::FAULTING_OP:
+ return LowerFAULTING_OP(*MI, MCInstLowering);
+
+ case TargetOpcode::FENTRY_CALL:
+ return LowerFENTRY_CALL(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_OP:
+ return LowerPATCHABLE_OP(*MI, MCInstLowering);
+
+ case TargetOpcode::STACKMAP:
+ return LowerSTACKMAP(*MI);
+
+ case TargetOpcode::PATCHPOINT:
+ return LowerPATCHPOINT(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+ return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_RET:
+ return LowerPATCHABLE_RET(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_TAIL_CALL:
+ return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_EVENT_CALL:
+ return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+ return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);
+
+ case X86::MORESTACK_RET:
+ EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+ return;
+
+ case X86::MORESTACK_RET_RESTORE_R10:
+ // Return, then restore R10.
+ EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
+ return;
+
+ case X86::SEH_PushReg:
+ case X86::SEH_SaveReg:
+ case X86::SEH_SaveXMM:
+ case X86::SEH_StackAlloc:
+ case X86::SEH_StackAlign:
+ case X86::SEH_SetFrame:
+ case X86::SEH_PushFrame:
+ case X86::SEH_EndPrologue:
+ EmitSEHInstruction(MI);
+ return;
+
+ case X86::SEH_Epilogue: {
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ MachineBasicBlock::const_iterator MBBI(MI);
+ // Check if preceded by a call and emit nop if so.
+ for (MBBI = PrevCrossBBInst(MBBI);
+ MBBI != MachineBasicBlock::const_iterator();
+ MBBI = PrevCrossBBInst(MBBI)) {
+ // Conservatively assume that pseudo instructions don't emit code and keep
+ // looking for a call. We may emit an unnecessary nop in some cases.
+ if (!MBBI->isPseudo()) {
+ if (MBBI->isCall())
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ break;
+ }
+ }
+ return;
+ }
+ case X86::UBSAN_UD1:
+ EmitAndCountInstruction(MCInstBuilder(X86::UD1Lm)
+ .addReg(X86::EAX)
+ .addReg(X86::EAX)
+ .addImm(1)
+ .addReg(X86::NoRegister)
+ .addImm(MI->getOperand(0).getImm())
+ .addReg(X86::NoRegister));
+ return;
+ }
+
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+
+ // Stackmap shadows cannot include branch targets, so we can count the bytes
+ // in a call towards the shadow, but must ensure that the no thread returns
+ // in to the stackmap shadow. The only way to achieve this is if the call
+ // is at the end of the shadow.
+ if (MI->isCall()) {
+ // Count then size of the call towards the shadow
+ SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
+ // Then flush the shadow so that we fill with nops before the call, not
+ // after it.
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+ // Then emit the call
+ OutStreamer->emitInstruction(TmpInst, getSubtargetInfo());
+ return;
+ }
+
+ EmitAndCountInstruction(TmpInst);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..05f846bfb219
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -0,0 +1,30 @@
+//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void X86MachineFunctionInfo::anchor() { }
+
+void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
+ if (!RestoreBasePointerOffset) {
+ const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+ MF->getSubtarget().getRegisterInfo());
+ unsigned SlotSize = RegInfo->getSlotSize();
+ for (const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs();
+ unsigned Reg = *CSR; ++CSR) {
+ if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+ RestoreBasePointerOffset -= SlotSize;
+ }
+ }
+}
+
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 000000000000..eedad952c3b9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,230 @@
+//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+
+ /// ForceFramePointer - True if the function is required to use of frame
+ /// pointer for reasons other than it containing dynamic allocation or
+ /// that FP eliminatation is turned off. For example, Cygwin main function
+ /// contains stack pointer re-alignment code which requires FP.
+ bool ForceFramePointer = false;
+
+ /// RestoreBasePointerOffset - Non-zero if the function has base pointer
+ /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a
+ /// displacement from the frame pointer to a slot where the base pointer
+ /// is stashed.
+ signed char RestoreBasePointerOffset = 0;
+
+ /// WinEHXMMSlotInfo - Slot information of XMM registers in the stack frame
+ /// in bytes.
+ DenseMap<int, unsigned> WinEHXMMSlotInfo;
+
+ /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize = 0;
+
+ /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
+ /// to the space used by the return address).
+ /// Used on windows platform for stdcall & fastcall name decoration
+ unsigned BytesToPopOnReturn = 0;
+
+ /// ReturnAddrIndex - FrameIndex for return slot.
+ int ReturnAddrIndex = 0;
+
+ /// FrameIndex for return slot.
+ int FrameAddrIndex = 0;
+
+ /// TailCallReturnAddrDelta - The number of bytes by which return address
+ /// stack slot is moved as the result of tail call optimization.
+ int TailCallReturnAddrDelta = 0;
+
+ /// SRetReturnReg - Some subtargets require that sret lowering includes
+ /// returning the value of the returned struct in a register. This field
+ /// holds the virtual register into which the sret argument is passed.
+ Register SRetReturnReg;
+
+ /// GlobalBaseReg - keeps track of the virtual register initialized for
+ /// use as the global base register. This is used for PIC in some PIC
+ /// relocation models.
+ Register GlobalBaseReg;
+
+ /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex = 0;
+ /// RegSaveFrameIndex - X86-64 vararg func register save area.
+ int RegSaveFrameIndex = 0;
+ /// VarArgsGPOffset - X86-64 vararg func int reg offset.
+ unsigned VarArgsGPOffset = 0;
+ /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
+ unsigned VarArgsFPOffset = 0;
+ /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+ /// being passed on the stack.
+ unsigned ArgumentStackSize = 0;
+ /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+ unsigned NumLocalDynamics = 0;
+ /// HasPushSequences - Keeps track of whether this function uses sequences
+ /// of pushes to pass function parameters.
+ bool HasPushSequences = false;
+
+ /// True if the function recovers from an SEH exception, and therefore needs
+ /// to spill and restore the frame pointer.
+ bool HasSEHFramePtrSave = false;
+
+ /// The frame index of a stack object containing the original frame pointer
+ /// used to address arguments in a function using a base pointer.
+ int SEHFramePtrSaveIndex = 0;
+
+ /// True if this function has a subset of CSRs that is handled explicitly via
+ /// copies.
+ bool IsSplitCSR = false;
+
+ /// True if this function uses the red zone.
+ bool UsesRedZone = false;
+
+ /// True if this function has WIN_ALLOCA instructions.
+ bool HasWinAlloca = false;
+
+ /// True if this function has any preallocated calls.
+ bool HasPreallocatedCall = false;
+
+ ValueMap<const Value *, size_t> PreallocatedIds;
+ SmallVector<size_t, 0> PreallocatedStackSizes;
+ SmallVector<SmallVector<size_t, 4>, 0> PreallocatedArgOffsets;
+
+private:
+ /// ForwardedMustTailRegParms - A list of virtual and physical registers
+ /// that must be forwarded to every musttail call.
+ SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
+
+public:
+ X86MachineFunctionInfo() = default;
+
+ explicit X86MachineFunctionInfo(MachineFunction &MF) {}
+
+ bool getForceFramePointer() const { return ForceFramePointer;}
+ void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+ bool getHasPushSequences() const { return HasPushSequences; }
+ void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
+ bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
+ void setRestoreBasePointer(const MachineFunction *MF);
+ int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
+
+ DenseMap<int, unsigned>& getWinEHXMMSlotInfo() { return WinEHXMMSlotInfo; }
+ const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const {
+ return WinEHXMMSlotInfo; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+ void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+ int getRAIndex() const { return ReturnAddrIndex; }
+ void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+ int getFAIndex() const { return FrameAddrIndex; }
+ void setFAIndex(int Index) { FrameAddrIndex = Index; }
+
+ int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+ void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+ Register getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(Register Reg) { SRetReturnReg = Reg; }
+
+ Register getGlobalBaseReg() const { return GlobalBaseReg; }
+ void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+
+ int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+ void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
+
+ unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
+ void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
+
+ unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
+ void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+ unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+ void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+ unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+ void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+ bool getHasSEHFramePtrSave() const { return HasSEHFramePtrSave; }
+ void setHasSEHFramePtrSave(bool V) { HasSEHFramePtrSave = V; }
+
+ int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
+ void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
+
+ SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
+ return ForwardedMustTailRegParms;
+ }
+
+ bool isSplitCSR() const { return IsSplitCSR; }
+ void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+ bool getUsesRedZone() const { return UsesRedZone; }
+ void setUsesRedZone(bool V) { UsesRedZone = V; }
+
+ bool hasWinAlloca() const { return HasWinAlloca; }
+ void setHasWinAlloca(bool v) { HasWinAlloca = v; }
+
+ bool hasPreallocatedCall() const { return HasPreallocatedCall; }
+ void setHasPreallocatedCall(bool v) { HasPreallocatedCall = v; }
+
+ size_t getPreallocatedIdForCallSite(const Value *CS) {
+ auto Insert = PreallocatedIds.insert({CS, PreallocatedIds.size()});
+ if (Insert.second) {
+ PreallocatedStackSizes.push_back(0);
+ PreallocatedArgOffsets.emplace_back();
+ }
+ return Insert.first->second;
+ }
+
+ void setPreallocatedStackSize(size_t Id, size_t StackSize) {
+ PreallocatedStackSizes[Id] = StackSize;
+ }
+
+ size_t getPreallocatedStackSize(const size_t Id) {
+ assert(PreallocatedStackSizes[Id] != 0 && "stack size not set");
+ return PreallocatedStackSizes[Id];
+ }
+
+ void setPreallocatedArgOffsets(size_t Id, ArrayRef<size_t> AO) {
+ PreallocatedArgOffsets[Id].assign(AO.begin(), AO.end());
+ }
+
+ const ArrayRef<size_t> getPreallocatedArgOffsets(const size_t Id) {
+ assert(!PreallocatedArgOffsets[Id].empty() && "arg offsets not set");
+ return PreallocatedArgOffsets[Id];
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp
new file mode 100644
index 000000000000..425054cfdd92
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -0,0 +1,74 @@
+//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the X86 implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MacroFusion.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+using namespace llvm;
+
+static X86::FirstMacroFusionInstKind classifyFirst(const MachineInstr &MI) {
+ return X86::classifyFirstOpcodeInMacroFusion(MI.getOpcode());
+}
+
+static X86::SecondMacroFusionInstKind classifySecond(const MachineInstr &MI) {
+ X86::CondCode CC = X86::getCondFromBranch(MI);
+ return X86::classifySecondCondCodeInMacroFusion(CC);
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const X86Subtarget &ST = static_cast<const X86Subtarget &>(TSI);
+
+ // Check if this processor supports any kind of fusion.
+ if (!(ST.hasBranchFusion() || ST.hasMacroFusion()))
+ return false;
+
+ const X86::SecondMacroFusionInstKind BranchKind = classifySecond(SecondMI);
+
+ if (BranchKind == X86::SecondMacroFusionInstKind::Invalid)
+ return false; // Second cannot be fused with anything.
+
+ if (FirstMI == nullptr)
+ return true; // We're only checking whether Second can be fused at all.
+
+ const X86::FirstMacroFusionInstKind TestKind = classifyFirst(*FirstMI);
+
+ if (ST.hasBranchFusion()) {
+ // Branch fusion can merge CMP and TEST with all conditional jumps.
+ return (TestKind == X86::FirstMacroFusionInstKind::Cmp ||
+ TestKind == X86::FirstMacroFusionInstKind::Test);
+ }
+
+ if (ST.hasMacroFusion()) {
+ return X86::isMacroFused(TestKind, BranchKind);
+ }
+
+ llvm_unreachable("unknown fusion type");
+}
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation>
+createX86MacroFusionDAGMutation () {
+ return createBranchMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h
new file mode 100644
index 000000000000..05388b275ca3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h
@@ -0,0 +1,31 @@
+//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the X86 definition of the DAG scheduling mutation
+/// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACROFUSION_H
+#define LLVM_LIB_TARGET_X86_X86MACROFUSION_H
+
+#include <memory>
+
+namespace llvm {
+
+class ScheduleDAGMutation;
+
+/// Note that you have to add:
+/// DAG.addMutation(createX86MacroFusionDAGMutation());
+/// to X86PassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation>
+createX86MacroFusionDAGMutation();
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
new file mode 100644
index 000000000000..c8899a85118e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -0,0 +1,723 @@
+//===- X86OptimizeLEAs.cpp - optimize usage of LEA instructions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that performs some optimizations with LEA
+// instructions in order to improve performance and code size.
+// Currently, it does two things:
+// 1) If there are two LEA instructions calculating addresses which only differ
+// by displacement inside a basic block, one of them is removed.
+// 2) Address calculations in load and store instructions are replaced by
+// existing LEA def registers where possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-optimize-LEAs"
+
+static cl::opt<bool>
+ DisableX86LEAOpt("disable-x86-lea-opt", cl::Hidden,
+ cl::desc("X86: Disable LEA optimizations."),
+ cl::init(false));
+
+STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
+
+/// Returns true if two machine operands are identical and they are not
+/// physical registers.
+static inline bool isIdenticalOp(const MachineOperand &MO1,
+ const MachineOperand &MO2);
+
+/// Returns true if two address displacement operands are of the same
+/// type and use the same symbol/index/address regardless of the offset.
+static bool isSimilarDispOp(const MachineOperand &MO1,
+ const MachineOperand &MO2);
+
+/// Returns true if the instruction is LEA.
+static inline bool isLEA(const MachineInstr &MI);
+
+namespace {
+
+/// A key based on instruction's memory operands.
+class MemOpKey {
+public:
+ MemOpKey(const MachineOperand *Base, const MachineOperand *Scale,
+ const MachineOperand *Index, const MachineOperand *Segment,
+ const MachineOperand *Disp)
+ : Disp(Disp) {
+ Operands[0] = Base;
+ Operands[1] = Scale;
+ Operands[2] = Index;
+ Operands[3] = Segment;
+ }
+
+ bool operator==(const MemOpKey &Other) const {
+ // Addresses' bases, scales, indices and segments must be identical.
+ for (int i = 0; i < 4; ++i)
+ if (!isIdenticalOp(*Operands[i], *Other.Operands[i]))
+ return false;
+
+ // Addresses' displacements don't have to be exactly the same. It only
+ // matters that they use the same symbol/index/address. Immediates' or
+ // offsets' differences will be taken care of during instruction
+ // substitution.
+ return isSimilarDispOp(*Disp, *Other.Disp);
+ }
+
+ // Address' base, scale, index and segment operands.
+ const MachineOperand *Operands[4];
+
+ // Address' displacement operand.
+ const MachineOperand *Disp;
+};
+
+} // end anonymous namespace
+
+/// Provide DenseMapInfo for MemOpKey.
+namespace llvm {
+
+template <> struct DenseMapInfo<MemOpKey> {
+ using PtrInfo = DenseMapInfo<const MachineOperand *>;
+
+ static inline MemOpKey getEmptyKey() {
+ return MemOpKey(PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(),
+ PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(),
+ PtrInfo::getEmptyKey());
+ }
+
+ static inline MemOpKey getTombstoneKey() {
+ return MemOpKey(PtrInfo::getTombstoneKey(), PtrInfo::getTombstoneKey(),
+ PtrInfo::getTombstoneKey(), PtrInfo::getTombstoneKey(),
+ PtrInfo::getTombstoneKey());
+ }
+
+ static unsigned getHashValue(const MemOpKey &Val) {
+ // Checking any field of MemOpKey is enough to determine if the key is
+ // empty or tombstone.
+ assert(Val.Disp != PtrInfo::getEmptyKey() && "Cannot hash the empty key");
+ assert(Val.Disp != PtrInfo::getTombstoneKey() &&
+ "Cannot hash the tombstone key");
+
+ hash_code Hash = hash_combine(*Val.Operands[0], *Val.Operands[1],
+ *Val.Operands[2], *Val.Operands[3]);
+
+ // If the address displacement is an immediate, it should not affect the
+ // hash so that memory operands which differ only be immediate displacement
+ // would have the same hash. If the address displacement is something else,
+ // we should reflect symbol/index/address in the hash.
+ switch (Val.Disp->getType()) {
+ case MachineOperand::MO_Immediate:
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_JumpTableIndex:
+ Hash = hash_combine(Hash, Val.Disp->getIndex());
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ Hash = hash_combine(Hash, Val.Disp->getSymbolName());
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ Hash = hash_combine(Hash, Val.Disp->getGlobal());
+ break;
+ case MachineOperand::MO_BlockAddress:
+ Hash = hash_combine(Hash, Val.Disp->getBlockAddress());
+ break;
+ case MachineOperand::MO_MCSymbol:
+ Hash = hash_combine(Hash, Val.Disp->getMCSymbol());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ Hash = hash_combine(Hash, Val.Disp->getMBB());
+ break;
+ default:
+ llvm_unreachable("Invalid address displacement operand");
+ }
+
+ return (unsigned)Hash;
+ }
+
+ static bool isEqual(const MemOpKey &LHS, const MemOpKey &RHS) {
+ // Checking any field of MemOpKey is enough to determine if the key is
+ // empty or tombstone.
+ if (RHS.Disp == PtrInfo::getEmptyKey())
+ return LHS.Disp == PtrInfo::getEmptyKey();
+ if (RHS.Disp == PtrInfo::getTombstoneKey())
+ return LHS.Disp == PtrInfo::getTombstoneKey();
+ return LHS == RHS;
+ }
+};
+
+} // end namespace llvm
+
+/// Returns a hash table key based on memory operands of \p MI. The
+/// number of the first memory operand of \p MI is specified through \p N.
+static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
+ assert((isLEA(MI) || MI.mayLoadOrStore()) &&
+ "The instruction must be a LEA, a load or a store");
+ return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg),
+ &MI.getOperand(N + X86::AddrScaleAmt),
+ &MI.getOperand(N + X86::AddrIndexReg),
+ &MI.getOperand(N + X86::AddrSegmentReg),
+ &MI.getOperand(N + X86::AddrDisp));
+}
+
+static inline bool isIdenticalOp(const MachineOperand &MO1,
+ const MachineOperand &MO2) {
+ return MO1.isIdenticalTo(MO2) &&
+ (!MO1.isReg() || !Register::isPhysicalRegister(MO1.getReg()));
+}
+
+#ifndef NDEBUG
+static bool isValidDispOp(const MachineOperand &MO) {
+ return MO.isImm() || MO.isCPI() || MO.isJTI() || MO.isSymbol() ||
+ MO.isGlobal() || MO.isBlockAddress() || MO.isMCSymbol() || MO.isMBB();
+}
+#endif
+
+static bool isSimilarDispOp(const MachineOperand &MO1,
+ const MachineOperand &MO2) {
+ assert(isValidDispOp(MO1) && isValidDispOp(MO2) &&
+ "Address displacement operand is not valid");
+ return (MO1.isImm() && MO2.isImm()) ||
+ (MO1.isCPI() && MO2.isCPI() && MO1.getIndex() == MO2.getIndex()) ||
+ (MO1.isJTI() && MO2.isJTI() && MO1.getIndex() == MO2.getIndex()) ||
+ (MO1.isSymbol() && MO2.isSymbol() &&
+ MO1.getSymbolName() == MO2.getSymbolName()) ||
+ (MO1.isGlobal() && MO2.isGlobal() &&
+ MO1.getGlobal() == MO2.getGlobal()) ||
+ (MO1.isBlockAddress() && MO2.isBlockAddress() &&
+ MO1.getBlockAddress() == MO2.getBlockAddress()) ||
+ (MO1.isMCSymbol() && MO2.isMCSymbol() &&
+ MO1.getMCSymbol() == MO2.getMCSymbol()) ||
+ (MO1.isMBB() && MO2.isMBB() && MO1.getMBB() == MO2.getMBB());
+}
+
+static inline bool isLEA(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+ Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+namespace {
+
+class X86OptimizeLEAPass : public MachineFunctionPass {
+public:
+ X86OptimizeLEAPass() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "X86 LEA Optimize"; }
+
+ /// Loop over all of the basic blocks, replacing address
+ /// calculations in load and store instructions, if it's already
+ /// been calculated by LEA. Also, remove redundant LEAs.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ static char ID;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
+
+ /// Returns a distance between two instructions inside one basic block.
+ /// Negative result means, that instructions occur in reverse order.
+ int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
+
+ /// Choose the best \p LEA instruction from the \p List to replace
+ /// address calculation in \p MI instruction. Return the address displacement
+ /// and the distance between \p MI and the chosen \p BestLEA in
+ /// \p AddrDispShift and \p Dist.
+ bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+ const MachineInstr &MI, MachineInstr *&BestLEA,
+ int64_t &AddrDispShift, int &Dist);
+
+ /// Returns the difference between addresses' displacements of \p MI1
+ /// and \p MI2. The numbers of the first memory operands for the instructions
+ /// are specified through \p N1 and \p N2.
+ int64_t getAddrDispShift(const MachineInstr &MI1, unsigned N1,
+ const MachineInstr &MI2, unsigned N2) const;
+
+ /// Returns true if the \p Last LEA instruction can be replaced by the
+ /// \p First. The difference between displacements of the addresses calculated
+ /// by these LEAs is returned in \p AddrDispShift. It'll be used for proper
+ /// replacement of the \p Last LEA's uses with the \p First's def register.
+ bool isReplaceable(const MachineInstr &First, const MachineInstr &Last,
+ int64_t &AddrDispShift) const;
+
+ /// Find all LEA instructions in the basic block. Also, assign position
+ /// numbers to all instructions in the basic block to speed up calculation of
+ /// distance between them.
+ void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs);
+
+ /// Removes redundant address calculations.
+ bool removeRedundantAddrCalc(MemOpMap &LEAs);
+
+ /// Replace debug value MI with a new debug value instruction using register
+ /// VReg with an appropriate offset and DIExpression to incorporate the
+ /// address displacement AddrDispShift. Return new debug value instruction.
+ MachineInstr *replaceDebugValue(MachineInstr &MI, unsigned VReg,
+ int64_t AddrDispShift);
+
+ /// Removes LEAs which calculate similar addresses.
+ bool removeRedundantLEAs(MemOpMap &LEAs);
+
+ DenseMap<const MachineInstr *, unsigned> InstrPos;
+
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
+};
+
+} // end anonymous namespace
+
+char X86OptimizeLEAPass::ID = 0;
+
+FunctionPass *llvm::createX86OptimizeLEAs() { return new X86OptimizeLEAPass(); }
+INITIALIZE_PASS(X86OptimizeLEAPass, DEBUG_TYPE, "X86 optimize LEA pass", false,
+ false)
+
+int X86OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+ const MachineInstr &Last) {
+ // Both instructions must be in the same basic block and they must be
+ // presented in InstrPos.
+ assert(Last.getParent() == First.getParent() &&
+ "Instructions are in different basic blocks");
+ assert(InstrPos.find(&First) != InstrPos.end() &&
+ InstrPos.find(&Last) != InstrPos.end() &&
+ "Instructions' positions are undefined");
+
+ return InstrPos[&Last] - InstrPos[&First];
+}
+
+// Find the best LEA instruction in the List to replace address recalculation in
+// MI. Such LEA must meet these requirements:
+// 1) The address calculated by the LEA differs only by the displacement from
+// the address used in MI.
+// 2) The register class of the definition of the LEA is compatible with the
+// register class of the address base register of MI.
+// 3) Displacement of the new memory operand should fit in 1 byte if possible.
+// 4) The LEA should be as close to MI as possible, and prior to it if
+// possible.
+bool X86OptimizeLEAPass::chooseBestLEA(
+ const SmallVectorImpl<MachineInstr *> &List, const MachineInstr &MI,
+ MachineInstr *&BestLEA, int64_t &AddrDispShift, int &Dist) {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) +
+ X86II::getOperandBias(Desc);
+
+ BestLEA = nullptr;
+
+ // Loop over all LEA instructions.
+ for (auto DefMI : List) {
+ // Get new address displacement.
+ int64_t AddrDispShiftTemp = getAddrDispShift(MI, MemOpNo, *DefMI, 1);
+
+ // Make sure address displacement fits 4 bytes.
+ if (!isInt<32>(AddrDispShiftTemp))
+ continue;
+
+ // Check that LEA def register can be used as MI address base. Some
+ // instructions can use a limited set of registers as address base, for
+ // example MOV8mr_NOREX. We could constrain the register class of the LEA
+ // def to suit MI, however since this case is very rare and hard to
+ // reproduce in a test it's just more reliable to skip the LEA.
+ if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+ MRI->getRegClass(DefMI->getOperand(0).getReg()))
+ continue;
+
+ // Choose the closest LEA instruction from the list, prior to MI if
+ // possible. Note that we took into account resulting address displacement
+ // as well. Also note that the list is sorted by the order in which the LEAs
+ // occur, so the break condition is pretty simple.
+ int DistTemp = calcInstrDist(*DefMI, MI);
+ assert(DistTemp != 0 &&
+ "The distance between two different instructions cannot be zero");
+ if (DistTemp > 0 || BestLEA == nullptr) {
+ // Do not update return LEA, if the current one provides a displacement
+ // which fits in 1 byte, while the new candidate does not.
+ if (BestLEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+ isInt<8>(AddrDispShift))
+ continue;
+
+ BestLEA = DefMI;
+ AddrDispShift = AddrDispShiftTemp;
+ Dist = DistTemp;
+ }
+
+ // FIXME: Maybe we should not always stop at the first LEA after MI.
+ if (DistTemp < 0)
+ break;
+ }
+
+ return BestLEA != nullptr;
+}
+
+// Get the difference between the addresses' displacements of the two
+// instructions \p MI1 and \p MI2. The numbers of the first memory operands are
+// passed through \p N1 and \p N2.
+int64_t X86OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1,
+ unsigned N1,
+ const MachineInstr &MI2,
+ unsigned N2) const {
+ const MachineOperand &Op1 = MI1.getOperand(N1 + X86::AddrDisp);
+ const MachineOperand &Op2 = MI2.getOperand(N2 + X86::AddrDisp);
+
+ assert(isSimilarDispOp(Op1, Op2) &&
+ "Address displacement operands are not compatible");
+
+ // After the assert above we can be sure that both operands are of the same
+ // valid type and use the same symbol/index/address, thus displacement shift
+ // calculation is rather simple.
+ if (Op1.isJTI())
+ return 0;
+ return Op1.isImm() ? Op1.getImm() - Op2.getImm()
+ : Op1.getOffset() - Op2.getOffset();
+}
+
+// Check that the Last LEA can be replaced by the First LEA. To be so,
+// these requirements must be met:
+// 1) Addresses calculated by LEAs differ only by displacement.
+// 2) Def registers of LEAs belong to the same class.
+// 3) All uses of the Last LEA def register are replaceable, thus the
+// register is used only as address base.
+bool X86OptimizeLEAPass::isReplaceable(const MachineInstr &First,
+ const MachineInstr &Last,
+ int64_t &AddrDispShift) const {
+ assert(isLEA(First) && isLEA(Last) &&
+ "The function works only with LEA instructions");
+
+ // Make sure that LEA def registers belong to the same class. There may be
+ // instructions (like MOV8mr_NOREX) which allow a limited set of registers to
+ // be used as their operands, so we must be sure that replacing one LEA
+ // with another won't lead to putting a wrong register in the instruction.
+ if (MRI->getRegClass(First.getOperand(0).getReg()) !=
+ MRI->getRegClass(Last.getOperand(0).getReg()))
+ return false;
+
+ // Get new address displacement.
+ AddrDispShift = getAddrDispShift(Last, 1, First, 1);
+
+ // Loop over all uses of the Last LEA to check that its def register is
+ // used only as address base for memory accesses. If so, it can be
+ // replaced, otherwise - no.
+ for (auto &MO : MRI->use_nodbg_operands(Last.getOperand(0).getReg())) {
+ MachineInstr &MI = *MO.getParent();
+
+ // Get the number of the first memory operand.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
+
+ // If the use instruction has no memory operand - the LEA is not
+ // replaceable.
+ if (MemOpNo < 0)
+ return false;
+
+ MemOpNo += X86II::getOperandBias(Desc);
+
+ // If the address base of the use instruction is not the LEA def register -
+ // the LEA is not replaceable.
+ if (!isIdenticalOp(MI.getOperand(MemOpNo + X86::AddrBaseReg), MO))
+ return false;
+
+ // If the LEA def register is used as any other operand of the use
+ // instruction - the LEA is not replaceable.
+ for (unsigned i = 0; i < MI.getNumOperands(); i++)
+ if (i != (unsigned)(MemOpNo + X86::AddrBaseReg) &&
+ isIdenticalOp(MI.getOperand(i), MO))
+ return false;
+
+ // Check that the new address displacement will fit 4 bytes.
+ if (MI.getOperand(MemOpNo + X86::AddrDisp).isImm() &&
+ !isInt<32>(MI.getOperand(MemOpNo + X86::AddrDisp).getImm() +
+ AddrDispShift))
+ return false;
+ }
+
+ return true;
+}
+
+void X86OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+ MemOpMap &LEAs) {
+ unsigned Pos = 0;
+ for (auto &MI : MBB) {
+ // Assign the position number to the instruction. Note that we are going to
+ // move some instructions during the optimization however there will never
+ // be a need to move two instructions before any selected instruction. So to
+ // avoid multiple positions' updates during moves we just increase position
+ // counter by two leaving a free space for instructions which will be moved.
+ InstrPos[&MI] = Pos += 2;
+
+ if (isLEA(MI))
+ LEAs[getMemOpKey(MI, 1)].push_back(const_cast<MachineInstr *>(&MI));
+ }
+}
+
+// Try to find load and store instructions which recalculate addresses already
+// calculated by some LEA and replace their memory operands with its def
+// register.
+bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
+ bool Changed = false;
+
+ assert(!LEAs.empty());
+ MachineBasicBlock *MBB = (*LEAs.begin()->second.begin())->getParent();
+
+ // Process all instructions in basic block.
+ for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
+ MachineInstr &MI = *I++;
+
+ // Instruction must be load or store.
+ if (!MI.mayLoadOrStore())
+ continue;
+
+ // Get the number of the first memory operand.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
+
+ // If instruction has no memory operand - skip it.
+ if (MemOpNo < 0)
+ continue;
+
+ MemOpNo += X86II::getOperandBias(Desc);
+
+ // Do not call chooseBestLEA if there was no matching LEA
+ auto Insns = LEAs.find(getMemOpKey(MI, MemOpNo));
+ if (Insns == LEAs.end())
+ continue;
+
+ // Get the best LEA instruction to replace address calculation.
+ MachineInstr *DefMI;
+ int64_t AddrDispShift;
+ int Dist;
+ if (!chooseBestLEA(Insns->second, MI, DefMI, AddrDispShift, Dist))
+ continue;
+
+ // If LEA occurs before current instruction, we can freely replace
+ // the instruction. If LEA occurs after, we can lift LEA above the
+ // instruction and this way to be able to replace it. Since LEA and the
+ // instruction have similar memory operands (thus, the same def
+ // instructions for these operands), we can always do that, without
+ // worries of using registers before their defs.
+ if (Dist < 0) {
+ DefMI->removeFromParent();
+ MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+ InstrPos[DefMI] = InstrPos[&MI] - 1;
+
+ // Make sure the instructions' position numbers are sane.
+ assert(((InstrPos[DefMI] == 1 &&
+ MachineBasicBlock::iterator(DefMI) == MBB->begin()) ||
+ InstrPos[DefMI] >
+ InstrPos[&*std::prev(MachineBasicBlock::iterator(DefMI))]) &&
+ "Instruction positioning is broken");
+ }
+
+ // Since we can possibly extend register lifetime, clear kill flags.
+ MRI->clearKillFlags(DefMI->getOperand(0).getReg());
+
+ ++NumSubstLEAs;
+ LLVM_DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+
+ // Change instruction operands.
+ MI.getOperand(MemOpNo + X86::AddrBaseReg)
+ .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
+ MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
+ MI.getOperand(MemOpNo + X86::AddrIndexReg)
+ .ChangeToRegister(X86::NoRegister, false);
+ MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
+ MI.getOperand(MemOpNo + X86::AddrSegmentReg)
+ .ChangeToRegister(X86::NoRegister, false);
+
+ LLVM_DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
+ unsigned VReg,
+ int64_t AddrDispShift) {
+ const DIExpression *Expr = MI.getDebugExpression();
+ if (AddrDispShift != 0)
+ Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
+
+ // Replace DBG_VALUE instruction with modified version.
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ bool IsIndirect = MI.isIndirectDebugValue();
+ const MDNode *Var = MI.getDebugVariable();
+ if (IsIndirect)
+ assert(MI.getOperand(1).getImm() == 0 && "DBG_VALUE with nonzero offset");
+ return BuildMI(*MBB, MBB->erase(&MI), DL, TII->get(TargetOpcode::DBG_VALUE),
+ IsIndirect, VReg, Var, Expr);
+}
+
+// Try to find similar LEAs in the list and replace one with another.
+bool X86OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
+ bool Changed = false;
+
+ // Loop over all entries in the table.
+ for (auto &E : LEAs) {
+ auto &List = E.second;
+
+ // Loop over all LEA pairs.
+ auto I1 = List.begin();
+ while (I1 != List.end()) {
+ MachineInstr &First = **I1;
+ auto I2 = std::next(I1);
+ while (I2 != List.end()) {
+ MachineInstr &Last = **I2;
+ int64_t AddrDispShift;
+
+ // LEAs should be in occurrence order in the list, so we can freely
+ // replace later LEAs with earlier ones.
+ assert(calcInstrDist(First, Last) > 0 &&
+ "LEAs must be in occurrence order in the list");
+
+ // Check that the Last LEA instruction can be replaced by the First.
+ if (!isReplaceable(First, Last, AddrDispShift)) {
+ ++I2;
+ continue;
+ }
+
+ // Loop over all uses of the Last LEA and update their operands. Note
+ // that the correctness of this has already been checked in the
+ // isReplaceable function.
+ Register FirstVReg = First.getOperand(0).getReg();
+ Register LastVReg = Last.getOperand(0).getReg();
+ for (auto UI = MRI->use_begin(LastVReg), UE = MRI->use_end();
+ UI != UE;) {
+ MachineOperand &MO = *UI++;
+ MachineInstr &MI = *MO.getParent();
+
+ if (MI.isDebugValue()) {
+ // Replace DBG_VALUE instruction with modified version using the
+ // register from the replacing LEA and the address displacement
+ // between the LEA instructions.
+ replaceDebugValue(MI, FirstVReg, AddrDispShift);
+ continue;
+ }
+
+ // Get the number of the first memory operand.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo =
+ X86II::getMemoryOperandNo(Desc.TSFlags) +
+ X86II::getOperandBias(Desc);
+
+ // Update address base.
+ MO.setReg(FirstVReg);
+
+ // Update address disp.
+ MachineOperand &Op = MI.getOperand(MemOpNo + X86::AddrDisp);
+ if (Op.isImm())
+ Op.setImm(Op.getImm() + AddrDispShift);
+ else if (!Op.isJTI())
+ Op.setOffset(Op.getOffset() + AddrDispShift);
+ }
+
+ // Since we can possibly extend register lifetime, clear kill flags.
+ MRI->clearKillFlags(FirstVReg);
+
+ ++NumRedundantLEAs;
+ LLVM_DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: ";
+ Last.dump(););
+
+ // By this moment, all of the Last LEA's uses must be replaced. So we
+ // can freely remove it.
+ assert(MRI->use_empty(LastVReg) &&
+ "The LEA's def register must have no uses");
+ Last.eraseFromParent();
+
+ // Erase removed LEA from the list.
+ I2 = List.erase(I2);
+
+ Changed = true;
+ }
+ ++I1;
+ }
+ }
+
+ return Changed;
+}
+
+bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+
+ if (DisableX86LEAOpt || skipFunction(MF.getFunction()))
+ return false;
+
+ MRI = &MF.getRegInfo();
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+ auto *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
+
+ // Process all basic blocks.
+ for (auto &MBB : MF) {
+ MemOpMap LEAs;
+ InstrPos.clear();
+
+ // Find all LEA instructions in basic block.
+ findLEAs(MBB, LEAs);
+
+ // If current basic block has no LEAs, move on to the next one.
+ if (LEAs.empty())
+ continue;
+
+ // Remove redundant LEA instructions.
+ Changed |= removeRedundantLEAs(LEAs);
+
+ // Remove redundant address calculations. Do it only for -Os/-Oz since only
+ // a code size gain is expected from this part of the pass.
+ bool OptForSize = MF.getFunction().hasOptSize() ||
+ llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
+ if (OptForSize)
+ Changed |= removeRedundantAddrCalc(LEAs);
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp
new file mode 100644
index 000000000000..ec81b07f9e5f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -0,0 +1,230 @@
+//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will pad short functions to prevent
+// a stall if a function returns before the return address is ready. This
+// is needed for some Intel Atom processors.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pad-short-functions"
+
+STATISTIC(NumBBsPadded, "Number of basic blocks padded");
+
+namespace {
+ struct VisitedBBInfo {
+ // HasReturn - Whether the BB contains a return instruction
+ bool HasReturn;
+
+ // Cycles - Number of cycles until return if HasReturn is true, otherwise
+ // number of cycles until end of the BB
+ unsigned int Cycles;
+
+ VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+ VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+ : HasReturn(HasReturn), Cycles(Cycles) {}
+ };
+
+ struct PadShortFunc : public MachineFunctionPass {
+ static char ID;
+ PadShortFunc() : MachineFunctionPass(ID)
+ , Threshold(4) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+ AU.addPreserved<LazyMachineBlockFrequencyInfoPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "X86 Atom pad short functions";
+ }
+
+ private:
+ void findReturns(MachineBasicBlock *MBB,
+ unsigned int Cycles = 0);
+
+ bool cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles);
+
+ void addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd);
+
+ const unsigned int Threshold;
+
+ // ReturnBBs - Maps basic blocks that return to the minimum number of
+ // cycles until the return, starting from the entry block.
+ DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
+
+ // VisitedBBs - Cache of previously visited BBs.
+ DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
+ TargetSchedModel TSM;
+ };
+
+ char PadShortFunc::ID = 0;
+}
+
+FunctionPass *llvm::createX86PadShortFunctions() {
+ return new PadShortFunc();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// NOOP instructions before early exits.
+bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ if (MF.getFunction().hasOptSize())
+ return false;
+
+ if (!MF.getSubtarget<X86Subtarget>().padShortFunctions())
+ return false;
+
+ TSM.init(&MF.getSubtarget());
+
+ auto *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
+
+ // Search through basic blocks and mark the ones that have early returns
+ ReturnBBs.clear();
+ VisitedBBs.clear();
+ findReturns(&MF.front());
+
+ bool MadeChange = false;
+
+ // Pad the identified basic blocks with NOOPs
+ for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
+ I != ReturnBBs.end(); ++I) {
+ MachineBasicBlock *MBB = I->first;
+ unsigned Cycles = I->second;
+
+ // Function::hasOptSize is already checked above.
+ bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
+ if (OptForSize)
+ continue;
+
+ if (Cycles < Threshold) {
+ // BB ends in a return. Skip over any DBG_VALUE instructions
+ // trailing the terminator.
+ assert(MBB->size() > 0 &&
+ "Basic block should contain at least a RET but is empty");
+ MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+ while (ReturnLoc->isDebugInstr())
+ --ReturnLoc;
+ assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+ "Basic block does not end with RET");
+
+ addPadding(MBB, ReturnLoc, Threshold - Cycles);
+ NumBBsPadded++;
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+/// findReturn - Starting at MBB, follow control flow and add all
+/// basic blocks that contain a return to ReturnBBs.
+void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
+ // If this BB has a return, note how many cycles it takes to get there.
+ bool hasReturn = cyclesUntilReturn(MBB, Cycles);
+ if (Cycles >= Threshold)
+ return;
+
+ if (hasReturn) {
+ ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
+ return;
+ }
+
+ // Follow branches in BB and look for returns
+ for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
+ I != MBB->succ_end(); ++I) {
+ if (*I == MBB)
+ continue;
+ findReturns(*I, Cycles);
+ }
+}
+
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
+/// Cycles will be incremented by the number of cycles taken to reach the
+/// return or the end of the BB, whichever occurs first.
+bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles) {
+ // Return cached result if BB was previously visited
+ DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+ = VisitedBBs.find(MBB);
+ if (it != VisitedBBs.end()) {
+ VisitedBBInfo BBInfo = it->second;
+ Cycles += BBInfo.Cycles;
+ return BBInfo.HasReturn;
+ }
+
+ unsigned int CyclesToEnd = 0;
+
+ for (MachineInstr &MI : *MBB) {
+ // Mark basic blocks with a return instruction. Calls to other
+ // functions do not count because the called function will be padded,
+ // if necessary.
+ if (MI.isReturn() && !MI.isCall()) {
+ VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return true;
+ }
+
+ CyclesToEnd += TSM.computeInstrLatency(&MI);
+ }
+
+ VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return false;
+}
+
+/// addPadding - Add the given number of NOOP instructions to the function
+/// just prior to the return at MBBI
+void PadShortFunc::addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd) {
+ DebugLoc DL = MBBI->getDebugLoc();
+ unsigned IssueWidth = TSM.getIssueWidth();
+
+ for (unsigned i = 0, e = IssueWidth * NOOPsToAdd; i != e; ++i)
+ BuildMI(*MBB, MBBI, DL, TSM.getInstrInfo()->get(X86::NOOP));
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
new file mode 100644
index 000000000000..babd923e7496
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -0,0 +1,487 @@
+//===-- X86PartialReduction.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for add instructions used by a horizontal reduction to see
+// if we might be able to use pmaddwd or psadbw. Some cases of this require
+// cross basic block knowledge and can't be done in SelectionDAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-partial-reduction"
+
+namespace {
+
+class X86PartialReduction : public FunctionPass {
+ const DataLayout *DL;
+ const X86Subtarget *ST;
+
+public:
+ static char ID; // Pass identification, replacement for typeid.
+
+ X86PartialReduction() : FunctionPass(ID) { }
+
+ bool runOnFunction(Function &Fn) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+
+ StringRef getPassName() const override {
+ return "X86 Partial Reduction";
+ }
+
+private:
+ bool tryMAddReplacement(Instruction *Op);
+ bool trySADReplacement(Instruction *Op);
+};
+}
+
+FunctionPass *llvm::createX86PartialReductionPass() {
+ return new X86PartialReduction();
+}
+
+char X86PartialReduction::ID = 0;
+
+INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
+ "X86 Partial Reduction", false, false)
+
+bool X86PartialReduction::tryMAddReplacement(Instruction *Op) {
+ if (!ST->hasSSE2())
+ return false;
+
+ // Need at least 8 elements.
+ if (cast<FixedVectorType>(Op->getType())->getNumElements() < 8)
+ return false;
+
+ // Element type should be i32.
+ if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
+ return false;
+
+ auto *Mul = dyn_cast<BinaryOperator>(Op);
+ if (!Mul || Mul->getOpcode() != Instruction::Mul)
+ return false;
+
+ Value *LHS = Mul->getOperand(0);
+ Value *RHS = Mul->getOperand(1);
+
+ // LHS and RHS should be only used once or if they are the same then only
+ // used twice. Only check this when SSE4.1 is enabled and we have zext/sext
+ // instructions, otherwise we use punpck to emulate zero extend in stages. The
+ // trunc/ we need to do likely won't introduce new instructions in that case.
+ if (ST->hasSSE41()) {
+ if (LHS == RHS) {
+ if (!isa<Constant>(LHS) && !LHS->hasNUses(2))
+ return false;
+ } else {
+ if (!isa<Constant>(LHS) && !LHS->hasOneUse())
+ return false;
+ if (!isa<Constant>(RHS) && !RHS->hasOneUse())
+ return false;
+ }
+ }
+
+ auto CanShrinkOp = [&](Value *Op) {
+ auto IsFreeTruncation = [&](Value *Op) {
+ if (auto *Cast = dyn_cast<CastInst>(Op)) {
+ if (Cast->getParent() == Mul->getParent() &&
+ (Cast->getOpcode() == Instruction::SExt ||
+ Cast->getOpcode() == Instruction::ZExt) &&
+ Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 16)
+ return true;
+ }
+
+ return isa<Constant>(Op);
+ };
+
+ // If the operation can be freely truncated and has enough sign bits we
+ // can shrink.
+ if (IsFreeTruncation(Op) &&
+ ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16)
+ return true;
+
+ // SelectionDAG has limited support for truncating through an add or sub if
+ // the inputs are freely truncatable.
+ if (auto *BO = dyn_cast<BinaryOperator>(Op)) {
+ if (BO->getParent() == Mul->getParent() &&
+ IsFreeTruncation(BO->getOperand(0)) &&
+ IsFreeTruncation(BO->getOperand(1)) &&
+ ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16)
+ return true;
+ }
+
+ return false;
+ };
+
+ // Both Ops need to be shrinkable.
+ if (!CanShrinkOp(LHS) && !CanShrinkOp(RHS))
+ return false;
+
+ IRBuilder<> Builder(Mul);
+
+ auto *MulTy = cast<FixedVectorType>(Op->getType());
+ unsigned NumElts = MulTy->getNumElements();
+
+ // Extract even elements and odd elements and add them together. This will
+ // be pattern matched by SelectionDAG to pmaddwd. This instruction will be
+ // half the original width.
+ SmallVector<int, 16> EvenMask(NumElts / 2);
+ SmallVector<int, 16> OddMask(NumElts / 2);
+ for (int i = 0, e = NumElts / 2; i != e; ++i) {
+ EvenMask[i] = i * 2;
+ OddMask[i] = i * 2 + 1;
+ }
+ // Creating a new mul so the replaceAllUsesWith below doesn't replace the
+ // uses in the shuffles we're creating.
+ Value *NewMul = Builder.CreateMul(Mul->getOperand(0), Mul->getOperand(1));
+ Value *EvenElts = Builder.CreateShuffleVector(NewMul, NewMul, EvenMask);
+ Value *OddElts = Builder.CreateShuffleVector(NewMul, NewMul, OddMask);
+ Value *MAdd = Builder.CreateAdd(EvenElts, OddElts);
+
+ // Concatenate zeroes to extend back to the original type.
+ SmallVector<int, 32> ConcatMask(NumElts);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Value *Zero = Constant::getNullValue(MAdd->getType());
+ Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask);
+
+ Mul->replaceAllUsesWith(Concat);
+ Mul->eraseFromParent();
+
+ return true;
+}
+
+bool X86PartialReduction::trySADReplacement(Instruction *Op) {
+ if (!ST->hasSSE2())
+ return false;
+
+ // TODO: There's nothing special about i32, any integer type above i16 should
+ // work just as well.
+ if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
+ return false;
+
+ // Operand should be a select.
+ auto *SI = dyn_cast<SelectInst>(Op);
+ if (!SI)
+ return false;
+
+ // Select needs to implement absolute value.
+ Value *LHS, *RHS;
+ auto SPR = matchSelectPattern(SI, LHS, RHS);
+ if (SPR.Flavor != SPF_ABS)
+ return false;
+
+ // Need a subtract of two values.
+ auto *Sub = dyn_cast<BinaryOperator>(LHS);
+ if (!Sub || Sub->getOpcode() != Instruction::Sub)
+ return false;
+
+ // Look for zero extend from i8.
+ auto getZeroExtendedVal = [](Value *Op) -> Value * {
+ if (auto *ZExt = dyn_cast<ZExtInst>(Op))
+ if (cast<VectorType>(ZExt->getOperand(0)->getType())
+ ->getElementType()
+ ->isIntegerTy(8))
+ return ZExt->getOperand(0);
+
+ return nullptr;
+ };
+
+ // Both operands of the subtract should be extends from vXi8.
+ Value *Op0 = getZeroExtendedVal(Sub->getOperand(0));
+ Value *Op1 = getZeroExtendedVal(Sub->getOperand(1));
+ if (!Op0 || !Op1)
+ return false;
+
+ IRBuilder<> Builder(SI);
+
+ auto *OpTy = cast<FixedVectorType>(Op->getType());
+ unsigned NumElts = OpTy->getNumElements();
+
+ unsigned IntrinsicNumElts;
+ Intrinsic::ID IID;
+ if (ST->hasBWI() && NumElts >= 64) {
+ IID = Intrinsic::x86_avx512_psad_bw_512;
+ IntrinsicNumElts = 64;
+ } else if (ST->hasAVX2() && NumElts >= 32) {
+ IID = Intrinsic::x86_avx2_psad_bw;
+ IntrinsicNumElts = 32;
+ } else {
+ IID = Intrinsic::x86_sse2_psad_bw;
+ IntrinsicNumElts = 16;
+ }
+
+ Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID);
+
+ if (NumElts < 16) {
+ // Pad input with zeroes.
+ SmallVector<int, 32> ConcatMask(16);
+ for (unsigned i = 0; i != NumElts; ++i)
+ ConcatMask[i] = i;
+ for (unsigned i = NumElts; i != 16; ++i)
+ ConcatMask[i] = (i % NumElts) + NumElts;
+
+ Value *Zero = Constant::getNullValue(Op0->getType());
+ Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask);
+ Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask);
+ NumElts = 16;
+ }
+
+ // Intrinsics produce vXi64 and need to be casted to vXi32.
+ auto *I32Ty =
+ FixedVectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4);
+
+ assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!");
+ unsigned NumSplits = NumElts / IntrinsicNumElts;
+
+ // First collect the pieces we need.
+ SmallVector<Value *, 4> Ops(NumSplits);
+ for (unsigned i = 0; i != NumSplits; ++i) {
+ SmallVector<int, 64> ExtractMask(IntrinsicNumElts);
+ std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts);
+ Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask);
+ Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask);
+ Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1});
+ Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty);
+ }
+
+ assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits");
+ unsigned Stages = Log2_32(NumSplits);
+ for (unsigned s = Stages; s > 0; --s) {
+ unsigned NumConcatElts =
+ cast<FixedVectorType>(Ops[0]->getType())->getNumElements() * 2;
+ for (unsigned i = 0; i != 1U << (s - 1); ++i) {
+ SmallVector<int, 64> ConcatMask(NumConcatElts);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask);
+ }
+ }
+
+ // At this point the final value should be in Ops[0]. Now we need to adjust
+ // it to the final original type.
+ NumElts = cast<FixedVectorType>(OpTy)->getNumElements();
+ if (NumElts == 2) {
+ // Extract down to 2 elements.
+ Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ArrayRef<int>{0, 1});
+ } else if (NumElts >= 8) {
+ SmallVector<int, 32> ConcatMask(NumElts);
+ unsigned SubElts =
+ cast<FixedVectorType>(Ops[0]->getType())->getNumElements();
+ for (unsigned i = 0; i != SubElts; ++i)
+ ConcatMask[i] = i;
+ for (unsigned i = SubElts; i != NumElts; ++i)
+ ConcatMask[i] = (i % SubElts) + SubElts;
+
+ Value *Zero = Constant::getNullValue(Ops[0]->getType());
+ Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
+ }
+
+ SI->replaceAllUsesWith(Ops[0]);
+ SI->eraseFromParent();
+
+ return true;
+}
+
+// Walk backwards from the ExtractElementInst and determine if it is the end of
+// a horizontal reduction. Return the input to the reduction if we find one.
+static Value *matchAddReduction(const ExtractElementInst &EE) {
+ // Make sure we're extracting index 0.
+ auto *Index = dyn_cast<ConstantInt>(EE.getIndexOperand());
+ if (!Index || !Index->isNullValue())
+ return nullptr;
+
+ const auto *BO = dyn_cast<BinaryOperator>(EE.getVectorOperand());
+ if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse())
+ return nullptr;
+
+ unsigned NumElems = cast<FixedVectorType>(BO->getType())->getNumElements();
+ // Ensure the reduction size is a power of 2.
+ if (!isPowerOf2_32(NumElems))
+ return nullptr;
+
+ const Value *Op = BO;
+ unsigned Stages = Log2_32(NumElems);
+ for (unsigned i = 0; i != Stages; ++i) {
+ const auto *BO = dyn_cast<BinaryOperator>(Op);
+ if (!BO || BO->getOpcode() != Instruction::Add)
+ return nullptr;
+
+ // If this isn't the first add, then it should only have 2 users, the
+ // shuffle and another add which we checked in the previous iteration.
+ if (i != 0 && !BO->hasNUses(2))
+ return nullptr;
+
+ Value *LHS = BO->getOperand(0);
+ Value *RHS = BO->getOperand(1);
+
+ auto *Shuffle = dyn_cast<ShuffleVectorInst>(LHS);
+ if (Shuffle) {
+ Op = RHS;
+ } else {
+ Shuffle = dyn_cast<ShuffleVectorInst>(RHS);
+ Op = LHS;
+ }
+
+ // The first operand of the shuffle should be the same as the other operand
+ // of the bin op.
+ if (!Shuffle || Shuffle->getOperand(0) != Op)
+ return nullptr;
+
+ // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+ unsigned MaskEnd = 1 << i;
+ for (unsigned Index = 0; Index < MaskEnd; ++Index)
+ if (Shuffle->getMaskValue(Index) != (int)(MaskEnd + Index))
+ return nullptr;
+ }
+
+ return const_cast<Value *>(Op);
+}
+
+// See if this BO is reachable from this Phi by walking forward through single
+// use BinaryOperators with the same opcode. If we get back then we know we've
+// found a loop and it is safe to step through this Add to find more leaves.
+static bool isReachableFromPHI(PHINode *Phi, BinaryOperator *BO) {
+ // The PHI itself should only have one use.
+ if (!Phi->hasOneUse())
+ return false;
+
+ Instruction *U = cast<Instruction>(*Phi->user_begin());
+ if (U == BO)
+ return true;
+
+ while (U->hasOneUse() && U->getOpcode() == BO->getOpcode())
+ U = cast<Instruction>(*U->user_begin());
+
+ return U == BO;
+}
+
+// Collect all the leaves of the tree of adds that feeds into the horizontal
+// reduction. Root is the Value that is used by the horizontal reduction.
+// We look through single use phis, single use adds, or adds that are used by
+// a phi that forms a loop with the add.
+static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
+ SmallPtrSet<Value *, 8> Visited;
+ SmallVector<Value *, 8> Worklist;
+ Worklist.push_back(Root);
+
+ while (!Worklist.empty()) {
+ Value *V = Worklist.pop_back_val();
+ if (!Visited.insert(V).second)
+ continue;
+
+ if (auto *PN = dyn_cast<PHINode>(V)) {
+ // PHI node should have single use unless it is the root node, then it
+ // has 2 uses.
+ if (!PN->hasNUses(PN == Root ? 2 : 1))
+ break;
+
+ // Push incoming values to the worklist.
+ append_range(Worklist, PN->incoming_values());
+
+ continue;
+ }
+
+ if (auto *BO = dyn_cast<BinaryOperator>(V)) {
+ if (BO->getOpcode() == Instruction::Add) {
+ // Simple case. Single use, just push its operands to the worklist.
+ if (BO->hasNUses(BO == Root ? 2 : 1)) {
+ append_range(Worklist, BO->operands());
+ continue;
+ }
+
+ // If there is additional use, make sure it is an unvisited phi that
+ // gets us back to this node.
+ if (BO->hasNUses(BO == Root ? 3 : 2)) {
+ PHINode *PN = nullptr;
+ for (auto *U : Root->users())
+ if (auto *P = dyn_cast<PHINode>(U))
+ if (!Visited.count(P))
+ PN = P;
+
+ // If we didn't find a 2-input PHI then this isn't a case we can
+ // handle.
+ if (!PN || PN->getNumIncomingValues() != 2)
+ continue;
+
+ // Walk forward from this phi to see if it reaches back to this add.
+ if (!isReachableFromPHI(PN, BO))
+ continue;
+
+ // The phi forms a loop with this Add, push its operands.
+ append_range(Worklist, BO->operands());
+ }
+ }
+ }
+
+ // Not an add or phi, make it a leaf.
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ if (!V->hasNUses(I == Root ? 2 : 1))
+ continue;
+
+ // Add this as a leaf.
+ Leaves.push_back(I);
+ }
+ }
+}
+
+bool X86PartialReduction::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ auto &TM = TPC->getTM<X86TargetMachine>();
+ ST = TM.getSubtargetImpl(F);
+
+ DL = &F.getParent()->getDataLayout();
+
+ bool MadeChange = false;
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ auto *EE = dyn_cast<ExtractElementInst>(&I);
+ if (!EE)
+ continue;
+
+ // First find a reduction tree.
+ // FIXME: Do we need to handle other opcodes than Add?
+ Value *Root = matchAddReduction(*EE);
+ if (!Root)
+ continue;
+
+ SmallVector<Instruction *, 8> Leaves;
+ collectLeaves(Root, Leaves);
+
+ for (Instruction *I : Leaves) {
+ if (tryMAddReplacement(I)) {
+ MadeChange = true;
+ continue;
+ }
+
+ // Don't do SAD matching on the root node. SelectionDAG already
+ // has support for that and currently generates better code.
+ if (I != Root && trySADReplacement(I))
+ MadeChange = true;
+ }
+ }
+ }
+
+ return MadeChange;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td b/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td
new file mode 100644
index 000000000000..833013fb69f3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td
@@ -0,0 +1,235 @@
+//===-- X86PfmCounters.td - X86 Hardware Counters ----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the available hardware counters for various subtargets.
+//
+//===----------------------------------------------------------------------===//
+
+def UnhaltedCoreCyclesPfmCounter : PfmCounter<"unhalted_core_cycles">;
+def UopsIssuedPfmCounter : PfmCounter<"uops_issued:any">;
+
+// No default counters on X86.
+def DefaultPfmCounters : ProcPfmCounters {}
+def : PfmCountersDefaultBinding<DefaultPfmCounters>;
+
+// Intel X86 Counters.
+def PentiumPfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+ let UopsCounter = PfmCounter<"uops_retired">;
+}
+def : PfmCountersBinding<"pentiumpro", PentiumPfmCounters>;
+def : PfmCountersBinding<"pentium2", PentiumPfmCounters>;
+def : PfmCountersBinding<"pentium3", PentiumPfmCounters>;
+def : PfmCountersBinding<"pentium3m", PentiumPfmCounters>;
+def : PfmCountersBinding<"pentium-m", PentiumPfmCounters>;
+
+def CorePfmCounters : ProcPfmCounters {
+ let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+ let UopsCounter = PfmCounter<"uops_retired:any">;
+}
+def : PfmCountersBinding<"yonah", CorePfmCounters>;
+def : PfmCountersBinding<"prescott", CorePfmCounters>;
+def : PfmCountersBinding<"core2", CorePfmCounters>;
+def : PfmCountersBinding<"penryn", CorePfmCounters>;
+def : PfmCountersBinding<"nehalem", CorePfmCounters>;
+def : PfmCountersBinding<"corei7", CorePfmCounters>;
+def : PfmCountersBinding<"westmere", CorePfmCounters>;
+
+def AtomPfmCounters : ProcPfmCounters {
+ let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+ let UopsCounter = PfmCounter<"uops_retired:any">;
+}
+def : PfmCountersBinding<"bonnell", AtomPfmCounters>;
+def : PfmCountersBinding<"atom", AtomPfmCounters>;
+
+def SLMPfmCounters : ProcPfmCounters {
+ let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+ let UopsCounter = PfmCounter<"uops_retired:any">;
+}
+def : PfmCountersBinding<"silvermont", SLMPfmCounters>;
+def : PfmCountersBinding<"goldmont", SLMPfmCounters>;
+def : PfmCountersBinding<"goldmont-plus", SLMPfmCounters>;
+def : PfmCountersBinding<"tremont", SLMPfmCounters>;
+
+def KnightPfmCounters : ProcPfmCounters {
+ let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+ let UopsCounter = PfmCounter<"uops_retired:all">;
+}
+def : PfmCountersBinding<"knl", KnightPfmCounters>;
+def : PfmCountersBinding<"knm", KnightPfmCounters>;
+
+def SandyBridgePfmCounters : ProcPfmCounters {
+ let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+ let UopsCounter = UopsIssuedPfmCounter;
+ let IssueCounters = [
+ PfmIssueCounter<"SBPort0", "uops_dispatched_port:port_0">,
+ PfmIssueCounter<"SBPort1", "uops_dispatched_port:port_1">,
+ PfmIssueCounter<"SBPort23", "uops_dispatched_port:port_2 + uops_dispatched_port:port_3">,
+ PfmIssueCounter<"SBPort4", "uops_dispatched_port:port_4">,
+ PfmIssueCounter<"SBPort5", "uops_dispatched_port:port_5">
+ ];
+}
+def : PfmCountersBinding<"sandybridge", SandyBridgePfmCounters>;
+def : PfmCountersBinding<"ivybridge", SandyBridgePfmCounters>;
+
+def HaswellPfmCounters : ProcPfmCounters {
+ let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+ let UopsCounter = UopsIssuedPfmCounter;
+ let IssueCounters = [
+ PfmIssueCounter<"HWPort0", "uops_executed_port:port_0">,
+ PfmIssueCounter<"HWPort1", "uops_executed_port:port_1">,
+ PfmIssueCounter<"HWPort2", "uops_executed_port:port_2">,
+ PfmIssueCounter<"HWPort3", "uops_executed_port:port_3">,
+ PfmIssueCounter<"HWPort4", "uops_executed_port:port_4">,
+ PfmIssueCounter<"HWPort5", "uops_executed_port:port_5">,
+ PfmIssueCounter<"HWPort6", "uops_executed_port:port_6">,
+ PfmIssueCounter<"HWPort7", "uops_executed_port:port_7">
+ ];
+}
+def : PfmCountersBinding<"haswell", HaswellPfmCounters>;
+
+def BroadwellPfmCounters : ProcPfmCounters {
+ let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+ let UopsCounter = UopsIssuedPfmCounter;
+ let IssueCounters = [
+ PfmIssueCounter<"BWPort0", "uops_executed_port:port_0">,
+ PfmIssueCounter<"BWPort1", "uops_executed_port:port_1">,
+ PfmIssueCounter<"BWPort2", "uops_executed_port:port_2">,
+ PfmIssueCounter<"BWPort3", "uops_executed_port:port_3">,
+ PfmIssueCounter<"BWPort4", "uops_executed_port:port_4">,
+ PfmIssueCounter<"BWPort5", "uops_executed_port:port_5">,
+ PfmIssueCounter<"BWPort6", "uops_executed_port:port_6">,
+ PfmIssueCounter<"BWPort7", "uops_executed_port:port_7">
+ ];
+}
+def : PfmCountersBinding<"broadwell", BroadwellPfmCounters>;
+
+def SkylakeClientPfmCounters : ProcPfmCounters {
+ let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+ let UopsCounter = UopsIssuedPfmCounter;
+ let IssueCounters = [
+ PfmIssueCounter<"SKLPort0", "uops_dispatched_port:port_0">,
+ PfmIssueCounter<"SKLPort1", "uops_dispatched_port:port_1">,
+ PfmIssueCounter<"SKLPort2", "uops_dispatched_port:port_2">,
+ PfmIssueCounter<"SKLPort3", "uops_dispatched_port:port_3">,
+ PfmIssueCounter<"SKLPort4", "uops_dispatched_port:port_4">,
+ PfmIssueCounter<"SKLPort5", "uops_dispatched_port:port_5">,
+ PfmIssueCounter<"SKLPort6", "uops_dispatched_port:port_6">,
+ PfmIssueCounter<"SKLPort7", "uops_dispatched_port:port_7">
+ ];
+}
+def : PfmCountersBinding<"skylake", SkylakeClientPfmCounters>;
+
+def SkylakeServerPfmCounters : ProcPfmCounters {
+ let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+ let UopsCounter = UopsIssuedPfmCounter;
+ let IssueCounters = [
+ PfmIssueCounter<"SKXPort0", "uops_dispatched_port:port_0">,
+ PfmIssueCounter<"SKXPort1", "uops_dispatched_port:port_1">,
+ PfmIssueCounter<"SKXPort2", "uops_dispatched_port:port_2">,
+ PfmIssueCounter<"SKXPort3", "uops_dispatched_port:port_3">,
+ PfmIssueCounter<"SKXPort4", "uops_dispatched_port:port_4">,
+ PfmIssueCounter<"SKXPort5", "uops_dispatched_port:port_5">,
+ PfmIssueCounter<"SKXPort6", "uops_dispatched_port:port_6">,
+ PfmIssueCounter<"SKXPort7", "uops_dispatched_port:port_7">
+ ];
+}
+def : PfmCountersBinding<"skylake-avx512", SkylakeServerPfmCounters>;
+def : PfmCountersBinding<"cascadelake", SkylakeServerPfmCounters>;
+def : PfmCountersBinding<"cannonlake", SkylakeServerPfmCounters>;
+def : PfmCountersBinding<"icelake-client", SkylakeServerPfmCounters>;
+def : PfmCountersBinding<"icelake-server", SkylakeServerPfmCounters>;
+
+// AMD X86 Counters.
+// Set basic counters for AMD cpus that we know libpfm4 supports.
+def DefaultAMDPfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+ let UopsCounter = PfmCounter<"retired_uops">;
+}
+def : PfmCountersBinding<"athlon", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-tbird", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-4", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-xp", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-mp", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"k8", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"opteron", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon64", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon-fx", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"k8-sse3", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"opteron-sse3", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"athlon64-sse3", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"amdfam10", DefaultAMDPfmCounters>;
+def : PfmCountersBinding<"barcelona", DefaultAMDPfmCounters>;
+
+def BdVer2PfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+ let UopsCounter = PfmCounter<"retired_uops">;
+ let IssueCounters = [
+ PfmIssueCounter<"PdFPU0", "dispatched_fpu_ops:ops_pipe0 + dispatched_fpu_ops:ops_dual_pipe0">,
+ PfmIssueCounter<"PdFPU1", "dispatched_fpu_ops:ops_pipe1 + dispatched_fpu_ops:ops_dual_pipe1">,
+ PfmIssueCounter<"PdFPU2", "dispatched_fpu_ops:ops_pipe2 + dispatched_fpu_ops:ops_dual_pipe2">,
+ PfmIssueCounter<"PdFPU3", "dispatched_fpu_ops:ops_pipe3 + dispatched_fpu_ops:ops_dual_pipe3">
+ ];
+}
+def : PfmCountersBinding<"bdver1", BdVer2PfmCounters>;
+def : PfmCountersBinding<"bdver2", BdVer2PfmCounters>;
+
+def BdVer3PfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+ let UopsCounter = PfmCounter<"retired_uops">;
+ let IssueCounters = [
+ PfmIssueCounter<"SrFPU0", "dispatched_fpu_ops:ops_pipe0 + dispatched_fpu_ops:ops_dual_pipe0">,
+ PfmIssueCounter<"SrFPU1", "dispatched_fpu_ops:ops_pipe1 + dispatched_fpu_ops:ops_dual_pipe1">,
+ PfmIssueCounter<"SrFPU2", "dispatched_fpu_ops:ops_pipe2 + dispatched_fpu_ops:ops_dual_pipe2">
+ ];
+}
+def : PfmCountersBinding<"bdver3", BdVer3PfmCounters>;
+def : PfmCountersBinding<"bdver4", BdVer3PfmCounters>;
+
+def BtVer1PfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+ let UopsCounter = PfmCounter<"retired_uops">;
+ let IssueCounters = [
+ PfmIssueCounter<"BtFPU0", "dispatched_fpu:pipe0">,
+ PfmIssueCounter<"BtFPU1", "dispatched_fpu:pipe1">
+ ];
+}
+def : PfmCountersBinding<"btver1", BtVer1PfmCounters>;
+
+def BtVer2PfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+ let UopsCounter = PfmCounter<"retired_uops">;
+ let IssueCounters = [
+ PfmIssueCounter<"JFPU0", "dispatched_fpu:pipe0">,
+ PfmIssueCounter<"JFPU1", "dispatched_fpu:pipe1">
+ ];
+}
+def : PfmCountersBinding<"btver2", BtVer2PfmCounters>;
+
+def ZnVer1PfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cycles_not_in_halt">;
+ let UopsCounter = PfmCounter<"retired_uops">;
+ let IssueCounters = [
+ PfmIssueCounter<"ZnFPU0", "fpu_pipe_assignment:total0">,
+ PfmIssueCounter<"ZnFPU1", "fpu_pipe_assignment:total1">,
+ PfmIssueCounter<"ZnFPU2", "fpu_pipe_assignment:total2">,
+ PfmIssueCounter<"ZnFPU3", "fpu_pipe_assignment:total3">,
+ PfmIssueCounter<"ZnDivider", "div_op_count">
+ ];
+}
+def : PfmCountersBinding<"znver1", ZnVer1PfmCounters>;
+
+def ZnVer2PfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cycles_not_in_halt">;
+ let UopsCounter = PfmCounter<"retired_uops">;
+ let IssueCounters = [
+ PfmIssueCounter<"Zn2AGU", "ls_dispatch:ld_dispatch + ls_dispatch:store_dispatch">,
+ PfmIssueCounter<"Zn2Divider", "div_op_count">
+ ];
+}
+def : PfmCountersBinding<"znver2", ZnVer2PfmCounters>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
new file mode 100644
index 000000000000..05ee6c6c8384
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -0,0 +1,265 @@
+//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to pre-config the shape of AMX register
+/// AMX register need to be configured before use. The shape of AMX register
+/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
+/// The pldtilecfg is to config tile registers. It should dominator all AMX
+/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
+/// register is used by all AMX instructions.
+/// This pass is to find the common dominator of all AMX instructions and
+/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
+/// produces is inserted as the last operand of each AMX instruction. We use
+/// this scheme to model the def-use relationship between AMX config instruction
+/// and other AMX instructions. Below is an example.
+///
+/// ----B1----
+/// / \
+/// / \
+/// B2 B3
+/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV
+///
+/// is transformed to
+///
+/// B1
+/// %25:tilecfg = PLDTILECFG
+/// / \
+/// / \
+/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-pre-config"
+
+namespace {
+
+class X86PreTileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+
+public:
+ X86PreTileConfig() : MachineFunctionPass(ID) {}
+
+ /// Return the pass name.
+ StringRef getPassName() const override {
+ return "Tile Register Pre-configure";
+ }
+
+ /// X86PreTileConfig analysis usage.
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &mf) override;
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86PreTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
+ "Tile Register Configure", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
+ "Tile Register Configure", false, false)
+
+void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
+ const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI,
+ const X86Subtarget *ST) {
+ auto *MBB = MI->getParent();
+
+ // FIXME: AMX should assume AVX512 enabled.
+ if (ST->hasAVX512()) {
+ // Zero stack slot.
+ Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
+ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
+ .addReg(Zmm, RegState::Undef)
+ .addReg(Zmm, RegState::Undef);
+ addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
+ FrameIdx)
+ .addReg(Zmm);
+ }
+
+ // build psuedo ldtilecfg
+ Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
+
+ addFrameReference(
+ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
+
+ return VReg;
+}
+
+static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected machine instruction on tile");
+ case X86::PTILELOADDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
+ MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
+ ShapeT Shape(&MO1, &MO2, MRI);
+ return Shape;
+ }
+}
+
+MachineInstr *X86PreTileConfig::getTileConfigPoint() {
+ DenseMap<Register, ShapeT> PhysShapeInfo;
+ MachineBasicBlock *MBB = nullptr;
+ DenseSet<const MachineInstr *> MIs;
+ for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+ Register VirtReg = Register::index2VirtReg(i);
+ if (MRI->reg_nodbg_empty(VirtReg))
+ continue;
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ if (RC.getID() != X86::TILERegClassID)
+ continue;
+
+ // Find the common dominator for all MI that define tile register.
+ for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
+ if (MO.isUndef())
+ continue;
+ const auto *MI = MO.getParent();
+ // PHI or IMPLICIT_DEF instructiion.
+ // There must be a input tile before PHI instruction.
+ if (MI->isTransient())
+ continue;
+ if (!MBB)
+ MBB = const_cast<MachineBasicBlock *>(MI->getParent());
+ MBB = DomTree->findNearestCommonDominator(
+ MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
+
+ // Collect the instructions that define shape.
+ ShapeT Shape = getShape(*MI, MRI);
+ std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
+ Shape.getCol()};
+ for (auto *ShapeMO : ShapeMOs) {
+ Register ShapeReg = ShapeMO->getReg();
+ for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
+ const auto *ShapeMI = MO.getParent();
+ MIs.insert(ShapeMI);
+ }
+ }
+ }
+ }
+ if (!MBB)
+ return nullptr;
+ // This pass is before the pass of eliminating PHI node, so it
+ // is in SSA form.
+ assert(MRI->isSSA() && "Not SSA form in pre-tile config");
+ // Shape def should dominate tile config MBB.
+ // def s s1 s2
+ // / \ \ /
+ // / \ \ /
+ // conf s3=phi(s1,s2)
+ // |
+ // c
+ //
+ for (const auto *MI : MIs) {
+ const MachineBasicBlock *ShapeMBB = MI->getParent();
+ if (DomTree->dominates(ShapeMBB, MBB))
+ continue;
+ if (MI->isMoveImmediate())
+ continue;
+ report_fatal_error(MF->getName() + ": Failed to config tile register, "
+ "please define the shape earlier");
+ }
+
+ // ldtilecfg should be inserted after the MI that define the shape.
+ MachineBasicBlock::reverse_instr_iterator I, E;
+ for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
+ auto *MI = &*I;
+ if (MIs.count(MI) && (!MI->isMoveImmediate()))
+ break;
+ }
+ MachineBasicBlock::iterator MII;
+ if (I == E)
+ MII = MBB->getFirstNonPHI();
+ else {
+ MII = MachineBasicBlock::iterator(&*I);
+ MII++;
+ }
+ return &*MII;
+}
+
+static void addTileCFGUse(MachineFunction &MF, Register CFG) {
+ for (MachineBasicBlock &MBB : MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ break;
+ case X86::PTILELOADDV:
+ case X86::PTILESTOREDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ unsigned NumOperands = MI.getNumOperands();
+ MI.RemoveOperand(NumOperands - 1);
+ MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
+ break;
+ }
+ }
+ }
+}
+
+bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ MRI = &mf.getRegInfo();
+ ST = &mf.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = mf.getSubtarget().getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+
+ MachineInstr *MI = getTileConfigPoint();
+ if (!MI)
+ return false;
+ unsigned Size = ST->getTileConfigSize();
+ Align Alignment = ST->getTileConfigAlignment();
+ int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
+ Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
+ addTileCFGUse(mf, CFG);
+ return true;
+}
+
+FunctionPass *llvm::createX86PreTileConfigPass() {
+ return new X86PreTileConfig();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
new file mode 100644
index 000000000000..9c076d2d6769
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -0,0 +1,315 @@
+//===- X86RegisterBankInfo.cpp -----------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterBankInfo.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "X86GenRegisterBank.inc"
+
+using namespace llvm;
+// This file will be TableGen'ed at some point.
+#define GET_TARGET_REGBANK_INFO_IMPL
+#include "X86GenRegisterBankInfo.def"
+
+X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI)
+ : X86GenRegisterBankInfo() {
+
+ // validate RegBank initialization.
+ const RegisterBank &RBGPR = getRegBank(X86::GPRRegBankID);
+ (void)RBGPR;
+ assert(&X86::GPRRegBank == &RBGPR && "Incorrect RegBanks inizalization.");
+
+ // The GPR register bank is fully defined by all the registers in
+ // GR64 + its subclasses.
+ assert(RBGPR.covers(*TRI.getRegClass(X86::GR64RegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+}
+
+const RegisterBank &
+X86RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
+ LLT) const {
+
+ if (X86::GR8RegClass.hasSubClassEq(&RC) ||
+ X86::GR16RegClass.hasSubClassEq(&RC) ||
+ X86::GR32RegClass.hasSubClassEq(&RC) ||
+ X86::GR64RegClass.hasSubClassEq(&RC) ||
+ X86::LOW32_ADDR_ACCESSRegClass.hasSubClassEq(&RC) ||
+ X86::LOW32_ADDR_ACCESS_RBPRegClass.hasSubClassEq(&RC))
+ return getRegBank(X86::GPRRegBankID);
+
+ if (X86::FR32XRegClass.hasSubClassEq(&RC) ||
+ X86::FR64XRegClass.hasSubClassEq(&RC) ||
+ X86::VR128XRegClass.hasSubClassEq(&RC) ||
+ X86::VR256XRegClass.hasSubClassEq(&RC) ||
+ X86::VR512RegClass.hasSubClassEq(&RC))
+ return getRegBank(X86::VECRRegBankID);
+
+ llvm_unreachable("Unsupported register kind yet.");
+}
+
+X86GenRegisterBankInfo::PartialMappingIdx
+X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) {
+ if ((Ty.isScalar() && !isFP) || Ty.isPointer()) {
+ switch (Ty.getSizeInBits()) {
+ case 1:
+ case 8:
+ return PMI_GPR8;
+ case 16:
+ return PMI_GPR16;
+ case 32:
+ return PMI_GPR32;
+ case 64:
+ return PMI_GPR64;
+ case 128:
+ return PMI_VEC128;
+ break;
+ default:
+ llvm_unreachable("Unsupported register size.");
+ }
+ } else if (Ty.isScalar()) {
+ switch (Ty.getSizeInBits()) {
+ case 32:
+ return PMI_FP32;
+ case 64:
+ return PMI_FP64;
+ case 128:
+ return PMI_VEC128;
+ default:
+ llvm_unreachable("Unsupported register size.");
+ }
+ } else {
+ switch (Ty.getSizeInBits()) {
+ case 128:
+ return PMI_VEC128;
+ case 256:
+ return PMI_VEC256;
+ case 512:
+ return PMI_VEC512;
+ default:
+ llvm_unreachable("Unsupported register size.");
+ }
+ }
+
+ return PMI_None;
+}
+
+void X86RegisterBankInfo::getInstrPartialMappingIdxs(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI, const bool isFP,
+ SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx) {
+
+ unsigned NumOperands = MI.getNumOperands();
+ for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+ auto &MO = MI.getOperand(Idx);
+ if (!MO.isReg())
+ OpRegBankIdx[Idx] = PMI_None;
+ else
+ OpRegBankIdx[Idx] = getPartialMappingIdx(MRI.getType(MO.getReg()), isFP);
+ }
+}
+
+bool X86RegisterBankInfo::getInstrValueMapping(
+ const MachineInstr &MI,
+ const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx,
+ SmallVectorImpl<const ValueMapping *> &OpdsMapping) {
+
+ unsigned NumOperands = MI.getNumOperands();
+ for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+ if (!MI.getOperand(Idx).isReg())
+ continue;
+
+ auto Mapping = getValueMapping(OpRegBankIdx[Idx], 1);
+ if (!Mapping->isValid())
+ return false;
+
+ OpdsMapping[Idx] = Mapping;
+ }
+ return true;
+}
+
+const RegisterBankInfo::InstructionMapping &
+X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI,
+ bool isFP) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumOperands = MI.getNumOperands();
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+ if (NumOperands != 3 || (Ty != MRI.getType(MI.getOperand(1).getReg())) ||
+ (Ty != MRI.getType(MI.getOperand(2).getReg())))
+ llvm_unreachable("Unsupported operand mapping yet.");
+
+ auto Mapping = getValueMapping(getPartialMappingIdx(Ty, isFP), 3);
+ return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands);
+}
+
+const RegisterBankInfo::InstructionMapping &
+X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ unsigned Opc = MI.getOpcode();
+
+ // Try the default logic for non-generic instructions that are either copies
+ // or already have some operands assigned to banks.
+ if (!isPreISelGenericOpcode(Opc) || Opc == TargetOpcode::G_PHI) {
+ const InstructionMapping &Mapping = getInstrMappingImpl(MI);
+ if (Mapping.isValid())
+ return Mapping;
+ }
+
+ switch (Opc) {
+ case TargetOpcode::G_ADD:
+ case TargetOpcode::G_SUB:
+ case TargetOpcode::G_MUL:
+ return getSameOperandsMapping(MI, false);
+ case TargetOpcode::G_FADD:
+ case TargetOpcode::G_FSUB:
+ case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FDIV:
+ return getSameOperandsMapping(MI, true);
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR: {
+ unsigned NumOperands = MI.getNumOperands();
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+ auto Mapping = getValueMapping(getPartialMappingIdx(Ty, false), 3);
+ return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands);
+
+ }
+ default:
+ break;
+ }
+
+ unsigned NumOperands = MI.getNumOperands();
+ SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+
+ switch (Opc) {
+ case TargetOpcode::G_FPEXT:
+ case TargetOpcode::G_FPTRUNC:
+ case TargetOpcode::G_FCONSTANT:
+ // Instruction having only floating-point operands (all scalars in VECRReg)
+ getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx);
+ break;
+ case TargetOpcode::G_SITOFP:
+ case TargetOpcode::G_FPTOSI: {
+ // Some of the floating-point instructions have mixed GPR and FP operands:
+ // fine-tune the computed mapping.
+ auto &Op0 = MI.getOperand(0);
+ auto &Op1 = MI.getOperand(1);
+ const LLT Ty0 = MRI.getType(Op0.getReg());
+ const LLT Ty1 = MRI.getType(Op1.getReg());
+
+ bool FirstArgIsFP = Opc == TargetOpcode::G_SITOFP;
+ bool SecondArgIsFP = Opc == TargetOpcode::G_FPTOSI;
+ OpRegBankIdx[0] = getPartialMappingIdx(Ty0, /* isFP */ FirstArgIsFP);
+ OpRegBankIdx[1] = getPartialMappingIdx(Ty1, /* isFP */ SecondArgIsFP);
+ break;
+ }
+ case TargetOpcode::G_FCMP: {
+ LLT Ty1 = MRI.getType(MI.getOperand(2).getReg());
+ LLT Ty2 = MRI.getType(MI.getOperand(3).getReg());
+ (void)Ty2;
+ assert(Ty1.getSizeInBits() == Ty2.getSizeInBits() &&
+ "Mismatched operand sizes for G_FCMP");
+
+ unsigned Size = Ty1.getSizeInBits();
+ (void)Size;
+ assert((Size == 32 || Size == 64) && "Unsupported size for G_FCMP");
+
+ auto FpRegBank = getPartialMappingIdx(Ty1, /* isFP */ true);
+ OpRegBankIdx = {PMI_GPR8,
+ /* Predicate */ PMI_None, FpRegBank, FpRegBank};
+ break;
+ }
+ case TargetOpcode::G_TRUNC:
+ case TargetOpcode::G_ANYEXT: {
+ auto &Op0 = MI.getOperand(0);
+ auto &Op1 = MI.getOperand(1);
+ const LLT Ty0 = MRI.getType(Op0.getReg());
+ const LLT Ty1 = MRI.getType(Op1.getReg());
+
+ bool isFPTrunc = (Ty0.getSizeInBits() == 32 || Ty0.getSizeInBits() == 64) &&
+ Ty1.getSizeInBits() == 128 && Opc == TargetOpcode::G_TRUNC;
+ bool isFPAnyExt =
+ Ty0.getSizeInBits() == 128 &&
+ (Ty1.getSizeInBits() == 32 || Ty1.getSizeInBits() == 64) &&
+ Opc == TargetOpcode::G_ANYEXT;
+
+ getInstrPartialMappingIdxs(MI, MRI, /* isFP */ isFPTrunc || isFPAnyExt,
+ OpRegBankIdx);
+ } break;
+ default:
+ // Track the bank of each register, use NotFP mapping (all scalars in GPRs)
+ getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx);
+ break;
+ }
+
+ // Finally construct the computed mapping.
+ SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+ if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping))
+ return getInvalidInstructionMapping();
+
+ return getInstructionMapping(DefaultMappingID, /* Cost */ 1,
+ getOperandsMapping(OpdsMapping), NumOperands);
+}
+
+void X86RegisterBankInfo::applyMappingImpl(
+ const OperandsMapper &OpdMapper) const {
+ return applyDefaultMapping(OpdMapper);
+}
+
+RegisterBankInfo::InstructionMappings
+X86RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
+
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_STORE:
+ case TargetOpcode::G_IMPLICIT_DEF: {
+ // we going to try to map 32/64 bit to PMI_FP32/PMI_FP64
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ if (Size != 32 && Size != 64)
+ break;
+
+ unsigned NumOperands = MI.getNumOperands();
+
+ // Track the bank of each register, use FP mapping (all scalars in VEC)
+ SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+ getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx);
+
+ // Finally construct the computed mapping.
+ SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+ if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping))
+ break;
+
+ const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
+ /*ID*/ 1, /*Cost*/ 1, getOperandsMapping(OpdsMapping), NumOperands);
+ InstructionMappings AltMappings;
+ AltMappings.push_back(&Mapping);
+ return AltMappings;
+ }
+ default:
+ break;
+ }
+ return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBankInfo.h
new file mode 100644
index 000000000000..d5afd2cae761
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBankInfo.h
@@ -0,0 +1,81 @@
+//===- X86RegisterBankInfo ---------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "X86GenRegisterBank.inc"
+
+namespace llvm {
+
+class LLT;
+
+class X86GenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "X86GenRegisterBank.inc"
+#define GET_TARGET_REGBANK_INFO_CLASS
+#include "X86GenRegisterBankInfo.def"
+
+ static RegisterBankInfo::PartialMapping PartMappings[];
+ static RegisterBankInfo::ValueMapping ValMappings[];
+
+ static PartialMappingIdx getPartialMappingIdx(const LLT &Ty, bool isFP);
+ static const RegisterBankInfo::ValueMapping *
+ getValueMapping(PartialMappingIdx Idx, unsigned NumOperands);
+};
+
+class TargetRegisterInfo;
+
+/// This class provides the information for the target register banks.
+class X86RegisterBankInfo final : public X86GenRegisterBankInfo {
+private:
+ /// Get an instruction mapping.
+ /// \return An InstructionMappings with a statically allocated
+ /// OperandsMapping.
+ const InstructionMapping &getSameOperandsMapping(const MachineInstr &MI,
+ bool isFP) const;
+
+ /// Track the bank of each instruction operand(register)
+ static void
+ getInstrPartialMappingIdxs(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI, const bool isFP,
+ SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx);
+
+ /// Construct the instruction ValueMapping from PartialMappingIdxs
+ /// \return true if mapping succeeded.
+ static bool
+ getInstrValueMapping(const MachineInstr &MI,
+ const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx,
+ SmallVectorImpl<const ValueMapping *> &OpdsMapping);
+
+public:
+ X86RegisterBankInfo(const TargetRegisterInfo &TRI);
+
+ const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
+ LLT) const override;
+
+ InstructionMappings
+ getInstrAlternativeMappings(const MachineInstr &MI) const override;
+
+ /// See RegisterBankInfo::applyMapping.
+ void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+ const InstructionMapping &
+ getInstrMapping(const MachineInstr &MI) const override;
+};
+
+} // namespace llvm
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td
new file mode 100644
index 000000000000..74c515850ab1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td
@@ -0,0 +1,16 @@
+//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers: RAX, RCX,...
+def GPRRegBank : RegisterBank<"GPR", [GR64]>;
+
+/// Floating Point/Vector Registers
+def VECRRegBank : RegisterBank<"VECR", [VR512]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 000000000000..d90b4e7bdc7e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,935 @@
+//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterInfo.h"
+#include "X86FrameLowering.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "X86GenRegisterInfo.inc"
+
+static cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+ cl::desc("Enable use of a base pointer for complex stack frames"));
+
+X86RegisterInfo::X86RegisterInfo(const Triple &TT)
+ : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP),
+ X86_MC::getDwarfRegFlavour(TT, false),
+ X86_MC::getDwarfRegFlavour(TT, true),
+ (TT.isArch64Bit() ? X86::RIP : X86::EIP)) {
+ X86_MC::initLLVMToSEHAndCVRegMapping(this);
+
+ // Cache some information.
+ Is64Bit = TT.isArch64Bit();
+ IsWin64 = Is64Bit && TT.isOSWindows();
+
+ // Use a callee-saved register as the base pointer. These registers must
+ // not conflict with any ABI requirements. For example, in 32-bit mode PIC
+ // requires GOT in the EBX register before function calls via PLT GOT pointer.
+ if (Is64Bit) {
+ SlotSize = 8;
+ // This matches the simplified 32-bit pointer code in the data layout
+ // computation.
+ // FIXME: Should use the data layout?
+ bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32;
+ StackPtr = Use64BitReg ? X86::RSP : X86::ESP;
+ FramePtr = Use64BitReg ? X86::RBP : X86::EBP;
+ BasePtr = Use64BitReg ? X86::RBX : X86::EBX;
+ } else {
+ SlotSize = 4;
+ StackPtr = X86::ESP;
+ FramePtr = X86::EBP;
+ BasePtr = X86::ESI;
+ }
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+ return getEncodingValue(i);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+ unsigned Idx) const {
+ // The sub_8bit sub-register index is more constrained in 32-bit mode.
+ // It behaves just like the sub_8bit_hi index.
+ if (!Is64Bit && Idx == X86::sub_8bit)
+ Idx = X86::sub_8bit_hi;
+
+ // Forward to TableGen's default version.
+ return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B,
+ unsigned SubIdx) const {
+ // The sub_8bit sub-register index is more constrained in 32-bit mode.
+ if (!Is64Bit && SubIdx == X86::sub_8bit) {
+ A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
+ if (!A)
+ return nullptr;
+ }
+ return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const {
+ // Don't allow super-classes of GR8_NOREX. This class is only used after
+ // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied
+ // to the full GR8 register class in 64-bit mode, so we cannot allow the
+ // reigster class inflation.
+ //
+ // The GR8_NOREX class is always used in a way that won't be constrained to a
+ // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+ // full GR8 class.
+ if (RC == &X86::GR8_NOREXRegClass)
+ return RC;
+
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+
+ const TargetRegisterClass *Super = RC;
+ TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+ do {
+ switch (Super->getID()) {
+ case X86::FR32RegClassID:
+ case X86::FR64RegClassID:
+ // If AVX-512 isn't supported we should only inflate to these classes.
+ if (!Subtarget.hasAVX512() &&
+ getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
+ return Super;
+ break;
+ case X86::VR128RegClassID:
+ case X86::VR256RegClassID:
+ // If VLX isn't supported we should only inflate to these classes.
+ if (!Subtarget.hasVLX() &&
+ getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
+ return Super;
+ break;
+ case X86::VR128XRegClassID:
+ case X86::VR256XRegClassID:
+ // If VLX isn't support we shouldn't inflate to these classes.
+ if (Subtarget.hasVLX() &&
+ getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
+ return Super;
+ break;
+ case X86::FR32XRegClassID:
+ case X86::FR64XRegClassID:
+ // If AVX-512 isn't support we shouldn't inflate to these classes.
+ if (Subtarget.hasAVX512() &&
+ getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
+ return Super;
+ break;
+ case X86::GR8RegClassID:
+ case X86::GR16RegClassID:
+ case X86::GR32RegClassID:
+ case X86::GR64RegClassID:
+ case X86::RFP32RegClassID:
+ case X86::RFP64RegClassID:
+ case X86::RFP80RegClassID:
+ case X86::VR512_0_15RegClassID:
+ case X86::VR512RegClassID:
+ // Don't return a super-class that would shrink the spill size.
+ // That can happen with the vector and float classes.
+ if (getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
+ return Super;
+ }
+ Super = *I++;
+ } while (Super);
+ return RC;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ switch (Kind) {
+ default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+ case 0: // Normal GPRs.
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64RegClass;
+ // If the target is 64bit but we have been told to use 32bit addresses,
+ // we can still use 64-bit register as long as we know the high bits
+ // are zeros.
+ // Reflect that in the returned register class.
+ if (Is64Bit) {
+ // When the target also allows 64-bit frame pointer and we do have a
+ // frame, this is fine to use it for the address accesses as well.
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+ return TFI->hasFP(MF) && TFI->Uses64BitFramePtr
+ ? &X86::LOW32_ADDR_ACCESS_RBPRegClass
+ : &X86::LOW32_ADDR_ACCESSRegClass;
+ }
+ return &X86::GR32RegClass;
+ case 1: // Normal GPRs except the stack pointer (for encoding reasons).
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOSPRegClass;
+ // NOSP does not contain RIP, so no special case here.
+ return &X86::GR32_NOSPRegClass;
+ case 2: // NOREX GPRs.
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOREXRegClass;
+ return &X86::GR32_NOREXRegClass;
+ case 3: // NOREX GPRs except the stack pointer (for encoding reasons).
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOREX_NOSPRegClass;
+ // NOSP does not contain RIP, so no special case here.
+ return &X86::GR32_NOREX_NOSPRegClass;
+ case 4: // Available for tailcall (not callee-saved GPRs).
+ return getGPRsForTailCall(MF);
+ }
+}
+
+bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const {
+ // Prevent rewriting a copy where the destination size is larger than the
+ // input size. See PR41619.
+ // FIXME: Should this be factored into the base implementation somehow.
+ if (DefRC->hasSuperClassEq(&X86::GR64RegClass) && DefSubReg == 0 &&
+ SrcRC->hasSuperClassEq(&X86::GR64RegClass) && SrcSubReg == X86::sub_32bit)
+ return false;
+
+ return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
+ SrcRC, SrcSubReg);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
+ const Function &F = MF.getFunction();
+ if (IsWin64 || (F.getCallingConv() == CallingConv::Win64))
+ return &X86::GR64_TCW64RegClass;
+ else if (Is64Bit)
+ return &X86::GR64_TCRegClass;
+
+ bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE);
+ if (hasHipeCC)
+ return &X86::GR32RegClass;
+ return &X86::GR32_TCRegClass;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &X86::CCRRegClass) {
+ if (Is64Bit)
+ return &X86::GR64RegClass;
+ else
+ return &X86::GR32RegClass;
+ }
+ return RC;
+}
+
+unsigned
+X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const {
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+
+ unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
+ switch (RC->getID()) {
+ default:
+ return 0;
+ case X86::GR32RegClassID:
+ return 4 - FPDiff;
+ case X86::GR64RegClassID:
+ return 12 - FPDiff;
+ case X86::VR128RegClassID:
+ return Is64Bit ? 10 : 4;
+ case X86::VR64RegClassID:
+ return 4;
+ }
+}
+
+const MCPhysReg *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ assert(MF && "MachineFunction required");
+
+ const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>();
+ const Function &F = MF->getFunction();
+ bool HasSSE = Subtarget.hasSSE1();
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+ bool CallsEHReturn = MF->callsEHReturn();
+
+ CallingConv::ID CC = F.getCallingConv();
+
+ // If attribute NoCallerSavedRegisters exists then we set X86_INTR calling
+ // convention because it has the CSR list.
+ if (MF->getFunction().hasFnAttribute("no_caller_saved_registers"))
+ CC = CallingConv::X86_INTR;
+
+ switch (CC) {
+ case CallingConv::GHC:
+ case CallingConv::HiPE:
+ return CSR_NoRegs_SaveList;
+ case CallingConv::AnyReg:
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_SaveList;
+ return CSR_64_AllRegs_SaveList;
+ case CallingConv::PreserveMost:
+ return CSR_64_RT_MostRegs_SaveList;
+ case CallingConv::PreserveAll:
+ if (HasAVX)
+ return CSR_64_RT_AllRegs_AVX_SaveList;
+ return CSR_64_RT_AllRegs_SaveList;
+ case CallingConv::CXX_FAST_TLS:
+ if (Is64Bit)
+ return MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR() ?
+ CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList;
+ break;
+ case CallingConv::Intel_OCL_BI: {
+ if (HasAVX512 && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
+ if (HasAVX512 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX512_SaveList;
+ if (HasAVX && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+ if (HasAVX && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX_SaveList;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_SaveList;
+ break;
+ }
+ case CallingConv::HHVM:
+ return CSR_64_HHVM_SaveList;
+ case CallingConv::X86_RegCall:
+ if (Is64Bit) {
+ if (IsWin64) {
+ return (HasSSE ? CSR_Win64_RegCall_SaveList :
+ CSR_Win64_RegCall_NoSSE_SaveList);
+ } else {
+ return (HasSSE ? CSR_SysV64_RegCall_SaveList :
+ CSR_SysV64_RegCall_NoSSE_SaveList);
+ }
+ } else {
+ return (HasSSE ? CSR_32_RegCall_SaveList :
+ CSR_32_RegCall_NoSSE_SaveList);
+ }
+ case CallingConv::CFGuard_Check:
+ assert(!Is64Bit && "CFGuard check mechanism only used on 32-bit X86");
+ return (HasSSE ? CSR_Win32_CFGuard_Check_SaveList
+ : CSR_Win32_CFGuard_Check_NoSSE_SaveList);
+ case CallingConv::Cold:
+ if (Is64Bit)
+ return CSR_64_MostRegs_SaveList;
+ break;
+ case CallingConv::Win64:
+ if (!HasSSE)
+ return CSR_Win64_NoSSE_SaveList;
+ return CSR_Win64_SaveList;
+ case CallingConv::X86_64_SysV:
+ if (CallsEHReturn)
+ return CSR_64EHRet_SaveList;
+ return CSR_64_SaveList;
+ case CallingConv::X86_INTR:
+ if (Is64Bit) {
+ if (HasAVX512)
+ return CSR_64_AllRegs_AVX512_SaveList;
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_SaveList;
+ if (HasSSE)
+ return CSR_64_AllRegs_SaveList;
+ return CSR_64_AllRegs_NoSSE_SaveList;
+ } else {
+ if (HasAVX512)
+ return CSR_32_AllRegs_AVX512_SaveList;
+ if (HasAVX)
+ return CSR_32_AllRegs_AVX_SaveList;
+ if (HasSSE)
+ return CSR_32_AllRegs_SSE_SaveList;
+ return CSR_32_AllRegs_SaveList;
+ }
+ default:
+ break;
+ }
+
+ if (Is64Bit) {
+ bool IsSwiftCC = Subtarget.getTargetLowering()->supportSwiftError() &&
+ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError);
+ if (IsSwiftCC)
+ return IsWin64 ? CSR_Win64_SwiftError_SaveList
+ : CSR_64_SwiftError_SaveList;
+
+ if (IsWin64)
+ return HasSSE ? CSR_Win64_SaveList : CSR_Win64_NoSSE_SaveList;
+ if (CallsEHReturn)
+ return CSR_64EHRet_SaveList;
+ return CSR_64_SaveList;
+ }
+
+ return CallsEHReturn ? CSR_32EHRet_SaveList : CSR_32_SaveList;
+}
+
+const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy(
+ const MachineFunction *MF) const {
+ assert(MF && "Invalid MachineFunction pointer.");
+ if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR())
+ return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList;
+ return nullptr;
+}
+
+const uint32_t *
+X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ bool HasSSE = Subtarget.hasSSE1();
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+
+ switch (CC) {
+ case CallingConv::GHC:
+ case CallingConv::HiPE:
+ return CSR_NoRegs_RegMask;
+ case CallingConv::AnyReg:
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_RegMask;
+ return CSR_64_AllRegs_RegMask;
+ case CallingConv::PreserveMost:
+ return CSR_64_RT_MostRegs_RegMask;
+ case CallingConv::PreserveAll:
+ if (HasAVX)
+ return CSR_64_RT_AllRegs_AVX_RegMask;
+ return CSR_64_RT_AllRegs_RegMask;
+ case CallingConv::CXX_FAST_TLS:
+ if (Is64Bit)
+ return CSR_64_TLS_Darwin_RegMask;
+ break;
+ case CallingConv::Intel_OCL_BI: {
+ if (HasAVX512 && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
+ if (HasAVX512 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX512_RegMask;
+ if (HasAVX && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+ if (HasAVX && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX_RegMask;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_RegMask;
+ break;
+ }
+ case CallingConv::HHVM:
+ return CSR_64_HHVM_RegMask;
+ case CallingConv::X86_RegCall:
+ if (Is64Bit) {
+ if (IsWin64) {
+ return (HasSSE ? CSR_Win64_RegCall_RegMask :
+ CSR_Win64_RegCall_NoSSE_RegMask);
+ } else {
+ return (HasSSE ? CSR_SysV64_RegCall_RegMask :
+ CSR_SysV64_RegCall_NoSSE_RegMask);
+ }
+ } else {
+ return (HasSSE ? CSR_32_RegCall_RegMask :
+ CSR_32_RegCall_NoSSE_RegMask);
+ }
+ case CallingConv::CFGuard_Check:
+ assert(!Is64Bit && "CFGuard check mechanism only used on 32-bit X86");
+ return (HasSSE ? CSR_Win32_CFGuard_Check_RegMask
+ : CSR_Win32_CFGuard_Check_NoSSE_RegMask);
+ case CallingConv::Cold:
+ if (Is64Bit)
+ return CSR_64_MostRegs_RegMask;
+ break;
+ case CallingConv::Win64:
+ return CSR_Win64_RegMask;
+ case CallingConv::X86_64_SysV:
+ return CSR_64_RegMask;
+ case CallingConv::X86_INTR:
+ if (Is64Bit) {
+ if (HasAVX512)
+ return CSR_64_AllRegs_AVX512_RegMask;
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_RegMask;
+ if (HasSSE)
+ return CSR_64_AllRegs_RegMask;
+ return CSR_64_AllRegs_NoSSE_RegMask;
+ } else {
+ if (HasAVX512)
+ return CSR_32_AllRegs_AVX512_RegMask;
+ if (HasAVX)
+ return CSR_32_AllRegs_AVX_RegMask;
+ if (HasSSE)
+ return CSR_32_AllRegs_SSE_RegMask;
+ return CSR_32_AllRegs_RegMask;
+ }
+ default:
+ break;
+ }
+
+ // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
+ // callsEHReturn().
+ if (Is64Bit) {
+ const Function &F = MF.getFunction();
+ bool IsSwiftCC = Subtarget.getTargetLowering()->supportSwiftError() &&
+ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError);
+ if (IsSwiftCC)
+ return IsWin64 ? CSR_Win64_SwiftError_RegMask : CSR_64_SwiftError_RegMask;
+ return IsWin64 ? CSR_Win64_RegMask : CSR_64_RegMask;
+ }
+
+ return CSR_32_RegMask;
+}
+
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+ return CSR_NoRegs_RegMask;
+}
+
+const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const {
+ return CSR_64_TLS_Darwin_RegMask;
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+
+ // Set the floating point control register as reserved.
+ Reserved.set(X86::FPCW);
+
+ // Set the floating point status register as reserved.
+ Reserved.set(X86::FPSW);
+
+ // Set the SIMD floating point control register as reserved.
+ Reserved.set(X86::MXCSR);
+
+ // Set the stack-pointer register and its aliases as reserved.
+ for (const MCPhysReg &SubReg : subregs_inclusive(X86::RSP))
+ Reserved.set(SubReg);
+
+ // Set the Shadow Stack Pointer as reserved.
+ Reserved.set(X86::SSP);
+
+ // Set the instruction pointer register and its aliases as reserved.
+ for (const MCPhysReg &SubReg : subregs_inclusive(X86::RIP))
+ Reserved.set(SubReg);
+
+ // Set the frame-pointer register and its aliases as reserved if needed.
+ if (TFI->hasFP(MF)) {
+ for (const MCPhysReg &SubReg : subregs_inclusive(X86::RBP))
+ Reserved.set(SubReg);
+ }
+
+ // Set the base-pointer register and its aliases as reserved if needed.
+ if (hasBasePointer(MF)) {
+ CallingConv::ID CC = MF.getFunction().getCallingConv();
+ const uint32_t *RegMask = getCallPreservedMask(MF, CC);
+ if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+ report_fatal_error(
+ "Stack realignment in presence of dynamic allocas is not supported with"
+ "this calling convention.");
+
+ Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
+ for (const MCPhysReg &SubReg : subregs_inclusive(BasePtr))
+ Reserved.set(SubReg);
+ }
+
+ // Mark the segment registers as reserved.
+ Reserved.set(X86::CS);
+ Reserved.set(X86::SS);
+ Reserved.set(X86::DS);
+ Reserved.set(X86::ES);
+ Reserved.set(X86::FS);
+ Reserved.set(X86::GS);
+
+ // Mark the floating point stack registers as reserved.
+ for (unsigned n = 0; n != 8; ++n)
+ Reserved.set(X86::ST0 + n);
+
+ // Reserve the registers that only exist in 64-bit mode.
+ if (!Is64Bit) {
+ // These 8-bit registers are part of the x86-64 extension even though their
+ // super-registers are old 32-bits.
+ Reserved.set(X86::SIL);
+ Reserved.set(X86::DIL);
+ Reserved.set(X86::BPL);
+ Reserved.set(X86::SPL);
+ Reserved.set(X86::SIH);
+ Reserved.set(X86::DIH);
+ Reserved.set(X86::BPH);
+ Reserved.set(X86::SPH);
+
+ for (unsigned n = 0; n != 8; ++n) {
+ // R8, R9, ...
+ for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+
+ // XMM8, XMM9, ...
+ for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
+ if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) {
+ for (unsigned n = 16; n != 32; ++n) {
+ for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
+
+ assert(checkAllSuperRegsMarked(Reserved,
+ {X86::SIL, X86::DIL, X86::BPL, X86::SPL,
+ X86::SIH, X86::DIH, X86::BPH, X86::SPH}));
+ return Reserved;
+}
+
+void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
+ // Check if the EFLAGS register is marked as live-out. This shouldn't happen,
+ // because the calling convention defines the EFLAGS register as NOT
+ // preserved.
+ //
+ // Unfortunatelly the EFLAGS show up as live-out after branch folding. Adding
+ // an assert to track this and clear the register afterwards to avoid
+ // unnecessary crashes during release builds.
+ assert(!(Mask[X86::EFLAGS / 32] & (1U << (X86::EFLAGS % 32))) &&
+ "EFLAGS are not live-out from a patchpoint.");
+
+ // Also clean other registers that don't need preserving (IP).
+ for (auto Reg : {X86::EFLAGS, X86::RIP, X86::EIP, X86::IP})
+ Mask[Reg / 32] &= ~(1U << (Reg % 32));
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+static bool CantUseSP(const MachineFrameInfo &MFI) {
+ return MFI.hasVarSizedObjects() || MFI.hasOpaqueSPAdjustment();
+}
+
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ if (X86FI->hasPreallocatedCall())
+ return true;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ if (!EnableBasePointer)
+ return false;
+
+ // When we need stack realignment, we can't address the stack from the frame
+ // pointer. When we have dynamic allocas or stack-adjusting inline asm, we
+ // can't address variables from the stack pointer. MS inline asm can
+ // reference locals while also adjusting the stack pointer. When we can't
+ // use both the SP and the FP, we need a separate base pointer register.
+ bool CantUseFP = needsStackRealignment(MF);
+ return CantUseFP && CantUseSP(MFI);
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+ // Stack realignment requires a frame pointer. If we already started
+ // register allocation with frame pointer elimination, it is too late now.
+ if (!MRI->canReserveReg(FramePtr))
+ return false;
+
+ // If a base pointer is necessary. Check that it isn't too late to reserve
+ // it.
+ if (CantUseSP(MFI))
+ return MRI->canReserveReg(BasePtr);
+ return true;
+}
+
+// tryOptimizeLEAtoMOV - helper function that tries to replace a LEA instruction
+// of the form 'lea (%esp), %ebx' --> 'mov %esp, %ebx'.
+// TODO: In this case we should be really trying first to entirely eliminate
+// this instruction which is a plain copy.
+static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
+ MachineInstr &MI = *II;
+ unsigned Opc = II->getOpcode();
+ // Check if this is a LEA of the form 'lea (%esp), %ebx'
+ if ((Opc != X86::LEA32r && Opc != X86::LEA64r && Opc != X86::LEA64_32r) ||
+ MI.getOperand(2).getImm() != 1 ||
+ MI.getOperand(3).getReg() != X86::NoRegister ||
+ MI.getOperand(4).getImm() != 0 ||
+ MI.getOperand(5).getReg() != X86::NoRegister)
+ return false;
+ Register BasePtr = MI.getOperand(1).getReg();
+ // In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will
+ // be replaced with a 32-bit operand MOV which will zero extend the upper
+ // 32-bits of the super register.
+ if (Opc == X86::LEA64_32r)
+ BasePtr = getX86SubSuperRegister(BasePtr, 32);
+ Register NewDestReg = MI.getOperand(0).getReg();
+ const X86InstrInfo *TII =
+ MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo();
+ TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr,
+ MI.getOperand(1).isKill());
+ MI.eraseFromParent();
+ return true;
+}
+
+static bool isFuncletReturnInstr(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::CATCHRET:
+ case X86::CLEANUPRET:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("impossible");
+}
+
+void
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ MachineInstr &MI = *II;
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ bool IsEHFuncletEpilogue = MBBI == MBB.end() ? false
+ : isFuncletReturnInstr(*MBBI);
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+ // Determine base register and offset.
+ int FIOffset;
+ Register BasePtr;
+ if (MI.isReturn()) {
+ assert((!needsStackRealignment(MF) ||
+ MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
+ "Return instruction can only reference SP relative frame objects");
+ FIOffset =
+ TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0).getFixed();
+ } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) {
+ FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr);
+ } else {
+ FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed();
+ }
+
+ // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
+ // simple FP case, and doesn't work with stack realignment. On 32-bit, the
+ // offset is from the traditional base pointer location. On 64-bit, the
+ // offset is from the SP at the end of the prologue, not the FP location. This
+ // matches the behavior of llvm.frameaddress.
+ unsigned Opc = MI.getOpcode();
+ if (Opc == TargetOpcode::LOCAL_ESCAPE) {
+ MachineOperand &FI = MI.getOperand(FIOperandNum);
+ FI.ChangeToImmediate(FIOffset);
+ return;
+ }
+
+ // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit
+ // register as source operand, semantic is the same and destination is
+ // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
+ // Don't change BasePtr since it is used later for stack adjustment.
+ Register MachineBasePtr = BasePtr;
+ if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
+ MachineBasePtr = getX86SubSuperRegister(BasePtr, 64);
+
+ // This must be part of a four operand memory reference. Replace the
+ // FrameIndex with base register. Add an offset to the offset.
+ MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false);
+
+ if (BasePtr == StackPtr)
+ FIOffset += SPAdj;
+
+ // The frame index format for stackmaps and patchpoints is different from the
+ // X86 format. It only has a FI and an offset.
+ if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+ assert(BasePtr == FramePtr && "Expected the FP as base register");
+ int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset;
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ return;
+ }
+
+ if (MI.getOperand(FIOperandNum+3).isImm()) {
+ // Offset is a 32-bit integer.
+ int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
+ int Offset = FIOffset + Imm;
+ assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
+ "Requesting 64-bit offset in 32-bit immediate!");
+ if (Offset != 0 || !tryOptimizeLEAtoMOV(II))
+ MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
+ } else {
+ // Offset is symbolic. This is extremely rare.
+ uint64_t Offset = FIOffset +
+ (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+ MI.getOperand(FIOperandNum + 3).setOffset(Offset);
+ }
+}
+
+unsigned X86RegisterInfo::findDeadCallerSavedReg(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const {
+ const MachineFunction *MF = MBB.getParent();
+ if (MF->callsEHReturn())
+ return 0;
+
+ const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF);
+
+ if (MBBI == MBB.end())
+ return 0;
+
+ switch (MBBI->getOpcode()) {
+ default:
+ return 0;
+ case TargetOpcode::PATCHABLE_RET:
+ case X86::RET:
+ case X86::RETL:
+ case X86::RETQ:
+ case X86::RETIL:
+ case X86::RETIQ:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ SmallSet<uint16_t, 8> Uses;
+ for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
+ MachineOperand &MO = MBBI->getOperand(I);
+ if (!MO.isReg() || MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
+ Uses.insert(*AI);
+ }
+
+ for (auto CS : AvailableRegs)
+ if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP)
+ return CS;
+ }
+ }
+
+ return 0;
+}
+
+Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const X86FrameLowering *TFI = getFrameLowering(MF);
+ return TFI->hasFP(MF) ? FramePtr : StackPtr;
+}
+
+unsigned
+X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ Register FrameReg = getFrameRegister(MF);
+ if (Subtarget.isTarget64BitILP32())
+ FrameReg = getX86SubSuperRegister(FrameReg, 32);
+ return FrameReg;
+}
+
+unsigned
+X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ Register StackReg = getStackRegister();
+ if (Subtarget.isTarget64BitILP32())
+ StackReg = getX86SubSuperRegister(StackReg, 32);
+ return StackReg;
+}
+
+static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
+ const MachineRegisterInfo *MRI) {
+ if (VRM->hasShape(VirtReg))
+ return VRM->getShape(VirtReg);
+
+ const MachineOperand &Def = *MRI->def_begin(VirtReg);
+ MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent());
+ unsigned OpCode = MI->getOpcode();
+ switch (OpCode) {
+ default:
+ llvm_unreachable("Unexpected machine instruction on tile register!");
+ break;
+ // We only collect the tile shape that is defined.
+ case X86::PTILELOADDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ MachineOperand &MO1 = MI->getOperand(1);
+ MachineOperand &MO2 = MI->getOperand(2);
+ ShapeT Shape(&MO1, &MO2, MRI);
+ VRM->assignVirt2Shape(VirtReg, Shape);
+ return Shape;
+ }
+}
+
+bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+ VirtReg, Order, Hints, MF, VRM, Matrix);
+
+ if (RC.getID() != X86::TILERegClassID)
+ return BaseImplRetVal;
+
+ ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
+ auto AddHint = [&](MCPhysReg PhysReg) {
+ Register VReg = Matrix->getOneVReg(PhysReg);
+ if (VReg == MCRegister::NoRegister) { // Not allocated yet
+ Hints.push_back(PhysReg);
+ return;
+ }
+ ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
+ if (PhysShape == VirtShape)
+ Hints.push_back(PhysReg);
+ };
+
+ SmallSet<MCPhysReg, 4> CopyHints;
+ CopyHints.insert(Hints.begin(), Hints.end());
+ Hints.clear();
+ for (auto Hint : CopyHints) {
+ if (RC.contains(Hint) && !MRI->isReserved(Hint))
+ AddHint(Hint);
+ }
+ for (MCPhysReg PhysReg : Order) {
+ if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) &&
+ !MRI->isReserved(PhysReg))
+ AddHint(PhysReg);
+ }
+
+#define DEBUG_TYPE "tile-hint"
+ LLVM_DEBUG({
+ dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n";
+ for (auto Hint : Hints) {
+ dbgs() << "tmm" << Hint << ",";
+ }
+ dbgs() << "\n";
+ });
+#undef DEBUG_TYPE
+
+ return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 000000000000..7fd10ddd1a15
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,156 @@
+//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "X86GenRegisterInfo.inc"
+
+namespace llvm {
+ class Triple;
+
+class X86RegisterInfo final : public X86GenRegisterInfo {
+private:
+ /// Is64Bit - Is the target 64-bits.
+ ///
+ bool Is64Bit;
+
+ /// IsWin64 - Is the target on of win64 flavours
+ ///
+ bool IsWin64;
+
+ /// SlotSize - Stack slot size in bytes.
+ ///
+ unsigned SlotSize;
+
+ /// StackPtr - X86 physical register used as stack ptr.
+ ///
+ unsigned StackPtr;
+
+ /// FramePtr - X86 physical register used as frame ptr.
+ ///
+ unsigned FramePtr;
+
+ /// BasePtr - X86 physical register used as a base ptr in complex stack
+ /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+ /// variable size stack objects.
+ unsigned BasePtr;
+
+public:
+ explicit X86RegisterInfo(const Triple &TT);
+
+ // FIXME: This should be tablegen'd like getDwarfRegNum is
+ int getSEHRegNum(unsigned i) const;
+
+ /// getMatchingSuperRegClass - Return a subclass of the specified register
+ /// class A so that each register in it has a sub-register of the
+ /// specified sub-register index which is in the specified register class B.
+ const TargetRegisterClass *
+ getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B,
+ unsigned Idx) const override;
+
+ const TargetRegisterClass *
+ getSubClassWithSubReg(const TargetRegisterClass *RC,
+ unsigned Idx) const override;
+
+ const TargetRegisterClass *
+ getLargestLegalSuperClass(const TargetRegisterClass *RC,
+ const MachineFunction &MF) const override;
+
+ bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const override;
+
+ /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+ /// values.
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
+
+ /// getCrossCopyRegClass - Returns a legal register class to copy a register
+ /// in the specified class to or from. Returns NULL if it is possible to copy
+ /// between a two registers of the specified class.
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+ /// getGPRsForTailCall - Returns a register class with registers that can be
+ /// used in forming tail calls.
+ const TargetRegisterClass *
+ getGPRsForTailCall(const MachineFunction &MF) const;
+
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const override;
+
+ /// getCalleeSavedRegs - Return a null-terminated list of all of the
+ /// callee-save registers on this target.
+ const MCPhysReg *
+ getCalleeSavedRegs(const MachineFunction* MF) const override;
+ const MCPhysReg *
+ getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+ const uint32_t *getNoPreservedMask() const override;
+
+ // Calls involved in thread-local variable lookup save more registers than
+ // normal calls, so they need a different mask to represent this.
+ const uint32_t *getDarwinTLSCallPreservedMask() const;
+
+ /// getReservedRegs - Returns a bitset indexed by physical register number
+ /// indicating if a register is a special register that has particular uses and
+ /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+ /// register scavenger to determine what registers are free.
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
+
+ bool hasBasePointer(const MachineFunction &MF) const;
+
+ bool canRealignStack(const MachineFunction &MF) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ /// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+ /// when it reaches the "return" instruction. We can then pop a stack object
+ /// to this register without worry about clobbering it.
+ unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI) const;
+
+ // Debug information queries.
+ Register getFrameRegister(const MachineFunction &MF) const override;
+ unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
+ unsigned getPtrSizedStackRegister(const MachineFunction &MF) const;
+ Register getStackRegister() const { return StackPtr; }
+ Register getBaseRegister() const { return BasePtr; }
+ /// Returns physical register used as frame pointer.
+ /// This will always returns the frame pointer register, contrary to
+ /// getFrameRegister() which returns the "base pointer" in situations
+ /// involving a stack, frame and base pointer.
+ Register getFramePtr() const { return FramePtr; }
+ // FIXME: Move to FrameInfok
+ unsigned getSlotSize() const { return SlotSize; }
+
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 000000000000..75cbd4e1cff1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,646 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+ let Namespace = "X86";
+ let HWEncoding = Enc;
+ let SubRegs = subregs;
+}
+
+// Subregister indices.
+let Namespace = "X86" in {
+ def sub_8bit : SubRegIndex<8>;
+ def sub_8bit_hi : SubRegIndex<8, 8>;
+ def sub_8bit_hi_phony : SubRegIndex<8, 8>;
+ def sub_16bit : SubRegIndex<16>;
+ def sub_16bit_hi : SubRegIndex<16, 16>;
+ def sub_32bit : SubRegIndex<32>;
+ def sub_xmm : SubRegIndex<128>;
+ def sub_ymm : SubRegIndex<256>;
+ def sub_mask_0 : SubRegIndex<-1>;
+ def sub_mask_1 : SubRegIndex<-1, -1>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others. We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL : X86Reg<"sil", 6>;
+def DIL : X86Reg<"dil", 7>;
+def BPL : X86Reg<"bpl", 5>;
+def SPL : X86Reg<"spl", 4>;
+def R8B : X86Reg<"r8b", 8>;
+def R9B : X86Reg<"r9b", 9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+let isArtificial = 1 in {
+// High byte of the low 16 bits of the super-register:
+def SIH : X86Reg<"", -1>;
+def DIH : X86Reg<"", -1>;
+def BPH : X86Reg<"", -1>;
+def SPH : X86Reg<"", -1>;
+def R8BH : X86Reg<"", -1>;
+def R9BH : X86Reg<"", -1>;
+def R10BH : X86Reg<"", -1>;
+def R11BH : X86Reg<"", -1>;
+def R12BH : X86Reg<"", -1>;
+def R13BH : X86Reg<"", -1>;
+def R14BH : X86Reg<"", -1>;
+def R15BH : X86Reg<"", -1>;
+// High word of the low 32 bits of the super-register:
+def HAX : X86Reg<"", -1>;
+def HDX : X86Reg<"", -1>;
+def HCX : X86Reg<"", -1>;
+def HBX : X86Reg<"", -1>;
+def HSI : X86Reg<"", -1>;
+def HDI : X86Reg<"", -1>;
+def HBP : X86Reg<"", -1>;
+def HSP : X86Reg<"", -1>;
+def HIP : X86Reg<"", -1>;
+def R8WH : X86Reg<"", -1>;
+def R9WH : X86Reg<"", -1>;
+def R10WH : X86Reg<"", -1>;
+def R11WH : X86Reg<"", -1>;
+def R12WH : X86Reg<"", -1>;
+def R13WH : X86Reg<"", -1>;
+def R14WH : X86Reg<"", -1>;
+def R15WH : X86Reg<"", -1>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CoveredBySubRegs = 1 in {
+def SI : X86Reg<"si", 6, [SIL,SIH]>;
+def DI : X86Reg<"di", 7, [DIL,DIH]>;
+def BP : X86Reg<"bp", 5, [BPL,BPH]>;
+def SP : X86Reg<"sp", 4, [SPL,SPH]>;
+}
+def IP : X86Reg<"ip", 0>;
+
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CostPerUse = 1,
+ CoveredBySubRegs = 1 in {
+def R8W : X86Reg<"r8w", 8, [R8B,R8BH]>;
+def R9W : X86Reg<"r9w", 9, [R9B,R9BH]>;
+def R10W : X86Reg<"r10w", 10, [R10B,R10BH]>;
+def R11W : X86Reg<"r11w", 11, [R11B,R11BH]>;
+def R12W : X86Reg<"r12w", 12, [R12B,R12BH]>;
+def R13W : X86Reg<"r13w", 13, [R13B,R13BH]>;
+def R14W : X86Reg<"r14w", 14, [R14B,R14BH]>;
+def R15W : X86Reg<"r15w", 15, [R15B,R15BH]>;
+}
+
+// 32-bit registers
+let SubRegIndices = [sub_16bit, sub_16bit_hi], CoveredBySubRegs = 1 in {
+def EAX : X86Reg<"eax", 0, [AX, HAX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX, HDX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX, HCX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX, HBX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI, HSI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI, HDI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP, HBP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP, HSP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP, HIP]>, DwarfRegNum<[-2, 8, 8]>;
+}
+
+// X86-64 only, requires REX
+let SubRegIndices = [sub_16bit, sub_16bit_hi], CostPerUse = 1,
+ CoveredBySubRegs = 1 in {
+def R8D : X86Reg<"r8d", 8, [R8W,R8WH]>;
+def R9D : X86Reg<"r9d", 9, [R9W,R9WH]>;
+def R10D : X86Reg<"r10d", 10, [R10W,R10WH]>;
+def R11D : X86Reg<"r11d", 11, [R11W,R11WH]>;
+def R12D : X86Reg<"r12d", 12, [R12W,R12WH]>;
+def R13D : X86Reg<"r13d", 13, [R13W,R13WH]>;
+def R14D : X86Reg<"r14d", 14, [R14W,R14WH]>;
+def R15D : X86Reg<"r15d", 15, [R15W,R15WH]>;
+}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>;
+def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+def FP7 : X86Reg<"fp7", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+
+def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[67, -2, -2]>;
+def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[68, -2, -2]>;
+def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[69, -2, -2]>;
+def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[70, -2, -2]>;
+def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[71, -2, -2]>;
+def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[72, -2, -2]>;
+def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[73, -2, -2]>;
+def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[74, -2, -2]>;
+def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[75, -2, -2]>;
+def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[76, -2, -2]>;
+def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[77, -2, -2]>;
+def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[78, -2, -2]>;
+def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[79, -2, -2]>;
+def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[80, -2, -2]>;
+def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>;
+def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>;
+
+} // CostPerUse
+
+// YMM0-15 registers, used by AVX instructions and
+// YMM16-31 registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_xmm] in {
+ foreach Index = 0-31 in {
+ def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
+}
+
+// ZMM Registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_ymm] in {
+ foreach Index = 0-31 in {
+ def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
+}
+
+// Tile config registers.
+def TMMCFG: X86Reg<"tmmcfg", 0>;
+
+// Tile "registers".
+def TMM0: X86Reg<"tmm0", 0>;
+def TMM1: X86Reg<"tmm1", 1>;
+def TMM2: X86Reg<"tmm2", 2>;
+def TMM3: X86Reg<"tmm3", 3>;
+def TMM4: X86Reg<"tmm4", 4>;
+def TMM5: X86Reg<"tmm5", 5>;
+def TMM6: X86Reg<"tmm6", 6>;
+def TMM7: X86Reg<"tmm7", 7>;
+
+// Mask Registers, used by AVX-512 instructions.
+def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, 93, 93]>;
+def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, 94, 94]>;
+def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, 95, 95]>;
+def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, 96, 96]>;
+def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, 97, 97]>;
+def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, 98, 98]>;
+def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, 99, 99]>;
+def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>;
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : X86Reg<"st", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsr", 0>;
+
+// Floating-point control word
+def FPCW : X86Reg<"fpcr", 0>;
+
+// SIMD Floating-point control register.
+// Note: We only model the "Uses" of the control bits: current rounding modes,
+// DAZ, FTZ and exception masks. We don't model the "Defs" of flag bits.
+def MXCSR : X86Reg<"mxcsr", 0>;
+
+// Status flags register.
+//
+// Note that some flags that are commonly thought of as part of the status
+// flags register are modeled separately. Typically this is due to instructions
+// reading and updating those flags independently of all the others. We don't
+// want to create false dependencies between these instructions and so we use
+// a separate register to model them.
+def EFLAGS : X86Reg<"flags", 0>;
+
+// The direction flag.
+def DF : X86Reg<"dirflag", 0>;
+
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0 : X86Reg<"dr0", 0>;
+def DR1 : X86Reg<"dr1", 1>;
+def DR2 : X86Reg<"dr2", 2>;
+def DR3 : X86Reg<"dr3", 3>;
+def DR4 : X86Reg<"dr4", 4>;
+def DR5 : X86Reg<"dr5", 5>;
+def DR6 : X86Reg<"dr6", 6>;
+def DR7 : X86Reg<"dr7", 7>;
+def DR8 : X86Reg<"dr8", 8>;
+def DR9 : X86Reg<"dr9", 9>;
+def DR10 : X86Reg<"dr10", 10>;
+def DR11 : X86Reg<"dr11", 11>;
+def DR12 : X86Reg<"dr12", 12>;
+def DR13 : X86Reg<"dr13", 13>;
+def DR14 : X86Reg<"dr14", 14>;
+def DR15 : X86Reg<"dr15", 15>;
+
+// Control registers
+def CR0 : X86Reg<"cr0", 0>;
+def CR1 : X86Reg<"cr1", 1>;
+def CR2 : X86Reg<"cr2", 2>;
+def CR3 : X86Reg<"cr3", 3>;
+def CR4 : X86Reg<"cr4", 4>;
+def CR5 : X86Reg<"cr5", 5>;
+def CR6 : X86Reg<"cr6", 6>;
+def CR7 : X86Reg<"cr7", 7>;
+def CR8 : X86Reg<"cr8", 8>;
+def CR9 : X86Reg<"cr9", 9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
+// Bound registers, used in MPX instructions
+def BND0 : X86Reg<"bnd0", 0>;
+def BND1 : X86Reg<"bnd1", 1>;
+def BND2 : X86Reg<"bnd2", 2>;
+def BND3 : X86Reg<"bnd3", 3>;
+
+// CET registers - Shadow Stack Pointer
+def SSP : X86Reg<"ssp", 0>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes. The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8], 8,
+ (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+ R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
+ let AltOrders = [(sub GR8, AH, BH, CH, DH)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<X86Subtarget>().is64Bit();
+ }];
+}
+
+let isAllocatable = 0 in
+def GRH8 : RegisterClass<"X86", [i8], 8,
+ (add SIH, DIH, BPH, SPH, R8BH, R9BH, R10BH, R11BH,
+ R12BH, R13BH, R14BH, R15BH)>;
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+ (add AX, CX, DX, SI, DI, BX, BP, SP,
+ R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
+
+let isAllocatable = 0 in
+def GRH16 : RegisterClass<"X86", [i16], 16,
+ (add HAX, HCX, HDX, HSI, HDI, HBX, HBP, HSP, HIP,
+ R8WH, R9WH, R10WH, R11WH, R12WH, R13WH, R14WH,
+ R15WH)>;
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+ (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+ R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+// FIXME: it *does* cause trouble - CheckBaseRegAndIndexReg() has extra
+// tests because of the inclusion of RIP in this register class.
+def GR64 : RegisterClass<"X86", [i64], 64,
+ (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
+
+// Segment registers for use by MOV instructions (and others) that have a
+// segment register as one operand. Always contain a 16-bit segment
+// descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 15)>;
+
+// Control registers.
+def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, ESP)>;
+def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
+ R8, R9, R11, RIP, RSP)>;
+def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
+ R8, R9, R10, R11,
+ RIP, RSP)>;
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+ (add AL, CL, DL, AH, CH, DH, BL, BH)> {
+ let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<X86Subtarget>().is64Bit();
+ }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+ (add AX, CX, DX, SI, DI, BX, BP, SP)>;
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+ (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+ (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+ (and GR32_NOREX, GR32_NOSP)>;
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+ (and GR64_NOREX, GR64_NOSP)>;
+
+// Register classes used for ABIs that use 32-bit address accesses,
+// while using the whole x84_64 ISA.
+
+// In such cases, it is fine to use RIP as we are sure the 32 high
+// bits are not set. We do not need variants for NOSP as RIP is not
+// allowed there.
+// RIP is not spilled anywhere for now, so stick to 32-bit alignment
+// to save on memory space.
+// FIXME: We could allow all 64bit registers, but we would need
+// something to check that the 32 high bits are not set,
+// which we do not have right now.
+def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
+
+// When RBP is used as a base pointer in a 32-bit addresses environment,
+// this is also safe to use the full register to access addresses.
+// Since RBP will never be spilled, stick to a 32 alignment to save
+// on memory consumption.
+def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32,
+ (add LOW32_ADDR_ACCESS, RBP)>;
+
+// A class to support the 'A' assembler constraint: [ER]AX then [ER]DX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
+def GR64_AD : RegisterClass<"X86", [i64], 64, (add RAX, RDX)>;
+
+// Classes to support the 64-bit assembler constraint tied to a fixed
+// register in 32-bit mode. The second register is always the next in
+// the list. Wrap around causes an error.
+def GR32_DC : RegisterClass<"X86", [i32], 32, (add EDX, ECX)>;
+def GR32_CB : RegisterClass<"X86", [i32], 32, (add ECX, EBX)>;
+def GR32_BSI : RegisterClass<"X86", [i32], 32, (add EBX, ESI)>;
+def GR32_SIDI : RegisterClass<"X86", [i32], 32, (add ESI, EDI)>;
+def GR32_DIBP : RegisterClass<"X86", [i32], 32, (add EDI, EBP)>;
+def GR32_BPSP : RegisterClass<"X86", [i32], 32, (add EBP, ESP)>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
+
+def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values. This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware. In reality, this should be controlled by a
+// command line option or something.
+
+
+def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>;
+def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>;
+def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>;
+
+// st(7) may be is not allocatable.
+def RFP80_7 : RegisterClass<"X86",[f80], 32, (add FP7)> {
+ let isAllocatable = 0;
+}
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
+ let isAllocatable = 0;
+}
+
+// Helper to allow %st to print as %st(0) when its encoded in the instruction.
+def RSTi : RegisterOperand<RST, "printSTiRegOperand">;
+
+// Generic vector registers: VR64 and VR128.
+// Ensure that float types are declared first - only float is legal on SSE1.
+def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
+ 128, (add FR32)>;
+def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+ 256, (sequence "YMM%u", 0, 15)>;
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
+def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
+
+// AVX-512 vector/mask registers.
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+ 512, (sequence "ZMM%u", 0, 31)>;
+
+// Represents the lower 16 registers that have VEX/legacy encodable subregs.
+def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+ 512, (sequence "ZMM%u", 0, 15)>;
+
+// Scalar AVX-512 floating point registers.
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+
+// Extended VR128 and VR256 for AVX-512 instructions
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
+ 128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+ 256, (sequence "YMM%u", 0, 31)>;
+
+// Mask registers
+def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;}
+def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;}
+def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;}
+def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;}
+def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
+def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
+def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
+
+// Mask register pairs
+def KPAIRS : RegisterTuples<[sub_mask_0, sub_mask_1],
+ [(add K0, K2, K4, K6), (add K1, K3, K5, K7)]>;
+
+def VK1PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK2PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK4PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK8PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK16PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+
+def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;}
+def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;}
+def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;}
+def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;}
+def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;}
+def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
+def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
+
+// Bound registers
+def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
+
+// Tiles
+let CopyCost = -1 in // Don't allow copying of tile registers
+def TILE : RegisterClass<"X86", [x86amx], 8192,
+ (sequence "TMM%u", 0, 7)> {let Size = 8192;}
+def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
+ let CopyCost = -1; // Don't allow copying of tile config registers.
+ let isAllocatable = 1;
+ let Size = 512;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td
new file mode 100644
index 000000000000..4aea7bc253bb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -0,0 +1,1733 @@
+//=- X86SchedBroadwell.td - X86 Broadwell Scheduling ---------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Broadwell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def BroadwellModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and BW can decode 4
+ // instructions per cycle.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 192; // Based on the reorder buffer.
+ let LoadLatency = 5;
+ let MispredictPenalty = 16;
+
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
+ // This flag is set to allow the scheduler to assign a default model to
+ // unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = BroadwellModel in {
+
+// Broadwell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def BWPort0 : ProcResource<1>;
+def BWPort1 : ProcResource<1>;
+def BWPort2 : ProcResource<1>;
+def BWPort3 : ProcResource<1>;
+def BWPort4 : ProcResource<1>;
+def BWPort5 : ProcResource<1>;
+def BWPort6 : ProcResource<1>;
+def BWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def BWPort01 : ProcResGroup<[BWPort0, BWPort1]>;
+def BWPort23 : ProcResGroup<[BWPort2, BWPort3]>;
+def BWPort237 : ProcResGroup<[BWPort2, BWPort3, BWPort7]>;
+def BWPort04 : ProcResGroup<[BWPort0, BWPort4]>;
+def BWPort05 : ProcResGroup<[BWPort0, BWPort5]>;
+def BWPort06 : ProcResGroup<[BWPort0, BWPort6]>;
+def BWPort15 : ProcResGroup<[BWPort1, BWPort5]>;
+def BWPort16 : ProcResGroup<[BWPort1, BWPort6]>;
+def BWPort56 : ProcResGroup<[BWPort5, BWPort6]>;
+def BWPort015 : ProcResGroup<[BWPort0, BWPort1, BWPort5]>;
+def BWPort056 : ProcResGroup<[BWPort0, BWPort5, BWPort6]>;
+def BWPort0156: ProcResGroup<[BWPort0, BWPort1, BWPort5, BWPort6]>;
+
+// 60 Entry Unified Scheduler
+def BWPortAny : ProcResGroup<[BWPort0, BWPort1, BWPort2, BWPort3, BWPort4,
+ BWPort5, BWPort6, BWPort7]> {
+ let BufferSize=60;
+}
+
+// Integer division issued on port 0.
+def BWDivider : ProcResource<1>;
+// FP division and sqrt on port 0.
+def BWFPDivider : ProcResource<1>;
+
+// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 5>;
+
+// Vector loads are 5/5/6 cycles, so ReadAfterVec*Ld registers needn't be available
+// until 5/5/6 cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 5>;
+def : ReadAdvance<ReadAfterVecYLd, 6>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([BWPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [BWPort237,BWPort4]>;
+
+// Arithmetic.
+defm : BWWriteResPair<WriteALU, [BWPort0156], 1>; // Simple integer ALU op.
+defm : BWWriteResPair<WriteADC, [BWPort06], 1>; // Integer ALU + flags op.
+
+// Integer multiplication.
+defm : BWWriteResPair<WriteIMul8, [BWPort1], 3>;
+defm : BWWriteResPair<WriteIMul16, [BWPort1,BWPort06,BWPort0156], 4, [1,1,2], 4>;
+defm : X86WriteRes<WriteIMul16Imm, [BWPort1,BWPort0156], 4, [1,1], 2>;
+defm : X86WriteRes<WriteIMul16ImmLd, [BWPort1,BWPort0156,BWPort23], 8, [1,1,1], 3>;
+defm : BWWriteResPair<WriteIMul16Reg, [BWPort1], 3>;
+defm : BWWriteResPair<WriteIMul32, [BWPort1,BWPort06,BWPort0156], 4, [1,1,1], 3>;
+defm : BWWriteResPair<WriteIMul32Imm, [BWPort1], 3>;
+defm : BWWriteResPair<WriteIMul32Reg, [BWPort1], 3>;
+defm : BWWriteResPair<WriteIMul64, [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : BWWriteResPair<WriteIMul64Imm, [BWPort1], 3>;
+defm : BWWriteResPair<WriteIMul64Reg, [BWPort1], 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+
+// TODO: Why isn't the BWDivider used consistently?
+defm : X86WriteRes<WriteDiv8, [BWPort0, BWDivider], 25, [1, 10], 1>;
+defm : X86WriteRes<WriteDiv16, [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv32, [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv64, [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv8Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteDiv16Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteDiv32Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteDiv64Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+
+defm : X86WriteRes<WriteIDiv8, [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv16, [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv32, [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv64, [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv8Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteIDiv16Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteIDiv32Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteIDiv64Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+
+defm : X86WriteRes<WriteCMPXCHG,[BWPort06, BWPort0156], 5, [2, 3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[BWPort23, BWPort06, BWPort0156, BWPort237, BWPort4], 8, [1, 2, 1, 1, 1], 6>;
+defm : X86WriteRes<WriteBSWAP32, [BWPort15], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [BWPort06, BWPort15], 2, [1, 1], 2>;
+defm : X86WriteRes<WriteXCHG, [BWPort0156], 2, [3], 3>;
+
+defm : BWWriteResPair<WriteCRC32, [BWPort1], 3>;
+
+def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads.
+
+defm : BWWriteResPair<WriteCMOV, [BWPort06], 1>; // Conditional move.
+defm : X86WriteRes<WriteFCMOV, [BWPort1], 3, [1], 1>; // x87 conditional move.
+
+def : WriteRes<WriteSETCC, [BWPort06]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [BWPort06,BWPort4,BWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+defm : X86WriteRes<WriteLAHFSAHF, [BWPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTest, [BWPort06], 1, [1], 1>; // Bit Test instrs
+defm : X86WriteRes<WriteBitTestImmLd, [BWPort06,BWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestRegLd, [BWPort0156,BWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestSet, [BWPort06], 1, [1], 1>; // Bit Test + Set instrs
+defm : X86WriteRes<WriteBitTestSetImmLd, [BWPort06,BWPort23], 5, [1,1], 3>;
+defm : X86WriteRes<WriteBitTestSetRegLd, [BWPort0156,BWPort23], 5, [1,1], 2>;
+
+// Bit counts.
+defm : BWWriteResPair<WriteBSF, [BWPort1], 3>;
+defm : BWWriteResPair<WriteBSR, [BWPort1], 3>;
+defm : BWWriteResPair<WriteLZCNT, [BWPort1], 3>;
+defm : BWWriteResPair<WriteTZCNT, [BWPort1], 3>;
+defm : BWWriteResPair<WritePOPCNT, [BWPort1], 3>;
+
+// Integer shifts and rotates.
+defm : BWWriteResPair<WriteShift, [BWPort06], 1>;
+defm : BWWriteResPair<WriteShiftCL, [BWPort06,BWPort0156], 3, [2,1], 3>;
+defm : BWWriteResPair<WriteRotate, [BWPort06], 1, [1], 1>;
+defm : BWWriteResPair<WriteRotateCL, [BWPort06,BWPort0156], 3, [2,1], 3>;
+
+// SHLD/SHRD.
+defm : X86WriteRes<WriteSHDrri, [BWPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteSHDrrcl,[BWPort1,BWPort06,BWPort0156], 6, [1, 1, 2], 4>;
+defm : X86WriteRes<WriteSHDmri, [BWPort1,BWPort23,BWPort237,BWPort0156], 9, [1, 1, 1, 1], 4>;
+defm : X86WriteRes<WriteSHDmrcl,[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156], 11, [1, 1, 1, 1, 2], 6>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>;
+defm : BWWriteResPair<WriteBLS, [BWPort15], 1>;
+defm : BWWriteResPair<WriteBZHI, [BWPort15], 1>;
+
+// Loads, stores, and moves, not folded with other operations.
+defm : X86WriteRes<WriteLoad, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore, [BWPort237, BWPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove, [BWPort0156], 1, [1], 1>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def : WriteRes<WriteZero, []>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : BWWriteResPair<WriteJump, [BWPort06], 1>;
+
+// Floating point. This covers both scalar and vector operations.
+defm : X86WriteRes<WriteFLD0, [BWPort01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [BWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC, [BWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [BWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [BWPort23,BWPort5], 7, [1,2], 3>;
+defm : X86WriteRes<WriteFMaskedLoadY, [BWPort23,BWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteFStore, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+
+defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>;
+
+defm : BWWriteResPair<WriteFAdd, [BWPort1], 3, [1], 1, 5>; // Floating point add/sub.
+defm : BWWriteResPair<WriteFAddX, [BWPort1], 3, [1], 1, 5>; // Floating point add/sub (XMM).
+defm : BWWriteResPair<WriteFAddY, [BWPort1], 3, [1], 1, 6>; // Floating point add/sub (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : BWWriteResPair<WriteFAdd64, [BWPort1], 3, [1], 1, 5>; // Floating point double add/sub.
+defm : BWWriteResPair<WriteFAdd64X, [BWPort1], 3, [1], 1, 5>; // Floating point double add/sub (XMM).
+defm : BWWriteResPair<WriteFAdd64Y, [BWPort1], 3, [1], 1, 6>; // Floating point double add/sub (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+
+defm : BWWriteResPair<WriteFCmp, [BWPort1], 3, [1], 1, 5>; // Floating point compare.
+defm : BWWriteResPair<WriteFCmpX, [BWPort1], 3, [1], 1, 5>; // Floating point compare (XMM).
+defm : BWWriteResPair<WriteFCmpY, [BWPort1], 3, [1], 1, 6>; // Floating point compare (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : BWWriteResPair<WriteFCmp64, [BWPort1], 3, [1], 1, 5>; // Floating point double compare.
+defm : BWWriteResPair<WriteFCmp64X, [BWPort1], 3, [1], 1, 5>; // Floating point double compare (XMM).
+defm : BWWriteResPair<WriteFCmp64Y, [BWPort1], 3, [1], 1, 6>; // Floating point double compare (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+
+defm : BWWriteResPair<WriteFCom, [BWPort1], 3>; // Floating point compare to flags (X87).
+defm : BWWriteResPair<WriteFComX, [BWPort1], 3>; // Floating point compare to flags (SSE).
+
+defm : BWWriteResPair<WriteFMul, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication.
+defm : BWWriteResPair<WriteFMulX, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication (XMM).
+defm : BWWriteResPair<WriteFMulY, [BWPort01], 3, [1], 1, 6>; // Floating point multiplication (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : BWWriteResPair<WriteFMul64, [BWPort01], 3, [1], 1, 5>; // Floating point double multiplication.
+defm : BWWriteResPair<WriteFMul64X, [BWPort01], 3, [1], 1, 5>; // Floating point double multiplication (XMM).
+defm : BWWriteResPair<WriteFMul64Y, [BWPort01], 3, [1], 1, 6>; // Floating point double multiplication (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+
+//defm : BWWriteResPair<WriteFDiv, [BWPort0,BWFPDivider], 11, [1,3], 1, 5>; // Floating point division.
+defm : BWWriteResPair<WriteFDivX, [BWPort0,BWFPDivider], 11, [1,5], 1, 5>; // Floating point division (XMM).
+defm : BWWriteResPair<WriteFDivY, [BWPort0,BWPort015,BWFPDivider], 17, [2,1,10], 3, 6>; // Floating point division (YMM).
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+//defm : BWWriteResPair<WriteFDiv64, [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; // Floating point division.
+defm : BWWriteResPair<WriteFDiv64X, [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; // Floating point division (XMM).
+defm : BWWriteResPair<WriteFDiv64Y, [BWPort0,BWPort015,BWFPDivider], 23, [2,1,16], 3, 6>; // Floating point division (YMM).
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+
+defm : X86WriteRes<WriteFSqrt, [BWPort0,BWFPDivider], 11, [1,4], 1>; // Floating point square root.
+defm : X86WriteRes<WriteFSqrtLd, [BWPort0,BWPort23,BWFPDivider], 16, [1,1,7], 2>;
+defm : BWWriteResPair<WriteFSqrtX, [BWPort0,BWFPDivider], 11, [1,7], 1, 5>; // Floating point square root (XMM).
+defm : BWWriteResPair<WriteFSqrtY, [BWPort0,BWPort015,BWFPDivider], 21, [2,1,14], 3, 6>; // Floating point square root (YMM).
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : X86WriteRes<WriteFSqrt64, [BWPort0,BWFPDivider], 16, [1,8], 1>; // Floating point double square root.
+defm : X86WriteRes<WriteFSqrt64Ld, [BWPort0,BWPort23,BWFPDivider], 21, [1,1,14], 2>;
+defm : BWWriteResPair<WriteFSqrt64X, [BWPort0,BWFPDivider], 16, [1,14],1, 5>; // Floating point double square root (XMM).
+defm : BWWriteResPair<WriteFSqrt64Y, [BWPort0,BWPort015,BWFPDivider], 29, [2,1,28], 3, 6>; // Floating point double square root (YMM).
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : BWWriteResPair<WriteFSqrt80, [BWPort0,BWFPDivider], 23, [1,9]>; // Floating point long double square root.
+
+defm : BWWriteResPair<WriteFRcp, [BWPort0], 5, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : BWWriteResPair<WriteFRcpX, [BWPort0], 5, [1], 1, 5>; // Floating point reciprocal estimate (XMM).
+defm : BWWriteResPair<WriteFRcpY, [BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : BWWriteResPair<WriteFRsqrt, [BWPort0], 5, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : BWWriteResPair<WriteFRsqrtX,[BWPort0], 5, [1], 1, 5>; // Floating point reciprocal square root estimate (XMM).
+defm : BWWriteResPair<WriteFRsqrtY,[BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal square root estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
+defm : BWWriteResPair<WriteFMA, [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add.
+defm : BWWriteResPair<WriteFMAX, [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add (XMM).
+defm : BWWriteResPair<WriteFMAY, [BWPort01], 5, [1], 1, 6>; // Fused Multiply Add (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : BWWriteResPair<WriteDPPD, [BWPort0,BWPort1,BWPort5], 9, [1,1,1], 3, 5>; // Floating point double dot product.
+defm : BWWriteResPair<WriteDPPS, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 5>; // Floating point single dot product.
+defm : BWWriteResPair<WriteDPPSY, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 6>; // Floating point single dot product (YMM).
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : BWWriteResPair<WriteFSign, [BWPort5], 1>; // Floating point fabs/fchs.
+defm : X86WriteRes<WriteFRnd, [BWPort23], 6, [1], 1>; // Floating point rounding.
+defm : X86WriteRes<WriteFRndY, [BWPort23], 6, [1], 1>; // Floating point rounding (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : X86WriteRes<WriteFRndLd, [BWPort1,BWPort23], 11, [2,1], 3>;
+defm : X86WriteRes<WriteFRndYLd, [BWPort1,BWPort23], 12, [2,1], 3>;
+defm : BWWriteResPair<WriteFLogic, [BWPort5], 1, [1], 1, 5>; // Floating point and/or/xor logicals.
+defm : BWWriteResPair<WriteFLogicY, [BWPort5], 1, [1], 1, 6>; // Floating point and/or/xor logicals (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : BWWriteResPair<WriteFTest, [BWPort0], 1, [1], 1, 5>; // Floating point TEST instructions.
+defm : BWWriteResPair<WriteFTestY, [BWPort0], 1, [1], 1, 6>; // Floating point TEST instructions (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : BWWriteResPair<WriteFShuffle, [BWPort5], 1, [1], 1, 5>; // Floating point vector shuffles.
+defm : BWWriteResPair<WriteFShuffleY, [BWPort5], 1, [1], 1, 6>; // Floating point vector shuffles (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : BWWriteResPair<WriteFVarShuffle, [BWPort5], 1, [1], 1, 5>; // Floating point vector variable shuffles.
+defm : BWWriteResPair<WriteFVarShuffleY, [BWPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles.
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : BWWriteResPair<WriteFBlend, [BWPort015], 1, [1], 1, 5>; // Floating point vector blends.
+defm : BWWriteResPair<WriteFBlendY, [BWPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : BWWriteResPair<WriteFVarBlend, [BWPort5], 2, [2], 2, 5>; // Fp vector variable blends.
+defm : BWWriteResPair<WriteFVarBlendY, [BWPort5], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+
+// FMA Scheduling helper class.
+// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm : X86WriteRes<WriteVecLoad, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [BWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [BWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [BWPort23,BWPort5], 7, [1,2], 3>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [BWPort23,BWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteVecStore, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMove, [BWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [BWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [BWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [BWPort0], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [BWPort5], 1, [1], 1>;
+
+defm : X86WriteRes<WriteEMMS, [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>;
+
+defm : BWWriteResPair<WriteVecALU, [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUX, [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUY, [BWPort15], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : BWWriteResPair<WriteVecLogic, [BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : BWWriteResPair<WriteVecLogicX,[BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : BWWriteResPair<WriteVecLogicY,[BWPort015], 1, [1], 1, 6>; // Vector integer and/or/xor (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : BWWriteResPair<WriteVecTest, [BWPort0,BWPort5], 2, [1,1], 2, 5>; // Vector integer TEST instructions.
+defm : BWWriteResPair<WriteVecTestY, [BWPort0,BWPort5], 4, [1,1], 2, 6>; // Vector integer TEST instructions (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : BWWriteResPair<WriteVecIMul, [BWPort0], 5, [1], 1, 5>; // Vector integer multiply.
+defm : BWWriteResPair<WriteVecIMulX, [BWPort0], 5, [1], 1, 5>; // Vector integer multiply.
+defm : BWWriteResPair<WriteVecIMulY, [BWPort0], 5, [1], 1, 6>; // Vector integer multiply.
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : BWWriteResPair<WritePMULLD, [BWPort0], 10, [2], 2, 5>; // Vector PMULLD.
+defm : BWWriteResPair<WritePMULLDY, [BWPort0], 10, [2], 2, 6>; // Vector PMULLD (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : BWWriteResPair<WriteShuffle, [BWPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : BWWriteResPair<WriteShuffleX, [BWPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : BWWriteResPair<WriteShuffleY, [BWPort5], 1, [1], 1, 6>; // Vector shuffles (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : BWWriteResPair<WriteVarShuffle, [BWPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : BWWriteResPair<WriteVarShuffleX,[BWPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : BWWriteResPair<WriteVarShuffleY,[BWPort5], 1, [1], 1, 6>; // Vector variable shuffles (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : BWWriteResPair<WriteBlend, [BWPort5], 1, [1], 1, 5>; // Vector blends.
+defm : BWWriteResPair<WriteBlendY, [BWPort5], 1, [1], 1, 6>; // Vector blends (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : BWWriteResPair<WriteVarBlend, [BWPort5], 2, [2], 2, 5>; // Vector variable blends.
+defm : BWWriteResPair<WriteVarBlendY, [BWPort5], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : BWWriteResPair<WriteMPSAD, [BWPort0, BWPort5], 7, [1, 2], 3, 5>; // Vector MPSAD.
+defm : BWWriteResPair<WriteMPSADY, [BWPort0, BWPort5], 7, [1, 2], 3, 6>; // Vector MPSAD.
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : BWWriteResPair<WritePSADBW, [BWPort0], 5, [1], 1, 5>; // Vector PSADBW.
+defm : BWWriteResPair<WritePSADBWX, [BWPort0], 5, [1], 1, 5>; // Vector PSADBW.
+defm : BWWriteResPair<WritePSADBWY, [BWPort0], 5, [1], 1, 6>; // Vector PSADBW (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : BWWriteResPair<WritePHMINPOS, [BWPort0], 5>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : BWWriteResPair<WriteVecShift, [BWPort0], 1, [1], 1, 5>;
+defm : BWWriteResPair<WriteVecShiftX, [BWPort0,BWPort5], 2, [1,1], 2, 5>;
+defm : X86WriteRes<WriteVecShiftY, [BWPort0,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd, [BWPort0,BWPort23], 7, [1,1], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+
+defm : BWWriteResPair<WriteVecShiftImm, [BWPort0], 1, [1], 1, 5>;
+defm : BWWriteResPair<WriteVecShiftImmX, [BWPort0], 1, [1], 1, 5>; // Vector integer immediate shifts (XMM).
+defm : BWWriteResPair<WriteVecShiftImmY, [BWPort0], 1, [1], 1, 6>; // Vector integer immediate shifts (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : BWWriteResPair<WriteVarVecShift, [BWPort0, BWPort5], 3, [2,1], 3, 5>; // Variable vector shifts.
+defm : BWWriteResPair<WriteVarVecShiftY, [BWPort0, BWPort5], 3, [2,1], 3, 6>; // Variable vector shifts (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [BWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVecInsertLd, [BWPort5,BWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def : WriteRes<WriteVecExtract, [BWPort0,BWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def : WriteRes<WriteVecExtractSt, [BWPort4,BWPort5,BWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+// Conversion between integer and float.
+defm : BWWriteResPair<WriteCvtSS2I, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2I, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2IY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : BWWriteResPair<WriteCvtSD2I, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2I, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2IY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : BWWriteResPair<WriteCvtI2SS, [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PS, [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PSY, [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : BWWriteResPair<WriteCvtI2SD, [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PD, [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PDY, [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : BWWriteResPair<WriteCvtSS2SD, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PD, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PDY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : BWWriteResPair<WriteCvtSD2SS, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PS, [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PSY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteRes<WriteCvtPH2PS, [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY, [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteRes<WriteCvtPH2PSLd, [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+defm : X86WriteRes<WriteCvtPS2PH, [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY, [BWPort1,BWPort5], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt, [BWPort1,BWPort4,BWPort237], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [BWPort1,BWPort4,BWPort237], 7, [1,1,1], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+// Strings instructions.
+
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [BWPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [BWPort0, BWPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [BWPort0, BWPort5, BWPort015, BWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def : WriteRes<WritePCmpEStrMLd, [BWPort0, BWPort5, BWPort23, BWPort015, BWPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [BWPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [BWPort0, BWPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [BWPort0, BWPort5, BWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
+}
+def : WriteRes<WritePCmpEStrILd, [BWPort0, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [BWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSK, [BWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSKY, [BWPort0]> { let Latency = 3; }
+def : WriteRes<WriteMMXMOVMSK, [BWPort0]> { let Latency = 1; }
+
+// AES instructions.
+def : WriteRes<WriteAESDecEnc, [BWPort5]> { // Decryption, encryption.
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [BWPort5, BWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+
+def : WriteRes<WriteAESIMC, [BWPort5]> { // InvMixColumn.
+ let Latency = 14;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [BWPort5, BWPort23]> {
+ let Latency = 19;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+
+def : WriteRes<WriteAESKeyGen, [BWPort0, BWPort5, BWPort015]> { // Key Generation.
+ let Latency = 29;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,2];
+}
+def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23, BWPort015]> {
+ let Latency = 33;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,1,1];
+}
+
+// Carry-less multiplication instructions.
+defm : BWWriteResPair<WriteCLMul, [BWPort0], 5>;
+
+// Catch-all for expensive system instructions.
+def : WriteRes<WriteSystem, [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
+
+// AVX2.
+defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector variable shuffles.
+defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 6>; // 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 6>; // 256-bit width vector variable shuffles.
+
+// Old microcoded instructions that nobody use.
+def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def : WriteRes<WriteFence, [BWPort23, BWPort4]>;
+
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [BWPort0,BWPort23,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [BWPort4,BWPort5,BWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
+// Nop, not very useful expect it provides a model for nops!
+def : WriteRes<WriteNop, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : BWWriteResPair<WriteFHAdd, [BWPort1,BWPort5], 5, [1,2], 3, 5>;
+defm : BWWriteResPair<WriteFHAddY, [BWPort1,BWPort5], 5, [1,2], 3, 6>;
+defm : BWWriteResPair<WritePHAdd, [BWPort5,BWPort15], 3, [2,1], 3, 5>;
+defm : BWWriteResPair<WritePHAddX, [BWPort5,BWPort15], 3, [2,1], 3, 5>;
+defm : BWWriteResPair<WritePHAddY, [BWPort5,BWPort15], 3, [2,1], 3, 6>;
+
+// Remaining instrs.
+
+def BWWriteResGroup1 : SchedWriteRes<[BWPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQ(Y?)rr",
+ "VPSRLVQ(Y?)rr")>;
+
+def BWWriteResGroup2 : SchedWriteRes<[BWPort1]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup2], (instregex "COM(P?)_FST0r",
+ "UCOM_F(P?)r")>;
+
+def BWWriteResGroup3 : SchedWriteRes<[BWPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup3], (instrs MMX_MOVQ2DQrr)>;
+
+def BWWriteResGroup4 : SchedWriteRes<[BWPort6]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup4], (instregex "JMP(16|32|64)r")>;
+
+def BWWriteResGroup5 : SchedWriteRes<[BWPort01]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup5], (instrs FINCSTP, FNOP)>;
+
+def BWWriteResGroup6 : SchedWriteRes<[BWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup6], (instrs CDQ, CQO)>;
+
+def BWWriteResGroup7 : SchedWriteRes<[BWPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr")>;
+
+def BWWriteResGroup8 : SchedWriteRes<[BWPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDD(Y?)rri")>;
+
+def BWWriteResGroup9 : SchedWriteRes<[BWPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup9], (instrs SGDT64m,
+ SIDT64m,
+ SMSW16m,
+ STRm,
+ SYSCALL)>;
+
+def BWWriteResGroup10 : SchedWriteRes<[BWPort4,BWPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup10], (instrs FBSTPm)>;
+def: InstRW<[BWWriteResGroup10], (instregex "ST_FP(32|64|80)m")>;
+
+def BWWriteResGroup12 : SchedWriteRes<[BWPort01]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup12], (instrs FDECSTP)>;
+
+def BWWriteResGroup14 : SchedWriteRes<[BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup14], (instrs LFENCE,
+ MFENCE,
+ WAIT,
+ XGETBV)>;
+
+def BWWriteResGroup15 : SchedWriteRes<[BWPort0,BWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup15], (instregex "(V?)CVTPS2PDrr",
+ "(V?)CVTSS2SDrr")>;
+
+def BWWriteResGroup16 : SchedWriteRes<[BWPort6,BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup16], (instregex "CLFLUSH")>;
+
+def BWWriteResGroup17 : SchedWriteRes<[BWPort01,BWPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup17], (instrs MMX_MOVDQ2Qrr)>;
+
+def BWWriteResGroup18 : SchedWriteRes<[BWPort237,BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup18], (instrs SFENCE)>;
+
+def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup20], (instrs CWD,
+ JCXZ, JECXZ, JRCXZ,
+ ADC8i8, SBB8i8,
+ ADC16i16, SBB16i16,
+ ADC32i32, SBB32i32,
+ ADC64i32, SBB64i32)>;
+
+def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup22], (instrs FNSTCW16m)>;
+
+def BWWriteResGroup24 : SchedWriteRes<[BWPort4,BWPort237,BWPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup24], (instregex "MOVBE(16|32|64)mr")>;
+
+def BWWriteResGroup25 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup25], (instrs PUSH16r, PUSH32r, PUSH64r, PUSH64i8,
+ STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)rmr")>;
+
+def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup27], (instrs MMX_CVTPI2PSirr)>;
+def: InstRW<[BWWriteResGroup27], (instregex "P(DEP|EXT)(32|64)rr",
+ "(V?)CVTDQ2PS(Y?)rr")>;
+
+def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup28], (instrs VPBROADCASTBrr,
+ VPBROADCASTWrr)>;
+
+def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup33], (instrs MMX_PACKSSDWirr,
+ MMX_PACKSSWBirr,
+ MMX_PACKUSWBirr)>;
+
+def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[BWWriteResGroup34], (instregex "CLD")>;
+
+def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[BWWriteResGroup35], (instregex "RCL(8|16|32|64)r(1|i)",
+ "RCR(8|16|32|64)r(1|i)")>;
+
+def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup37], (instregex "CALL(16|32|64)r")>;
+
+def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup38], (instrs CALL64pcrel32)>;
+
+def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup39], (instregex "(V?)CVT(T?)SD2SI64rr",
+ "(V?)CVT(T?)SD2SIrr",
+ "(V?)CVT(T?)SS2SI64rr",
+ "(V?)CVT(T?)SS2SIrr")>;
+
+def BWWriteResGroup40 : SchedWriteRes<[BWPort0,BWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup40], (instrs VCVTPS2PDYrr)>;
+
+def BWWriteResGroup41 : SchedWriteRes<[BWPort0,BWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup41], (instrs FNSTSW16r)>;
+
+def BWWriteResGroup42 : SchedWriteRes<[BWPort1,BWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup42], (instrs MMX_CVTPI2PDirr)>;
+def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVT(T?)PD2PIirr",
+ "MMX_CVT(T?)PS2PIirr",
+ "(V?)CVTDQ2PDrr",
+ "(V?)CVTPD2PSrr",
+ "(V?)CVTSD2SSrr",
+ "(V?)CVTSI642SDrr",
+ "(V?)CVTSI2SDrr",
+ "(V?)CVTSI2SSrr",
+ "(V?)CVT(T?)PD2DQrr")>;
+
+def BWWriteResGroup43 : SchedWriteRes<[BWPort0,BWPort4,BWPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup43], (instrs FNSTSWm)>;
+
+def BWWriteResGroup44 : SchedWriteRes<[BWPort1,BWPort4,BWPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup44], (instregex "IST(T?)_FP(16|32|64)m",
+ "IST_F(16|32)m")>;
+
+def BWWriteResGroup45 : SchedWriteRes<[BWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
+}
+def: InstRW<[BWWriteResGroup45], (instrs FNCLEX)>;
+
+def BWWriteResGroup46 : SchedWriteRes<[]> {
+ let Latency = 0;
+ let NumMicroOps = 4;
+ let ResourceCycles = [];
+}
+def: InstRW<[BWWriteResGroup46], (instrs VZEROUPPER)>;
+
+def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup47], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
+
+def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm(8|16|32)",
+ "MOVZX(16|32|64)rm(8|16)")>;
+def: InstRW<[BWWriteResGroup49], (instrs VBROADCASTSSrm,
+ VMOVDDUPrm, MOVDDUPrm,
+ VMOVSHDUPrm, MOVSHDUPrm,
+ VMOVSLDUPrm, MOVSLDUPrm,
+ VPBROADCASTDrm,
+ VPBROADCASTQrm)>;
+
+def BWWriteResGroup50 : SchedWriteRes<[BWPort1,BWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[BWWriteResGroup50], (instregex "(V?)CVTSI642SSrr")>;
+
+def BWWriteResGroup51 : SchedWriteRes<[BWPort1,BWPort6,BWPort06]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup51], (instregex "STR(16|32|64)r")>;
+
+def BWWriteResGroup54 : SchedWriteRes<[BWPort6,BWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[BWWriteResGroup54], (instrs PAUSE)>;
+
+def BWWriteResGroup55 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[BWWriteResGroup55], (instrs XSETBV)>;
+
+def BWWriteResGroup57 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,4];
+}
+def: InstRW<[BWWriteResGroup57], (instregex "PUSHF(16|64)")>;
+
+def BWWriteResGroup58 : SchedWriteRes<[BWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup58], (instregex "LD_F(32|64|80)m")>;
+def: InstRW<[BWWriteResGroup58], (instrs VBROADCASTF128,
+ VBROADCASTI128,
+ VBROADCASTSDYrm,
+ VBROADCASTSSYrm,
+ VMOVDDUPYrm,
+ VMOVSHDUPYrm,
+ VMOVSLDUPYrm,
+ VPBROADCASTDYrm,
+ VPBROADCASTQYrm)>;
+
+def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup59], (instrs CVTPS2PDrm, VCVTPS2PDrm,
+ CVTSS2SDrm, VCVTSS2SDrm,
+ CVTSS2SDrm_Int, VCVTSS2SDrm_Int,
+ VPSLLVQrm,
+ VPSRLVQrm)>;
+
+def BWWriteResGroup60 : SchedWriteRes<[BWPort1,BWPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup60], (instrs VCVTDQ2PDYrr,
+ VCVTPD2PSYrr,
+ VCVTPD2DQYrr,
+ VCVTTPD2DQYrr)>;
+
+def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup62], (instrs FARJMP64m)>;
+def: InstRW<[BWWriteResGroup62], (instregex "JMP(16|32|64)m")>;
+
+def BWWriteResGroup64 : SchedWriteRes<[BWPort23,BWPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm",
+ "MOVBE(16|32|64)rm")>;
+
+def BWWriteResGroup65 : SchedWriteRes<[BWPort23,BWPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup65], (instrs VINSERTF128rm,
+ VINSERTI128rm,
+ VPBLENDDrmi)>;
+
+def BWWriteResGroup66 : SchedWriteRes<[BWPort23,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup66], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)rmr")>;
+
+def BWWriteResGroup68 : SchedWriteRes<[BWPort1,BWPort6,BWPort06,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup68], (instregex "SLDT(16|32|64)r")>;
+
+def BWWriteResGroup69 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup69], (instregex "SAR(8|16|32|64)m(1|i)",
+ "SHL(8|16|32|64)m(1|i)",
+ "SHR(8|16|32|64)m(1|i)")>;
+
+def BWWriteResGroup70 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup70], (instregex "POP(16|32|64)rmm",
+ "PUSH(16|32|64)rmm")>;
+
+def BWWriteResGroup71 : SchedWriteRes<[BWPort6,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,5];
+}
+def: InstRW<[BWWriteResGroup71], (instrs STD)>;
+
+def BWWriteResGroup73 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup73], (instrs VPSLLVQYrm,
+ VPSRLVQYrm)>;
+
+def BWWriteResGroup74 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup74], (instregex "FCOM(P?)(32|64)m")>;
+
+def BWWriteResGroup77 : SchedWriteRes<[BWPort23,BWPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup77], (instrs VPBLENDDYrmi)>;
+
+def BWWriteResGroup79 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup79], (instrs MMX_PACKSSDWirm,
+ MMX_PACKSSWBirm,
+ MMX_PACKUSWBirm)>;
+
+def BWWriteResGroup80 : SchedWriteRes<[BWPort23,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[BWWriteResGroup80], (instrs LEAVE, LEAVE64,
+ SCASB, SCASL, SCASQ, SCASW)>;
+
+def BWWriteResGroup82 : SchedWriteRes<[BWPort0,BWPort01,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup82], (instrs FLDCW16m)>;
+
+def BWWriteResGroup84 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup84], (instrs LRETQ, RETQ)>;
+
+def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[BWWriteResGroup87], (instregex "ROL(8|16|32|64)m(1|i)",
+ "ROR(8|16|32|64)m(1|i)")>;
+
+def BWWriteResGroup87_1 : SchedWriteRes<[BWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup87_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
+def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[BWWriteResGroup88], (instregex "XADD(8|16|32|64)rm")>;
+
+def BWWriteResGroup89 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup89], (instrs FARCALL64m)>;
+
+def BWWriteResGroup90 : SchedWriteRes<[BWPort6,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,1,2];
+}
+def: InstRW<[BWWriteResGroup90], (instrs LOOP)>;
+
+def BWWriteResGroup91 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup91], (instrs MMX_CVTPI2PSirm,
+ CVTDQ2PSrm,
+ VCVTDQ2PSrm)>;
+def: InstRW<[BWWriteResGroup91], (instregex "P(DEP|EXT)(32|64)rm")>;
+
+def BWWriteResGroup92 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup92], (instrs VPMOVSXBDYrm,
+ VPMOVSXBQYrm,
+ VPMOVSXBWYrm,
+ VPMOVSXDQYrm,
+ VPMOVSXWDYrm,
+ VPMOVSXWQYrm,
+ VPMOVZXWDYrm)>;
+
+def BWWriteResGroup97 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[BWWriteResGroup97], (instregex "RCL(8|16|32|64)m(1|i)",
+ "RCR(8|16|32|64)m(1|i)")>;
+
+def BWWriteResGroup99 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[BWWriteResGroup99], (instregex "XCHG(8|16|32|64)rm")>;
+
+def BWWriteResGroup100 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def : SchedAlias<WriteADCRMW, BWWriteResGroup100>;
+def: InstRW<[BWWriteResGroup100], (instregex "ROL(8|16|32|64)mCL",
+ "ROR(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
+
+def BWWriteResGroup101 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup101], (instrs VCVTPS2DQYrm,
+ VCVTTPS2DQYrm)>;
+
+def BWWriteResGroup105 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup105], (instregex "(V?)CVTSS2SI(64)?rm",
+ "(V?)CVT(T?)SD2SI64rm",
+ "(V?)CVT(T?)SD2SIrm",
+ "VCVTTSS2SI64rm",
+ "(V?)CVTTSS2SIrm")>;
+
+def BWWriteResGroup106 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup106], (instrs VCVTPS2PDYrm)>;
+
+def BWWriteResGroup107 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup107], (instrs CVTPD2PSrm,
+ CVTPD2DQrm,
+ CVTTPD2DQrm,
+ MMX_CVTPI2PDirm)>;
+def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVT(T?)PD2PIirm",
+ "(V?)CVTDQ2PDrm",
+ "(V?)CVTSD2SSrm")>;
+
+def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTB(Y?)rm",
+ "VPBROADCASTW(Y?)rm")>;
+
+def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,3];
+}
+def: InstRW<[BWWriteResGroup112], (instrs RDRAND16r, RDRAND32r, RDRAND64r)>;
+
+def BWWriteResGroup113 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm",
+ "LSL(16|32|64)rm")>;
+
+def BWWriteResGroup115 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup115], (instregex "(V?)PCMPGTQrm")>;
+
+def BWWriteResGroup117 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup117], (instregex "FICOM(P?)(16|32)m")>;
+
+def BWWriteResGroup120 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup120], (instregex "CVTTSS2SI64rm")>;
+
+def BWWriteResGroup122_1 : SchedWriteRes<[BWPort0,BWFPDivider]> {
+ let Latency = 11;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,3]; // Really 2.5 cycle throughput
+}
+def : SchedAlias<WriteFDiv, BWWriteResGroup122_1>; // TODO - convert to ZnWriteResFpuPair
+
+def BWWriteResGroup123 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup123], (instregex "MUL_F(32|64)m")>;
+def: InstRW<[BWWriteResGroup123], (instrs VPCMPGTQYrm)>;
+
+def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup128], (instrs VCVTDQ2PDYrm)>;
+
+def BWWriteResGroup131 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,3];
+}
+def: InstRW<[BWWriteResGroup131], (instregex "RCL(16|32|64)rCL",
+ "RCR(16|32|64)rCL")>;
+
+def BWWriteResGroup132 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,4,1,3];
+}
+def: InstRW<[BWWriteResGroup132], (instrs RCL8rCL)>;
+
+def BWWriteResGroup133 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,9];
+}
+def: InstRW<[BWWriteResGroup133], (instrs LOOPE)>;
+def: InstRW<[BWWriteResGroup133], (instrs LOOPNE)>;
+
+def BWWriteResGroup135 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup135], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
+
+def BWWriteResGroup139_1 : SchedWriteRes<[BWPort0,BWFPDivider]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,4];
+}
+def : SchedAlias<WriteFDiv64, BWWriteResGroup139_1>; // TODO - convert to ZnWriteResFpuPair
+
+def BWWriteResGroup141 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI(16|32)m")>;
+
+def BWWriteResGroup144 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,2,1,3];
+}
+def: InstRW<[BWWriteResGroup144], (instregex "LAR(16|32|64)rr")>;
+
+def BWWriteResGroup145 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [2,3,1,4];
+}
+def: InstRW<[BWWriteResGroup145], (instrs RCR8rCL)>;
+
+def BWWriteResGroup146 : SchedWriteRes<[BWPort0,BWPort1,BWPort6,BWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 12;
+ let ResourceCycles = [2,1,4,5];
+}
+def: InstRW<[BWWriteResGroup146], (instrs XCH_F)>;
+
+def BWWriteResGroup147 : SchedWriteRes<[BWPort0]> {
+ let Latency = 15;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup147], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
+
+def BWWriteResGroup149 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 15;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,1,1,4,1,2];
+}
+def: InstRW<[BWWriteResGroup149], (instregex "RCL(8|16|32|64)mCL")>;
+
+def BWWriteResGroup150 : SchedWriteRes<[BWPort0,BWPort23,BWFPDivider]> {
+ let Latency = 16;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,5];
+}
+def : SchedAlias<WriteFDivLd, BWWriteResGroup150>; // TODO - convert to ZnWriteResFpuPair
+
+def BWWriteResGroup153 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[BWWriteResGroup153], (instrs CMPXCHG8B)>;
+
+def BWWriteResGroup154 : SchedWriteRes<[BWPort5,BWPort6]> {
+ let Latency = 8;
+ let NumMicroOps = 20;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup154], (instrs VZEROALL)>;
+
+def BWWriteResGroup159 : SchedWriteRes<[BWPort5,BWPort6,BWPort06,BWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,5];
+}
+def: InstRW<[BWWriteResGroup159], (instrs CPUID)>;
+def: InstRW<[BWWriteResGroup159], (instrs RDTSC)>;
+
+def BWWriteResGroup160 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,1,1,3,1,3];
+}
+def: InstRW<[BWWriteResGroup160], (instregex "RCR(8|16|32|64)mCL")>;
+
+def BWWriteResGroup161 : SchedWriteRes<[BWPort0,BWPort23,BWFPDivider]> {
+ let Latency = 19;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,8];
+}
+def : SchedAlias<WriteFDiv64Ld, BWWriteResGroup161>; // TODO - convert to ZnWriteResFpuPair
+
+def BWWriteResGroup165 : SchedWriteRes<[BWPort0]> {
+ let Latency = 20;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup165], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
+
+def BWWriteResGroup167 : SchedWriteRes<[BWPort4,BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[BWWriteResGroup167], (instrs INSB, INSL, INSW)>;
+
+def BWWriteResGroup169 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup169], (instregex "DIV_F(32|64)m")>;
+
+def BWWriteResGroup171 : SchedWriteRes<[BWPort0,BWPort4,BWPort5,BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 21;
+ let NumMicroOps = 19;
+ let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[BWWriteResGroup171], (instrs CMPXCHG16B)>;
+
+def BWWriteResGroup172 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 22;
+ let NumMicroOps = 18;
+ let ResourceCycles = [1,1,16];
+}
+def: InstRW<[BWWriteResGroup172], (instrs POPF64)>;
+
+def BWWriteResGroup176 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 19;
+ let ResourceCycles = [3,1,15];
+}
+def: InstRW<[BWWriteResGroup176], (instregex "XRSTOR(64)?")>;
+
+def BWWriteResGroup177 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
+ let Latency = 24;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI(16|32)m")>;
+
+def BWWriteResGroup180 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 26;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F(32|64)m")>;
+
+def BWWriteResGroup182 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
+ let Latency = 29;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI(16|32)m")>;
+
+def BWWriteResGroup183_1 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,3,2,1];
+}
+def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERDPDrm, VPGATHERDQrm,
+ VGATHERQPDrm, VPGATHERQQrm)>;
+
+def BWWriteResGroup183_2 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,3,4,1];
+}
+def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+ VGATHERQPDYrm, VPGATHERQQYrm)>;
+
+def BWWriteResGroup183_3 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,5,2,1];
+}
+def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSrm, VPGATHERQDrm)>;
+
+def BWWriteResGroup183_4 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,4,4,1];
+}
+def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPSrm, VPGATHERDDrm,
+ VGATHERQPSYrm, VPGATHERQDYrm)>;
+
+def BWWriteResGroup183_5 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 21;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,4,8,1];
+}
+def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
+
+def BWWriteResGroup185 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 29;
+ let NumMicroOps = 27;
+ let ResourceCycles = [1,5,1,1,19];
+}
+def: InstRW<[BWWriteResGroup185], (instrs XSAVE64)>;
+
+def BWWriteResGroup186 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 30;
+ let NumMicroOps = 28;
+ let ResourceCycles = [1,6,1,1,19];
+}
+def: InstRW<[BWWriteResGroup186], (instrs XSAVE)>;
+def: InstRW<[BWWriteResGroup186], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
+
+def BWWriteResGroup191 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort06,BWPort0156]> {
+ let Latency = 34;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,3,4,10];
+}
+def: InstRW<[BWWriteResGroup191], (instregex "IN(8|16|32)ri",
+ "IN(8|16|32)rr")>;
+
+def BWWriteResGroup194 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,2,1,4,10];
+}
+def: InstRW<[BWWriteResGroup194], (instregex "OUT(8|16|32)ir",
+ "OUT(8|16|32)rr")>;
+
+def BWWriteResGroup196 : SchedWriteRes<[BWPort5,BWPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 22;
+ let ResourceCycles = [2,20];
+}
+def: InstRW<[BWWriteResGroup196], (instrs RDTSCP)>;
+
+def BWWriteResGroup197 : SchedWriteRes<[BWPort0,BWPort01,BWPort23,BWPort05,BWPort06,BWPort015,BWPort0156]> {
+ let Latency = 60;
+ let NumMicroOps = 64;
+ let ResourceCycles = [2,2,8,1,10,2,39];
+}
+def: InstRW<[BWWriteResGroup197], (instrs FLDENVm)>;
+
+def BWWriteResGroup198 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 88;
+ let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[BWWriteResGroup198], (instrs FXRSTOR64)>;
+
+def BWWriteResGroup199 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 90;
+ let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[BWWriteResGroup199], (instrs FXRSTOR)>;
+
+def BWWriteResGroup200 : SchedWriteRes<[BWPort5,BWPort01,BWPort0156]> {
+ let Latency = 75;
+ let NumMicroOps = 15;
+ let ResourceCycles = [6,3,6];
+}
+def: InstRW<[BWWriteResGroup200], (instrs FNINIT)>;
+
+def BWWriteResGroup202 : SchedWriteRes<[BWPort0,BWPort1,BWPort4,BWPort5,BWPort6,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 115;
+ let NumMicroOps = 100;
+ let ResourceCycles = [9,9,11,8,1,11,21,30];
+}
+def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
+
+
+// Instruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Haswell and Broadwell Pipeline" > "Register allocation and
+// renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def BWWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def BWWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[BWWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def BWWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[BWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def BWWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[BWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def BWWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def BWWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>;
+
+def BWWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def BWWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomALUY], (instrs VPSUBBYrr,
+ VPSUBDYrr,
+ VPSUBQYrr,
+ VPSUBWYrr,
+ VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def BWWritePCMPGTQ : SchedWriteRes<[BWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def BWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [BWWritePCMPGTQ]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def BWWriteCMOVA_CMOVBErr : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [1,1];
+ let NumMicroOps = 2;
+}
+
+def BWWriteCMOVA_CMOVBErm : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
+ let Latency = 7;
+ let ResourceCycles = [1,1,1];
+ let NumMicroOps = 3;
+}
+
+def BWCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [BWWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def BWCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [BWWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[BWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[BWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def BWWriteSETA_SETBEr : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [1,1];
+ let NumMicroOps = 2;
+}
+
+def BWWriteSETA_SETBEm : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,1,1];
+ let NumMicroOps = 4;
+}
+
+def BWSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [BWWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def BWSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [BWWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[BWSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[BWSETA_SETBErm], (instrs SETCCm)>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td
new file mode 100644
index 000000000000..746dbaeca189
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -0,0 +1,2008 @@
+//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Haswell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+// Note that we define some instructions here that are not supported by haswell,
+// but we still have to define them because KNL uses the HSW model.
+// They are currently tagged with a comment `Unsupported = 1`.
+// FIXME: Use Unsupported = 1 once KNL has its own model.
+//
+//===----------------------------------------------------------------------===//
+
+def HaswellModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+ // instructions per cycle.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 192; // Based on the reorder buffer.
+ let LoadLatency = 5;
+ let MispredictPenalty = 16;
+
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
+ // This flag is set to allow the scheduler to assign a default model to
+ // unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = HaswellModel in {
+
+// Haswell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def HWPort0 : ProcResource<1>;
+def HWPort1 : ProcResource<1>;
+def HWPort2 : ProcResource<1>;
+def HWPort3 : ProcResource<1>;
+def HWPort4 : ProcResource<1>;
+def HWPort5 : ProcResource<1>;
+def HWPort6 : ProcResource<1>;
+def HWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def HWPort01 : ProcResGroup<[HWPort0, HWPort1]>;
+def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>;
+def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort04 : ProcResGroup<[HWPort0, HWPort4]>;
+def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>;
+def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>;
+def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort16 : ProcResGroup<[HWPort1, HWPort6]>;
+def HWPort56 : ProcResGroup<[HWPort5, HWPort6]>;
+def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
+def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+
+// 60 Entry Unified Scheduler
+def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
+ HWPort5, HWPort6, HWPort7]> {
+ let BufferSize=60;
+}
+
+// Integer division issued on port 0.
+def HWDivider : ProcResource<1>;
+// FP division and sqrt on port 0.
+def HWFPDivider : ProcResource<1>;
+
+// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 5>;
+
+// Vector loads are 5/6/7 cycles, so ReadAfterVec*Ld registers needn't be available
+// until 5/6/7 cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 6>;
+def : ReadAdvance<ReadAfterVecYLd, 7>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([HWPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort237,HWPort4]>;
+
+// Store_addr on 237.
+// Store_data on 4.
+defm : X86WriteRes<WriteStore, [HWPort237, HWPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [HWPort237, HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteLoad, [HWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteMove, [HWPort0156], 1, [1], 1>;
+def : WriteRes<WriteZero, []>;
+
+// Arithmetic.
+defm : HWWriteResPair<WriteALU, [HWPort0156], 1>;
+defm : HWWriteResPair<WriteADC, [HWPort06, HWPort0156], 2, [1,1], 2>;
+
+// Integer multiplication.
+defm : HWWriteResPair<WriteIMul8, [HWPort1], 3>;
+defm : HWWriteResPair<WriteIMul16, [HWPort1,HWPort06,HWPort0156], 4, [1,1,2], 4>;
+defm : X86WriteRes<WriteIMul16Imm, [HWPort1,HWPort0156], 4, [1,1], 2>;
+defm : X86WriteRes<WriteIMul16ImmLd, [HWPort1,HWPort0156,HWPort23], 8, [1,1,1], 3>;
+defm : HWWriteResPair<WriteIMul16Reg, [HWPort1], 3>;
+defm : HWWriteResPair<WriteIMul32, [HWPort1,HWPort06,HWPort0156], 4, [1,1,1], 3>;
+defm : HWWriteResPair<WriteIMul32Imm, [HWPort1], 3>;
+defm : HWWriteResPair<WriteIMul32Reg, [HWPort1], 3>;
+defm : HWWriteResPair<WriteIMul64, [HWPort1,HWPort6], 4, [1,1], 2>;
+defm : HWWriteResPair<WriteIMul64Imm, [HWPort1], 3>;
+defm : HWWriteResPair<WriteIMul64Reg, [HWPort1], 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+
+defm : X86WriteRes<WriteBSWAP32, [HWPort15], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [HWPort06, HWPort15], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCMPXCHG,[HWPort06, HWPort0156], 5, [2,3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[HWPort23,HWPort06,HWPort0156,HWPort237,HWPort4], 9, [1,2,1,1,1], 6>;
+defm : X86WriteRes<WriteXCHG, [HWPort0156], 2, [3], 3>;
+
+// Integer shifts and rotates.
+defm : HWWriteResPair<WriteShift, [HWPort06], 1>;
+defm : HWWriteResPair<WriteShiftCL, [HWPort06, HWPort0156], 3, [2,1], 3>;
+defm : HWWriteResPair<WriteRotate, [HWPort06], 1, [1], 1>;
+defm : HWWriteResPair<WriteRotateCL, [HWPort06, HWPort0156], 3, [2,1], 3>;
+
+// SHLD/SHRD.
+defm : X86WriteRes<WriteSHDrri, [HWPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteSHDrrcl,[HWPort1, HWPort06, HWPort0156], 6, [1, 1, 2], 4>;
+defm : X86WriteRes<WriteSHDmri, [HWPort1, HWPort23, HWPort237, HWPort0156], 10, [1, 1, 1, 1], 4>;
+defm : X86WriteRes<WriteSHDmrcl,[HWPort1, HWPort23, HWPort237, HWPort06, HWPort0156], 12, [1, 1, 1, 1, 2], 6>;
+
+defm : HWWriteResPair<WriteJump, [HWPort06], 1>;
+defm : HWWriteResPair<WriteCRC32, [HWPort1], 3>;
+
+defm : HWWriteResPair<WriteCMOV, [HWPort06,HWPort0156], 2, [1,1], 2>; // Conditional move.
+defm : X86WriteRes<WriteFCMOV, [HWPort1], 3, [1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [HWPort06]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+defm : X86WriteRes<WriteLAHFSAHF, [HWPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTest, [HWPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [HWPort06,HWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestRegLd, [], 1, [], 10>;
+defm : X86WriteRes<WriteBitTestSet, [HWPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestSetImmLd, [HWPort06,HWPort23], 6, [1,1], 3>;
+//defm : X86WriteRes<WriteBitTestSetRegLd, [], 1, [], 11>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [HWPort15]>;
+
+// Bit counts.
+defm : HWWriteResPair<WriteBSF, [HWPort1], 3>;
+defm : HWWriteResPair<WriteBSR, [HWPort1], 3>;
+defm : HWWriteResPair<WriteLZCNT, [HWPort1], 3>;
+defm : HWWriteResPair<WriteTZCNT, [HWPort1], 3>;
+defm : HWWriteResPair<WritePOPCNT, [HWPort1], 3>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : HWWriteResPair<WriteBEXTR, [HWPort06,HWPort15], 2, [1,1], 2>;
+defm : HWWriteResPair<WriteBLS, [HWPort15], 1>;
+defm : HWWriteResPair<WriteBZHI, [HWPort15], 1>;
+
+// TODO: Why isn't the HWDivider used?
+defm : X86WriteRes<WriteDiv8, [HWPort0,HWPort1,HWPort5,HWPort6], 22, [], 9>;
+defm : X86WriteRes<WriteDiv16, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156], 98, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv32, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156], 98, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv64, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156], 98, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv8Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteDiv16Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteDiv32Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteDiv64Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
+
+defm : X86WriteRes<WriteIDiv8, [HWPort0,HWPort1,HWPort5,HWPort6], 23, [], 9>;
+defm : X86WriteRes<WriteIDiv16, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156], 112, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv32, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156], 112, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv64, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156], 112, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv8Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteIDiv16Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteIDiv32Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteIDiv64Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
+
+// Scalar and vector floating point.
+defm : X86WriteRes<WriteFLD0, [HWPort01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [HWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC, [HWPort01], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad, [HWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [HWPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [HWPort23,HWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteFMaskedLoadY, [HWPort23,HWPort5], 9, [1,2], 3>;
+defm : X86WriteRes<WriteFStore, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+
+defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [HWPort01,HWPort15,HWPort015,HWPort0156], 31, [8,1,21,1], 31>;
+
+defm : HWWriteResPair<WriteFAdd, [HWPort1], 3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFAddX, [HWPort1], 3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFAddY, [HWPort1], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFAddZ, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFAdd64, [HWPort1], 3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFAdd64X, [HWPort1], 3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFAdd64Y, [HWPort1], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFAdd64Z, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFCmp, [HWPort1], 3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFCmpX, [HWPort1], 3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFCmpY, [HWPort1], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFCmpZ, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFCmp64, [HWPort1], 3, [1], 1, 5>;
+defm : HWWriteResPair<WriteFCmp64X, [HWPort1], 3, [1], 1, 6>;
+defm : HWWriteResPair<WriteFCmp64Y, [HWPort1], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFCmp64Z, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFCom, [HWPort1], 3>;
+defm : HWWriteResPair<WriteFComX, [HWPort1], 3>;
+
+defm : HWWriteResPair<WriteFMul, [HWPort01], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMulX, [HWPort01], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMulY, [HWPort01], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMulZ, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFMul64, [HWPort01], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMul64X, [HWPort01], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMul64Y, [HWPort01], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMul64Z, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFDiv, [HWPort0,HWFPDivider], 13, [1,7], 1, 5>;
+defm : HWWriteResPair<WriteFDivX, [HWPort0,HWFPDivider], 13, [1,7], 1, 6>;
+defm : HWWriteResPair<WriteFDivY, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>;
+defm : HWWriteResPair<WriteFDivZ, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFDiv64, [HWPort0,HWFPDivider], 20, [1,14], 1, 5>;
+defm : HWWriteResPair<WriteFDiv64X, [HWPort0,HWFPDivider], 20, [1,14], 1, 6>;
+defm : HWWriteResPair<WriteFDiv64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>;
+defm : HWWriteResPair<WriteFDiv64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFRcp, [HWPort0], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFRcpX, [HWPort0], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFRcpY, [HWPort0,HWPort015], 11, [2,1], 3, 7>;
+defm : HWWriteResPair<WriteFRcpZ, [HWPort0,HWPort015], 11, [2,1], 3, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFRsqrt, [HWPort0], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFRsqrtX,[HWPort0], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFRsqrtY,[HWPort0,HWPort015], 11, [2,1], 3, 7>;
+defm : HWWriteResPair<WriteFRsqrtZ,[HWPort0,HWPort015], 11, [2,1], 3, 7>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteFSqrt, [HWPort0,HWFPDivider], 11, [1,7], 1, 5>;
+defm : HWWriteResPair<WriteFSqrtX, [HWPort0,HWFPDivider], 11, [1,7], 1, 6>;
+defm : HWWriteResPair<WriteFSqrtY, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>;
+defm : HWWriteResPair<WriteFSqrtZ, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFSqrt64, [HWPort0,HWFPDivider], 16, [1,14], 1, 5>;
+defm : HWWriteResPair<WriteFSqrt64X, [HWPort0,HWFPDivider], 16, [1,14], 1, 6>;
+defm : HWWriteResPair<WriteFSqrt64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>;
+defm : HWWriteResPair<WriteFSqrt64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFSqrt80, [HWPort0,HWFPDivider], 23, [1,17]>;
+
+defm : HWWriteResPair<WriteFMA, [HWPort01], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMAX, [HWPort01], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMAY, [HWPort01], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMAZ, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteDPPD, [HWPort0,HWPort1,HWPort5], 9, [1,1,1], 3, 6>;
+defm : HWWriteResPair<WriteDPPS, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>;
+defm : HWWriteResPair<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>;
+defm : HWWriteResPair<WriteDPPSZ, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFSign, [HWPort0], 1>;
+defm : X86WriteRes<WriteFRnd, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFRndY, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFRndZ, [HWPort23], 6, [1], 1>; // Unsupported = 1
+defm : X86WriteRes<WriteFRndLd, [HWPort1,HWPort23], 12, [2,1], 3>;
+defm : X86WriteRes<WriteFRndYLd, [HWPort1,HWPort23], 13, [2,1], 3>;
+defm : X86WriteRes<WriteFRndZLd, [HWPort1,HWPort23], 13, [2,1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteFLogic, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFLogicZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFTest, [HWPort0], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFTestY, [HWPort0], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFTestZ, [HWPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFShuffle, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFShuffleY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFVarShuffle, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFVarShuffleY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFBlend, [HWPort015], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteFBlendY, [HWPort015], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteFBlendZ, [HWPort015], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteFVarBlend, [HWPort5], 2, [2], 2, 6>;
+defm : HWWriteResPair<WriteFVarBlendY, [HWPort5], 2, [2], 2, 7>;
+defm : HWWriteResPair<WriteFVarBlendZ, [HWPort5], 2, [2], 2, 7>; // Unsupported = 1
+
+// Conversion between integer and float.
+defm : HWWriteResPair<WriteCvtSD2I, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2I, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2IY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2IZ, [HWPort1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteCvtSS2I, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2I, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2IY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2IZ, [HWPort1], 3>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteCvtI2SD, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PD, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PDY, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PDZ, [HWPort1], 4>; // Unsupported = 1
+defm : HWWriteResPair<WriteCvtI2SS, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PS, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PSY, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtI2PSZ, [HWPort1], 4>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteCvtSS2SD, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2PD, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2PDY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPS2PDZ, [HWPort1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteCvtSD2SS, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2PS, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2PSY, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtPD2PSZ, [HWPort1], 3>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtPH2PS, [HWPort0,HWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY, [HWPort0,HWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZ, [HWPort0,HWPort5], 2, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPH2PSLd, [HWPort0,HWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [HWPort0,HWPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZLd, [HWPort0,HWPort23], 7, [1,1], 2>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtPS2PH, [HWPort1,HWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY, [HWPort1,HWPort5], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHZ, [HWPort1,HWPort5], 6, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PHSt, [HWPort1,HWPort4,HWPort5,HWPort237], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [HWPort1,HWPort4,HWPort5,HWPort237], 7, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [HWPort1,HWPort4,HWPort5,HWPort237], 7, [1,1,1,1], 4>; // Unsupported = 1
+
+// Vector integer operations.
+defm : X86WriteRes<WriteVecLoad, [HWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [HWPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [HWPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [HWPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [HWPort23,HWPort5], 8, [1,2], 3>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [HWPort23,HWPort5], 9, [1,2], 3>;
+defm : X86WriteRes<WriteVecStore, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMove, [HWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [HWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [HWPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [HWPort0], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [HWPort5], 1, [1], 1>;
+
+defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecLogicX,[HWPort015], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecLogicY,[HWPort015], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecLogicZ,[HWPort015], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVecTest, [HWPort0,HWPort5], 2, [1,1], 2, 6>;
+defm : HWWriteResPair<WriteVecTestY, [HWPort0,HWPort5], 4, [1,1], 2, 7>;
+defm : HWWriteResPair<WriteVecTestZ, [HWPort0,HWPort5], 4, [1,1], 2, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVecALU, [HWPort15], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecALUX, [HWPort15], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecALUY, [HWPort15], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecALUZ, [HWPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVecIMul, [HWPort0], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecIMulX, [HWPort0], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecIMulY, [HWPort0], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecIMulZ, [HWPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WritePMULLD, [HWPort0], 10, [2], 2, 6>;
+defm : HWWriteResPair<WritePMULLDY, [HWPort0], 10, [2], 2, 7>;
+defm : HWWriteResPair<WritePMULLDZ, [HWPort0], 10, [2], 2, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteShuffle, [HWPort5], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteShuffleX, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteShuffleY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVarShuffle, [HWPort5], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVarShuffleX,[HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVarShuffleY,[HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarShuffleZ,[HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteBlend, [HWPort5], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteBlendY, [HWPort5], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteBlendZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarShuffle256, [HWPort5], 3, [1], 1, 7>;
+defm : HWWriteResPair<WriteVarBlend, [HWPort5], 2, [2], 2, 6>;
+defm : HWWriteResPair<WriteVarBlendY, [HWPort5], 2, [2], 2, 7>;
+defm : HWWriteResPair<WriteVarBlendZ, [HWPort5], 2, [2], 2, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteMPSAD, [HWPort0, HWPort5], 7, [1, 2], 3, 6>;
+defm : HWWriteResPair<WriteMPSADY, [HWPort0, HWPort5], 7, [1, 2], 3, 7>;
+defm : HWWriteResPair<WriteMPSADZ, [HWPort0, HWPort5], 7, [1, 2], 3, 7>; // Unsupported = 1
+defm : HWWriteResPair<WritePSADBW, [HWPort0], 5, [1], 1, 5>;
+defm : HWWriteResPair<WritePSADBWX, [HWPort0], 5, [1], 1, 6>;
+defm : HWWriteResPair<WritePSADBWY, [HWPort0], 5, [1], 1, 7>;
+defm : HWWriteResPair<WritePSADBWZ, [HWPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WritePHMINPOS, [HWPort0], 5, [1], 1, 6>;
+
+// Vector integer shifts.
+defm : HWWriteResPair<WriteVecShift, [HWPort0], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecShiftX, [HWPort0,HWPort5], 2, [1,1], 2, 6>;
+defm : X86WriteRes<WriteVecShiftY, [HWPort0,HWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZ, [HWPort0,HWPort5], 4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteVecShiftYLd, [HWPort0,HWPort23], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZLd, [HWPort0,HWPort23], 8, [1,1], 2>; // Unsupported = 1
+
+defm : HWWriteResPair<WriteVecShiftImm, [HWPort0], 1, [1], 1, 5>;
+defm : HWWriteResPair<WriteVecShiftImmX, [HWPort0], 1, [1], 1, 6>;
+defm : HWWriteResPair<WriteVecShiftImmY, [HWPort0], 1, [1], 1, 7>;
+defm : HWWriteResPair<WriteVecShiftImmZ, [HWPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteVarVecShift, [HWPort0, HWPort5], 3, [2,1], 3, 6>;
+defm : HWWriteResPair<WriteVarVecShiftY, [HWPort0, HWPort5], 3, [2,1], 3, 7>;
+defm : HWWriteResPair<WriteVarVecShiftZ, [HWPort0, HWPort5], 3, [2,1], 3, 7>; // Unsupported = 1
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVecInsertLd, [HWPort5,HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
+
+def : WriteRes<WriteVecExtract, [HWPort0,HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def : WriteRes<WriteVecExtractSt, [HWPort4,HWPort5,HWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+// String instructions.
+
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [HWPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort5, HWPort015, HWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def : WriteRes<WritePCmpEStrMLd, [HWPort0, HWPort5, HWPort23, HWPort015, HWPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [HWPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [HWPort0, HWPort5, HWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
+}
+def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort5, HWPort23, HWPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [HWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSK, [HWPort0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSKY, [HWPort0]> { let Latency = 3; }
+def : WriteRes<WriteMMXMOVMSK, [HWPort0]> { let Latency = 1; }
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [HWPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+
+def : WriteRes<WriteAESIMC, [HWPort5]> {
+ let Latency = 14;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> {
+ let Latency = 20;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+
+def : WriteRes<WriteAESKeyGen, [HWPort0,HWPort5,HWPort015]> {
+ let Latency = 29;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,2];
+}
+def : WriteRes<WriteAESKeyGenLd, [HWPort0,HWPort5,HWPort23,HWPort015]> {
+ let Latency = 34;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,1,1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [HWPort0,HWPort23,HWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [HWPort4,HWPort5,HWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
+def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteFence, [HWPort23, HWPort4]>;
+def : WriteRes<WriteNop, []>;
+
+//================ Exceptions ================//
+
+//-- Specific Scheduling Models --//
+
+// Starting with P0.
+def HWWriteP0 : SchedWriteRes<[HWPort0]>;
+
+def HWWriteP01 : SchedWriteRes<[HWPort01]>;
+
+def HWWrite2P01 : SchedWriteRes<[HWPort01]> {
+ let NumMicroOps = 2;
+}
+def HWWrite3P01 : SchedWriteRes<[HWPort01]> {
+ let NumMicroOps = 3;
+}
+
+def HWWriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+ let NumMicroOps = 2;
+}
+
+def HWWrite2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+ let NumMicroOps = 3;
+ let ResourceCycles = [2, 1];
+}
+
+// Starting with P1.
+def HWWriteP1 : SchedWriteRes<[HWPort1]>;
+
+
+def HWWrite2P1 : SchedWriteRes<[HWPort1]> {
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+
+// Notation:
+// - r: register.
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+// - m = memory.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+
+// XLAT.
+def HWWriteXLAT : SchedWriteRes<[]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+def : InstRW<[HWWriteXLAT], (instrs XLAT)>;
+
+// PUSHA.
+def HWWritePushA : SchedWriteRes<[]> {
+ let NumMicroOps = 19;
+}
+def : InstRW<[HWWritePushA], (instregex "PUSHA(16|32)")>;
+
+// POPA.
+def HWWritePopA : SchedWriteRes<[]> {
+ let NumMicroOps = 18;
+}
+def : InstRW<[HWWritePopA], (instregex "POPA(16|32)")>;
+
+//-- Arithmetic instructions --//
+
+// BTR BTS BTC.
+// m,r.
+def HWWriteBTRSCmr : SchedWriteRes<[]> {
+ let NumMicroOps = 11;
+}
+def : SchedAlias<WriteBitTestSetRegRMW, HWWriteBTRSCmr>;
+
+//-- Control transfer instructions --//
+
+// CALL.
+// i.
+def HWWriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[HWWriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
+
+// BOUND.
+// r,m.
+def HWWriteBOUND : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[HWWriteBOUND], (instregex "BOUNDS(16|32)rm")>;
+
+// INTO.
+def HWWriteINTO : SchedWriteRes<[]> {
+ let NumMicroOps = 4;
+}
+def : InstRW<[HWWriteINTO], (instrs INTO)>;
+
+//-- String instructions --//
+
+// LODSB/W.
+def : InstRW<[HWWrite2P0156_P23], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[HWWriteP0156_P23], (instregex "LODS(L|Q)")>;
+
+// MOVS.
+def HWWriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2, 1, 2];
+}
+def : InstRW<[HWWriteMOVS], (instrs MOVSB, MOVSL, MOVSQ, MOVSW)>;
+
+// CMPS.
+def HWWriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2, 3];
+}
+def : InstRW<[HWWriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
+
+//-- Other --//
+
+// RDPMC.f
+def HWWriteRDPMC : SchedWriteRes<[]> {
+ let NumMicroOps = 34;
+}
+def : InstRW<[HWWriteRDPMC], (instrs RDPMC)>;
+
+// RDRAND.
+def HWWriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
+ let NumMicroOps = 17;
+ let ResourceCycles = [1, 16];
+}
+def : InstRW<[HWWriteRDRAND], (instrs RDRAND16r, RDRAND32r, RDRAND64r)>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+// FLD.
+// m80.
+def : InstRW<[HWWriteP01], (instrs LD_Frr)>;
+
+// FBLD.
+// m80.
+def HWWriteFBLD : SchedWriteRes<[]> {
+ let Latency = 47;
+ let NumMicroOps = 43;
+}
+def : InstRW<[HWWriteFBLD], (instrs FBLDm)>;
+
+// FST(P).
+// r.
+def : InstRW<[HWWriteP01], (instregex "ST_(F|FP)rr")>;
+
+// FFREE.
+def : InstRW<[HWWriteP01], (instregex "FFREE")>;
+
+// FNSAVE.
+def HWWriteFNSAVE : SchedWriteRes<[]> {
+ let NumMicroOps = 147;
+}
+def : InstRW<[HWWriteFNSAVE], (instrs FSAVEm)>;
+
+// FRSTOR.
+def HWWriteFRSTOR : SchedWriteRes<[]> {
+ let NumMicroOps = 90;
+}
+def : InstRW<[HWWriteFRSTOR], (instrs FRSTORm)>;
+
+//-- Arithmetic instructions --//
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[HWWrite2P01], (instrs FCOMPP, UCOM_FPPr)>;
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[HWWrite3P01], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
+
+// FTST.
+def : InstRW<[HWWriteP1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[HWWrite2P1], (instrs FXAM)>;
+
+// FPREM.
+def HWWriteFPREM : SchedWriteRes<[]> {
+ let Latency = 19;
+ let NumMicroOps = 28;
+}
+def : InstRW<[HWWriteFPREM], (instrs FPREM)>;
+
+// FPREM1.
+def HWWriteFPREM1 : SchedWriteRes<[]> {
+ let Latency = 27;
+ let NumMicroOps = 41;
+}
+def : InstRW<[HWWriteFPREM1], (instrs FPREM1)>;
+
+// FRNDINT.
+def HWWriteFRNDINT : SchedWriteRes<[]> {
+ let Latency = 11;
+ let NumMicroOps = 17;
+}
+def : InstRW<[HWWriteFRNDINT], (instrs FRNDINT)>;
+
+//-- Math instructions --//
+
+// FSCALE.
+def HWWriteFSCALE : SchedWriteRes<[]> {
+ let Latency = 75; // 49-125
+ let NumMicroOps = 50; // 25-75
+}
+def : InstRW<[HWWriteFSCALE], (instrs FSCALE)>;
+
+// FXTRACT.
+def HWWriteFXTRACT : SchedWriteRes<[]> {
+ let Latency = 15;
+ let NumMicroOps = 17;
+}
+def : InstRW<[HWWriteFXTRACT], (instrs FXTRACT)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : HWWriteResPair<WriteFHAdd, [HWPort1, HWPort5], 5, [1,2], 3, 6>;
+defm : HWWriteResPair<WriteFHAddY, [HWPort1, HWPort5], 5, [1,2], 3, 7>;
+defm : HWWriteResPair<WritePHAdd, [HWPort5, HWPort15], 3, [2,1], 3, 5>;
+defm : HWWriteResPair<WritePHAddX, [HWPort5, HWPort15], 3, [2,1], 3, 6>;
+defm : HWWriteResPair<WritePHAddY, [HWPort5, HWPort15], 3, [2,1], 3, 7>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+
+// Remaining instrs.
+
+def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup0], (instrs VBROADCASTSSrm)>;
+def: InstRW<[HWWriteResGroup0], (instregex "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm",
+ "VPBROADCAST(D|Q)rm")>;
+
+def HWWriteResGroup0_1 : SchedWriteRes<[HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup0_1], (instrs VBROADCASTF128,
+ VBROADCASTI128,
+ VBROADCASTSDYrm,
+ VBROADCASTSSYrm,
+ VMOVDDUPYrm,
+ VMOVSHDUPYrm,
+ VMOVSLDUPYrm)>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F(32|64|80)m",
+ "VPBROADCAST(D|Q)Yrm")>;
+
+def HWWriteResGroup0_2 : SchedWriteRes<[HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm(8|16|32)",
+ "MOVZX(16|32|64)rm(8|16)",
+ "(V?)MOVDDUPrm")>;
+
+def HWWriteResGroup1 : SchedWriteRes<[HWPort4,HWPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup1], (instrs FBSTPm, VMPTRSTm)>;
+def: InstRW<[HWWriteResGroup1], (instregex "ST_FP(32|64|80)m")>;
+
+def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQ(Y?)rr",
+ "VPSRLVQ(Y?)rr")>;
+
+def HWWriteResGroup3 : SchedWriteRes<[HWPort1]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup3], (instregex "COM(P?)_FST0r",
+ "UCOM_F(P?)r")>;
+
+def HWWriteResGroup4 : SchedWriteRes<[HWPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup4], (instrs MMX_MOVQ2DQrr)>;
+
+def HWWriteResGroup5 : SchedWriteRes<[HWPort6]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup5], (instregex "JMP(16|32|64)r")>;
+
+def HWWriteResGroup6 : SchedWriteRes<[HWPort01]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup6], (instrs FINCSTP, FNOP)>;
+
+def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup7], (instrs CDQ, CQO)>;
+
+def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr")>;
+
+def HWWriteResGroup9 : SchedWriteRes<[HWPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDD(Y?)rri")>;
+
+def HWWriteResGroup10 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup10], (instrs CBW, CWDE, CDQE,
+ CMC, STC,
+ SGDT64m,
+ SIDT64m,
+ SMSW16m,
+ STRm,
+ SYSCALL)>;
+
+def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup11], (instregex "(V?)CVTPS2PDrm")>;
+
+def HWWriteResGroup11_1 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup11_1], (instrs VPSLLVQrm, VPSRLVQrm)>;
+def: InstRW<[HWWriteResGroup11_1], (instregex "(V?)CVTSS2SDrm")>;
+
+def HWWriteResGroup11_2 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup11_2], (instrs VPSLLVQYrm, VPSRLVQYrm)>;
+
+def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup12], (instrs MMX_CVTPI2PSirm)>;
+def: InstRW<[HWWriteResGroup12], (instregex "P(DEP|EXT)(32|64)rm")>;
+
+def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup13], (instregex "(V?)PMOV(SX|ZX)BDrm",
+ "(V?)PMOV(SX|ZX)BQrm",
+ "(V?)PMOV(SX|ZX)BWrm",
+ "(V?)PMOV(SX|ZX)DQrm",
+ "(V?)PMOV(SX|ZX)WDrm",
+ "(V?)PMOV(SX|ZX)WQrm")>;
+
+def HWWriteResGroup13_1 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup13_1], (instrs VPMOVSXBDYrm,
+ VPMOVSXBQYrm,
+ VPMOVSXWQYrm)>;
+
+def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup14], (instrs FARJMP64m)>;
+def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>;
+
+def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm",
+ "MOVBE(16|32|64)rm")>;
+
+def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup17], (instrs VINSERTF128rm,
+ VINSERTI128rm,
+ VPBLENDDrmi)>;
+
+def HWWriteResGroup17_2 : SchedWriteRes<[HWPort23,HWPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup17_2], (instrs VPBLENDDYrmi)>;
+
+def HWWriteResGroup18 : SchedWriteRes<[HWPort23,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup18], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)rmr")>;
+
+def HWWriteResGroup19 : SchedWriteRes<[HWPort237,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup19], (instrs SFENCE)>;
+
+def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup21], (instrs FNSTCW16m)>;
+
+def HWWriteResGroup23 : SchedWriteRes<[HWPort4,HWPort237,HWPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup23], (instregex "MOVBE(32|64)mr")>;
+
+def HWWriteResGroup23_16 : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup23_16], (instrs MOVBE16mr)>;
+
+def HWWriteResGroup24 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup24], (instrs PUSH16r, PUSH32r, PUSH64r, PUSH64i8,
+ STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)rmr")>;
+
+def HWWriteResGroup25 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup25], (instregex "SAR(8|16|32|64)m(1|i)",
+ "SHL(8|16|32|64)m(1|i)",
+ "SHR(8|16|32|64)m(1|i)")>;
+
+def HWWriteResGroup26 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup26], (instregex "POP(16|32|64)rmm",
+ "PUSH(16|32|64)rmm")>;
+
+def HWWriteResGroup28 : SchedWriteRes<[HWPort01]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[HWWriteResGroup28], (instrs FDECSTP)>;
+
+def HWWriteResGroup30 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[HWWriteResGroup30], (instrs LFENCE,
+ MFENCE,
+ WAIT,
+ XGETBV)>;
+
+def HWWriteResGroup31 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup31], (instregex "(V?)CVTPS2PDrr",
+ "(V?)CVTSS2SDrr")>;
+
+def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup32], (instregex "CLFLUSH")>;
+
+def HWWriteResGroup33 : SchedWriteRes<[HWPort01,HWPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup33], (instrs MMX_MOVDQ2Qrr)>;
+
+def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup35], (instrs CWD, JCXZ, JECXZ, JRCXZ)>;
+
+def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup36_2], (instrs MMX_PACKSSDWirm,
+ MMX_PACKSSWBirm,
+ MMX_PACKUSWBirm)>;
+
+def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[HWWriteResGroup37], (instrs LEAVE, LEAVE64,
+ SCASB, SCASL, SCASQ, SCASW)>;
+
+def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup39], (instrs FLDCW16m)>;
+
+def HWWriteResGroup41 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup41], (instrs LRETQ, RETL, RETQ)>;
+
+def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup44], (instregex "CALL(16|32|64)r")>;
+
+def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup45], (instrs CALL64pcrel32)>;
+
+def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup46], (instregex "ROL(8|16|32|64)m(1|i)",
+ "ROR(8|16|32|64)m(1|i)")>;
+
+def HWWriteResGroup46_1 : SchedWriteRes<[HWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[HWWriteResGroup46_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
+def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup47], (instregex "XADD(8|16|32|64)rm")>;
+
+def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup48], (instrs FARCALL64m)>;
+
+def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup50], (instrs MMX_CVTPI2PSirr)>;
+def: InstRW<[HWWriteResGroup50], (instregex "P(DEP|EXT)(32|64)rr",
+ "(V?)CVTDQ2PS(Y?)rr")>;
+
+def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCAST(B|W)rr")>;
+
+def HWWriteResGroup52 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup52], (instregex "(V?)CVTPS2DQrm",
+ "(V?)CVTTPS2DQrm")>;
+
+def HWWriteResGroup52_1 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup52_1], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup52_1], (instrs VCVTDQ2PSYrm,
+ VCVTPS2DQYrm,
+ VCVTTPS2DQYrm)>;
+
+def HWWriteResGroup53_1 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup53_1], (instrs VPMOVSXBWYrm,
+ VPMOVSXDQYrm,
+ VPMOVSXWDYrm,
+ VPMOVZXWDYrm)>;
+
+def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup57], (instrs MMX_PACKSSDWirr,
+ MMX_PACKSSWBirr,
+ MMX_PACKUSWBirr)>;
+
+def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[HWWriteResGroup58], (instregex "CLD")>;
+
+def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[HWWriteResGroup59], (instregex "RCL(8|16|32|64)r(1|i)",
+ "RCR(8|16|32|64)r(1|i)")>;
+
+def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup61], (instrs FNSTSWm)>;
+
+def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup62], (instregex "IST(T?)_FP(16|32|64)m",
+ "IST_F(16|32)m")>;
+
+def HWWriteResGroup66 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup66], (instregex "RCL(8|16|32|64)m(1|i)",
+ "RCR(8|16|32|64)m(1|i)")>;
+
+def HWWriteResGroup68 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[HWWriteResGroup68], (instregex "XCHG(8|16|32|64)rm")>;
+
+def HWWriteResGroup69 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: InstRW<[HWWriteResGroup69], (instregex "ROL(8|16|32|64)mCL",
+ "ROR(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
+def: SchedAlias<WriteADCRMW, HWWriteResGroup69>;
+
+def HWWriteResGroup70 : SchedWriteRes<[HWPort0,HWPort1]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup70], (instregex "(V?)CVT(T?)SD2SI(64)?rr",
+ "(V?)CVT(T?)SS2SI(64)?rr")>;
+
+def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup71], (instrs VCVTPS2PDYrr)>;
+
+def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup72], (instrs FNSTSW16r)>;
+
+def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup73], (instrs MMX_CVTPI2PDirr,
+ MMX_CVTPD2PIirr,
+ MMX_CVTPS2PIirr,
+ MMX_CVTTPD2PIirr,
+ MMX_CVTTPS2PIirr)>;
+def: InstRW<[HWWriteResGroup73], (instregex "(V?)CVTDQ2PDrr",
+ "(V?)CVTPD2PSrr",
+ "(V?)CVTSD2SSrr",
+ "(V?)CVTSI(64)?2SDrr",
+ "(V?)CVTSI2SSrr",
+ "(V?)CVT(T?)PD2DQrr")>;
+
+def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup75], (instregex "FICOM(P?)(16|32)m")>;
+
+def HWWriteResGroup76 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup76], (instregex "(V?)CVTSD2SI(64)?rm",
+ "(V?)CVTSS2SI(64)?rm",
+ "(V?)CVTTSD2SI(64)?rm",
+ "VCVTTSS2SI64rm",
+ "(V?)CVTTSS2SIrm")>;
+
+def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup77], (instrs VCVTPS2PDYrm)>;
+
+def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup78], (instrs CVTPD2PSrm,
+ CVTPD2DQrm,
+ CVTTPD2DQrm,
+ MMX_CVTPD2PIirm,
+ MMX_CVTTPD2PIirm,
+ CVTDQ2PDrm,
+ VCVTDQ2PDrm)>;
+
+def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDirm,
+ CVTSD2SSrm, CVTSD2SSrm_Int,
+ VCVTSD2SSrm, VCVTSD2SSrm_Int)>;
+
+def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCAST(B|W)(Y?)rm")>;
+
+def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
+}
+def: InstRW<[HWWriteResGroup81], (instrs FNCLEX)>;
+
+def HWWriteResGroup82 : SchedWriteRes<[]> {
+ let Latency = 0;
+ let NumMicroOps = 4;
+ let ResourceCycles = [];
+}
+def: InstRW<[HWWriteResGroup82], (instrs VZEROUPPER)>;
+
+def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>;
+
+def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm",
+ "LSL(16|32|64)rm")>;
+
+def HWWriteResGroup88 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,4];
+}
+def: InstRW<[HWWriteResGroup88], (instregex "PUSHF(16|64)")>;
+
+def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup89], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
+
+def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup91_2], (instregex "(V?)PCMPGTQrm")>;
+
+def HWWriteResGroup91_3 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F(32|64)m")>;
+def: InstRW<[HWWriteResGroup91_3], (instrs VPCMPGTQYrm)>;
+
+def HWWriteResGroup93 : SchedWriteRes<[HWPort1,HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[HWWriteResGroup93], (instregex "(V?)CVTSI642SSrr")>;
+
+def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup94], (instregex "STR(16|32|64)r")>;
+
+def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup97], (instregex "CVTTSS2SI64rm")>;
+
+def HWWriteResGroup99 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[HWWriteResGroup99], (instrs PAUSE)>;
+
+def HWWriteResGroup100 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[HWWriteResGroup100], (instrs XSETBV)>;
+
+def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup102], (instrs VCVTDQ2PDYrr,
+ VCVTPD2PSYrr,
+ VCVTPD2DQYrr,
+ VCVTTPD2DQYrr)>;
+
+def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup103], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
+
+def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup104], (instrs VCVTDQ2PDYrm)>;
+
+def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup107], (instregex "SLDT(16|32|64)r")>;
+
+def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,5];
+}
+def: InstRW<[HWWriteResGroup108], (instrs STD)>;
+
+def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,1,2];
+}
+def: InstRW<[HWWriteResGroup114], (instrs LOOP)>;
+
+def HWWriteResGroup115 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI(16|32)m")>;
+
+def HWWriteResGroup120 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,1,1,4,1,2];
+}
+def: InstRW<[HWWriteResGroup120], (instregex "RCL(8|16|32|64)mCL")>;
+
+def HWWriteResGroup129 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,3];
+}
+def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL",
+ "RCR(16|32|64)rCL")>;
+
+def HWWriteResGroup130 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,4,1,3];
+}
+def: InstRW<[HWWriteResGroup130], (instrs RCL8rCL)>;
+
+def HWWriteResGroup131 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,9];
+}
+def: InstRW<[HWWriteResGroup131], (instrs LOOPE, LOOPNE)>;
+
+def HWWriteResGroup132 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[HWWriteResGroup132], (instrs CMPXCHG8B)>;
+
+def HWWriteResGroup135 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,1,1,3,1,3];
+}
+def: InstRW<[HWWriteResGroup135], (instregex "RCR(8|16|32|64)mCL")>;
+
+def HWWriteResGroup142 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [2,3,1,4];
+}
+def: InstRW<[HWWriteResGroup142], (instrs RCR8rCL)>;
+
+def HWWriteResGroup143 : SchedWriteRes<[HWPort23,HWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 15;
+ let ResourceCycles = [1,14];
+}
+def: InstRW<[HWWriteResGroup143], (instrs POPF16)>;
+
+def HWWriteResGroup144 : SchedWriteRes<[HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 21;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup144], (instrs INSB, INSL, INSW)>;
+
+def HWWriteResGroup145 : SchedWriteRes<[HWPort5, HWPort6]> {
+ let Latency = 8;
+ let NumMicroOps = 20;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup145], (instrs VZEROALL)>;
+
+def HWWriteResGroup146 : SchedWriteRes<[HWPort0,HWPort4,HWPort5,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 22;
+ let NumMicroOps = 19;
+ let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[HWWriteResGroup146], (instrs CMPXCHG16B)>;
+
+def HWWriteResGroup147 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 15;
+ let ResourceCycles = [2,1,2,4,2,4];
+}
+def: InstRW<[HWWriteResGroup147], (instrs XCH_F)>;
+
+def HWWriteResGroup149 : SchedWriteRes<[HWPort5,HWPort6,HWPort06,HWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,5];
+}
+def: InstRW<[HWWriteResGroup149], (instrs CPUID, RDTSC)>;
+
+def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 19;
+ let ResourceCycles = [3,1,15];
+}
+def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64)?")>;
+
+def HWWriteResGroup154 : SchedWriteRes<[HWPort0]> {
+ let Latency = 20;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup154], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
+
+def HWWriteResGroup155 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 27;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F(32|64)m")>;
+
+def HWWriteResGroup156 : SchedWriteRes<[HWPort5,HWPort6,HWPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,2,7];
+}
+def: InstRW<[HWWriteResGroup156], (instrs MWAITrr)>;
+
+def HWWriteResGroup161 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 30;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI(16|32)m")>;
+
+def HWWriteResGroup162 : SchedWriteRes<[HWPort0]> {
+ let Latency = 24;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup162], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
+
+def HWWriteResGroup163 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 31;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup163], (instregex "DIV_F(32|64)m")>;
+
+def HWWriteResGroup164 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 30;
+ let NumMicroOps = 27;
+ let ResourceCycles = [1,5,1,1,19];
+}
+def: InstRW<[HWWriteResGroup164], (instrs XSAVE64)>;
+
+def HWWriteResGroup165 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 31;
+ let NumMicroOps = 28;
+ let ResourceCycles = [1,6,1,1,19];
+}
+def: InstRW<[HWWriteResGroup165], (instrs XSAVE)>;
+def: InstRW<[HWWriteResGroup165], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
+
+def HWWriteResGroup166 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 34;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI(16|32)m")>;
+
+def HWWriteResGroup170 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,3,4,10];
+}
+def: InstRW<[HWWriteResGroup170], (instregex "IN(8|16|32)ri",
+ "IN(8|16|32)rr")>;
+
+def HWWriteResGroup171 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 36;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,2,1,4,10];
+}
+def: InstRW<[HWWriteResGroup171], (instregex "OUT(8|16|32)ir",
+ "OUT(8|16|32)rr")>;
+
+def HWWriteResGroup175 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort15,HWPort0156]> {
+ let Latency = 41;
+ let NumMicroOps = 18;
+ let ResourceCycles = [1,1,2,3,1,1,1,8];
+}
+def: InstRW<[HWWriteResGroup175], (instrs VMCLEARm)>;
+
+def HWWriteResGroup176 : SchedWriteRes<[HWPort5,HWPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 22;
+ let ResourceCycles = [2,20];
+}
+def: InstRW<[HWWriteResGroup176], (instrs RDTSCP)>;
+
+def HWWriteResGroup177 : SchedWriteRes<[HWPort0,HWPort01,HWPort23,HWPort05,HWPort06,HWPort015,HWPort0156]> {
+ let Latency = 61;
+ let NumMicroOps = 64;
+ let ResourceCycles = [2,2,8,1,10,2,39];
+}
+def: InstRW<[HWWriteResGroup177], (instrs FLDENVm)>;
+
+def HWWriteResGroup178 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 64;
+ let NumMicroOps = 88;
+ let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[HWWriteResGroup178], (instrs FXRSTOR64)>;
+
+def HWWriteResGroup179 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 64;
+ let NumMicroOps = 90;
+ let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[HWWriteResGroup179], (instrs FXRSTOR)>;
+
+def HWWriteResGroup180 : SchedWriteRes<[HWPort5,HWPort01,HWPort0156]> {
+ let Latency = 75;
+ let NumMicroOps = 15;
+ let ResourceCycles = [6,3,6];
+}
+def: InstRW<[HWWriteResGroup180], (instrs FNINIT)>;
+
+def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 115;
+ let NumMicroOps = 100;
+ let ResourceCycles = [9,9,11,8,1,11,21,30];
+}
+def: InstRW<[HWWriteResGroup183], (instrs FSTENVm)>;
+
+def HWWriteResGroup184 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 12;
+ let ResourceCycles = [2,2,2,1,3,2];
+}
+def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm, VPGATHERDQrm)>;
+
+def HWWriteResGroup185 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 20;
+ let ResourceCycles = [3,3,4,1,5,4];
+}
+def: InstRW<[HWWriteResGroup185], (instrs VGATHERDPDYrm, VPGATHERDQYrm)>;
+
+def HWWriteResGroup186 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 20;
+ let ResourceCycles = [3,3,4,1,5,4];
+}
+def: InstRW<[HWWriteResGroup186], (instrs VGATHERDPSrm, VPGATHERDDrm)>;
+
+def HWWriteResGroup187 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 22;
+ let NumMicroOps = 34;
+ let ResourceCycles = [5,3,8,1,9,8];
+}
+def: InstRW<[HWWriteResGroup187], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
+
+def HWWriteResGroup188 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 15;
+ let NumMicroOps = 14;
+ let ResourceCycles = [3,3,2,1,3,2];
+}
+def: InstRW<[HWWriteResGroup188], (instrs VGATHERQPDrm, VPGATHERQQrm)>;
+
+def HWWriteResGroup189 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 22;
+ let ResourceCycles = [5,3,4,1,5,4];
+}
+def: InstRW<[HWWriteResGroup189], (instrs VGATHERQPDYrm, VPGATHERQQYrm,
+ VGATHERQPSYrm, VPGATHERQDYrm)>;
+
+def HWWriteResGroup190 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 15;
+ let ResourceCycles = [3,3,2,1,4,2];
+}
+def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPSrm, VPGATHERQDrm)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
+
+
+// Instruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Haswell and Broadwell Pipeline" > "Register allocation and
+// renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def HWWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def HWWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[HWWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def HWWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[HWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def HWWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[HWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def HWWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def HWWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>;
+
+def HWWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def HWWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomALUY], (instrs VPSUBBYrr,
+ VPSUBDYrr,
+ VPSUBQYrr,
+ VPSUBWYrr,
+ VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def HWWritePCMPGTQ : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def HWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [HWWritePCMPGTQ]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// The 0x83 ADC/SBB opcodes have special support for immediate 0 to only require
+// a single uop. It does not apply to the GR8 encoding. And only applies to the
+// 8-bit immediate since using larger immediate for 0 would be silly.
+// Unfortunately, this optimization does not apply to the AX/EAX/RAX short
+// encodings we convert to in MCInstLowering so we exclude AX/EAX/RAX here since
+// we schedule before that point.
+// TODO: Should we disable using the short encodings on these CPUs?
+def HWFastADC0 : MCSchedPredicate<
+ CheckAll<[
+ CheckImmOperand<2, 0>, // Second MCOperand is Imm and has value 0.
+ CheckNot<CheckRegOperand<1, AX>>, // First MCOperand is not register AX
+ CheckNot<CheckRegOperand<1, EAX>>, // First MCOperand is not register EAX
+ CheckNot<CheckRegOperand<1, RAX>> // First MCOperand is not register RAX
+ ]>
+>;
+
+def HWWriteADC0 : SchedWriteRes<[HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def HWWriteADC : SchedWriteVariant<[
+ SchedVar<HWFastADC0, [HWWriteADC0]>,
+ SchedVar<NoSchedPred, [WriteADC]>
+]>;
+
+def : InstRW<[HWWriteADC], (instrs ADC16ri8, ADC32ri8, ADC64ri8,
+ SBB16ri8, SBB32ri8, SBB64ri8)>;
+
+// CMOVs that use both Z and C flag require an extra uop.
+def HWWriteCMOVA_CMOVBErr : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 3;
+}
+
+def HWWriteCMOVA_CMOVBErm : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 8;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def HWCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [HWWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def HWCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [HWWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[HWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[HWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def HWWriteSETA_SETBEr : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [1,1];
+ let NumMicroOps = 2;
+}
+
+def HWWriteSETA_SETBEm : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,1,1];
+ let NumMicroOps = 4;
+}
+
+def HWSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [HWWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def HWSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [HWWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[HWSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[HWSETA_SETBErm], (instrs SETCCm)>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedPredicates.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedPredicates.td
new file mode 100644
index 000000000000..76001d382a27
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedPredicates.td
@@ -0,0 +1,143 @@
+//===-- X86SchedPredicates.td - X86 Scheduling Predicates --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are common to
+// all X86 subtargets.
+//
+//===----------------------------------------------------------------------===//
+
+// A predicate used to identify dependency-breaking instructions that clear the
+// content of the destination register. Note that this predicate only checks if
+// input registers are the same. This predicate doesn't make any assumptions on
+// the expected instruction opcodes, because different processors may implement
+// different zero-idioms.
+def ZeroIdiomPredicate : CheckSameRegOperand<1, 2>;
+
+// A predicate used to identify VPERM that have bits 3 and 7 of their mask set.
+// On some processors, these VPERM instructions are zero-idioms.
+def ZeroIdiomVPERMPredicate : CheckAll<[
+ ZeroIdiomPredicate,
+ CheckImmOperand<3, 0x88>
+]>;
+
+// A predicate used to check if a LEA instruction uses all three source
+// operands: base, index, and offset.
+def IsThreeOperandsLEAPredicate: CheckAll<[
+ // isRegOperand(Base)
+ CheckIsRegOperand<1>,
+ CheckNot<CheckInvalidRegOperand<1>>,
+
+ // isRegOperand(Index)
+ CheckIsRegOperand<3>,
+ CheckNot<CheckInvalidRegOperand<3>>,
+
+ // hasLEAOffset(Offset)
+ CheckAny<[
+ CheckAll<[
+ CheckIsImmOperand<4>,
+ CheckNot<CheckZeroOperand<4>>
+ ]>,
+ CheckNonPortable<"MI.getOperand(4).isGlobal()">
+ ]>
+]>;
+
+def LEACases : MCOpcodeSwitchCase<
+ [LEA32r, LEA64r, LEA64_32r, LEA16r],
+ MCReturnStatement<IsThreeOperandsLEAPredicate>
+>;
+
+// Used to generate the body of a TII member function.
+def IsThreeOperandsLEABody :
+ MCOpcodeSwitchStatement<[LEACases], MCReturnStatement<FalsePred>>;
+
+// This predicate evaluates to true only if the input machine instruction is a
+// 3-operands LEA. Tablegen automatically generates a new method for it in
+// X86GenInstrInfo.
+def IsThreeOperandsLEAFn :
+ TIIPredicate<"isThreeOperandsLEA", IsThreeOperandsLEABody>;
+
+// A predicate to check for COND_A and COND_BE CMOVs which have an extra uop
+// on recent Intel CPUs.
+def IsCMOVArr_Or_CMOVBErr : CheckAny<[
+ CheckImmOperand_s<3, "X86::COND_A">,
+ CheckImmOperand_s<3, "X86::COND_BE">
+]>;
+
+def IsCMOVArm_Or_CMOVBErm : CheckAny<[
+ CheckImmOperand_s<7, "X86::COND_A">,
+ CheckImmOperand_s<7, "X86::COND_BE">
+]>;
+
+// A predicate to check for COND_A and COND_BE SETCCs which have an extra uop
+// on recent Intel CPUs.
+def IsSETAr_Or_SETBEr : CheckAny<[
+ CheckImmOperand_s<1, "X86::COND_A">,
+ CheckImmOperand_s<1, "X86::COND_BE">
+]>;
+
+def IsSETAm_Or_SETBEm : CheckAny<[
+ CheckImmOperand_s<5, "X86::COND_A">,
+ CheckImmOperand_s<5, "X86::COND_BE">
+]>;
+
+// A predicate used to check if an instruction has a LOCK prefix.
+def CheckLockPrefix : CheckFunctionPredicate<
+ "X86_MC::hasLockPrefix",
+ "X86InstrInfo::hasLockPrefix"
+>;
+
+def IsRegRegCompareAndSwap_8 : CheckOpcode<[ CMPXCHG8rr ]>;
+
+def IsRegMemCompareAndSwap_8 : CheckOpcode<[
+ LCMPXCHG8, CMPXCHG8rm
+]>;
+
+def IsRegRegCompareAndSwap_16_32_64 : CheckOpcode<[
+ CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr
+]>;
+
+def IsRegMemCompareAndSwap_16_32_64 : CheckOpcode<[
+ CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm,
+ LCMPXCHG16, LCMPXCHG32, LCMPXCHG64,
+ LCMPXCHG8B, LCMPXCHG16B
+]>;
+
+def IsCompareAndSwap8B : CheckOpcode<[ CMPXCHG8B, LCMPXCHG8B ]>;
+def IsCompareAndSwap16B : CheckOpcode<[ CMPXCHG16B, LCMPXCHG16B ]>;
+
+def IsRegMemCompareAndSwap : CheckOpcode<
+ !listconcat(
+ IsRegMemCompareAndSwap_8.ValidOpcodes,
+ IsRegMemCompareAndSwap_16_32_64.ValidOpcodes
+ )>;
+
+def IsRegRegCompareAndSwap : CheckOpcode<
+ !listconcat(
+ IsRegRegCompareAndSwap_8.ValidOpcodes,
+ IsRegRegCompareAndSwap_16_32_64.ValidOpcodes
+ )>;
+
+def IsAtomicCompareAndSwap_8 : CheckAll<[
+ CheckLockPrefix,
+ IsRegMemCompareAndSwap_8
+]>;
+
+def IsAtomicCompareAndSwap : CheckAll<[
+ CheckLockPrefix,
+ IsRegMemCompareAndSwap
+]>;
+
+def IsAtomicCompareAndSwap8B : CheckAll<[
+ CheckLockPrefix,
+ IsCompareAndSwap8B
+]>;
+
+def IsAtomicCompareAndSwap16B : CheckAll<[
+ CheckLockPrefix,
+ IsCompareAndSwap16B
+]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td
new file mode 100644
index 000000000000..ac32f1b19990
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -0,0 +1,1226 @@
+//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Sandy Bridge to support instruction
+// scheduling and other instruction cost heuristics.
+//
+// Note that we define some instructions here that are not supported by SNB,
+// but we still have to define them because SNB is the default subtarget for
+// X86. These instructions are tagged with a comment `Unsupported = 1`.
+//
+//===----------------------------------------------------------------------===//
+
+def SandyBridgeModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SB can decode 4
+ // instructions per cycle.
+ // FIXME: Identify instructions that aren't a single fused micro-op.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 168; // Based on the reorder buffer.
+ let LoadLatency = 5;
+ let MispredictPenalty = 16;
+
+ // Based on the LSD (loop-stream detector) queue size.
+ let LoopMicroOpBufferSize = 28;
+
+ // This flag is set to allow the scheduler to assign
+ // a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SandyBridgeModel in {
+
+// Sandy Bridge can issue micro-ops to 6 different ports in one cycle.
+
+// Ports 0, 1, and 5 handle all computation.
+def SBPort0 : ProcResource<1>;
+def SBPort1 : ProcResource<1>;
+def SBPort5 : ProcResource<1>;
+
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores.
+def SBPort23 : ProcResource<2>;
+
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+def SBPort4 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SBPort01 : ProcResGroup<[SBPort0, SBPort1]>;
+def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>;
+def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>;
+def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+
+// 54 Entry Unified Scheduler
+def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
+ let BufferSize=54;
+}
+
+// Integer division issued on port 0.
+def SBDivider : ProcResource<1>;
+// FP division and sqrt on port 0.
+def SBFPDivider : ProcResource<1>;
+
+// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 5>;
+
+// Vector loads are 5/6/7 cycles, so ReadAfterVec*Ld registers needn't be available
+// until 5/6/7 cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 6>;
+def : ReadAdvance<ReadAfterVecYLd, 7>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([SBPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort23,SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteStoreNT, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 5; }
+def : WriteRes<WriteMove, [SBPort015]>;
+def : WriteRes<WriteZero, []>;
+
+// Arithmetic.
+defm : SBWriteResPair<WriteALU, [SBPort015], 1>;
+defm : SBWriteResPair<WriteADC, [SBPort05,SBPort015], 2, [1,1], 2>;
+
+defm : SBWriteResPair<WriteIMul8, [SBPort1], 3>;
+defm : SBWriteResPair<WriteIMul16, [SBPort1,SBPort05,SBPort015], 4, [1,1,2], 4>;
+defm : X86WriteRes<WriteIMul16Imm, [SBPort1,SBPort015], 4, [1,1], 2>;
+defm : X86WriteRes<WriteIMul16ImmLd, [SBPort1,SBPort015,SBPort23], 8, [1,1,1], 3>;
+defm : SBWriteResPair<WriteIMul16Reg, [SBPort1], 3>;
+defm : SBWriteResPair<WriteIMul32, [SBPort1,SBPort05,SBPort015], 4, [1,1,1], 3>;
+defm : SBWriteResPair<WriteIMul32Imm, [SBPort1], 3>;
+defm : SBWriteResPair<WriteIMul32Reg, [SBPort1], 3>;
+defm : SBWriteResPair<WriteIMul64, [SBPort1,SBPort0], 4, [1,1], 2>;
+defm : SBWriteResPair<WriteIMul64Imm, [SBPort1], 3>;
+defm : SBWriteResPair<WriteIMul64Reg, [SBPort1], 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+
+defm : X86WriteRes<WriteXCHG, [SBPort015], 2, [3], 3>;
+defm : X86WriteRes<WriteBSWAP32, [SBPort1], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [SBPort1, SBPort05], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCMPXCHG, [SBPort05, SBPort015], 5, [1,3], 4>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[SBPort015, SBPort5, SBPort23, SBPort4], 8, [1, 2, 2, 1], 6>;
+
+defm : SBWriteResPair<WriteDiv8, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteDiv16, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteDiv32, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteDiv64, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv8, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv16, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv32, [SBPort0, SBDivider], 25, [1, 10]>;
+defm : SBWriteResPair<WriteIDiv64, [SBPort0, SBDivider], 25, [1, 10]>;
+
+// SHLD/SHRD.
+defm : X86WriteRes<WriteSHDrri, [SBPort05, SBPort015], 2, [1, 1], 2>;
+defm : X86WriteRes<WriteSHDrrcl,[SBPort05, SBPort015], 4, [3, 1], 4>;
+defm : X86WriteRes<WriteSHDmri, [SBPort4,SBPort23,SBPort05,SBPort015], 8, [1, 2, 1, 1], 5>;
+defm : X86WriteRes<WriteSHDmrcl,[SBPort4,SBPort23,SBPort05,SBPort015], 10, [1, 2, 3, 1], 7>;
+
+defm : SBWriteResPair<WriteShift, [SBPort05], 1>;
+defm : SBWriteResPair<WriteShiftCL, [SBPort05], 3, [3], 3>;
+defm : SBWriteResPair<WriteRotate, [SBPort05], 2, [2], 2>;
+defm : SBWriteResPair<WriteRotateCL, [SBPort05], 3, [3], 3>;
+
+defm : SBWriteResPair<WriteJump, [SBPort5], 1>;
+defm : SBWriteResPair<WriteCRC32, [SBPort1], 3, [1], 1, 5>;
+
+defm : SBWriteResPair<WriteCMOV, [SBPort05,SBPort015], 2, [1,1], 2>; // Conditional move.
+defm : X86WriteRes<WriteFCMOV, [SBPort5,SBPort05], 3, [2,1], 3>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [SBPort05]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+defm : X86WriteRes<WriteLAHFSAHF, [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTest, [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [SBPort05,SBPort23], 6, [1,1], 2>;
+//defm : X86WriteRes<WriteBitTestRegLd, [SBPort05,SBPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestSet, [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestSetImmLd, [SBPort05,SBPort23], 6, [1,1], 3>;
+defm : X86WriteRes<WriteBitTestSetRegLd, [SBPort05,SBPort23,SBPort5,SBPort015], 8, [1,1,1,1], 5>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SBPort01]>;
+
+// Bit counts.
+defm : SBWriteResPair<WriteBSF, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteBSR, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteLZCNT, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteTZCNT, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WritePOPCNT, [SBPort1], 3, [1], 1, 6>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+// NOTE: These don't exist on Sandy Bridge. Ports are guesses.
+defm : SBWriteResPair<WriteBEXTR, [SBPort05,SBPort1], 2, [1,1], 2>;
+defm : SBWriteResPair<WriteBLS, [SBPort015], 1>;
+defm : SBWriteResPair<WriteBZHI, [SBPort1], 1>;
+
+// Scalar and vector floating point.
+defm : X86WriteRes<WriteFLD0, [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [SBPort0,SBPort5], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFLDC, [SBPort0,SBPort1], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFLoad, [SBPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [SBPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [SBPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [SBPort23,SBPort05], 8, [1,2], 3>;
+defm : X86WriteRes<WriteFMaskedLoadY, [SBPort23,SBPort05], 9, [1,2], 3>;
+defm : X86WriteRes<WriteFStore, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreX, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreY, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTX, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore64, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+
+defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [SBPort015], 31, [31], 31>;
+
+defm : SBWriteResPair<WriteFAdd, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAddX, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAddY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFAddZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFAdd64, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAdd64X, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFAdd64Y, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFAdd64Z, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFCmp, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmpX, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmpY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFCmpZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFCmp64, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmp64X, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFCmp64Y, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFCmp64Z, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFCom, [SBPort1], 3>;
+defm : SBWriteResPair<WriteFComX, [SBPort1], 3>;
+
+defm : SBWriteResPair<WriteFMul, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMulX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMulY, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WriteFMulZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFMul64, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMul64X, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFMul64Y, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WriteFMul64Z, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFDiv, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFDivX, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFDivY, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>;
+defm : SBWriteResPair<WriteFDivZ, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFDiv64, [SBPort0,SBFPDivider], 22, [1,22], 1, 6>;
+defm : SBWriteResPair<WriteFDiv64X, [SBPort0,SBFPDivider], 22, [1,22], 1, 6>;
+defm : SBWriteResPair<WriteFDiv64Y, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>;
+defm : SBWriteResPair<WriteFDiv64Z, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFRcp, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRcpX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRcpY, [SBPort0,SBPort05], 7, [2,1], 3, 7>;
+defm : SBWriteResPair<WriteFRcpZ, [SBPort0,SBPort05], 7, [2,1], 3, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFRsqrt, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRsqrtX,[SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRsqrtY,[SBPort0,SBPort05], 7, [2,1], 3, 7>;
+defm : SBWriteResPair<WriteFRsqrtZ,[SBPort0,SBPort05], 7, [2,1], 3, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteFSqrt, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFSqrtX, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>;
+defm : SBWriteResPair<WriteFSqrtY, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>;
+defm : SBWriteResPair<WriteFSqrtZ, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFSqrt64, [SBPort0,SBFPDivider], 21, [1,21], 1, 6>;
+defm : SBWriteResPair<WriteFSqrt64X, [SBPort0,SBFPDivider], 21, [1,21], 1, 6>;
+defm : SBWriteResPair<WriteFSqrt64Y, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>;
+defm : SBWriteResPair<WriteFSqrt64Z, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFSqrt80, [SBPort0,SBFPDivider], 24, [1,24], 1, 6>;
+
+defm : SBWriteResPair<WriteDPPD, [SBPort0,SBPort1,SBPort5], 9, [1,1,1], 3, 6>;
+defm : SBWriteResPair<WriteDPPS, [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 6>;
+defm : SBWriteResPair<WriteDPPSY, [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 7>;
+defm : SBWriteResPair<WriteDPPSZ, [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFSign, [SBPort5], 1>;
+defm : SBWriteResPair<WriteFRnd, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteFRndY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteFRndZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFLogic, [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFLogicY, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFLogicZ, [SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFTest, [SBPort0], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFTestY, [SBPort0], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFTestZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFShuffle, [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFShuffleY,[SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFShuffleZ,[SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFVarShuffle, [SBPort5], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFVarShuffleY,[SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFVarShuffleZ,[SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFBlend, [SBPort05], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteFBlendY, [SBPort05], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFBlendZ, [SBPort05], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteFVarBlend, [SBPort05], 2, [2], 2, 6>;
+defm : SBWriteResPair<WriteFVarBlendY,[SBPort05], 2, [2], 2, 7>;
+defm : SBWriteResPair<WriteFVarBlendZ,[SBPort05], 2, [2], 2, 7>; // Unsupported = 1
+
+// Conversion between integer and float.
+defm : SBWriteResPair<WriteCvtSS2I, [SBPort0,SBPort1], 5, [1,1], 2>;
+defm : SBWriteResPair<WriteCvtPS2I, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteCvtPS2IY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteCvtPS2IZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteCvtSD2I, [SBPort0,SBPort1], 5, [1,1], 2>;
+defm : SBWriteResPair<WriteCvtPD2I, [SBPort1,SBPort5], 4, [1,1], 2, 6>;
+defm : X86WriteRes<WriteCvtPD2IY, [SBPort1,SBPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPD2IZ, [SBPort1,SBPort5], 4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPD2IYLd, [SBPort1,SBPort5,SBPort23], 11, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtPD2IZLd, [SBPort1,SBPort5,SBPort23], 11, [1,1,1], 3>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtI2SS, [SBPort1,SBPort5], 5, [1,2], 3>;
+defm : X86WriteRes<WriteCvtI2SSLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
+defm : SBWriteResPair<WriteCvtI2PS, [SBPort1], 3, [1], 1, 6>;
+defm : SBWriteResPair<WriteCvtI2PSY, [SBPort1], 3, [1], 1, 7>;
+defm : SBWriteResPair<WriteCvtI2PSZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtI2SD, [SBPort1,SBPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PD, [SBPort1,SBPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDY, [SBPort1,SBPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDZ, [SBPort1,SBPort5], 4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtI2SDLd, [SBPort1,SBPort23], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtI2PDYLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtI2PDZLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteCvtSS2SD, [SBPort0], 1, [1], 1, 6>;
+defm : X86WriteRes<WriteCvtPS2PD, [SBPort0,SBPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDY, [SBPort0,SBPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDZ, [SBPort0,SBPort5], 2, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PDLd, [SBPort0,SBPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDYLd, [SBPort0,SBPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDZLd, [SBPort0,SBPort23], 7, [1,1], 2>; // Unsupported = 1
+defm : SBWriteResPair<WriteCvtSD2SS, [SBPort1,SBPort5], 4, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteCvtPD2PS, [SBPort1,SBPort5], 4, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteCvtPD2PSY, [SBPort1,SBPort5], 4, [1,1], 2, 7>;
+defm : SBWriteResPair<WriteCvtPD2PSZ, [SBPort1,SBPort5], 4, [1,1], 2, 7>; // Unsupported = 1
+
+defm : SBWriteResPair<WriteCvtPH2PS, [SBPort1], 3>;
+defm : SBWriteResPair<WriteCvtPH2PSY, [SBPort1], 3>;
+defm : SBWriteResPair<WriteCvtPH2PSZ, [SBPort1], 3>; // Unsupported = 1
+
+defm : X86WriteRes<WriteCvtPS2PH, [SBPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHY, [SBPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHZ, [SBPort1], 3, [1], 1>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PHSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>; // Unsupported = 1
+
+// Vector integer operations.
+defm : X86WriteRes<WriteVecLoad, [SBPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [SBPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [SBPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [SBPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [SBPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [SBPort23,SBPort05], 8, [1,2], 3>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [SBPort23,SBPort05], 9, [1,2], 3>;
+defm : X86WriteRes<WriteVecStore, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreX, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore32, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore64, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMove, [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [SBPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [SBPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [SBPort5], 1, [1], 1>;
+
+defm : SBWriteResPair<WriteVecLogic, [SBPort015], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecLogicX,[SBPort015], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecLogicY,[SBPort015], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecLogicZ,[SBPort015], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecTest, [SBPort0,SBPort5], 2, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteVecTestY, [SBPort0,SBPort5], 2, [1,1], 2, 7>;
+defm : SBWriteResPair<WriteVecTestZ, [SBPort0,SBPort5], 2, [1,1], 2, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecALU, [SBPort1], 3, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecALUX, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecALUY, [SBPort15], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecALUZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecIMul, [SBPort0], 5, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecIMulX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecIMulY, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecIMulZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePMULLD, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WritePMULLDY, [SBPort0], 5, [1], 1, 7>; // TODO this is probably wrong for 256/512-bit for the "generic" model
+defm : SBWriteResPair<WritePMULLDZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteShuffle, [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteShuffleX, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteShuffleY, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteShuffleZ, [SBPort5], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVarShuffle, [SBPort15], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVarShuffleX, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVarShuffleY, [SBPort15], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarShuffleZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteBlend, [SBPort15], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteBlendY, [SBPort15], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteBlendZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVarBlend, [SBPort15], 2, [2], 2, 6>;
+defm : SBWriteResPair<WriteVarBlendY,[SBPort15], 2, [2], 2, 7>;
+defm : SBWriteResPair<WriteVarBlendZ,[SBPort15], 2, [2], 2, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteMPSAD, [SBPort0, SBPort15], 7, [1,2], 3, 6>;
+defm : SBWriteResPair<WriteMPSADY, [SBPort0, SBPort15], 7, [1,2], 3, 7>;
+defm : SBWriteResPair<WriteMPSADZ, [SBPort0, SBPort15], 7, [1,2], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePSADBW, [SBPort0], 5, [1], 1, 5>;
+defm : SBWriteResPair<WritePSADBWX, [SBPort0], 5, [1], 1, 6>;
+defm : SBWriteResPair<WritePSADBWY, [SBPort0], 5, [1], 1, 7>;
+defm : SBWriteResPair<WritePSADBWZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePHMINPOS, [SBPort0], 5, [1], 1, 6>;
+
+// Vector integer shifts.
+defm : SBWriteResPair<WriteVecShift, [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecShiftX, [SBPort0,SBPort15], 2, [1,1], 2, 6>;
+defm : SBWriteResPair<WriteVecShiftY, [SBPort0,SBPort15], 4, [1,1], 2, 7>;
+defm : SBWriteResPair<WriteVecShiftZ, [SBPort0,SBPort15], 4, [1,1], 2, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVecShiftImm, [SBPort5], 1, [1], 1, 5>;
+defm : SBWriteResPair<WriteVecShiftImmX, [SBPort0], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVecShiftImmY, [SBPort0], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVecShiftImmZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1
+defm : SBWriteResPair<WriteVarVecShift, [SBPort0], 1, [1], 1, 6>;
+defm : SBWriteResPair<WriteVarVecShiftY, [SBPort0], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarVecShiftZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [SBPort5,SBPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def : WriteRes<WriteVecInsertLd, [SBPort23,SBPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def : WriteRes<WriteVecExtract, [SBPort0,SBPort15]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def : WriteRes<WriteVecExtractSt, [SBPort4,SBPort23,SBPort15]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : SBWriteResPair<WriteFHAdd, [SBPort1,SBPort5], 5, [1,2], 3, 6>;
+defm : SBWriteResPair<WriteFHAddY, [SBPort1,SBPort5], 5, [1,2], 3, 7>;
+defm : SBWriteResPair<WriteFHAddZ, [SBPort1,SBPort5], 5, [1,2], 3, 7>; // Unsupported = 1
+defm : SBWriteResPair<WritePHAdd, [SBPort15], 3, [3], 3, 5>;
+defm : SBWriteResPair<WritePHAddX, [SBPort15], 3, [3], 3, 6>;
+defm : SBWriteResPair<WritePHAddY, [SBPort15], 3, [3], 3, 7>;
+defm : SBWriteResPair<WritePHAddZ, [SBPort15], 3, [3], 3, 7>; // Unsupported = 1
+
+////////////////////////////////////////////////////////////////////////////////
+// String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [SBPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SBPort0, SBPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SBPort015]> {
+ let Latency = 11;
+ let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
+ let Latency = 17;
+ let ResourceCycles = [7, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SBPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SBPort0,SBPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SBPort015]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [7, 1];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [SBPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK, [SBPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [SBPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK, [SBPort0]> { let Latency = 1; }
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [SBPort5,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def : WriteRes<WriteAESDecEncLd, [SBPort5,SBPort23,SBPort015]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+
+def : WriteRes<WriteAESIMC, [SBPort5]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SBPort5,SBPort23]> {
+ let Latency = 18;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+
+def : WriteRes<WriteAESKeyGen, [SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [11];
+}
+def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> {
+ let Latency = 14;
+ let ResourceCycles = [10, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SBPort015]> {
+ let Latency = 14;
+ let ResourceCycles = [18];
+}
+def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
+ let Latency = 20;
+ let ResourceCycles = [17, 1];
+}
+
+// Load/store MXCSR.
+// FIXME: This is probably wrong. Only STMXCSR should require Port4.
+def : WriteRes<WriteLDMXCSR, [SBPort0,SBPort4,SBPort5,SBPort23]> { let Latency = 5; let NumMicroOps = 4; let ResourceCycles = [1,1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [SBPort0,SBPort4,SBPort5,SBPort23]> { let Latency = 5; let NumMicroOps = 4; let ResourceCycles = [1,1,1,1]; }
+
+def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SBPort23, SBPort4]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX2/FMA is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+defm : SBWriteResPair<WriteFShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFVarShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteVarShuffle256, [SBPort5], 1, [1], 1, 7>;
+defm : SBWriteResPair<WriteFMA, [SBPort01], 5>;
+defm : SBWriteResPair<WriteFMAX, [SBPort01], 5>;
+defm : SBWriteResPair<WriteFMAY, [SBPort01], 5>;
+defm : SBWriteResPair<WriteFMAZ, [SBPort01], 5>; // Unsupported = 1
+
+// Remaining SNB instrs.
+
+def SBWriteResGroup1 : SchedWriteRes<[SBPort1]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup1], (instrs COMP_FST0r,
+ COM_FST0r,
+ UCOM_FPr,
+ UCOM_Fr)>;
+
+def SBWriteResGroup2 : SchedWriteRes<[SBPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup2], (instrs FDECSTP, FINCSTP, FFREE, FFREEP, FNOP,
+ LD_Frr, ST_Frr, ST_FPrr)>;
+def: InstRW<[SBWriteResGroup2], (instrs LOOP, LOOPE, LOOPNE)>; // FIXME: This seems wrong compared to other Intel CPUs.
+def: InstRW<[SBWriteResGroup2], (instrs RETQ)>;
+
+def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup4], (instrs CDQ, CQO)>;
+
+def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr,
+ MMX_PABSDrr,
+ MMX_PABSWrr,
+ MMX_PADDQirr,
+ MMX_PALIGNRrri,
+ MMX_PSIGNBrr,
+ MMX_PSIGNDrr,
+ MMX_PSIGNWrr)>;
+
+def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroup11], (instrs SCASB,
+ SCASL,
+ SCASQ,
+ SCASW)>;
+
+def SBWriteResGroup12 : SchedWriteRes<[SBPort0,SBPort1]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup12], (instregex "(V?)(U?)COMI(SD|SS)rr")>;
+
+def SBWriteResGroup15 : SchedWriteRes<[SBPort0,SBPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup15], (instrs CWD,
+ FNSTSW16r)>;
+
+def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup18], (instrs JCXZ, JECXZ, JRCXZ,
+ MMX_MOVDQ2Qrr)>;
+
+def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup21], (instrs PUSHFS64)>;
+
+def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup22], (instregex "(V?)EXTRACTPSrr")>;
+
+def SBWriteResGroup23 : SchedWriteRes<[SBPort05]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SBWriteResGroup23], (instregex "RCL(8|16|32|64)r1",
+ "RCR(8|16|32|64)r1")>;
+
+def SBWriteResGroup25_1 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup25_1], (instrs LEAVE, LEAVE64)>;
+
+def SBWriteResGroup26_2 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup26_2], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
+
+def SBWriteResGroup29 : SchedWriteRes<[SBPort1,SBPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup29], (instrs MOV64sr)>;
+
+def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>;
+
+def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm(8|16|32)",
+ "MOVZX(16|32|64)rm(8|16)")>;
+
+def SBWriteResGroup76 : SchedWriteRes<[SBPort05]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+ let ResourceCycles = [8];
+}
+def: InstRW<[SBWriteResGroup76], (instregex "RCL(8|16|32|64)r(i|CL)",
+ "RCR(8|16|32|64)r(i|CL)")>;
+
+def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup33], (instregex "PUSH(16r|32r|64r|64i8)")>;
+
+def SBWriteResGroup35 : SchedWriteRes<[SBPort1,SBPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup35], (instrs CLI)>;
+
+def SBWriteResGroup35_2 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup35_2], (instrs PUSHGS64)>;
+def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP(16|32|64)m")>;
+
+def SBWriteResGroup36 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup36], (instrs CALL64pcrel32)>;
+def: InstRW<[SBWriteResGroup36], (instregex "CALL(16|32|64)r",
+ "(V?)EXTRACTPSmr")>;
+
+def SBWriteResGroup40 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup40], (instrs STOSB, STOSL, STOSQ, STOSW)>;
+
+def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup41], (instrs FNINIT)>;
+
+def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup45], (instregex "(V?)PEXTR(D|Q)mr",
+ "PUSHF(16|64)")>;
+
+def SBWriteResGroup46 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup46], (instregex "CLFLUSH")>;
+
+def SBWriteResGroup47 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SBWriteResGroup47], (instregex "FXRSTOR")>;
+
+def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup48], (instrs MMX_MOVD64from64rm,
+ VBROADCASTSSrm)>;
+def: InstRW<[SBWriteResGroup48], (instregex "POP(16|32|64)r",
+ "(V?)MOV64toPQIrm",
+ "(V?)MOVDDUPrm",
+ "(V?)MOVDI2PDIrm",
+ "(V?)MOVQI2PQIrm",
+ "(V?)MOVSDrm",
+ "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm",
+ "(V?)MOVSSrm")>;
+
+def SBWriteResGroup49 : SchedWriteRes<[SBPort5,SBPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup49], (instrs MOV16sm)>;
+
+def SBWriteResGroup51 : SchedWriteRes<[SBPort23,SBPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup51], (instrs MMX_PABSBrm,
+ MMX_PABSDrm,
+ MMX_PABSWrm,
+ MMX_PALIGNRrmi,
+ MMX_PSIGNBrm,
+ MMX_PSIGNDrm,
+ MMX_PSIGNWrm)>;
+
+def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup52], (instrs LODSL, LODSQ)>;
+
+def SBWriteResGroup53 : SchedWriteRes<[SBPort4,SBPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup53], (instregex "ST_F(32|64)m",
+ "ST_FP(32|64|80)m")>;
+
+def SBWriteResGroup54 : SchedWriteRes<[SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup54], (instrs VBROADCASTSDYrm,
+ VBROADCASTSSYrm,
+ VMOVDDUPYrm,
+ VMOVSHDUPYrm,
+ VMOVSLDUPYrm)>;
+
+def SBWriteResGroup58 : SchedWriteRes<[SBPort23,SBPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup58], (instrs VINSERTF128rm)>;
+
+def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQirm)>;
+
+def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup62], (instrs VERRm, VERWm)>;
+
+def SBWriteResGroup63 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup63], (instrs LODSB, LODSW)>;
+
+def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup64], (instrs FARJMP64m)>;
+
+def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup66], (instrs FNSTSWm)>;
+
+def SBWriteResGroup67 : SchedWriteRes<[SBPort1,SBPort5,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup67], (instregex "SLDT(16|32|64)r",
+ "STR(16|32|64)r")>;
+
+def SBWriteResGroup68 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup68], (instrs FNSTCW16m)>;
+def: InstRW<[SBWriteResGroup68], (instregex "CALL(16|32|64)m")>;
+
+def SBWriteResGroup69 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup69], (instregex "SAR(8|16|32|64)m(1|i)",
+ "SHL(8|16|32|64)m(1|i)",
+ "SHR(8|16|32|64)m(1|i)")>;
+
+def SBWriteResGroup77 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup77], (instregex "(V?)(U?)COMI(SD|SS)rm")>;
+
+def SBWriteResGroup81 : SchedWriteRes<[SBPort4, SBPort23, SBPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2, 1];
+}
+def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG(8|16)B")>;
+
+def SBWriteResGroup83 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,3];
+}
+def: InstRW<[SBWriteResGroup83], (instrs CMPSB,
+ CMPSL,
+ CMPSQ,
+ CMPSW)>;
+
+def SBWriteResGroup84 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,2];
+}
+def: InstRW<[SBWriteResGroup84], (instrs FLDCW16m)>;
+
+def SBWriteResGroup85 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,2];
+}
+def: InstRW<[SBWriteResGroup85], (instregex "ROL(8|16|32|64)m(1|i)",
+ "ROR(8|16|32|64)m(1|i)")>;
+
+def SBWriteResGroup86 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,2];
+}
+def: InstRW<[SBWriteResGroup86], (instrs MOVSB, MOVSL, MOVSQ, MOVSW)>;
+def: InstRW<[SBWriteResGroup86], (instregex "XADD(8|16|32|64)rm")>;
+
+def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SBWriteResGroup87], (instrs FARCALL64m)>;
+
+def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup93], (instregex "CVT(T?)(SD|SS)2SI(64)?rm")>;
+
+def SBWriteResGroup95 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup95], (instregex "LD_F(32|64|80)m")>;
+
+def SBWriteResGroup97 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup97], (instregex "IST_F(16|32)m",
+ "IST_FP(16|32|64)m")>;
+
+def SBWriteResGroup97_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,2,3];
+}
+def: InstRW<[SBWriteResGroup97_2], (instregex "ROL(8|16|32|64)mCL",
+ "ROR(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
+
+def SBWriteResGroup98 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,2,3];
+}
+def: SchedAlias<WriteADCRMW, SBWriteResGroup98>;
+
+def SBWriteResGroup99 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,2,2,1];
+}
+def: InstRW<[SBWriteResGroup99, ReadAfterLd], (instrs ADC8mr, ADC16mr, ADC32mr, ADC64mr,
+ SBB8mr, SBB16mr, SBB32mr, SBB64mr)>;
+
+def SBWriteResGroup100 : SchedWriteRes<[SBPort4,SBPort5,SBPort23,SBPort05,SBPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,2,1,1];
+}
+def : SchedAlias<WriteBitTestRegLd, SBWriteResGroup100>; // TODO - this is incorrect - no RMW
+
+def SBWriteResGroup101 : SchedWriteRes<[SBPort1,SBPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m")>;
+
+def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup104], (instregex "(V?)PCMPGTQrm")>;
+
+def SBWriteResGroup106 : SchedWriteRes<[SBPort1,SBPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup106], (instregex "FICOM(P?)(16|32)m")>;
+
+def SBWriteResGroup108 : SchedWriteRes<[SBPort05,SBPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [7,4];
+}
+def: InstRW<[SBWriteResGroup108], (instregex "RCL(8|16|32|64)m",
+ "RCR(8|16|32|64)m")>;
+
+def SBWriteResGroup111 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup111], (instregex "MUL_F(32|64)m")>;
+
+def SBWriteResGroup114 : SchedWriteRes<[SBPort1,SBPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup114], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
+
+def SBWriteResGroup119 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI(16|32)m")>;
+
+def SBWriteResGroup130 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 31;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup130], (instregex "DIV(R?)_F(32|64)m")>;
+
+def SBWriteResGroup131 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+ let Latency = 34;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup131], (instregex "DIV(R?)_FI(16|32)m")>;
+
+def SBWriteResGroupVzeroall : SchedWriteRes<[SBPort5]> {
+ let Latency = 9;
+ let NumMicroOps = 20;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroupVzeroall], (instrs VZEROALL)>;
+
+def SBWriteResGroupVzeroupper : SchedWriteRes<[]> {
+ let Latency = 1;
+ let NumMicroOps = 4;
+ let ResourceCycles = [];
+}
+def: InstRW<[SBWriteResGroupVzeroupper], (instrs VZEROUPPER)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
+
+// Instruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Sandy Bridge and Ivy Bridge Pipeline" > "Register allocation and
+// renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def SBWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def SBWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[SBWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def SBWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[SBWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def SBWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[SBWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def SBWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[SBWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def SBWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[SBWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def SBWritePCMPGTQ : SchedWriteRes<[SBPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SBWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SBWritePCMPGTQ]>
+]>;
+def : InstRW<[SBWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr)>;
+
+// CMOVs that use both Z and C flag require an extra uop.
+def SBWriteCMOVA_CMOVBErr : SchedWriteRes<[SBPort05,SBPort015]> {
+ let Latency = 3;
+ let ResourceCycles = [2,1];
+ let NumMicroOps = 3;
+}
+
+def SBWriteCMOVA_CMOVBErm : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [1,2,1];
+ let NumMicroOps = 4;
+}
+
+def SBCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SBWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def SBCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SBWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[SBCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[SBCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def SBWriteSETA_SETBEr : SchedWriteRes<[SBPort05]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SBWriteSETA_SETBEm : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def SBSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SBWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def SBSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SBWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[SBSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[SBSETA_SETBErm], (instrs SETCCm)>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td
new file mode 100644
index 000000000000..0599564765da
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -0,0 +1,1894 @@
+//=- X86SchedSkylake.td - X86 Skylake Client Scheduling ------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Skylake Client to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SkylakeClientModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SKylake can
+ // decode 6 instructions per cycle.
+ let IssueWidth = 6;
+ let MicroOpBufferSize = 224; // Based on the reorder buffer.
+ let LoadLatency = 5;
+ let MispredictPenalty = 14;
+
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
+ // This flag is set to allow the scheduler to assign a default model to
+ // unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SkylakeClientModel in {
+
+// Skylake Client can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def SKLPort0 : ProcResource<1>;
+def SKLPort1 : ProcResource<1>;
+def SKLPort2 : ProcResource<1>;
+def SKLPort3 : ProcResource<1>;
+def SKLPort4 : ProcResource<1>;
+def SKLPort5 : ProcResource<1>;
+def SKLPort6 : ProcResource<1>;
+def SKLPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SKLPort01 : ProcResGroup<[SKLPort0, SKLPort1]>;
+def SKLPort23 : ProcResGroup<[SKLPort2, SKLPort3]>;
+def SKLPort237 : ProcResGroup<[SKLPort2, SKLPort3, SKLPort7]>;
+def SKLPort04 : ProcResGroup<[SKLPort0, SKLPort4]>;
+def SKLPort05 : ProcResGroup<[SKLPort0, SKLPort5]>;
+def SKLPort06 : ProcResGroup<[SKLPort0, SKLPort6]>;
+def SKLPort15 : ProcResGroup<[SKLPort1, SKLPort5]>;
+def SKLPort16 : ProcResGroup<[SKLPort1, SKLPort6]>;
+def SKLPort56 : ProcResGroup<[SKLPort5, SKLPort6]>;
+def SKLPort015 : ProcResGroup<[SKLPort0, SKLPort1, SKLPort5]>;
+def SKLPort056 : ProcResGroup<[SKLPort0, SKLPort5, SKLPort6]>;
+def SKLPort0156: ProcResGroup<[SKLPort0, SKLPort1, SKLPort5, SKLPort6]>;
+
+def SKLDivider : ProcResource<1>; // Integer division issued on port 0.
+// FP division and sqrt on port 0.
+def SKLFPDivider : ProcResource<1>;
+
+// 60 Entry Unified Scheduler
+def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4,
+ SKLPort5, SKLPort6, SKLPort7]> {
+ let BufferSize=60;
+}
+
+// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 5>;
+
+// Vector loads are 5/6/7 cycles, so ReadAfterVec*Ld registers needn't be available
+// until 5/6/7 cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 6>;
+def : ReadAdvance<ReadAfterVecYLd, 7>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SKLWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([SKLPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SKLPort237,SKLPort4]>;
+
+// Arithmetic.
+defm : SKLWriteResPair<WriteALU, [SKLPort0156], 1>; // Simple integer ALU op.
+defm : SKLWriteResPair<WriteADC, [SKLPort06], 1>; // Integer ALU + flags op.
+
+// Integer multiplication.
+defm : SKLWriteResPair<WriteIMul8, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteIMul16, [SKLPort1,SKLPort06,SKLPort0156], 4, [1,1,2], 4>;
+defm : X86WriteRes<WriteIMul16Imm, [SKLPort1,SKLPort0156], 4, [1,1], 2>;
+defm : X86WriteRes<WriteIMul16ImmLd, [SKLPort1,SKLPort0156,SKLPort23], 8, [1,1,1], 3>;
+defm : SKLWriteResPair<WriteIMul16Reg, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteIMul32, [SKLPort1,SKLPort06,SKLPort0156], 4, [1,1,1], 3>;
+defm : SKLWriteResPair<WriteIMul32Imm, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteIMul32Reg, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteIMul64, [SKLPort1,SKLPort5], 4, [1,1], 2>;
+defm : SKLWriteResPair<WriteIMul64Imm, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteIMul64Reg, [SKLPort1], 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+
+defm : X86WriteRes<WriteBSWAP32, [SKLPort15], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [SKLPort06, SKLPort15], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCMPXCHG,[SKLPort06, SKLPort0156], 5, [2,3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[SKLPort23,SKLPort06,SKLPort0156,SKLPort237,SKLPort4], 8, [1,2,1,1,1], 6>;
+defm : X86WriteRes<WriteXCHG, [SKLPort0156], 2, [3], 3>;
+
+// TODO: Why isn't the SKLDivider used?
+defm : SKLWriteResPair<WriteDiv8, [SKLPort0,SKLDivider], 25, [1,10], 1, 4>;
+defm : X86WriteRes<WriteDiv16, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156], 76, [7,2,8,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv32, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156], 76, [7,2,8,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv64, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156], 76, [7,2,8,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv16Ld, [SKLPort0,SKLPort23,SKLDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteDiv32Ld, [SKLPort0,SKLPort23,SKLDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteDiv64Ld, [SKLPort0,SKLPort23,SKLDivider], 29, [1,1,10], 2>;
+
+defm : X86WriteRes<WriteIDiv8, [SKLPort0,SKLDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv16, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156], 102, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv32, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156], 102, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv64, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156], 102, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv8Ld, [SKLPort0,SKLPort5,SKLPort23,SKLPort0156], 28, [2,4,1,1], 8>;
+defm : X86WriteRes<WriteIDiv16Ld, [SKLPort0,SKLPort5,SKLPort23,SKLPort0156], 28, [2,4,1,1], 8>;
+defm : X86WriteRes<WriteIDiv32Ld, [SKLPort0,SKLPort5,SKLPort23,SKLPort0156], 28, [2,4,1,1], 8>;
+defm : X86WriteRes<WriteIDiv64Ld, [SKLPort0,SKLPort5,SKLPort23,SKLPort0156], 28, [2,4,1,1], 8>;
+
+defm : SKLWriteResPair<WriteCRC32, [SKLPort1], 3>;
+
+def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads.
+
+defm : SKLWriteResPair<WriteCMOV, [SKLPort06], 1, [1], 1>; // Conditional move.
+defm : X86WriteRes<WriteFCMOV, [SKLPort1], 3, [1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [SKLPort06]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+defm : X86WriteRes<WriteLAHFSAHF, [SKLPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTest, [SKLPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [SKLPort06,SKLPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestRegLd, [SKLPort0156,SKLPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestSet, [SKLPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestSetImmLd, [SKLPort06,SKLPort23], 5, [1,1], 3>;
+defm : X86WriteRes<WriteBitTestSetRegLd, [SKLPort0156,SKLPort23], 5, [1,1], 2>;
+
+// Bit counts.
+defm : SKLWriteResPair<WriteBSF, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteBSR, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteLZCNT, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteTZCNT, [SKLPort1], 3>;
+defm : SKLWriteResPair<WritePOPCNT, [SKLPort1], 3>;
+
+// Integer shifts and rotates.
+defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>;
+defm : SKLWriteResPair<WriteShiftCL, [SKLPort06], 3, [3], 3>;
+defm : SKLWriteResPair<WriteRotate, [SKLPort06], 1, [1], 1>;
+defm : SKLWriteResPair<WriteRotateCL, [SKLPort06], 3, [3], 3>;
+
+// SHLD/SHRD.
+defm : X86WriteRes<WriteSHDrri, [SKLPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteSHDrrcl,[SKLPort1,SKLPort06,SKLPort0156], 6, [1, 2, 1], 4>;
+defm : X86WriteRes<WriteSHDmri, [SKLPort1,SKLPort23,SKLPort237,SKLPort0156], 9, [1, 1, 1, 1], 4>;
+defm : X86WriteRes<WriteSHDmrcl,[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156], 11, [1, 1, 1, 2, 1], 6>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : SKLWriteResPair<WriteBEXTR, [SKLPort06,SKLPort15], 2, [1,1], 2>;
+defm : SKLWriteResPair<WriteBLS, [SKLPort15], 1>;
+defm : SKLWriteResPair<WriteBZHI, [SKLPort15], 1>;
+
+// Loads, stores, and moves, not folded with other operations.
+defm : X86WriteRes<WriteLoad, [SKLPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore, [SKLPort237, SKLPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [SKLPort237, SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove, [SKLPort0156], 1, [1], 1>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def : WriteRes<WriteZero, []>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : SKLWriteResPair<WriteJump, [SKLPort06], 1>;
+
+// Floating point. This covers both scalar and vector operations.
+defm : X86WriteRes<WriteFLD0, [SKLPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [SKLPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC, [SKLPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad, [SKLPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [SKLPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [SKLPort23,SKLPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedLoadY, [SKLPort23,SKLPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteFStore, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [SKLPort05,SKLPort0156], 10, [9,1], 10>;
+
+defm : SKLWriteResPair<WriteFAdd, [SKLPort01], 4, [1], 1, 5>; // Floating point add/sub.
+defm : SKLWriteResPair<WriteFAddX, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFAddY, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : SKLWriteResPair<WriteFAdd64, [SKLPort01], 4, [1], 1, 5>; // Floating point double add/sub.
+defm : SKLWriteResPair<WriteFAdd64X, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFAdd64Y, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+
+defm : SKLWriteResPair<WriteFCmp, [SKLPort01], 4, [1], 1, 5>; // Floating point compare.
+defm : SKLWriteResPair<WriteFCmpX, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFCmpY, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : SKLWriteResPair<WriteFCmp64, [SKLPort01], 4, [1], 1, 5>; // Floating point double compare.
+defm : SKLWriteResPair<WriteFCmp64X, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFCmp64Y, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+
+defm : SKLWriteResPair<WriteFCom, [SKLPort0], 2>; // Floating point compare to flags (X87).
+defm : SKLWriteResPair<WriteFComX, [SKLPort0], 2>; // Floating point compare to flags (SSE).
+
+defm : SKLWriteResPair<WriteFMul, [SKLPort01], 4, [1], 1, 5>; // Floating point multiplication.
+defm : SKLWriteResPair<WriteFMulX, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFMulY, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : SKLWriteResPair<WriteFMul64, [SKLPort01], 4, [1], 1, 5>; // Floating point double multiplication.
+defm : SKLWriteResPair<WriteFMul64X, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFMul64Y, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+
+defm : SKLWriteResPair<WriteFDiv, [SKLPort0,SKLFPDivider], 11, [1,3], 1, 5>; // Floating point division.
+//defm : SKLWriteResPair<WriteFDivX, [SKLPort0,SKLFPDivider], 11, [1,3], 1, 6>;
+defm : SKLWriteResPair<WriteFDivY, [SKLPort0,SKLFPDivider], 11, [1,5], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+//defm : SKLWriteResPair<WriteFDiv64, [SKLPort0,SKLFPDivider], 14, [1,3], 1, 5>; // Floating point double division.
+//defm : SKLWriteResPair<WriteFDiv64X, [SKLPort0,SKLFPDivider], 14, [1,3], 1, 6>;
+//defm : SKLWriteResPair<WriteFDiv64Y, [SKLPort0,SKLFPDivider], 14, [1,5], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+
+defm : SKLWriteResPair<WriteFSqrt, [SKLPort0,SKLFPDivider], 12, [1,3], 1, 5>; // Floating point square root.
+defm : SKLWriteResPair<WriteFSqrtX, [SKLPort0,SKLFPDivider], 12, [1,3], 1, 6>;
+defm : SKLWriteResPair<WriteFSqrtY, [SKLPort0,SKLFPDivider], 12, [1,6], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : SKLWriteResPair<WriteFSqrt64, [SKLPort0,SKLFPDivider], 18, [1,6], 1, 5>; // Floating point double square root.
+defm : SKLWriteResPair<WriteFSqrt64X, [SKLPort0,SKLFPDivider], 18, [1,6], 1, 6>;
+defm : SKLWriteResPair<WriteFSqrt64Y, [SKLPort0,SKLFPDivider], 18, [1,12],1, 7>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : SKLWriteResPair<WriteFSqrt80, [SKLPort0,SKLFPDivider], 21, [1,7]>; // Floating point long double square root.
+
+defm : SKLWriteResPair<WriteFRcp, [SKLPort0], 4, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : SKLWriteResPair<WriteFRcpX, [SKLPort0], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFRcpY, [SKLPort0], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : SKLWriteResPair<WriteFRsqrt, [SKLPort0], 4, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : SKLWriteResPair<WriteFRsqrtX,[SKLPort0], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFRsqrtY,[SKLPort0], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
+defm : SKLWriteResPair<WriteFMA, [SKLPort01], 4, [1], 1, 5>; // Fused Multiply Add.
+defm : SKLWriteResPair<WriteFMAX, [SKLPort01], 4, [1], 1, 6>;
+defm : SKLWriteResPair<WriteFMAY, [SKLPort01], 4, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : SKLWriteResPair<WriteDPPD, [SKLPort5,SKLPort01], 9, [1,2], 3, 6>; // Floating point double dot product.
+defm : SKLWriteResPair<WriteDPPS, [SKLPort5,SKLPort01], 13, [1,3], 4, 6>;
+defm : SKLWriteResPair<WriteDPPSY, [SKLPort5,SKLPort01], 13, [1,3], 4, 7>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : SKLWriteResPair<WriteFSign, [SKLPort0], 1>; // Floating point fabs/fchs.
+defm : SKLWriteResPair<WriteFRnd, [SKLPort01], 8, [2], 2, 6>; // Floating point rounding.
+defm : SKLWriteResPair<WriteFRndY, [SKLPort01], 8, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : SKLWriteResPair<WriteFLogic, [SKLPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals.
+defm : SKLWriteResPair<WriteFLogicY, [SKLPort015], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : SKLWriteResPair<WriteFTest, [SKLPort0], 2, [1], 1, 6>; // Floating point TEST instructions.
+defm : SKLWriteResPair<WriteFTestY, [SKLPort0], 2, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : SKLWriteResPair<WriteFShuffle, [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : SKLWriteResPair<WriteFVarShuffle, [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFVarShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : SKLWriteResPair<WriteFBlend, [SKLPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : SKLWriteResPair<WriteFBlendY, [SKLPort015], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : SKLWriteResPair<WriteFVarBlend, [SKLPort015], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : SKLWriteResPair<WriteFVarBlendY,[SKLPort015], 2, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+
+// FMA Scheduling helper class.
+// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm : X86WriteRes<WriteVecLoad, [SKLPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [SKLPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [SKLPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [SKLPort23,SKLPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [SKLPort23,SKLPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecStore, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMove, [SKLPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [SKLPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [SKLPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [SKLPort5], 1, [1], 1>;
+
+defm : SKLWriteResPair<WriteVecALU, [SKLPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : SKLWriteResPair<WriteVecALUX, [SKLPort01], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecALUY, [SKLPort01], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : SKLWriteResPair<WriteVecLogic, [SKLPort05], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : SKLWriteResPair<WriteVecLogicX,[SKLPort015], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecLogicY,[SKLPort015], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : SKLWriteResPair<WriteVecTest, [SKLPort0,SKLPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
+defm : SKLWriteResPair<WriteVecTestY, [SKLPort0,SKLPort5], 3, [1,1], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : SKLWriteResPair<WriteVecIMul, [SKLPort0] , 5, [1], 1, 5>; // Vector integer multiply.
+defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01], 5, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01], 5, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : SKLWriteResPair<WritePMULLD, [SKLPort01], 10, [2], 2, 6>; // Vector PMULLD.
+defm : SKLWriteResPair<WritePMULLDY, [SKLPort01], 10, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : SKLWriteResPair<WriteShuffle, [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKLWriteResPair<WriteShuffleX, [SKLPort5], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : SKLWriteResPair<WriteVarShuffle, [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffleX, [SKLPort5], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVarShuffleY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : SKLWriteResPair<WriteBlend, [SKLPort5], 1, [1], 1, 6>; // Vector blends.
+defm : SKLWriteResPair<WriteBlendY, [SKLPort5], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : SKLWriteResPair<WriteVarBlend, [SKLPort015], 2, [2], 2, 6>; // Vector variable blends.
+defm : SKLWriteResPair<WriteVarBlendY, [SKLPort015], 2, [2], 2, 6>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : SKLWriteResPair<WriteMPSAD, [SKLPort5], 4, [2], 2, 6>; // Vector MPSAD.
+defm : SKLWriteResPair<WriteMPSADY, [SKLPort5], 4, [2], 2, 7>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : SKLWriteResPair<WritePSADBW, [SKLPort5], 3, [1], 1, 5>; // Vector PSADBW.
+defm : SKLWriteResPair<WritePSADBWX, [SKLPort5], 3, [1], 1, 6>;
+defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : SKLWriteResPair<WriteVecShift, [SKLPort0], 1, [1], 1, 5>;
+defm : X86WriteRes<WriteVecShiftX, [SKLPort5,SKLPort01], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftY, [SKLPort5,SKLPort01], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftXLd, [SKLPort01,SKLPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd, [SKLPort01,SKLPort23], 8, [1,1], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+
+defm : SKLWriteResPair<WriteVecShiftImm, [SKLPort0], 1, [1], 1, 5>; // Vector integer immediate shifts.
+defm : SKLWriteResPair<WriteVecShiftImmX, [SKLPort01], 1, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecShiftImmY, [SKLPort01], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : SKLWriteResPair<WriteVarVecShift, [SKLPort01], 1, [1], 1, 6>; // Variable vector shifts.
+defm : SKLWriteResPair<WriteVarVecShiftY, [SKLPort01], 1, [1], 1, 7>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [SKLPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVecInsertLd, [SKLPort5,SKLPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
+
+def : WriteRes<WriteVecExtract, [SKLPort0,SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def : WriteRes<WriteVecExtractSt, [SKLPort4,SKLPort5,SKLPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+// Conversion between integer and float.
+defm : SKLWriteResPair<WriteCvtSS2I, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2I, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2IY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : SKLWriteResPair<WriteCvtSD2I, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2I, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2IY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : SKLWriteResPair<WriteCvtI2SS, [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PS, [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PSY, [SKLPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : SKLWriteResPair<WriteCvtI2SD, [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PD, [SKLPort1], 4>;
+defm : SKLWriteResPair<WriteCvtI2PDY, [SKLPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : SKLWriteResPair<WriteCvtSS2SD, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2PD, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPS2PDY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : SKLWriteResPair<WriteCvtSD2SS, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2PS, [SKLPort1], 3>;
+defm : SKLWriteResPair<WriteCvtPD2PSY, [SKLPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteRes<WriteCvtPH2PS, [SKLPort5,SKLPort015], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY, [SKLPort5,SKLPort01], 7, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteRes<WriteCvtPH2PSLd, [SKLPort23,SKLPort01], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [SKLPort23,SKLPort01], 10, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+defm : X86WriteRes<WriteCvtPS2PH, [SKLPort5,SKLPort015], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY, [SKLPort5,SKLPort01], 7, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt, [SKLPort4,SKLPort5,SKLPort237,SKLPort01], 6, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [SKLPort4,SKLPort5,SKLPort237,SKLPort01], 8, [1,1,1,1], 4>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+// Strings instructions.
+
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [SKLPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SKLPort0, SKLPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SKLPort0, SKLPort5, SKLPort015, SKLPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def : WriteRes<WritePCmpEStrMLd, [SKLPort0, SKLPort5,SKLPort23, SKLPort015, SKLPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SKLPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SKLPort0, SKLPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SKLPort0, SKLPort5, SKLPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
+}
+def : WriteRes<WritePCmpEStrILd, [SKLPort0, SKLPort5, SKLPort23, SKLPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [SKLPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK, [SKLPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [SKLPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK, [SKLPort0]> { let Latency = 2; }
+
+// AES instructions.
+def : WriteRes<WriteAESDecEnc, [SKLPort0]> { // Decryption, encryption.
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [SKLPort0, SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+
+def : WriteRes<WriteAESIMC, [SKLPort0]> { // InvMixColumn.
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SKLPort0, SKLPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+
+def : WriteRes<WriteAESKeyGen, [SKLPort0, SKLPort5, SKLPort015]> { // Key Generation.
+ let Latency = 20;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,2];
+}
+def : WriteRes<WriteAESKeyGenLd, [SKLPort0, SKLPort5, SKLPort23, SKLPort015]> {
+ let Latency = 25;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,1,1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SKLPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteCLMulLd, [SKLPort5, SKLPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+
+// Catch-all for expensive system instructions.
+def : WriteRes<WriteSystem, [SKLPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
+
+// AVX2.
+defm : SKLWriteResPair<WriteFShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteFVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : SKLWriteResPair<WriteShuffle256, [SKLPort5], 3, [1], 1, 7>; // 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // 256-bit width vector variable shuffles.
+
+// Old microcoded instructions that nobody use.
+def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def : WriteRes<WriteFence, [SKLPort23, SKLPort4]>;
+
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [SKLPort0,SKLPort23,SKLPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [SKLPort4,SKLPort5,SKLPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
+// Nop, not very useful expect it provides a model for nops!
+def : WriteRes<WriteNop, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : SKLWriteResPair<WriteFHAdd, [SKLPort5,SKLPort01], 6, [2,1], 3, 6>;
+defm : SKLWriteResPair<WriteFHAddY, [SKLPort5,SKLPort01], 6, [2,1], 3, 7>;
+defm : SKLWriteResPair<WritePHAdd, [SKLPort5,SKLPort05], 3, [2,1], 3, 5>;
+defm : SKLWriteResPair<WritePHAddX, [SKLPort5,SKLPort015], 3, [2,1], 3, 6>;
+defm : SKLWriteResPair<WritePHAddY, [SKLPort5,SKLPort015], 3, [2,1], 3, 7>;
+
+// Remaining instrs.
+
+def SKLWriteResGroup1 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDS(B|W)irr",
+ "MMX_PADDUS(B|W)irr",
+ "MMX_PAVG(B|W)irr",
+ "MMX_PCMPEQ(B|D|W)irr",
+ "MMX_PCMPGT(B|D|W)irr",
+ "MMX_P(MAX|MIN)SWirr",
+ "MMX_P(MAX|MIN)UBirr",
+ "MMX_PSUBS(B|W)irr",
+ "MMX_PSUBUS(B|W)irr")>;
+
+def SKLWriteResGroup3 : SchedWriteRes<[SKLPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup3], (instregex "COM(P?)_FST0r",
+ "UCOM_F(P?)r")>;
+
+def SKLWriteResGroup4 : SchedWriteRes<[SKLPort6]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup4], (instregex "JMP(16|32|64)r")>;
+
+def SKLWriteResGroup6 : SchedWriteRes<[SKLPort05]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup6], (instrs FINCSTP, FNOP)>;
+
+def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>;
+
+def SKLWriteResGroup8 : SchedWriteRes<[SKLPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr")>;
+
+def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup9], (instregex "(V?)PADD(B|D|Q|W)(Y?)rr",
+ "VPBLENDD(Y?)rri")>;
+
+def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup10], (instrs CBW, CWDE, CDQE,
+ CMC, STC,
+ SGDT64m,
+ SIDT64m,
+ SMSW16m,
+ STRm,
+ SYSCALL)>;
+
+def SKLWriteResGroup11 : SchedWriteRes<[SKLPort4,SKLPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup11], (instrs FBSTPm, VMPTRSTm)>;
+def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP(32|64|80)m")>;
+
+def SKLWriteResGroup13 : SchedWriteRes<[SKLPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup13], (instrs MMX_MOVQ2DQrr)>;
+
+def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup14], (instrs FDECSTP,
+ MMX_MOVDQ2Qrr)>;
+
+def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup17], (instrs LFENCE,
+ WAIT,
+ XGETBV)>;
+
+def SKLWriteResGroup20 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup20], (instregex "CLFLUSH")>;
+
+def SKLWriteResGroup21 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup21], (instrs SFENCE)>;
+
+def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup23], (instrs CWD,
+ JCXZ, JECXZ, JRCXZ,
+ ADC8i8, SBB8i8,
+ ADC16i16, SBB16i16,
+ ADC32i32, SBB32i32,
+ ADC64i32, SBB64i32)>;
+
+def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup25], (instrs FNSTCW16m)>;
+
+def SKLWriteResGroup27 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup27], (instregex "MOVBE(16|32|64)mr")>;
+
+def SKLWriteResGroup28 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup28], (instrs PUSH16r, PUSH32r, PUSH64r, PUSH64i8,
+ STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)rmr")>;
+
+def SKLWriteResGroup29 : SchedWriteRes<[SKLPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr",
+ "PEXT(32|64)rr")>;
+
+def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup30], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
+ "VPBROADCAST(B|W)rr")>;
+
+def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup32], (instrs FNSTSW16r)>;
+
+def SKLWriteResGroup35 : SchedWriteRes<[SKLPort0,SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PH(ADD|SUB)SWrr")>;
+
+def SKLWriteResGroup36 : SchedWriteRes<[SKLPort5,SKLPort01]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup36], (instregex "(V?)PHADDSW(Y?)rr",
+ "(V?)PHSUBSW(Y?)rr")>;
+
+def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup39], (instrs MMX_PACKSSDWirr,
+ MMX_PACKSSWBirr,
+ MMX_PACKUSWBirr)>;
+
+def SKLWriteResGroup40 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup40], (instregex "CLD")>;
+
+def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup41], (instrs MFENCE)>;
+
+def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup42], (instregex "RCL(8|16|32|64)r(1|i)",
+ "RCR(8|16|32|64)r(1|i)")>;
+
+def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup43], (instrs FNSTSWm)>;
+
+def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup45], (instregex "CALL(16|32|64)r")>;
+
+def SKLWriteResGroup46 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup46], (instrs CALL64pcrel32)>;
+
+def SKLWriteResGroup47 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup47], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
+
+def SKLWriteResGroup48 : SchedWriteRes<[SKLPort01]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup48], (instregex "(V?)CVTDQ2PS(Y?)rr",
+ "(V?)CVT(T?)PS2DQ(Y?)rr")>;
+
+def SKLWriteResGroup53 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup53], (instregex "IST(T?)_FP(16|32|64)m",
+ "IST_F(16|32)m")>;
+
+def SKLWriteResGroup54 : SchedWriteRes<[SKLPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
+}
+def: InstRW<[SKLWriteResGroup54], (instrs FNCLEX)>;
+
+def SKLWriteResGroup55 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKLWriteResGroup55], (instrs PAUSE)>;
+
+def SKLWriteResGroup56 : SchedWriteRes<[]> {
+ let Latency = 0;
+ let NumMicroOps = 4;
+ let ResourceCycles = [];
+}
+def: InstRW<[SKLWriteResGroup56], (instrs VZEROUPPER)>;
+
+def SKLWriteResGroup57 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SKLWriteResGroup57], (instregex "LAR(16|32|64)rr")>;
+
+def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
+ "MOVZX(16|32|64)rm(8|16)",
+ "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67?
+
+def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup59], (instrs MMX_CVTPI2PDirr,
+ CVTDQ2PDrr,
+ VCVTDQ2PDrr)>;
+
+def SKLWriteResGroup60 : SchedWriteRes<[SKLPort5,SKLPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVT(T?)PD2PIirr",
+ "MMX_CVT(T?)PS2PIirr",
+ "(V?)CVT(T?)PD2DQrr",
+ "(V?)CVTPD2PSrr",
+ "(V?)CVTPS2PDrr",
+ "(V?)CVTSD2SSrr",
+ "(V?)CVTSI642SDrr",
+ "(V?)CVTSI2SDrr",
+ "(V?)CVTSI2SSrr",
+ "(V?)CVTSS2SDrr")>;
+
+def SKLWriteResGroup61 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup61], (instregex "STR(16|32|64)r")>;
+
+def SKLWriteResGroup63 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[SKLWriteResGroup63], (instrs XSETBV)>;
+
+def SKLWriteResGroup65 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,4];
+}
+def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF(16|64)")>;
+
+def SKLWriteResGroup67 : SchedWriteRes<[SKLPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup67], (instrs VBROADCASTSSrm,
+ VPBROADCASTDrm,
+ VPBROADCASTQrm)>;
+def: InstRW<[SKLWriteResGroup67], (instregex "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm")>;
+
+def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup68], (instrs MMX_CVTPI2PSirr)>;
+
+def SKLWriteResGroup69 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup69], (instrs MMX_PADDSBirm,
+ MMX_PADDSWirm,
+ MMX_PADDUSBirm,
+ MMX_PADDUSWirm,
+ MMX_PAVGBirm,
+ MMX_PAVGWirm,
+ MMX_PCMPEQBirm,
+ MMX_PCMPEQDirm,
+ MMX_PCMPEQWirm,
+ MMX_PCMPGTBirm,
+ MMX_PCMPGTDirm,
+ MMX_PCMPGTWirm,
+ MMX_PMAXSWirm,
+ MMX_PMAXUBirm,
+ MMX_PMINSWirm,
+ MMX_PMINUBirm,
+ MMX_PSUBSBirm,
+ MMX_PSUBSWirm,
+ MMX_PSUBUSBirm,
+ MMX_PSUBUSWirm)>;
+
+def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort01]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup70], (instregex "(V?)CVTSS2SI(64)?rr",
+ "(V?)CVT(T?)SD2SI(64)?rr")>;
+
+def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup72], (instrs FARJMP64m)>;
+def: InstRW<[SKLWriteResGroup72], (instregex "JMP(16|32|64)m")>;
+
+def SKLWriteResGroup75 : SchedWriteRes<[SKLPort23,SKLPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm",
+ "MOVBE(16|32|64)rm")>;
+
+def SKLWriteResGroup76 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup76], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)rmr")>;
+
+def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort01]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup78], (instregex "(V?)CVTSI642SSrr")>;
+
+def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup80], (instregex "SLDT(16|32|64)r")>;
+
+def SKLWriteResGroup82 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup82], (instregex "SAR(8|16|32|64)m(1|i)",
+ "SHL(8|16|32|64)m(1|i)",
+ "SHR(8|16|32|64)m(1|i)")>;
+
+def SKLWriteResGroup83 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup83], (instregex "POP(16|32|64)rmm",
+ "PUSH(16|32|64)rmm")>;
+
+def SKLWriteResGroup84 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,5];
+}
+def: InstRW<[SKLWriteResGroup84], (instrs STD)>;
+
+def SKLWriteResGroup85 : SchedWriteRes<[SKLPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup85], (instregex "LD_F(32|64|80)m")>;
+def: InstRW<[SKLWriteResGroup85], (instrs VBROADCASTF128,
+ VBROADCASTI128,
+ VBROADCASTSDYrm,
+ VBROADCASTSSYrm,
+ VMOVDDUPYrm,
+ VMOVSHDUPYrm,
+ VMOVSLDUPYrm,
+ VPBROADCASTDYrm,
+ VPBROADCASTQYrm)>;
+
+def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0,SKLPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup86], (instrs VCVTDQ2PDYrr)>;
+
+def SKLWriteResGroup88 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup88], (instregex "(V?)PMOV(SX|ZX)BDrm",
+ "(V?)PMOV(SX|ZX)BQrm",
+ "(V?)PMOV(SX|ZX)BWrm",
+ "(V?)PMOV(SX|ZX)DQrm",
+ "(V?)PMOV(SX|ZX)WDrm",
+ "(V?)PMOV(SX|ZX)WQrm")>;
+
+def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort01]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup89], (instrs VCVTPD2PSYrr,
+ VCVTPS2PDYrr,
+ VCVTPD2DQYrr,
+ VCVTTPD2DQYrr)>;
+
+def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup91], (instrs VINSERTF128rm,
+ VINSERTI128rm,
+ VPBLENDDrmi)>;
+def: InstRW<[SKLWriteResGroup91, ReadAfterVecXLd],
+ (instregex "(V?)PADD(B|D|Q|W)rm",
+ "(V?)PSUB(B|D|Q|W)rm")>;
+
+def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup92], (instrs MMX_PACKSSDWirm,
+ MMX_PACKSSWBirm,
+ MMX_PACKUSWBirm)>;
+
+def SKLWriteResGroup94 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup94], (instrs LEAVE, LEAVE64,
+ SCASB, SCASL, SCASQ, SCASW)>;
+
+def SKLWriteResGroup95 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort01]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup95], (instregex "(V?)CVTTSS2SI(64)?rr")>;
+
+def SKLWriteResGroup96 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup96], (instrs FLDCW16m)>;
+
+def SKLWriteResGroup98 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup98], (instrs LRETQ, RETQ)>;
+
+def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKLWriteResGroup100], (instregex "ROL(8|16|32|64)m(1|i)",
+ "ROR(8|16|32|64)m(1|i)")>;
+
+def SKLWriteResGroup100_1 : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup100_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
+def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKLWriteResGroup101], (instregex "XADD(8|16|32|64)rm")>;
+
+def SKLWriteResGroup102 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[SKLWriteResGroup102], (instrs FARCALL64m)>;
+
+def SKLWriteResGroup103 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,3,1,2];
+}
+def: InstRW<[SKLWriteResGroup103], (instrs LOOP)>;
+
+def SKLWriteResGroup107 : SchedWriteRes<[SKLPort1,SKLPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup107], (instregex "PDEP(32|64)rm",
+ "PEXT(32|64)rm")>;
+
+def SKLWriteResGroup108 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup108], (instregex "FCOM(P?)(32|64)m")>;
+def: InstRW<[SKLWriteResGroup108], (instrs VPBROADCASTBYrm,
+ VPBROADCASTWYrm,
+ VPMOVSXBDYrm,
+ VPMOVSXBQYrm,
+ VPMOVSXWQYrm)>;
+
+def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup110], (instrs VPBLENDDYrmi)>;
+def: InstRW<[SKLWriteResGroup110, ReadAfterVecYLd],
+ (instregex "VPADD(B|D|Q|W)Yrm",
+ "VPSUB(B|D|Q|W)Yrm")>;
+
+def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PH(ADD|SUB)SWrm")>;
+
+def SKLWriteResGroup116 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKLWriteResGroup116], (instregex "RCL(8|16|32|64)m(1|i)",
+ "RCR(8|16|32|64)m(1|i)")>;
+
+def SKLWriteResGroup117 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[SKLWriteResGroup117], (instregex "ROL(8|16|32|64)mCL",
+ "ROR(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
+
+def SKLWriteResGroup119 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: SchedAlias<WriteADCRMW, SKLWriteResGroup119>;
+
+def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup120], (instrs MMX_CVTPI2PSirm)>;
+
+def SKLWriteResGroup121 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup121], (instrs PCMPGTQrm,
+ VPCMPGTQrm,
+ VPMOVSXBWYrm,
+ VPMOVSXDQYrm,
+ VPMOVSXWDYrm,
+ VPMOVZXWDYrm)>;
+
+def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort01]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVT(T?)PS2PIirm",
+ "(V?)CVTPS2PDrm")>;
+
+def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKLWriteResGroup128], (instregex "(V?)PHADDSWrm",
+ "(V?)PHSUBSWrm")>;
+
+def SKLWriteResGroup131 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKLWriteResGroup131], (instregex "LAR(16|32|64)rm",
+ "LSL(16|32|64)rm")>;
+
+def SKLWriteResGroup133 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup133], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m")>;
+def: InstRW<[SKLWriteResGroup133], (instrs VPCMPGTQYrm)>;
+
+def SKLWriteResGroup134 : SchedWriteRes<[SKLPort01,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup134], (instregex "(V?)CVTDQ2PSrm",
+ "(V?)CVTPS2DQrm",
+ "(V?)CVTSS2SDrm",
+ "(V?)CVTTPS2DQrm")>;
+
+def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup138], (instrs MMX_CVTPI2PDirm)>;
+
+def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup139], (instregex "(V?)CVTSD2SSrm")>;
+
+def SKLWriteResGroup140 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKLWriteResGroup140], (instrs VPHADDSWYrm,
+ VPHSUBSWYrm)>;
+
+def SKLWriteResGroup143 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,3];
+}
+def: InstRW<[SKLWriteResGroup143], (instregex "XCHG(8|16|32|64)rm")>;
+
+def SKLWriteResGroup145 : SchedWriteRes<[SKLPort0,SKLFPDivider]> {
+ let Latency = 11;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,3];
+}
+def : SchedAlias<WriteFDivX, SKLWriteResGroup145>; // TODO - convert to ZnWriteResFpuPair
+
+def SKLWriteResGroup146 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F(32|64)m")>;
+
+def SKLWriteResGroup147 : SchedWriteRes<[SKLPort01,SKLPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup147], (instrs VCVTDQ2PSYrm,
+ VCVTPS2PDYrm,
+ VCVTPS2DQYrm,
+ VCVTTPS2DQYrm)>;
+
+def SKLWriteResGroup149 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup149], (instregex "FICOM(P?)(16|32)m")>;
+
+def SKLWriteResGroup150 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup150], (instregex "(V?)CVTDQ2PDrm")>;
+
+def SKLWriteResGroup151 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort01]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup151], (instregex "(V?)CVTSS2SI64rm",
+ "(V?)CVT(T?)SD2SI(64)?rm",
+ "VCVTTSS2SI64rm",
+ "(V?)CVT(T?)SS2SIrm")>;
+
+def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup152], (instrs CVTPD2PSrm,
+ CVTPD2DQrm,
+ CVTTPD2DQrm,
+ MMX_CVTPD2PIirm,
+ MMX_CVTTPD2PIirm)>;
+
+def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,3,2];
+}
+def: InstRW<[SKLWriteResGroup154], (instregex "RCL(16|32|64)rCL",
+ "RCR(16|32|64)rCL")>;
+
+def SKLWriteResGroup155 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,5,1,2];
+}
+def: InstRW<[SKLWriteResGroup155], (instrs RCL8rCL)>;
+
+def SKLWriteResGroup156 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,9];
+}
+def: InstRW<[SKLWriteResGroup156], (instrs LOOPE, LOOPNE)>;
+
+def SKLWriteResGroup160 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort01]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup160], (instregex "CVTTSS2SI64rm")>;
+
+def SKLWriteResGroup162 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup162], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>;
+
+def SKLWriteResGroup163 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup163], (instrs VCVTDQ2PDYrm)>;
+
+def SKLWriteResGroup166 : SchedWriteRes<[SKLPort0,SKLFPDivider]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,3];
+}
+def : SchedAlias<WriteFDiv64, SKLWriteResGroup166>; // TODO - convert to ZnWriteResFpuPair
+def : SchedAlias<WriteFDiv64X, SKLWriteResGroup166>; // TODO - convert to ZnWriteResFpuPair
+
+def SKLWriteResGroup166_1 : SchedWriteRes<[SKLPort0,SKLFPDivider]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,5];
+}
+def : SchedAlias<WriteFDiv64Y, SKLWriteResGroup166_1>; // TODO - convert to ZnWriteResFpuPair
+
+def SKLWriteResGroup169 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI(16|32)m")>;
+
+def SKLWriteResGroup170 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [2,4,1,3];
+}
+def: InstRW<[SKLWriteResGroup170], (instrs RCR8rCL)>;
+
+def SKLWriteResGroup171 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 15;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
+
+def SKLWriteResGroup174 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 15;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,1,1,5,1,1];
+}
+def: InstRW<[SKLWriteResGroup174], (instregex "RCL(8|16|32|64)mCL")>;
+
+def SKLWriteResGroup177 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[SKLWriteResGroup177], (instrs CMPXCHG8B)>;
+
+def SKLWriteResGroup178 : SchedWriteRes<[SKLPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 16;
+ let ResourceCycles = [16];
+}
+def: InstRW<[SKLWriteResGroup178], (instrs VZEROALL)>;
+
+def SKLWriteResGroup179 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
+ let Latency = 17;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,5];
+}
+def : SchedAlias<WriteFDivXLd, SKLWriteResGroup179>; // TODO - convert to ZnWriteResFpuPair
+
+def SKLWriteResGroup180 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 15;
+ let ResourceCycles = [2,1,2,4,2,4];
+}
+def: InstRW<[SKLWriteResGroup180], (instrs XCH_F)>;
+
+def SKLWriteResGroup184 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,5];
+}
+def: InstRW<[SKLWriteResGroup184], (instrs CPUID, RDTSC)>;
+
+def SKLWriteResGroup185 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,1,1,4,1,2];
+}
+def: InstRW<[SKLWriteResGroup185], (instregex "RCR(8|16|32|64)mCL")>;
+
+def SKLWriteResGroup186 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
+ let Latency = 19;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,4];
+}
+def : SchedAlias<WriteFDiv64Ld, SKLWriteResGroup186>; // TODO - convert to ZnWriteResFpuPair
+
+def SKLWriteResGroup189 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 20;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup189], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
+
+def SKLWriteResGroup190 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
+ let Latency = 20;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,4];
+}
+def : SchedAlias<WriteFDiv64XLd, SKLWriteResGroup190>; // TODO - convert to ZnWriteResFpuPair
+
+def SKLWriteResGroup192 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[SKLWriteResGroup192], (instrs INSB, INSL, INSW)>;
+
+def SKLWriteResGroup193 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,2,7];
+}
+def: InstRW<[SKLWriteResGroup193], (instrs MWAITrr)>;
+
+def SKLWriteResGroup195 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,8];
+}
+def : SchedAlias<WriteFDiv64YLd, SKLWriteResGroup195>; // TODO - convert to ZnWriteResFpuPair
+
+def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 22;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F(32|64)m")>;
+
+def SKLWriteResGroupVEX2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+ let Latency = 18;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKLWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm,
+ VGATHERQPDrm, VPGATHERQQrm,
+ VGATHERQPSrm, VPGATHERQDrm)>;
+
+def SKLWriteResGroupVEX4 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+ let Latency = 20;
+ let NumMicroOps = 5; // 2 uops peform multiple loads
+ let ResourceCycles = [1,4,1,1];
+}
+def: InstRW<[SKLWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+ VGATHERDPSrm, VPGATHERDDrm,
+ VGATHERQPDYrm, VPGATHERQQYrm,
+ VGATHERQPSYrm, VPGATHERQDYrm)>;
+
+def SKLWriteResGroupVEX8 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+ let Latency = 22;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,8,1,1];
+}
+def: InstRW<[SKLWriteResGroupVEX8], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
+
+def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 19;
+ let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[SKLWriteResGroup198], (instrs CMPXCHG16B)>;
+
+def SKLWriteResGroup202 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI(16|32)m")>;
+
+def SKLWriteResGroup206 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 27;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F(32|64)m")>;
+
+def SKLWriteResGroup208 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 30;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI(16|32)m")>;
+
+def SKLWriteResGroup209 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort06,SKLPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,3,4,10];
+}
+def: InstRW<[SKLWriteResGroup209], (instregex "IN(8|16|32)ri",
+ "IN(8|16|32)rr")>;
+
+def SKLWriteResGroup210 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,2,1,4,10];
+}
+def: InstRW<[SKLWriteResGroup210], (instregex "OUT(8|16|32)ir",
+ "OUT(8|16|32)rr")>;
+
+def SKLWriteResGroup211 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> {
+ let Latency = 37;
+ let NumMicroOps = 31;
+ let ResourceCycles = [1,8,1,21];
+}
+def: InstRW<[SKLWriteResGroup211], (instregex "XRSTOR(64)?")>;
+
+def SKLWriteResGroup212 : SchedWriteRes<[SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort15,SKLPort0156]> {
+ let Latency = 40;
+ let NumMicroOps = 18;
+ let ResourceCycles = [1,1,2,3,1,1,1,8];
+}
+def: InstRW<[SKLWriteResGroup212], (instrs VMCLEARm)>;
+
+def SKLWriteResGroup213 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 41;
+ let NumMicroOps = 39;
+ let ResourceCycles = [1,10,1,1,26];
+}
+def: InstRW<[SKLWriteResGroup213], (instrs XSAVE64)>;
+
+def SKLWriteResGroup214 : SchedWriteRes<[SKLPort5,SKLPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 22;
+ let ResourceCycles = [2,20];
+}
+def: InstRW<[SKLWriteResGroup214], (instrs RDTSCP)>;
+
+def SKLWriteResGroup215 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 40;
+ let ResourceCycles = [1,11,1,1,26];
+}
+def: InstRW<[SKLWriteResGroup215], (instrs XSAVE)>;
+def: InstRW<[SKLWriteResGroup215], (instregex "XSAVEC", "XSAVES")>;
+
+def SKLWriteResGroup216 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 46;
+ let NumMicroOps = 44;
+ let ResourceCycles = [1,11,1,1,30];
+}
+def: InstRW<[SKLWriteResGroup216], (instregex "XSAVEOPT")>;
+
+def SKLWriteResGroup217 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05,SKLPort06,SKLPort0156]> {
+ let Latency = 62;
+ let NumMicroOps = 64;
+ let ResourceCycles = [2,8,5,10,39];
+}
+def: InstRW<[SKLWriteResGroup217], (instrs FLDENVm)>;
+
+def SKLWriteResGroup218 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 88;
+ let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[SKLWriteResGroup218], (instrs FXRSTOR64)>;
+
+def SKLWriteResGroup219 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 90;
+ let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[SKLWriteResGroup219], (instrs FXRSTOR)>;
+
+def SKLWriteResGroup220 : SchedWriteRes<[SKLPort5,SKLPort05,SKLPort0156]> {
+ let Latency = 75;
+ let NumMicroOps = 15;
+ let ResourceCycles = [6,3,6];
+}
+def: InstRW<[SKLWriteResGroup220], (instrs FNINIT)>;
+
+def SKLWriteResGroup223 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 106;
+ let NumMicroOps = 100;
+ let ResourceCycles = [9,1,11,16,1,11,21,30];
+}
+def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
+
+
+// Instruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Skylake Pipeline" > "Register allocation and renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def SKLWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def SKLWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[SKLWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def SKLWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[SKLWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def SKLWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[SKLWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def SKLWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def SKLWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomLogicY], (instrs VPXORYrr)>;
+
+def SKLWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def SKLWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def SKLWritePSUB : SchedWriteRes<[SKLPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKLWriteVZeroIdiomPSUB : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKLWritePSUB]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ VPSUBBYrr,
+ VPSUBDYrr,
+ VPSUBQYrr,
+ VPSUBWYrr)>;
+
+def SKLWritePCMPGTQ : SchedWriteRes<[SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKLWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKLWritePCMPGTQ]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def SKLWriteCMOVA_CMOVBErr : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKLWriteCMOVA_CMOVBErm : SchedWriteRes<[SKLPort23,SKLPort06]> {
+ let Latency = 7;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 3;
+}
+
+def SKLCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SKLWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def SKLCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SKLWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[SKLCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[SKLCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def SKLWriteSETA_SETBEr : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKLWriteSETA_SETBEm : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def SKLSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SKLWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def SKLSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SKLWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[SKLSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[SKLSETA_SETBErm], (instrs SETCCm)>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td
new file mode 100644
index 000000000000..7fc96d1eda89
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -0,0 +1,2618 @@
+//=- X86SchedSkylake.td - X86 Skylake Server Scheduling ------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Skylake Server to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SkylakeServerModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SKylake can
+ // decode 6 instructions per cycle.
+ let IssueWidth = 6;
+ let MicroOpBufferSize = 224; // Based on the reorder buffer.
+ let LoadLatency = 5;
+ let MispredictPenalty = 14;
+
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
+ // This flag is set to allow the scheduler to assign a default model to
+ // unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SkylakeServerModel in {
+
+// Skylake Server can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def SKXPort0 : ProcResource<1>;
+def SKXPort1 : ProcResource<1>;
+def SKXPort2 : ProcResource<1>;
+def SKXPort3 : ProcResource<1>;
+def SKXPort4 : ProcResource<1>;
+def SKXPort5 : ProcResource<1>;
+def SKXPort6 : ProcResource<1>;
+def SKXPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SKXPort01 : ProcResGroup<[SKXPort0, SKXPort1]>;
+def SKXPort23 : ProcResGroup<[SKXPort2, SKXPort3]>;
+def SKXPort237 : ProcResGroup<[SKXPort2, SKXPort3, SKXPort7]>;
+def SKXPort04 : ProcResGroup<[SKXPort0, SKXPort4]>;
+def SKXPort05 : ProcResGroup<[SKXPort0, SKXPort5]>;
+def SKXPort06 : ProcResGroup<[SKXPort0, SKXPort6]>;
+def SKXPort15 : ProcResGroup<[SKXPort1, SKXPort5]>;
+def SKXPort16 : ProcResGroup<[SKXPort1, SKXPort6]>;
+def SKXPort56 : ProcResGroup<[SKXPort5, SKXPort6]>;
+def SKXPort015 : ProcResGroup<[SKXPort0, SKXPort1, SKXPort5]>;
+def SKXPort056 : ProcResGroup<[SKXPort0, SKXPort5, SKXPort6]>;
+def SKXPort0156: ProcResGroup<[SKXPort0, SKXPort1, SKXPort5, SKXPort6]>;
+
+def SKXDivider : ProcResource<1>; // Integer division issued on port 0.
+// FP division and sqrt on port 0.
+def SKXFPDivider : ProcResource<1>;
+
+// 60 Entry Unified Scheduler
+def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4,
+ SKXPort5, SKXPort6, SKXPort7]> {
+ let BufferSize=60;
+}
+
+// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 5>;
+
+// Vector loads are 5/6/7 cycles, so ReadAfterVec*Ld registers needn't be available
+// until 5/6/7 cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 6>;
+def : ReadAdvance<ReadAfterVecYLd, 7>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SKXWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 5> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+ // the latency (default = 5).
+ def : WriteRes<SchedRW.Folded, !listconcat([SKXPort23], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SKXPort237,SKXPort4]>;
+
+// Arithmetic.
+defm : SKXWriteResPair<WriteALU, [SKXPort0156], 1>; // Simple integer ALU op.
+defm : SKXWriteResPair<WriteADC, [SKXPort06], 1>; // Integer ALU + flags op.
+
+// Integer multiplication.
+defm : SKXWriteResPair<WriteIMul8, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteIMul16, [SKXPort1,SKXPort06,SKXPort0156], 4, [1,1,2], 4>;
+defm : X86WriteRes<WriteIMul16Imm, [SKXPort1,SKXPort0156], 4, [1,1], 2>;
+defm : X86WriteRes<WriteIMul16ImmLd, [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1,1], 3>;
+defm : X86WriteRes<WriteIMul16Reg, [SKXPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteIMul16RegLd, [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1,1], 3>;
+defm : SKXWriteResPair<WriteIMul32, [SKXPort1,SKXPort06,SKXPort0156], 4, [1,1,1], 3>;
+defm : SKXWriteResPair<WriteIMul32Imm, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteIMul32Reg, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteIMul64, [SKXPort1,SKXPort5], 4, [1,1], 2>;
+defm : SKXWriteResPair<WriteIMul64Imm, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteIMul64Reg, [SKXPort1], 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+
+defm : X86WriteRes<WriteBSWAP32, [SKXPort15], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [SKXPort06, SKXPort15], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCMPXCHG,[SKXPort06, SKXPort0156], 5, [2,3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[SKXPort23,SKXPort06,SKXPort0156,SKXPort237,SKXPort4], 8, [1,2,1,1,1], 6>;
+defm : X86WriteRes<WriteXCHG, [SKXPort0156], 2, [3], 3>;
+
+// TODO: Why isn't the SKXDivider used?
+defm : SKXWriteResPair<WriteDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
+defm : X86WriteRes<WriteDiv16, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156], 76, [7,2,8,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv32, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156], 76, [7,2,8,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv64, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156], 76, [7,2,8,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv16Ld, [SKXPort0,SKXPort23,SKXDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteDiv32Ld, [SKXPort0,SKXPort23,SKXDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteDiv64Ld, [SKXPort0,SKXPort23,SKXDivider], 29, [1,1,10], 2>;
+
+defm : X86WriteRes<WriteIDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv16, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort06,SKXPort0156], 102, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv32, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort06,SKXPort0156], 102, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv64, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort06,SKXPort0156], 102, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv8Ld, [SKXPort0,SKXPort5,SKXPort23,SKXPort0156], 28, [2,4,1,1], 8>;
+defm : X86WriteRes<WriteIDiv16Ld, [SKXPort0,SKXPort5,SKXPort23,SKXPort0156], 28, [2,4,1,1], 8>;
+defm : X86WriteRes<WriteIDiv32Ld, [SKXPort0,SKXPort5,SKXPort23,SKXPort0156], 28, [2,4,1,1], 8>;
+defm : X86WriteRes<WriteIDiv64Ld, [SKXPort0,SKXPort5,SKXPort23,SKXPort0156], 28, [2,4,1,1], 8>;
+
+defm : SKXWriteResPair<WriteCRC32, [SKXPort1], 3>;
+
+def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads.
+
+defm : SKXWriteResPair<WriteCMOV, [SKXPort06], 1, [1], 1>; // Conditional move.
+defm : X86WriteRes<WriteFCMOV, [SKXPort1], 3, [1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [SKXPort06]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+defm : X86WriteRes<WriteLAHFSAHF, [SKXPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTest, [SKXPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [SKXPort06,SKXPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestRegLd, [SKXPort0156,SKXPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestSet, [SKXPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestSetImmLd, [SKXPort06,SKXPort23], 5, [1,1], 3>;
+defm : X86WriteRes<WriteBitTestSetRegLd, [SKXPort0156,SKXPort23], 5, [1,1], 2>;
+
+// Integer shifts and rotates.
+defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>;
+defm : SKXWriteResPair<WriteShiftCL, [SKXPort06], 3, [3], 3>;
+defm : SKXWriteResPair<WriteRotate, [SKXPort06], 1, [1], 1>;
+defm : SKXWriteResPair<WriteRotateCL, [SKXPort06], 3, [3], 3>;
+
+// SHLD/SHRD.
+defm : X86WriteRes<WriteSHDrri, [SKXPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteSHDrrcl,[SKXPort1,SKXPort06,SKXPort0156], 6, [1, 2, 1], 4>;
+defm : X86WriteRes<WriteSHDmri, [SKXPort1,SKXPort23,SKXPort237,SKXPort0156], 9, [1, 1, 1, 1], 4>;
+defm : X86WriteRes<WriteSHDmrcl,[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156], 11, [1, 1, 1, 2, 1], 6>;
+
+// Bit counts.
+defm : SKXWriteResPair<WriteBSF, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteBSR, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteLZCNT, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteTZCNT, [SKXPort1], 3>;
+defm : SKXWriteResPair<WritePOPCNT, [SKXPort1], 3>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : SKXWriteResPair<WriteBEXTR, [SKXPort06,SKXPort15], 2, [1,1], 2>;
+defm : SKXWriteResPair<WriteBLS, [SKXPort15], 1>;
+defm : SKXWriteResPair<WriteBZHI, [SKXPort15], 1>;
+
+// Loads, stores, and moves, not folded with other operations.
+defm : X86WriteRes<WriteLoad, [SKXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore, [SKXPort237, SKXPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [SKXPort237, SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove, [SKXPort0156], 1, [1], 1>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def : WriteRes<WriteZero, []>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : SKXWriteResPair<WriteJump, [SKXPort06], 1>;
+
+// Floating point. This covers both scalar and vector operations.
+defm : X86WriteRes<WriteFLD0, [SKXPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [SKXPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC, [SKXPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad, [SKXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [SKXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [SKXPort23,SKXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedLoadY, [SKXPort23,SKXPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteFStore, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [SKXPort05,SKXPort0156], 10, [9,1], 10>;
+
+defm : SKXWriteResPair<WriteFAdd, [SKXPort01], 4, [1], 1, 5>; // Floating point add/sub.
+defm : SKXWriteResPair<WriteFAddX, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFAddY, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFAddZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFAdd64, [SKXPort01], 4, [1], 1, 5>; // Floating point double add/sub.
+defm : SKXWriteResPair<WriteFAdd64X, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFAdd64Y, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFAdd64Z, [SKXPort05], 4, [1], 1, 7>;
+
+defm : SKXWriteResPair<WriteFCmp, [SKXPort01], 4, [1], 1, 5>; // Floating point compare.
+defm : SKXWriteResPair<WriteFCmpX, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFCmpY, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFCmpZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFCmp64, [SKXPort01], 4, [1], 1, 5>; // Floating point double compare.
+defm : SKXWriteResPair<WriteFCmp64X, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFCmp64Y, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFCmp64Z, [SKXPort05], 4, [1], 1, 7>;
+
+defm : SKXWriteResPair<WriteFCom, [SKXPort0], 2>; // Floating point compare to flags (X87).
+defm : SKXWriteResPair<WriteFComX, [SKXPort0], 2>; // Floating point compare to flags (SSE).
+
+defm : SKXWriteResPair<WriteFMul, [SKXPort01], 4, [1], 1, 5>; // Floating point multiplication.
+defm : SKXWriteResPair<WriteFMulX, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFMulY, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMulZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMul64, [SKXPort01], 4, [1], 1, 5>; // Floating point double multiplication.
+defm : SKXWriteResPair<WriteFMul64X, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFMul64Y, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMul64Z, [SKXPort05], 4, [1], 1, 7>;
+
+defm : SKXWriteResPair<WriteFDiv, [SKXPort0,SKXFPDivider], 11, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
+//defm : SKXWriteResPair<WriteFDivX, [SKXPort0,SKXFPDivider], 11, [1,3], 1, 6>; // 10-14 cycles.
+defm : SKXWriteResPair<WriteFDivY, [SKXPort0,SKXFPDivider], 11, [1,5], 1, 7>; // 10-14 cycles.
+defm : SKXWriteResPair<WriteFDivZ, [SKXPort0,SKXPort5,SKXFPDivider], 18, [2,1,10], 3, 7>; // 10-14 cycles.
+//defm : SKXWriteResPair<WriteFDiv64, [SKXPort0,SKXFPDivider], 14, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
+//defm : SKXWriteResPair<WriteFDiv64X, [SKXPort0,SKXFPDivider], 14, [1,3], 1, 6>; // 10-14 cycles.
+//defm : SKXWriteResPair<WriteFDiv64Y, [SKXPort0,SKXFPDivider], 14, [1,5], 1, 7>; // 10-14 cycles.
+defm : SKXWriteResPair<WriteFDiv64Z, [SKXPort0,SKXPort5,SKXFPDivider], 23, [2,1,16], 3, 7>; // 10-14 cycles.
+
+defm : SKXWriteResPair<WriteFSqrt, [SKXPort0,SKXFPDivider], 12, [1,3], 1, 5>; // Floating point square root.
+defm : SKXWriteResPair<WriteFSqrtX, [SKXPort0,SKXFPDivider], 12, [1,3], 1, 6>;
+defm : SKXWriteResPair<WriteFSqrtY, [SKXPort0,SKXFPDivider], 12, [1,6], 1, 7>;
+defm : SKXWriteResPair<WriteFSqrtZ, [SKXPort0,SKXPort5,SKXFPDivider], 20, [2,1,12], 3, 7>;
+defm : SKXWriteResPair<WriteFSqrt64, [SKXPort0,SKXFPDivider], 18, [1,6], 1, 5>; // Floating point double square root.
+defm : SKXWriteResPair<WriteFSqrt64X, [SKXPort0,SKXFPDivider], 18, [1,6], 1, 6>;
+defm : SKXWriteResPair<WriteFSqrt64Y, [SKXPort0,SKXFPDivider], 18, [1,12],1, 7>;
+defm : SKXWriteResPair<WriteFSqrt64Z, [SKXPort0,SKXPort5,SKXFPDivider], 32, [2,1,24], 3, 7>;
+defm : SKXWriteResPair<WriteFSqrt80, [SKXPort0,SKXFPDivider], 21, [1,7]>; // Floating point long double square root.
+
+defm : SKXWriteResPair<WriteFRcp, [SKXPort0], 4, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : SKXWriteResPair<WriteFRcpX, [SKXPort0], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFRcpY, [SKXPort0], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFRcpZ, [SKXPort0,SKXPort5], 4, [2,1], 3, 7>;
+
+defm : SKXWriteResPair<WriteFRsqrt, [SKXPort0], 4, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : SKXWriteResPair<WriteFRsqrtX,[SKXPort0], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFRsqrtY,[SKXPort0], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFRsqrtZ,[SKXPort0,SKXPort5], 9, [2,1], 3, 7>;
+
+defm : SKXWriteResPair<WriteFMA, [SKXPort01], 4, [1], 1, 5>; // Fused Multiply Add.
+defm : SKXWriteResPair<WriteFMAX, [SKXPort01], 4, [1], 1, 6>;
+defm : SKXWriteResPair<WriteFMAY, [SKXPort01], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFMAZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteDPPD, [SKXPort5,SKXPort015], 9, [1,2], 3, 6>; // Floating point double dot product.
+defm : SKXWriteResPair<WriteDPPS, [SKXPort5,SKXPort015], 13, [1,3], 4, 6>;
+defm : SKXWriteResPair<WriteDPPSY,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>;
+defm : SKXWriteResPair<WriteDPPSZ,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>;
+defm : SKXWriteResPair<WriteFSign, [SKXPort0], 1>; // Floating point fabs/fchs.
+defm : SKXWriteResPair<WriteFRnd, [SKXPort01], 8, [2], 2, 6>; // Floating point rounding.
+defm : SKXWriteResPair<WriteFRndY, [SKXPort01], 8, [2], 2, 7>;
+defm : SKXWriteResPair<WriteFRndZ, [SKXPort05], 8, [2], 2, 7>;
+defm : SKXWriteResPair<WriteFLogic, [SKXPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals.
+defm : SKXWriteResPair<WriteFLogicY, [SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFLogicZ, [SKXPort05], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFTest, [SKXPort0], 2, [1], 1, 6>; // Floating point TEST instructions.
+defm : SKXWriteResPair<WriteFTestY, [SKXPort0], 2, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFTestZ, [SKXPort0], 2, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFShuffle, [SKXPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : SKXWriteResPair<WriteFShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFVarShuffle, [SKXPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles.
+defm : SKXWriteResPair<WriteFVarShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFVarShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFBlend, [SKXPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : SKXWriteResPair<WriteFBlendY,[SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFBlendZ,[SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteFVarBlend, [SKXPort015], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : SKXWriteResPair<WriteFVarBlendY,[SKXPort015], 2, [2], 2, 7>;
+defm : SKXWriteResPair<WriteFVarBlendZ,[SKXPort015], 2, [2], 2, 7>;
+
+// FMA Scheduling helper class.
+// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm : X86WriteRes<WriteVecLoad, [SKXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [SKXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [SKXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [SKXPort23,SKXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [SKXPort23,SKXPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecStore, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMove, [SKXPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr, [SKXPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [SKXPort5], 1, [1], 1>;
+
+defm : SKXWriteResPair<WriteVecALU, [SKXPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : SKXWriteResPair<WriteVecALUX, [SKXPort01], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecALUY, [SKXPort01], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecALUZ, [SKXPort0], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecLogic, [SKXPort05], 1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : SKXWriteResPair<WriteVecLogicX,[SKXPort015], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecLogicY,[SKXPort015], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecLogicZ,[SKXPort05], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecTest, [SKXPort0,SKXPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
+defm : SKXWriteResPair<WriteVecTestY, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
+defm : SKXWriteResPair<WriteVecTestZ, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
+defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 5, [1], 1, 5>; // Vector integer multiply.
+defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01], 5, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01], 5, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05], 5, [1], 1, 7>;
+defm : SKXWriteResPair<WritePMULLD, [SKXPort01], 10, [2], 2, 6>; // Vector PMULLD.
+defm : SKXWriteResPair<WritePMULLDY, [SKXPort01], 10, [2], 2, 7>;
+defm : SKXWriteResPair<WritePMULLDZ, [SKXPort05], 10, [2], 2, 7>;
+defm : SKXWriteResPair<WriteShuffle, [SKXPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKXWriteResPair<WriteShuffleX, [SKXPort5], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarShuffle, [SKXPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : SKXWriteResPair<WriteVarShuffleX, [SKXPort5], 1, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVarShuffleY, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarShuffleZ, [SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteBlend, [SKXPort5], 1, [1], 1, 6>; // Vector blends.
+defm : SKXWriteResPair<WriteBlendY,[SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteBlendZ,[SKXPort5], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarBlend, [SKXPort015], 2, [2], 2, 6>; // Vector variable blends.
+defm : SKXWriteResPair<WriteVarBlendY,[SKXPort015], 2, [2], 2, 6>;
+defm : SKXWriteResPair<WriteVarBlendZ,[SKXPort05], 2, [1], 1, 6>;
+defm : SKXWriteResPair<WriteMPSAD, [SKXPort5], 4, [2], 2, 6>; // Vector MPSAD.
+defm : SKXWriteResPair<WriteMPSADY, [SKXPort5], 4, [2], 2, 7>;
+defm : SKXWriteResPair<WriteMPSADZ, [SKXPort5], 4, [2], 2, 7>;
+defm : SKXWriteResPair<WritePSADBW, [SKXPort5], 3, [1], 1, 5>; // Vector PSADBW.
+defm : SKXWriteResPair<WritePSADBWX, [SKXPort5], 3, [1], 1, 6>;
+defm : SKXWriteResPair<WritePSADBWY, [SKXPort5], 3, [1], 1, 7>;
+defm : SKXWriteResPair<WritePSADBWZ, [SKXPort5], 3, [1], 1, 7>;
+defm : SKXWriteResPair<WritePHMINPOS, [SKXPort0], 4, [1], 1, 6>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : SKXWriteResPair<WriteVecShift, [SKXPort0], 1, [1], 1, 5>;
+defm : X86WriteRes<WriteVecShiftX, [SKXPort5,SKXPort01], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftY, [SKXPort5,SKXPort01], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZ, [SKXPort5,SKXPort0], 4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftXLd, [SKXPort01,SKXPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd, [SKXPort01,SKXPort23], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZLd, [SKXPort0,SKXPort23], 8, [1,1], 2>;
+
+defm : SKXWriteResPair<WriteVecShiftImm, [SKXPort0], 1, [1], 1, 5>;
+defm : SKXWriteResPair<WriteVecShiftImmX, [SKXPort01], 1, [1], 1, 6>; // Vector integer immediate shifts.
+defm : SKXWriteResPair<WriteVecShiftImmY, [SKXPort01], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecShiftImmZ, [SKXPort0], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarVecShift, [SKXPort01], 1, [1], 1, 6>; // Variable vector shifts.
+defm : SKXWriteResPair<WriteVarVecShiftY, [SKXPort01], 1, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVarVecShiftZ, [SKXPort0], 1, [1], 1, 7>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [SKXPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVecInsertLd, [SKXPort5,SKXPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
+
+def : WriteRes<WriteVecExtract, [SKXPort0,SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def : WriteRes<WriteVecExtractSt, [SKXPort4,SKXPort5,SKXPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+// Conversion between integer and float.
+defm : SKXWriteResPair<WriteCvtSS2I, [SKXPort01], 6, [2], 2>; // Needs more work: DD vs DQ.
+defm : SKXWriteResPair<WriteCvtPS2I, [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPS2IY, [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPS2IZ, [SKXPort05], 3>;
+defm : SKXWriteResPair<WriteCvtSD2I, [SKXPort01], 6, [2], 2>;
+defm : SKXWriteResPair<WriteCvtPD2I, [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPD2IY, [SKXPort01], 3>;
+defm : SKXWriteResPair<WriteCvtPD2IZ, [SKXPort05], 3>;
+
+defm : SKXWriteResPair<WriteCvtI2SS, [SKXPort1], 4>;
+defm : SKXWriteResPair<WriteCvtI2PS, [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PSY, [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PSZ, [SKXPort05], 4>; // Needs more work: DD vs DQ.
+defm : SKXWriteResPair<WriteCvtI2SD, [SKXPort1], 4>;
+defm : SKXWriteResPair<WriteCvtI2PD, [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PDY, [SKXPort01], 4>;
+defm : SKXWriteResPair<WriteCvtI2PDZ, [SKXPort05], 4>;
+
+defm : SKXWriteResPair<WriteCvtSS2SD, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPS2PD, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPS2PDY, [SKXPort5,SKXPort01], 3, [1,1], 2>;
+defm : SKXWriteResPair<WriteCvtPS2PDZ, [SKXPort05], 3, [2], 2>;
+defm : SKXWriteResPair<WriteCvtSD2SS, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPD2PS, [SKXPort1], 3>;
+defm : SKXWriteResPair<WriteCvtPD2PSY, [SKXPort5,SKXPort01], 3, [1,1], 2>;
+defm : SKXWriteResPair<WriteCvtPD2PSZ, [SKXPort05], 3, [2], 2>;
+
+defm : X86WriteRes<WriteCvtPH2PS, [SKXPort5,SKXPort01], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY, [SKXPort5,SKXPort01], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZ, [SKXPort5,SKXPort0], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSLd, [SKXPort23,SKXPort01], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [SKXPort23,SKXPort01], 10, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZLd, [SKXPort23,SKXPort05], 10, [1,1], 2>;
+
+defm : X86WriteRes<WriteCvtPS2PH, [SKXPort5,SKXPort01], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY, [SKXPort5,SKXPort01], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHZ, [SKXPort5,SKXPort05], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort01], 6, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort01], 8, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort05], 8, [1,1,1,1], 4>;
+
+// Strings instructions.
+
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [SKXPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SKXPort0, SKXPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SKXPort0, SKXPort5, SKXPort015, SKXPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def : WriteRes<WritePCmpEStrMLd, [SKXPort0, SKXPort5, SKXPort23, SKXPort015, SKXPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SKXPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SKXPort0, SKXPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SKXPort0,SKXPort5,SKXPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
+}
+def : WriteRes<WritePCmpEStrILd, [SKXPort0, SKXPort5, SKXPort23, SKXPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [SKXPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK, [SKXPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [SKXPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK, [SKXPort0]> { let Latency = 2; }
+
+// AES instructions.
+def : WriteRes<WriteAESDecEnc, [SKXPort0]> { // Decryption, encryption.
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [SKXPort0, SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+
+def : WriteRes<WriteAESIMC, [SKXPort0]> { // InvMixColumn.
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SKXPort0, SKXPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+
+def : WriteRes<WriteAESKeyGen, [SKXPort0,SKXPort5,SKXPort015]> { // Key Generation.
+ let Latency = 20;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,2];
+}
+def : WriteRes<WriteAESKeyGenLd, [SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 25;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,1,1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SKXPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteCLMulLd, [SKXPort5, SKXPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+
+// Catch-all for expensive system instructions.
+def : WriteRes<WriteSystem, [SKXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
+
+// AVX2.
+defm : SKXWriteResPair<WriteFShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteFVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : SKXWriteResPair<WriteShuffle256, [SKXPort5], 3, [1], 1, 7>; // 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // 256-bit width vector variable shuffles.
+
+// Old microcoded instructions that nobody use.
+def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def : WriteRes<WriteFence, [SKXPort23, SKXPort4]>;
+
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [SKXPort0,SKXPort23,SKXPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [SKXPort4,SKXPort5,SKXPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
+// Nop, not very useful expect it provides a model for nops!
+def : WriteRes<WriteNop, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort015], 6, [2,1], 3, 6>;
+defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort015], 6, [2,1], 3, 7>;
+defm : SKXWriteResPair<WritePHAdd, [SKXPort5,SKXPort05], 3, [2,1], 3, 5>;
+defm : SKXWriteResPair<WritePHAddX, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>;
+defm : SKXWriteResPair<WritePHAddY, [SKXPort5,SKXPort015], 3, [2,1], 3, 7>;
+
+// Remaining instrs.
+
+def SKXWriteResGroup1 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr",
+ "KANDN(B|D|Q|W)rr",
+ "KMOV(B|D|Q|W)kk",
+ "KNOT(B|D|Q|W)rr",
+ "KOR(B|D|Q|W)rr",
+ "KXNOR(B|D|Q|W)rr",
+ "KXOR(B|D|Q|W)rr",
+ "KSET0(B|D|Q|W)", // Same as KXOR
+ "KSET1(B|D|Q|W)", // Same as KXNOR
+ "MMX_PADDS(B|W)irr",
+ "MMX_PADDUS(B|W)irr",
+ "MMX_PAVG(B|W)irr",
+ "MMX_PCMPEQ(B|D|W)irr",
+ "MMX_PCMPGT(B|D|W)irr",
+ "MMX_P(MAX|MIN)SWirr",
+ "MMX_P(MAX|MIN)UBirr",
+ "MMX_PSUBS(B|W)irr",
+ "MMX_PSUBUS(B|W)irr",
+ "VPMOVB2M(Z|Z128|Z256)rr",
+ "VPMOVD2M(Z|Z128|Z256)rr",
+ "VPMOVQ2M(Z|Z128|Z256)rr",
+ "VPMOVW2M(Z|Z128|Z256)rr")>;
+
+def SKXWriteResGroup3 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup3], (instregex "COM(P?)_FST0r",
+ "KMOV(B|D|Q|W)kr",
+ "UCOM_F(P?)r")>;
+
+def SKXWriteResGroup4 : SchedWriteRes<[SKXPort6]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup4], (instregex "JMP(16|32|64)r")>;
+
+def SKXWriteResGroup6 : SchedWriteRes<[SKXPort05]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup6], (instrs FINCSTP, FNOP)>;
+
+def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>;
+
+def SKXWriteResGroup8 : SchedWriteRes<[SKXPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr")>;
+
+def SKXWriteResGroup9 : SchedWriteRes<[SKXPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr",
+ "VBLENDMPS(Z128|Z256)rr",
+ "VPADD(B|D|Q|W)(Y|Z|Z128|Z256)rr",
+ "(V?)PADD(B|D|Q|W)rr",
+ "VPBLENDD(Y?)rri",
+ "VPBLENDMB(Z128|Z256)rr",
+ "VPBLENDMD(Z128|Z256)rr",
+ "VPBLENDMQ(Z128|Z256)rr",
+ "VPBLENDMW(Z128|Z256)rr",
+ "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rrk",
+ "VPTERNLOGD(Z|Z128|Z256)rri",
+ "VPTERNLOGQ(Z|Z128|Z256)rri")>;
+
+def SKXWriteResGroup10 : SchedWriteRes<[SKXPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup10], (instrs CBW, CWDE, CDQE,
+ CMC, STC,
+ SGDT64m,
+ SIDT64m,
+ SMSW16m,
+ STRm,
+ SYSCALL)>;
+
+def SKXWriteResGroup11 : SchedWriteRes<[SKXPort4,SKXPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup11], (instrs FBSTPm, VMPTRSTm)>;
+def: InstRW<[SKXWriteResGroup11], (instregex "KMOV(B|D|Q|W)mk",
+ "ST_FP(32|64|80)m")>;
+
+def SKXWriteResGroup13 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup13], (instrs MMX_MOVQ2DQrr)>;
+
+def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup14], (instrs FDECSTP,
+ MMX_MOVDQ2Qrr)>;
+
+def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup17], (instrs LFENCE,
+ WAIT,
+ XGETBV)>;
+
+def SKXWriteResGroup20 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup20], (instregex "CLFLUSH")>;
+
+def SKXWriteResGroup21 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup21], (instrs SFENCE)>;
+
+def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup23], (instrs CWD,
+ JCXZ, JECXZ, JRCXZ,
+ ADC8i8, SBB8i8,
+ ADC16i16, SBB16i16,
+ ADC32i32, SBB32i32,
+ ADC64i32, SBB64i32)>;
+
+def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup25], (instrs FNSTCW16m)>;
+
+def SKXWriteResGroup27 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup27], (instregex "MOVBE(16|32|64)mr")>;
+
+def SKXWriteResGroup28 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup28], (instrs PUSH16r, PUSH32r, PUSH64r, PUSH64i8,
+ STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)rmr")>;
+
+def SKXWriteResGroup29 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,2,1];
+}
+def: InstRW<[SKXWriteResGroup29], (instregex "VMOVDQU8Zmr(b?)")>;
+
+def SKXWriteResGroup30 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup30], (instregex "KMOV(B|D|Q|W)rk",
+ "KORTEST(B|D|Q|W)rr",
+ "KTEST(B|D|Q|W)rr")>;
+
+def SKXWriteResGroup31 : SchedWriteRes<[SKXPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr",
+ "PEXT(32|64)rr")>;
+
+def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup32], (instrs VPSADBWZrr)>; // TODO: 512-bit ops require ports 0/1 to be joined.
+def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
+ "VALIGND(Z|Z128|Z256)rri",
+ "VALIGNQ(Z|Z128|Z256)rri",
+ "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined.
+ "VPBROADCAST(B|W)rr",
+ "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr")>;
+
+def SKXWriteResGroup33 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup33], (instregex "KADD(B|D|Q|W)rr",
+ "KSHIFTL(B|D|Q|W)ri",
+ "KSHIFTR(B|D|Q|W)ri",
+ "KUNPCK(BW|DQ|WD)rr",
+ "VCMPPD(Z|Z128|Z256)rri",
+ "VCMPPS(Z|Z128|Z256)rri",
+ "VCMP(SD|SS)Zrr",
+ "VFPCLASS(PD|PS)(Z|Z128|Z256)rr",
+ "VFPCLASS(SD|SS)Zrr",
+ "VPCMPB(Z|Z128|Z256)rri",
+ "VPCMPD(Z|Z128|Z256)rri",
+ "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
+ "VPCMPGT(B|D|Q|W)(Z|Z128|Z256)rr",
+ "VPCMPQ(Z|Z128|Z256)rri",
+ "VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri",
+ "VPCMPW(Z|Z128|Z256)rri",
+ "VPTEST(N?)M(B|D|Q|W)(Z|Z128|Z256)rr")>;
+
+def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup34], (instrs FNSTSW16r)>;
+
+def SKXWriteResGroup37 : SchedWriteRes<[SKXPort0,SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PH(ADD|SUB)SWrr")>;
+
+def SKXWriteResGroup38 : SchedWriteRes<[SKXPort5,SKXPort01]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup38], (instregex "(V?)PH(ADD|SUB)SW(Y?)rr")>;
+
+def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup41], (instrs MMX_PACKSSDWirr,
+ MMX_PACKSSWBirr,
+ MMX_PACKUSWBirr)>;
+
+def SKXWriteResGroup42 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup42], (instregex "CLD")>;
+
+def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup43], (instrs MFENCE)>;
+
+def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)",
+ "RCR(8|16|32|64)r(1|i)")>;
+
+def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup45], (instrs FNSTSWm)>;
+
+def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup47], (instregex "CALL(16|32|64)r")>;
+
+def SKXWriteResGroup48 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup48], (instrs CALL64pcrel32)>;
+
+def SKXWriteResGroup49 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup49], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
+
+def SKXWriteResGroup50 : SchedWriteRes<[SKXPort01]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PS(Y|Z128|Z256)rr",
+ "(V?)CVTDQ2PSrr",
+ "VCVTPD2QQ(Z128|Z256)rr",
+ "VCVTPD2UQQ(Z128|Z256)rr",
+ "VCVTPS2DQ(Y|Z128|Z256)rr",
+ "(V?)CVTPS2DQrr",
+ "VCVTPS2UDQ(Z128|Z256)rr",
+ "VCVTQQ2PD(Z128|Z256)rr",
+ "VCVTTPD2QQ(Z128|Z256)rr",
+ "VCVTTPD2UQQ(Z128|Z256)rr",
+ "VCVTTPS2DQ(Z128|Z256)rr",
+ "(V?)CVTTPS2DQrr",
+ "VCVTTPS2UDQ(Z128|Z256)rr",
+ "VCVTUDQ2PS(Z128|Z256)rr",
+ "VCVTUQQ2PD(Z128|Z256)rr")>;
+
+def SKXWriteResGroup50z : SchedWriteRes<[SKXPort05]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup50z], (instrs VCVTDQ2PSZrr,
+ VCVTPD2QQZrr,
+ VCVTPD2UQQZrr,
+ VCVTPS2DQZrr,
+ VCVTPS2UDQZrr,
+ VCVTQQ2PDZrr,
+ VCVTTPD2QQZrr,
+ VCVTTPD2UQQZrr,
+ VCVTTPS2DQZrr,
+ VCVTTPS2UDQZrr,
+ VCVTUDQ2PSZrr,
+ VCVTUQQ2PDZrr)>;
+
+def SKXWriteResGroup51 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPD(Z|Z128|Z256)rr",
+ "VEXPANDPS(Z|Z128|Z256)rr",
+ "VPEXPANDD(Z|Z128|Z256)rr",
+ "VPEXPANDQ(Z|Z128|Z256)rr",
+ "VPMOVDB(Z|Z128|Z256)rr",
+ "VPMOVDW(Z|Z128|Z256)rr",
+ "VPMOVQB(Z|Z128|Z256)rr",
+ "VPMOVQW(Z|Z128|Z256)rr",
+ "VPMOVSDB(Z|Z128|Z256)rr",
+ "VPMOVSDW(Z|Z128|Z256)rr",
+ "VPMOVSQB(Z|Z128|Z256)rr",
+ "VPMOVSQD(Z|Z128|Z256)rr",
+ "VPMOVSQW(Z|Z128|Z256)rr",
+ "VPMOVSWB(Z|Z128|Z256)rr",
+ "VPMOVUSDB(Z|Z128|Z256)rr",
+ "VPMOVUSDW(Z|Z128|Z256)rr",
+ "VPMOVUSQB(Z|Z128|Z256)rr",
+ "VPMOVUSQD(Z|Z128|Z256)rr",
+ "VPMOVUSWB(Z|Z128|Z256)rr",
+ "VPMOVWB(Z|Z128|Z256)rr")>;
+
+def SKXWriteResGroup54 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup54], (instregex "IST(T?)_FP(16|32|64)m",
+ "IST_F(16|32)m",
+ "VPMOVQD(Z|Z128|Z256)mr(b?)")>;
+
+def SKXWriteResGroup55 : SchedWriteRes<[SKXPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
+}
+def: InstRW<[SKXWriteResGroup55], (instrs FNCLEX)>;
+
+def SKXWriteResGroup56 : SchedWriteRes<[]> {
+ let Latency = 0;
+ let NumMicroOps = 4;
+ let ResourceCycles = [];
+}
+def: InstRW<[SKXWriteResGroup56], (instrs VZEROUPPER)>;
+
+def SKXWriteResGroup57 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SKXWriteResGroup57], (instregex "LAR(16|32|64)rr")>;
+
+def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
+ "MOVZX(16|32|64)rm(8|16)",
+ "(V?)MOVDDUPrm")>; // TODO: Should this be SKXWriteResGroup71?
+
+def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr",
+ "MMX_CVT(T?)PS2PIirr",
+ "VCVTDQ2PDZ128rr",
+ "VCVTPD2DQZ128rr",
+ "(V?)CVT(T?)PD2DQrr",
+ "VCVTPD2PSZ128rr",
+ "(V?)CVTPD2PSrr",
+ "VCVTPD2UDQZ128rr",
+ "VCVTPS2PDZ128rr",
+ "(V?)CVTPS2PDrr",
+ "VCVTPS2QQZ128rr",
+ "VCVTPS2UQQZ128rr",
+ "VCVTQQ2PSZ128rr",
+ "(V?)CVTSD2SS(Z?)rr",
+ "(V?)CVTSI(64)?2SDrr",
+ "VCVTSI2SSZrr",
+ "(V?)CVTSI2SSrr",
+ "VCVTSI(64)?2SDZrr",
+ "VCVTSS2SDZrr",
+ "(V?)CVTSS2SDrr",
+ "VCVTTPD2DQZ128rr",
+ "VCVTTPD2UDQZ128rr",
+ "VCVTTPS2QQZ128rr",
+ "VCVTTPS2UQQZ128rr",
+ "VCVTUDQ2PDZ128rr",
+ "VCVTUQQ2PSZ128rr",
+ "VCVTUSI2SSZrr",
+ "VCVTUSI(64)?2SDZrr")>;
+
+def SKXWriteResGroup62 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup62], (instregex "VPCONFLICTQZ128rr")>;
+
+def SKXWriteResGroup63 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup63], (instregex "STR(16|32|64)r")>;
+
+def SKXWriteResGroup65 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ128mr(b?)",
+ "VCVTPS2PHZ256mr(b?)",
+ "VCVTPS2PHZmr(b?)")>;
+
+def SKXWriteResGroup66 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDB(Z|Z128|Z256)mr(b?)",
+ "VPMOVDW(Z|Z128|Z256)mr(b?)",
+ "VPMOVQB(Z|Z128|Z256)mr(b?)",
+ "VPMOVQW(Z|Z128|Z256)mr(b?)",
+ "VPMOVSDB(Z|Z128|Z256)mr(b?)",
+ "VPMOVSDW(Z|Z128|Z256)mr(b?)",
+ "VPMOVSQB(Z|Z128|Z256)mr(b?)",
+ "VPMOVSQD(Z|Z128|Z256)mr(b?)",
+ "VPMOVSQW(Z|Z128|Z256)mr(b?)",
+ "VPMOVSWB(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSDB(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSDW(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSQB(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSQD(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSQW(Z|Z128|Z256)mr(b?)",
+ "VPMOVUSWB(Z|Z128|Z256)mr(b?)",
+ "VPMOVWB(Z|Z128|Z256)mr(b?)")>;
+
+def SKXWriteResGroup67 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[SKXWriteResGroup67], (instrs XSETBV)>;
+
+def SKXWriteResGroup69 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,4];
+}
+def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF(16|64)")>;
+
+def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup71], (instrs VBROADCASTSSrm,
+ VPBROADCASTDrm,
+ VPBROADCASTQrm,
+ VMOVSHDUPrm,
+ VMOVSLDUPrm,
+ MOVSHDUPrm,
+ MOVSLDUPrm)>;
+
+def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup72], (instrs MMX_CVTPI2PSirr)>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPD(Z|Z128|Z256)rr",
+ "VCOMPRESSPS(Z|Z128|Z256)rr",
+ "VPCOMPRESSD(Z|Z128|Z256)rr",
+ "VPCOMPRESSQ(Z|Z128|Z256)rr",
+ "VPERMW(Z|Z128|Z256)rr")>;
+
+def SKXWriteResGroup73 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup73], (instrs MMX_PADDSBirm,
+ MMX_PADDSWirm,
+ MMX_PADDUSBirm,
+ MMX_PADDUSWirm,
+ MMX_PAVGBirm,
+ MMX_PAVGWirm,
+ MMX_PCMPEQBirm,
+ MMX_PCMPEQDirm,
+ MMX_PCMPEQWirm,
+ MMX_PCMPGTBirm,
+ MMX_PCMPGTDirm,
+ MMX_PCMPGTWirm,
+ MMX_PMAXSWirm,
+ MMX_PMAXUBirm,
+ MMX_PMINSWirm,
+ MMX_PMINUBirm,
+ MMX_PSUBSBirm,
+ MMX_PSUBSWirm,
+ MMX_PSUBUSBirm,
+ MMX_PSUBUSWirm)>;
+
+def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup76], (instrs FARJMP64m)>;
+def: InstRW<[SKXWriteResGroup76], (instregex "JMP(16|32|64)m")>;
+
+def SKXWriteResGroup79 : SchedWriteRes<[SKXPort23,SKXPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm",
+ "MOVBE(16|32|64)rm")>;
+
+def SKXWriteResGroup80 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup80], (instregex "VMOV(64to|QI2)PQIZrm(b?)")>;
+def: InstRW<[SKXWriteResGroup80], (instrs VMOVDI2PDIZrm)>;
+
+def SKXWriteResGroup81 : SchedWriteRes<[SKXPort23,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup81], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)rmr")>;
+
+def SKXWriteResGroup82 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup82], (instregex "(V?)CVTSI642SSrr",
+ "VCVTSI642SSZrr",
+ "VCVTUSI642SSZrr")>;
+
+def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup84], (instregex "SLDT(16|32|64)r")>;
+
+def SKXWriteResGroup86 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup86], (instregex "SAR(8|16|32|64)m(1|i)",
+ "SHL(8|16|32|64)m(1|i)",
+ "SHR(8|16|32|64)m(1|i)")>;
+
+def SKXWriteResGroup87 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup87], (instregex "POP(16|32|64)rmm",
+ "PUSH(16|32|64)rmm")>;
+
+def SKXWriteResGroup88 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,5];
+}
+def: InstRW<[SKXWriteResGroup88], (instrs STD)>;
+
+def SKXWriteResGroup89 : SchedWriteRes<[SKXPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup89], (instregex "LD_F(32|64|80)m")>;
+def: InstRW<[SKXWriteResGroup89], (instrs VBROADCASTF128,
+ VBROADCASTI128,
+ VBROADCASTSDYrm,
+ VBROADCASTSSYrm,
+ VMOVDDUPYrm,
+ VMOVSHDUPYrm,
+ VMOVSLDUPYrm,
+ VPBROADCASTDYrm,
+ VPBROADCASTQYrm)>;
+
+def SKXWriteResGroup90 : SchedWriteRes<[SKXPort01,SKXPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup90], (instrs VCVTDQ2PDYrr)>;
+
+def SKXWriteResGroup92 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)",
+ "VMOVSSZrm(b?)")>;
+
+def SKXWriteResGroup92a : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup92a], (instregex "(V?)PMOV(SX|ZX)BDrm",
+ "(V?)PMOV(SX|ZX)BQrm",
+ "(V?)PMOV(SX|ZX)BWrm",
+ "(V?)PMOV(SX|ZX)DQrm",
+ "(V?)PMOV(SX|ZX)WDrm",
+ "(V?)PMOV(SX|ZX)WQrm")>;
+
+def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr",
+ "VCVTPD2DQ(Y|Z256)rr",
+ "VCVTPD2PS(Y|Z256)rr",
+ "VCVTPD2UDQZ256rr",
+ "VCVTPS2PD(Y|Z256)rr",
+ "VCVTPS2QQZ256rr",
+ "VCVTPS2UQQZ256rr",
+ "VCVTQQ2PSZ256rr",
+ "VCVTTPD2DQ(Y|Z256)rr",
+ "VCVTTPD2UDQZ256rr",
+ "VCVTTPS2QQZ256rr",
+ "VCVTTPS2UQQZ256rr",
+ "VCVTUDQ2PDZ256rr",
+ "VCVTUQQ2PSZ256rr")>;
+
+def SKXWriteResGroup93z : SchedWriteRes<[SKXPort5,SKXPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup93z], (instrs VCVTDQ2PDZrr,
+ VCVTPD2DQZrr,
+ VCVTPD2PSZrr,
+ VCVTPD2UDQZrr,
+ VCVTPS2PDZrr,
+ VCVTPS2QQZrr,
+ VCVTPS2UQQZrr,
+ VCVTQQ2PSZrr,
+ VCVTTPD2DQZrr,
+ VCVTTPD2UDQZrr,
+ VCVTTPS2QQZrr,
+ VCVTTPS2UQQZrr,
+ VCVTUDQ2PDZrr,
+ VCVTUQQ2PSZrr)>;
+
+def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup95], (instrs VMOVNTDQAZ128rm,
+ VPBLENDDrmi)>;
+def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd],
+ (instregex "VBLENDMPDZ128rm(b?)",
+ "VBLENDMPSZ128rm(b?)",
+ "VBROADCASTI32X2Z128rm(b?)",
+ "VBROADCASTSSZ128rm(b?)",
+ "VINSERT(F|I)128rm",
+ "VMOVAPDZ128rm(b?)",
+ "VMOVAPSZ128rm(b?)",
+ "VMOVDDUPZ128rm(b?)",
+ "VMOVDQA32Z128rm(b?)",
+ "VMOVDQA64Z128rm(b?)",
+ "VMOVDQU16Z128rm(b?)",
+ "VMOVDQU32Z128rm(b?)",
+ "VMOVDQU64Z128rm(b?)",
+ "VMOVDQU8Z128rm(b?)",
+ "VMOVSHDUPZ128rm(b?)",
+ "VMOVSLDUPZ128rm(b?)",
+ "VMOVUPDZ128rm(b?)",
+ "VMOVUPSZ128rm(b?)",
+ "VPADD(B|D|Q|W)Z128rm(b?)",
+ "(V?)PADD(B|D|Q|W)rm",
+ "VPBLENDM(B|D|Q|W)Z128rm(b?)",
+ "VPBROADCASTDZ128rm(b?)",
+ "VPBROADCASTQZ128rm(b?)",
+ "VPSUB(B|D|Q|W)Z128rm(b?)",
+ "(V?)PSUB(B|D|Q|W)rm",
+ "VPTERNLOGDZ128rm(b?)i",
+ "VPTERNLOGQZ128rm(b?)i")>;
+
+def SKXWriteResGroup96 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup96], (instrs MMX_PACKSSDWirm,
+ MMX_PACKSSWBirm,
+ MMX_PACKUSWBirm)>;
+
+def SKXWriteResGroup97 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W128rr",
+ "VPERMI2W256rr",
+ "VPERMI2Wrr",
+ "VPERMT2W128rr",
+ "VPERMT2W256rr",
+ "VPERMT2Wrr")>;
+
+def SKXWriteResGroup99 : SchedWriteRes<[SKXPort23,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup99], (instrs LEAVE, LEAVE64,
+ SCASB, SCASL, SCASQ, SCASW)>;
+
+def SKXWriteResGroup100 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup100], (instregex "VCVTSS2USI64Zrr",
+ "(V?)CVTSS2SI64(Z?)rr",
+ "(V?)CVTTSS2SI64(Z?)rr",
+ "VCVTTSS2USI64Zrr")>;
+
+def SKXWriteResGroup101 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup101], (instrs FLDCW16m)>;
+
+def SKXWriteResGroup103 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup103], (instregex "KMOV(B|D|Q|W)km")>;
+
+def SKXWriteResGroup104 : SchedWriteRes<[SKXPort6,SKXPort23,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup104], (instrs LRETQ, RETQ)>;
+
+def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPD(Z|Z128|Z256)mr(b?)",
+ "VCOMPRESSPS(Z|Z128|Z256)mr(b?)",
+ "VPCOMPRESSD(Z|Z128|Z256)mr(b?)",
+ "VPCOMPRESSQ(Z|Z128|Z256)mr(b?)")>;
+
+def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKXWriteResGroup107], (instregex "ROL(8|16|32|64)m(1|i)",
+ "ROR(8|16|32|64)m(1|i)")>;
+
+def SKXWriteResGroup107_1 : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup107_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
+def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKXWriteResGroup108], (instregex "XADD(8|16|32|64)rm")>;
+
+def SKXWriteResGroup109 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[SKXWriteResGroup109], (instrs FARCALL64m)>;
+
+def SKXWriteResGroup110 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,2,2,2];
+}
+def: InstRW<[SKXWriteResGroup110], (instrs VPSCATTERDQZ128mr,
+ VPSCATTERQQZ128mr,
+ VSCATTERDPDZ128mr,
+ VSCATTERQPDZ128mr)>;
+
+def SKXWriteResGroup111 : SchedWriteRes<[SKXPort6,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,3,1,2];
+}
+def: InstRW<[SKXWriteResGroup111], (instrs LOOP)>;
+
+def SKXWriteResGroup112 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 11;
+ let ResourceCycles = [1,4,4,2];
+}
+def: InstRW<[SKXWriteResGroup112], (instrs VPSCATTERDQZ256mr,
+ VPSCATTERQQZ256mr,
+ VSCATTERDPDZ256mr,
+ VSCATTERQPDZ256mr)>;
+
+def SKXWriteResGroup113 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 19;
+ let ResourceCycles = [1,8,8,2];
+}
+def: InstRW<[SKXWriteResGroup113], (instrs VPSCATTERDQZmr,
+ VPSCATTERQQZmr,
+ VSCATTERDPDZmr,
+ VSCATTERQPDZmr)>;
+
+def SKXWriteResGroup114 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 36;
+ let ResourceCycles = [1,16,1,16,2];
+}
+def: InstRW<[SKXWriteResGroup114], (instrs VSCATTERDPSZmr)>;
+
+def SKXWriteResGroup118 : SchedWriteRes<[SKXPort1,SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup118], (instregex "PDEP(32|64)rm",
+ "PEXT(32|64)rm")>;
+
+def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup119], (instregex "FCOM(P?)(32|64)m",
+ "VPBROADCASTB(Z|Z256)rm(b?)",
+ "VPBROADCASTW(Z|Z256)rm(b?)")>;
+def: InstRW<[SKXWriteResGroup119], (instrs VPBROADCASTBYrm,
+ VPBROADCASTWYrm,
+ VPMOVSXBDYrm,
+ VPMOVSXBQYrm,
+ VPMOVSXWQYrm)>;
+
+def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup121], (instrs VMOVNTDQAZ256rm,
+ VPBLENDDYrmi)>;
+def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd],
+ (instregex "VBLENDMPD(Z|Z256)rm(b?)",
+ "VBLENDMPS(Z|Z256)rm(b?)",
+ "VBROADCASTF32X2Z256rm(b?)",
+ "VBROADCASTF32X2Zrm(b?)",
+ "VBROADCASTF32X4Z256rm(b?)",
+ "VBROADCASTF32X4rm(b?)",
+ "VBROADCASTF32X8rm(b?)",
+ "VBROADCASTF64X2Z128rm(b?)",
+ "VBROADCASTF64X2rm(b?)",
+ "VBROADCASTF64X4rm(b?)",
+ "VBROADCASTI32X2Z256rm(b?)",
+ "VBROADCASTI32X2Zrm(b?)",
+ "VBROADCASTI32X4Z256rm(b?)",
+ "VBROADCASTI32X4rm(b?)",
+ "VBROADCASTI32X8rm(b?)",
+ "VBROADCASTI64X2Z128rm(b?)",
+ "VBROADCASTI64X2rm(b?)",
+ "VBROADCASTI64X4rm(b?)",
+ "VBROADCASTSD(Z|Z256)rm(b?)",
+ "VBROADCASTSS(Z|Z256)rm(b?)",
+ "VINSERTF32x4(Z|Z256)rm(b?)",
+ "VINSERTF32x8Zrm(b?)",
+ "VINSERTF64x2(Z|Z256)rm(b?)",
+ "VINSERTF64x4Zrm(b?)",
+ "VINSERTI32x4(Z|Z256)rm(b?)",
+ "VINSERTI32x8Zrm(b?)",
+ "VINSERTI64x2(Z|Z256)rm(b?)",
+ "VINSERTI64x4Zrm(b?)",
+ "VMOVAPD(Z|Z256)rm(b?)",
+ "VMOVAPS(Z|Z256)rm(b?)",
+ "VMOVDDUP(Z|Z256)rm(b?)",
+ "VMOVDQA32(Z|Z256)rm(b?)",
+ "VMOVDQA64(Z|Z256)rm(b?)",
+ "VMOVDQU16(Z|Z256)rm(b?)",
+ "VMOVDQU32(Z|Z256)rm(b?)",
+ "VMOVDQU64(Z|Z256)rm(b?)",
+ "VMOVDQU8(Z|Z256)rm(b?)",
+ "VMOVSHDUP(Z|Z256)rm(b?)",
+ "VMOVSLDUP(Z|Z256)rm(b?)",
+ "VMOVUPD(Z|Z256)rm(b?)",
+ "VMOVUPS(Z|Z256)rm(b?)",
+ "VPADD(B|D|Q|W)Yrm",
+ "VPADD(B|D|Q|W)(Z|Z256)rm(b?)",
+ "VPBLENDM(B|D|Q|W)(Z|Z256)rm(b?)",
+ "VPBROADCASTD(Z|Z256)rm(b?)",
+ "VPBROADCASTQ(Z|Z256)rm(b?)",
+ "VPSUB(B|D|Q|W)Yrm",
+ "VPSUB(B|D|Q|W)(Z|Z256)rm(b?)",
+ "VPTERNLOGD(Z|Z256)rm(b?)i",
+ "VPTERNLOGQ(Z|Z256)rm(b?)i")>;
+
+def SKXWriteResGroup123 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PH(ADD|SUB)SWrm")>;
+
+def SKXWriteResGroup127 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKXWriteResGroup127], (instregex "RCL(8|16|32|64)m(1|i)",
+ "RCR(8|16|32|64)m(1|i)")>;
+
+def SKXWriteResGroup128 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[SKXWriteResGroup128], (instregex "ROL(8|16|32|64)mCL",
+ "ROR(8|16|32|64)mCL",
+ "SAR(8|16|32|64)mCL",
+ "SHL(8|16|32|64)mCL",
+ "SHR(8|16|32|64)mCL")>;
+
+def SKXWriteResGroup130 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: SchedAlias<WriteADCRMW, SKXWriteResGroup130>;
+
+def SKXWriteResGroup131 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,2,1,2,2];
+}
+def: InstRW<[SKXWriteResGroup131], (instrs VPSCATTERQDZ128mr,
+ VPSCATTERQDZ256mr,
+ VSCATTERQPSZ128mr,
+ VSCATTERQPSZ256mr)>;
+
+def SKXWriteResGroup132 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 12;
+ let ResourceCycles = [1,4,1,4,2];
+}
+def: InstRW<[SKXWriteResGroup132], (instrs VPSCATTERDDZ128mr,
+ VSCATTERDPSZ128mr)>;
+
+def SKXWriteResGroup133 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 20;
+ let ResourceCycles = [1,8,1,8,2];
+}
+def: InstRW<[SKXWriteResGroup133], (instrs VPSCATTERDDZ256mr,
+ VSCATTERDPSZ256mr)>;
+
+def SKXWriteResGroup134 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 36;
+ let ResourceCycles = [1,16,1,16,2];
+}
+def: InstRW<[SKXWriteResGroup134], (instrs VPSCATTERDDZmr)>;
+
+def SKXWriteResGroup135 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup135], (instrs MMX_CVTPI2PSirm)>;
+
+def SKXWriteResGroup136 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup136], (instrs VPMOVSXBWYrm,
+ VPMOVSXDQYrm,
+ VPMOVSXWDYrm,
+ VPMOVZXWDYrm)>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
+ "VFPCLASSSDZrm(b?)",
+ "VFPCLASSSSZrm(b?)",
+ "(V?)PCMPGTQrm",
+ "VPERMI2D128rm(b?)",
+ "VPERMI2PD128rm(b?)",
+ "VPERMI2PS128rm(b?)",
+ "VPERMI2Q128rm(b?)",
+ "VPERMT2D128rm(b?)",
+ "VPERMT2PD128rm(b?)",
+ "VPERMT2PS128rm(b?)",
+ "VPERMT2Q128rm(b?)",
+ "VPMAXSQZ128rm(b?)",
+ "VPMAXUQZ128rm(b?)",
+ "VPMINSQZ128rm(b?)",
+ "VPMINUQZ128rm(b?)",
+ "VPMOVSXBDZ128rm(b?)",
+ "VPMOVSXBQZ128rm(b?)",
+ "VPMOVSXBWZ128rm(b?)",
+ "VPMOVSXDQZ128rm(b?)",
+ "VPMOVSXWDZ128rm(b?)",
+ "VPMOVSXWQZ128rm(b?)",
+ "VPMOVZXBDZ128rm(b?)",
+ "VPMOVZXBQZ128rm(b?)",
+ "VPMOVZXBWZ128rm(b?)",
+ "VPMOVZXDQZ128rm(b?)",
+ "VPMOVZXWDZ128rm(b?)",
+ "VPMOVZXWQZ128rm(b?)")>;
+
+def SKXWriteResGroup136_2 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup136_2], (instregex "VCMP(PD|PS)Z128rm(b?)i",
+ "VCMP(SD|SS)Zrm",
+ "VFPCLASSPDZ128rm(b?)",
+ "VFPCLASSPSZ128rm(b?)",
+ "VPCMPBZ128rmi(b?)",
+ "VPCMPDZ128rmi(b?)",
+ "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
+ "VPCMPGT(B|D|Q|W)Z128rm(b?)",
+ "VPCMPQZ128rmi(b?)",
+ "VPCMPU(B|D|Q|W)Z128rmi(b?)",
+ "VPCMPWZ128rmi(b?)",
+ "VPTESTMBZ128rm(b?)",
+ "VPTESTMDZ128rm(b?)",
+ "VPTESTMQZ128rm(b?)",
+ "VPTESTMWZ128rm(b?)",
+ "VPTESTNMBZ128rm(b?)",
+ "VPTESTNMDZ128rm(b?)",
+ "VPTESTNMQZ128rm(b?)",
+ "VPTESTNMWZ128rm(b?)")>;
+
+def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm",
+ "(V?)CVTPS2PDrm")>;
+
+def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup143], (instregex "(V?)PHADDSWrm",
+ "(V?)PHSUBSWrm")>;
+
+def SKXWriteResGroup146 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup146], (instregex "LAR(16|32|64)rm",
+ "LSL(16|32|64)rm")>;
+
+def SKXWriteResGroup148 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup148], (instrs VPCMPGTQYrm)>;
+def: InstRW<[SKXWriteResGroup148], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+ "ILD_F(16|32|64)m",
+ "VALIGND(Z|Z256)rm(b?)i",
+ "VALIGNQ(Z|Z256)rm(b?)i",
+ "VPMAXSQ(Z|Z256)rm(b?)",
+ "VPMAXUQ(Z|Z256)rm(b?)",
+ "VPMINSQ(Z|Z256)rm(b?)",
+ "VPMINUQ(Z|Z256)rm(b?)")>;
+
+def SKXWriteResGroup148_2 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup148_2], (instregex "VCMPPD(Z|Z256)rm(b?)i",
+ "VCMPPS(Z|Z256)rm(b?)i",
+ "VFPCLASSPD(Z|Z256)rm(b?)",
+ "VFPCLASSPS(Z|Z256)rm(b?)",
+ "VPCMPB(Z|Z256)rmi(b?)",
+ "VPCMPD(Z|Z256)rmi(b?)",
+ "VPCMPEQB(Z|Z256)rm(b?)",
+ "VPCMPEQD(Z|Z256)rm(b?)",
+ "VPCMPEQQ(Z|Z256)rm(b?)",
+ "VPCMPEQW(Z|Z256)rm(b?)",
+ "VPCMPGTB(Z|Z256)rm(b?)",
+ "VPCMPGTD(Z|Z256)rm(b?)",
+ "VPCMPGTQ(Z|Z256)rm(b?)",
+ "VPCMPGTW(Z|Z256)rm(b?)",
+ "VPCMPQ(Z|Z256)rmi(b?)",
+ "VPCMPU(B|D|Q|W)Z256rmi(b?)",
+ "VPCMPU(B|D|Q|W)Zrmi(b?)",
+ "VPCMPW(Z|Z256)rmi(b?)",
+ "VPTESTM(B|D|Q|W)Z256rm(b?)",
+ "VPTESTM(B|D|Q|W)Zrm(b?)",
+ "VPTESTNM(B|D|Q|W)Z256rm(b?)",
+ "VPTESTNM(B|D|Q|W)Zrm(b?)")>;
+
+def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)",
+ "VCVTDQ2PSZ128rm(b?)",
+ "(V?)CVTDQ2PSrm",
+ "VCVTPD2QQZ128rm(b?)",
+ "VCVTPD2UQQZ128rm(b?)",
+ "VCVTPH2PSZ128rm(b?)",
+ "VCVTPS2DQZ128rm(b?)",
+ "(V?)CVTPS2DQrm",
+ "VCVTPS2PDZ128rm(b?)",
+ "VCVTPS2QQZ128rm(b?)",
+ "VCVTPS2UDQZ128rm(b?)",
+ "VCVTPS2UQQZ128rm(b?)",
+ "VCVTQQ2PDZ128rm(b?)",
+ "VCVTQQ2PSZ128rm(b?)",
+ "VCVTSS2SDZrm",
+ "(V?)CVTSS2SDrm",
+ "VCVTTPD2QQZ128rm(b?)",
+ "VCVTTPD2UQQZ128rm(b?)",
+ "VCVTTPS2DQZ128rm(b?)",
+ "(V?)CVTTPS2DQrm",
+ "VCVTTPS2QQZ128rm(b?)",
+ "VCVTTPS2UDQZ128rm(b?)",
+ "VCVTTPS2UQQZ128rm(b?)",
+ "VCVTUDQ2PDZ128rm(b?)",
+ "VCVTUDQ2PSZ128rm(b?)",
+ "VCVTUQQ2PDZ128rm(b?)",
+ "VCVTUQQ2PSZ128rm(b?)")>;
+
+def SKXWriteResGroup151 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)",
+ "VEXPANDPSZ128rm(b?)",
+ "VPEXPANDDZ128rm(b?)",
+ "VPEXPANDQZ128rm(b?)")>;
+
+def SKXWriteResGroup153 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup153], (instregex "(V?)CVTSD2SSrm")>;
+
+def SKXWriteResGroup154 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup154], (instrs VPHADDSWYrm,
+ VPHSUBSWYrm)>;
+
+def SKXWriteResGroup157 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,3];
+}
+def: InstRW<[SKXWriteResGroup157], (instregex "XCHG(8|16|32|64)rm")>;
+
+def SKXWriteResGroup159 : SchedWriteRes<[SKXPort0,SKXFPDivider]> {
+ let Latency = 11;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,3];
+}
+def : SchedAlias<WriteFDivX, SKXWriteResGroup159>; // TODO - convert to ZnWriteResFpuPair
+
+def SKXWriteResGroup160 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F(32|64)m")>;
+
+def SKXWriteResGroup161 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup161], (instrs VCVTDQ2PSYrm,
+ VCVTPS2PDYrm)>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2(PD|PS)(Z|Z256)rm(b?)",
+ "VCVTPH2PS(Z|Z256)rm(b?)",
+ "VCVTPS2PD(Z|Z256)rm(b?)",
+ "VCVTQQ2PD(Z|Z256)rm(b?)",
+ "VCVTQQ2PSZ256rm(b?)",
+ "VCVT(T?)PD2QQ(Z|Z256)rm(b?)",
+ "VCVT(T?)PD2UQQ(Z|Z256)rm(b?)",
+ "VCVT(T?)PS2DQYrm",
+ "VCVT(T?)PS2DQ(Z|Z256)rm(b?)",
+ "VCVT(T?)PS2QQZ256rm(b?)",
+ "VCVT(T?)PS2UDQ(Z|Z256)rm(b?)",
+ "VCVT(T?)PS2UQQZ256rm(b?)",
+ "VCVTUDQ2(PD|PS)(Z|Z256)rm(b?)",
+ "VCVTUQQ2PD(Z|Z256)rm(b?)",
+ "VCVTUQQ2PSZ256rm(b?)")>;
+
+def SKXWriteResGroup162 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup162], (instregex "FICOM(P?)(16|32)m",
+ "VEXPANDPD(Z|Z256)rm(b?)",
+ "VEXPANDPS(Z|Z256)rm(b?)",
+ "VPEXPANDD(Z|Z256)rm(b?)",
+ "VPEXPANDQ(Z|Z256)rm(b?)")>;
+
+def SKXWriteResGroup163 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup163], (instregex "VCVTSD2SSZrm")>;
+
+def SKXWriteResGroup164 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup164], (instregex "(V?)CVTDQ2PDrm")>;
+
+def SKXWriteResGroup166 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup166], (instrs CVTPD2PSrm,
+ CVTPD2DQrm,
+ CVTTPD2DQrm,
+ MMX_CVTPD2PIirm,
+ MMX_CVTTPD2PIirm)>;
+
+def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)")>;
+
+def SKXWriteResGroup169 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,3,2];
+}
+def: InstRW<[SKXWriteResGroup169], (instregex "RCL(16|32|64)rCL",
+ "RCR(16|32|64)rCL")>;
+
+def SKXWriteResGroup170 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,5,1,2];
+}
+def: InstRW<[SKXWriteResGroup170], (instrs RCL8rCL)>;
+
+def SKXWriteResGroup171 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,9];
+}
+def: InstRW<[SKXWriteResGroup171], (instrs LOOPE, LOOPNE)>;
+
+def SKXWriteResGroup174 : SchedWriteRes<[SKXPort01]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQ(Z128|Z256)rr")>;
+
+def SKXWriteResGroup174z : SchedWriteRes<[SKXPort05]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SKXWriteResGroup174z], (instregex "VPMULLQZrr")>;
+
+def SKXWriteResGroup175 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup175], (instregex "VPERMWZ128rm(b?)")>;
+
+def SKXWriteResGroup176 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup176], (instregex "VCVT(T?)SD2USIZrm(b?)",
+ "VCVT(T?)SS2USI64Zrm(b?)")>;
+
+def SKXWriteResGroup177 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup177], (instregex "VCVT(T?)PS2QQZrm(b?)",
+ "VCVT(T?)PS2UQQZrm(b?)")>;
+
+def SKXWriteResGroup179 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup179], (instregex "CVTTSS2SI64rm")>;
+
+def SKXWriteResGroup180 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup180], (instregex "(ADD|SUB|SUBR)_FI(16|32)m",
+ "VPERMWZ256rm(b?)",
+ "VPERMWZrm(b?)")>;
+
+def SKXWriteResGroup181 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup181], (instrs VCVTDQ2PDYrm)>;
+
+def SKXWriteResGroup183 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup183], (instregex "VPERMI2W128rm(b?)",
+ "VPERMT2W128rm(b?)")>;
+
+def SKXWriteResGroup184 : SchedWriteRes<[SKXPort0,SKXFPDivider]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,3];
+}
+def : SchedAlias<WriteFDiv64, SKXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair
+def : SchedAlias<WriteFDiv64X, SKXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair
+
+def SKXWriteResGroup184_1 : SchedWriteRes<[SKXPort0,SKXFPDivider]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1,5];
+}
+def : SchedAlias<WriteFDiv64Y, SKXWriteResGroup184_1>; // TODO - convert to ZnWriteResFpuPair
+
+def SKXWriteResGroup187 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI(16|32)m")>;
+
+def SKXWriteResGroup188 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2DQZrm(b?)",
+ "VCVTPD2PSZrm(b?)",
+ "VCVTPD2UDQZrm(b?)",
+ "VCVTQQ2PSZrm(b?)",
+ "VCVTTPD2DQZrm(b?)",
+ "VCVTTPD2UDQZrm(b?)",
+ "VCVTUQQ2PSZrm(b?)")>;
+
+def SKXWriteResGroup189 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2W256rm(b?)",
+ "VPERMI2Wrm(b?)",
+ "VPERMT2W256rm(b?)",
+ "VPERMT2Wrm(b?)")>;
+
+def SKXWriteResGroup190 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [2,4,1,3];
+}
+def: InstRW<[SKXWriteResGroup190], (instrs RCR8rCL)>;
+
+def SKXWriteResGroup191 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 15;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
+
+def SKXWriteResGroup194 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
+ let Latency = 15;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,2,2,1,2];
+}
+def: InstRW<[SKXWriteResGroup194], (instregex "VPCONFLICTDZ128rm(b?)")>;
+
+def SKXWriteResGroup195 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 15;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,1,1,5,1,1];
+}
+def: InstRW<[SKXWriteResGroup195], (instregex "RCL(8|16|32|64)mCL")>;
+
+def SKXWriteResGroup199 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[SKXWriteResGroup199], (instrs CMPXCHG8B)>;
+
+def SKXWriteResGroup200 : SchedWriteRes<[SKXPort1, SKXPort05, SKXPort6]> {
+ let Latency = 12;
+ let NumMicroOps = 34;
+ let ResourceCycles = [1, 4, 5];
+}
+def: InstRW<[SKXWriteResGroup200], (instrs VZEROALL)>;
+
+def SKXWriteResGroup201 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
+ let Latency = 17;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,5];
+}
+def : SchedAlias<WriteFDivXLd, SKXWriteResGroup201>; // TODO - convert to ZnWriteResFpuPair
+
+def SKXWriteResGroup202 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 15;
+ let ResourceCycles = [2,1,2,4,2,4];
+}
+def: InstRW<[SKXWriteResGroup202], (instrs XCH_F)>;
+
+def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort01]> {
+ let Latency = 21;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKXWriteResGroup205], (instregex "VPMULLQZ128rm(b?)")>;
+
+def SKXWriteResGroup207 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort06,SKXPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,5];
+}
+def: InstRW<[SKXWriteResGroup207], (instrs CPUID, RDTSC)>;
+
+def SKXWriteResGroup208 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,1,1,4,1,2];
+}
+def: InstRW<[SKXWriteResGroup208], (instregex "RCR(8|16|32|64)mCL")>;
+
+def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
+ let Latency = 19;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,4];
+}
+def : SchedAlias<WriteFDiv64Ld, SKXWriteResGroup209>; // TODO - convert to ZnWriteResFpuPair
+
+def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort01]> {
+ let Latency = 22;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)")>;
+
+def SKXWriteResGroup211_1 : SchedWriteRes<[SKXPort23,SKXPort05]> {
+ let Latency = 22;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKXWriteResGroup211_1], (instregex "VPMULLQZrm(b?)")>;
+
+def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 20;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup215], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
+
+def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
+ let Latency = 20;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,4];
+}
+def : SchedAlias<WriteFDiv64XLd, SKXWriteResGroup216>; // TODO - convert to ZnWriteResFpuPair
+
+def SKXWriteGatherEVEX2 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX2], (instrs VGATHERQPSZ128rm, VPGATHERQDZ128rm,
+ VGATHERDPDZ128rm, VPGATHERDQZ128rm,
+ VGATHERQPDZ128rm, VPGATHERQQZ128rm)>;
+
+def SKXWriteGatherEVEX4 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,4,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX4], (instrs VGATHERQPSZ256rm, VPGATHERQDZ256rm,
+ VGATHERQPDZ256rm, VPGATHERQQZ256rm,
+ VGATHERDPSZ128rm, VPGATHERDDZ128rm,
+ VGATHERDPDZ256rm, VPGATHERDQZ256rm)>;
+
+def SKXWriteGatherEVEX8 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 21;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,8,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX8], (instrs VGATHERDPSZ256rm, VPGATHERDDZ256rm,
+ VGATHERDPDZrm, VPGATHERDQZrm,
+ VGATHERQPDZrm, VPGATHERQQZrm,
+ VGATHERQPSZrm, VPGATHERQDZrm)>;
+
+def SKXWriteGatherEVEX16 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,16,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX16], (instrs VGATHERDPSZrm, VPGATHERDDZrm)>;
+
+def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[SKXWriteResGroup219], (instrs INSB, INSL, INSW)>;
+
+def SKXWriteResGroup220 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,2,7];
+}
+def: InstRW<[SKXWriteResGroup220], (instrs MWAITrr)>;
+
+def SKXWriteResGroup222 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1,8];
+}
+def : SchedAlias<WriteFDiv64YLd, SKXWriteResGroup222>; // TODO - convert to ZnWriteResFpuPair
+
+def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 22;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F(32|64)m")>;
+
+def SKXWriteResGroupVEX2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+ let Latency = 18;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm,
+ VGATHERQPDrm, VPGATHERQQrm,
+ VGATHERQPSrm, VPGATHERQDrm)>;
+
+def SKXWriteResGroupVEX4 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+ let Latency = 20;
+ let NumMicroOps = 5; // 2 uops peform multiple loads
+ let ResourceCycles = [1,4,1,1];
+}
+def: InstRW<[SKXWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+ VGATHERDPSrm, VPGATHERDDrm,
+ VGATHERQPDYrm, VPGATHERQQYrm,
+ VGATHERQPSYrm, VPGATHERQDYrm)>;
+
+def SKXWriteResGroupVEX8 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+ let Latency = 22;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,8,1,1];
+}
+def: InstRW<[SKXWriteResGroupVEX8], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
+
+def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
+ let Latency = 22;
+ let NumMicroOps = 14;
+ let ResourceCycles = [5,5,4];
+}
+def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTDZ128rr",
+ "VPCONFLICTQZ256rr")>;
+
+def SKXWriteResGroup228 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 19;
+ let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[SKXWriteResGroup228], (instrs CMPXCHG16B)>;
+
+def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI(16|32)m")>;
+
+def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 27;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F(32|64)m")>;
+
+def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
+ let Latency = 29;
+ let NumMicroOps = 15;
+ let ResourceCycles = [5,5,1,4];
+}
+def: InstRW<[SKXWriteResGroup242], (instregex "VPCONFLICTQZ256rm(b?)")>;
+
+def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 30;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI(16|32)m")>;
+
+def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,3,4,10];
+}
+def: InstRW<[SKXWriteResGroup247], (instregex "IN(8|16|32)ri",
+ "IN(8|16|32)rr")>;
+
+def SKXWriteResGroup248 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,2,1,4,10];
+}
+def: InstRW<[SKXWriteResGroup248], (instregex "OUT(8|16|32)ir",
+ "OUT(8|16|32)rr")>;
+
+def SKXWriteResGroup249 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
+ let Latency = 37;
+ let NumMicroOps = 21;
+ let ResourceCycles = [9,7,5];
+}
+def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTDZ256rr",
+ "VPCONFLICTQZrr")>;
+
+def SKXWriteResGroup250 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> {
+ let Latency = 37;
+ let NumMicroOps = 31;
+ let ResourceCycles = [1,8,1,21];
+}
+def: InstRW<[SKXWriteResGroup250], (instregex "XRSTOR(64)?")>;
+
+def SKXWriteResGroup252 : SchedWriteRes<[SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort15,SKXPort0156]> {
+ let Latency = 40;
+ let NumMicroOps = 18;
+ let ResourceCycles = [1,1,2,3,1,1,1,8];
+}
+def: InstRW<[SKXWriteResGroup252], (instrs VMCLEARm)>;
+
+def SKXWriteResGroup253 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 41;
+ let NumMicroOps = 39;
+ let ResourceCycles = [1,10,1,1,26];
+}
+def: InstRW<[SKXWriteResGroup253], (instrs XSAVE64)>;
+
+def SKXWriteResGroup254 : SchedWriteRes<[SKXPort5,SKXPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 22;
+ let ResourceCycles = [2,20];
+}
+def: InstRW<[SKXWriteResGroup254], (instrs RDTSCP)>;
+
+def SKXWriteResGroup255 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 40;
+ let ResourceCycles = [1,11,1,1,26];
+}
+def: InstRW<[SKXWriteResGroup255], (instrs XSAVE)>;
+def: InstRW<[SKXWriteResGroup255], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
+
+def SKXWriteResGroup256 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
+ let Latency = 44;
+ let NumMicroOps = 22;
+ let ResourceCycles = [9,7,1,5];
+}
+def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTDZ256rm(b?)",
+ "VPCONFLICTQZrm(b?)")>;
+
+def SKXWriteResGroup258 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05,SKXPort06,SKXPort0156]> {
+ let Latency = 62;
+ let NumMicroOps = 64;
+ let ResourceCycles = [2,8,5,10,39];
+}
+def: InstRW<[SKXWriteResGroup258], (instrs FLDENVm)>;
+
+def SKXWriteResGroup259 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 88;
+ let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[SKXWriteResGroup259], (instrs FXRSTOR64)>;
+
+def SKXWriteResGroup260 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 90;
+ let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[SKXWriteResGroup260], (instrs FXRSTOR)>;
+
+def SKXWriteResGroup261 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
+ let Latency = 67;
+ let NumMicroOps = 35;
+ let ResourceCycles = [17,11,7];
+}
+def: InstRW<[SKXWriteResGroup261], (instregex "VPCONFLICTDZrr")>;
+
+def SKXWriteResGroup262 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
+ let Latency = 74;
+ let NumMicroOps = 36;
+ let ResourceCycles = [17,11,1,7];
+}
+def: InstRW<[SKXWriteResGroup262], (instregex "VPCONFLICTDZrm(b?)")>;
+
+def SKXWriteResGroup263 : SchedWriteRes<[SKXPort5,SKXPort05,SKXPort0156]> {
+ let Latency = 75;
+ let NumMicroOps = 15;
+ let ResourceCycles = [6,3,6];
+}
+def: InstRW<[SKXWriteResGroup263], (instrs FNINIT)>;
+
+def SKXWriteResGroup266 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 106;
+ let NumMicroOps = 100;
+ let ResourceCycles = [9,1,11,16,1,11,21,30];
+}
+def: InstRW<[SKXWriteResGroup266], (instrs FSTENVm)>;
+
+def SKXWriteResGroup267 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
+ let Latency = 140;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
+
+
+// Instruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Skylake Pipeline" > "Register allocation and renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def SKXWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def SKXWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[SKXWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def SKXWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[SKXWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr,
+ XORPDrr, VXORPDrr,
+ VXORPSZ128rr,
+ VXORPDZ128rr)>;
+
+def SKXWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[SKXWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
+ VXORPSZ256rr, VXORPDZ256rr)>;
+
+def SKXWriteFZeroIdiomZ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicZ]>
+]>;
+def : InstRW<[SKXWriteFZeroIdiomZ], (instrs VXORPSZrr, VXORPDZrr)>;
+
+def SKXWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
+ VPXORDZ128rr, VPXORQZ128rr)>;
+
+def SKXWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomLogicY], (instrs VPXORYrr,
+ VPXORDZ256rr, VPXORQZ256rr)>;
+
+def SKXWriteVZeroIdiomLogicZ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicZ]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomLogicZ], (instrs VPXORDZrr, VPXORQZrr)>;
+
+def SKXWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def SKXWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def SKXWritePSUB : SchedWriteRes<[SKXPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKXWriteVZeroIdiomPSUB : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKXWritePSUB]>
+]>;
+
+def : InstRW<[SKXWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr, VPSUBBZ128rr,
+ PSUBDrr, VPSUBDrr, VPSUBDZ128rr,
+ PSUBQrr, VPSUBQrr, VPSUBQZ128rr,
+ PSUBWrr, VPSUBWrr, VPSUBWZ128rr,
+ VPSUBBYrr, VPSUBBZ256rr,
+ VPSUBDYrr, VPSUBDZ256rr,
+ VPSUBQYrr, VPSUBQZ256rr,
+ VPSUBWYrr, VPSUBWZ256rr,
+ VPSUBBZrr,
+ VPSUBDZrr,
+ VPSUBQZrr,
+ VPSUBWZrr)>;
+def SKXWritePCMPGTQ : SchedWriteRes<[SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKXWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKXWritePCMPGTQ]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def SKXWriteCMOVA_CMOVBErr : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKXWriteCMOVA_CMOVBErm : SchedWriteRes<[SKXPort23,SKXPort06]> {
+ let Latency = 7;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 3;
+}
+
+def SKXCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SKXWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def SKXCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SKXWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[SKXCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[SKXCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def SKXWriteSETA_SETBEr : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKXWriteSETA_SETBEm : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def SKXSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SKXWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def SKXSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SKXWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[SKXSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[SKXSETA_SETBErm], (instrs SETCCm)>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td
new file mode 100644
index 000000000000..f204d6622119
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td
@@ -0,0 +1,731 @@
+//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// InstrSchedModel annotations for out-of-order CPUs.
+
+// Instructions with folded loads need to read the memory operand immediately,
+// but other register operands don't have to be read until the load is ready.
+// These operands are marked with ReadAfterLd.
+def ReadAfterLd : SchedRead;
+def ReadAfterVecLd : SchedRead;
+def ReadAfterVecXLd : SchedRead;
+def ReadAfterVecYLd : SchedRead;
+
+// Instructions that move data between general purpose registers and vector
+// registers may be subject to extra latency due to data bypass delays.
+// This SchedRead describes a bypass delay caused by data being moved from the
+// integer unit to the floating point unit.
+def ReadInt2Fpu : SchedRead;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+def WriteRMW : SchedWrite;
+
+// Helper to set SchedWrite ExePorts/Latency/ResourceCycles/NumMicroOps.
+multiclass X86WriteRes<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res, int UOps> {
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+}
+
+// Most instructions can fold loads, so almost every SchedWrite comes in two
+// variants: With and without a folded load.
+// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
+// with a folded load.
+class X86FoldableSchedWrite : SchedWrite {
+ // The SchedWrite to use when a load is folded into the instruction.
+ SchedWrite Folded;
+ // The SchedRead to tag register operands than don't need to be ready
+ // until the folded load has completed.
+ SchedRead ReadAfterFold;
+}
+
+// Multiclass that produces a linked pair of SchedWrites.
+multiclass X86SchedWritePair<SchedRead ReadAfter = ReadAfterLd> {
+ // Register-Memory operation.
+ def Ld : SchedWrite;
+ // Register-Register operation.
+ def NAME : X86FoldableSchedWrite {
+ let Folded = !cast<SchedWrite>(NAME#"Ld");
+ let ReadAfterFold = ReadAfter;
+ }
+}
+
+// Helpers to mark SchedWrites as unsupported.
+multiclass X86WriteResUnsupported<SchedWrite SchedRW> {
+ let Unsupported = 1 in {
+ def : WriteRes<SchedRW, []>;
+ }
+}
+multiclass X86WriteResPairUnsupported<X86FoldableSchedWrite SchedRW> {
+ let Unsupported = 1 in {
+ def : WriteRes<SchedRW, []>;
+ def : WriteRes<SchedRW.Folded, []>;
+ }
+}
+
+// Multiclass that wraps X86FoldableSchedWrite for each vector width.
+class X86SchedWriteWidths<X86FoldableSchedWrite sScl,
+ X86FoldableSchedWrite s128,
+ X86FoldableSchedWrite s256,
+ X86FoldableSchedWrite s512> {
+ X86FoldableSchedWrite Scl = sScl; // Scalar float/double operations.
+ X86FoldableSchedWrite MMX = sScl; // MMX operations.
+ X86FoldableSchedWrite XMM = s128; // XMM operations.
+ X86FoldableSchedWrite YMM = s256; // YMM operations.
+ X86FoldableSchedWrite ZMM = s512; // ZMM operations.
+}
+
+// Multiclass that wraps X86SchedWriteWidths for each fp vector type.
+class X86SchedWriteSizes<X86SchedWriteWidths sPS,
+ X86SchedWriteWidths sPD> {
+ X86SchedWriteWidths PS = sPS;
+ X86SchedWriteWidths PD = sPD;
+}
+
+// Multiclass that wraps move/load/store triple for a vector width.
+class X86SchedWriteMoveLS<SchedWrite MoveRR,
+ SchedWrite LoadRM,
+ SchedWrite StoreMR> {
+ SchedWrite RR = MoveRR;
+ SchedWrite RM = LoadRM;
+ SchedWrite MR = StoreMR;
+}
+
+// Multiclass that wraps masked load/store writes for a vector width.
+class X86SchedWriteMaskMove<SchedWrite LoadRM, SchedWrite StoreMR> {
+ SchedWrite RM = LoadRM;
+ SchedWrite MR = StoreMR;
+}
+
+// Multiclass that wraps X86SchedWriteMoveLS for each vector width.
+class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl,
+ X86SchedWriteMoveLS s128,
+ X86SchedWriteMoveLS s256,
+ X86SchedWriteMoveLS s512> {
+ X86SchedWriteMoveLS Scl = sScl; // Scalar float/double operations.
+ X86SchedWriteMoveLS MMX = sScl; // MMX operations.
+ X86SchedWriteMoveLS XMM = s128; // XMM operations.
+ X86SchedWriteMoveLS YMM = s256; // YMM operations.
+ X86SchedWriteMoveLS ZMM = s512; // ZMM operations.
+}
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteStoreNT : SchedWrite;
+def WriteMove : SchedWrite;
+def WriteCopy : WriteSequence<[WriteLoad, WriteStore]>; // mem->mem copy
+
+// Arithmetic.
+defm WriteALU : X86SchedWritePair; // Simple integer ALU op.
+defm WriteADC : X86SchedWritePair; // Integer ALU + flags op.
+def WriteALURMW : WriteSequence<[WriteALULd, WriteRMW]>;
+def WriteADCRMW : WriteSequence<[WriteADCLd, WriteRMW]>;
+def WriteLEA : SchedWrite; // LEA instructions can't fold loads.
+
+// Integer multiplication
+defm WriteIMul8 : X86SchedWritePair; // Integer 8-bit multiplication.
+defm WriteIMul16 : X86SchedWritePair; // Integer 16-bit multiplication.
+defm WriteIMul16Imm : X86SchedWritePair; // Integer 16-bit multiplication by immediate.
+defm WriteIMul16Reg : X86SchedWritePair; // Integer 16-bit multiplication by register.
+defm WriteIMul32 : X86SchedWritePair; // Integer 32-bit multiplication.
+defm WriteIMul32Imm : X86SchedWritePair; // Integer 32-bit multiplication by immediate.
+defm WriteIMul32Reg : X86SchedWritePair; // Integer 32-bit multiplication by register.
+defm WriteIMul64 : X86SchedWritePair; // Integer 64-bit multiplication.
+defm WriteIMul64Imm : X86SchedWritePair; // Integer 64-bit multiplication by immediate.
+defm WriteIMul64Reg : X86SchedWritePair; // Integer 64-bit multiplication by register.
+def WriteIMulH : SchedWrite; // Integer multiplication, high part.
+
+def WriteBSWAP32 : SchedWrite; // Byte Order (Endianness) 32-bit Swap.
+def WriteBSWAP64 : SchedWrite; // Byte Order (Endianness) 64-bit Swap.
+defm WriteCMPXCHG : X86SchedWritePair; // Compare and set, compare and swap.
+def WriteCMPXCHGRMW : SchedWrite; // Compare and set, compare and swap.
+def WriteXCHG : SchedWrite; // Compare+Exchange - TODO RMW support.
+
+// Integer division.
+defm WriteDiv8 : X86SchedWritePair;
+defm WriteDiv16 : X86SchedWritePair;
+defm WriteDiv32 : X86SchedWritePair;
+defm WriteDiv64 : X86SchedWritePair;
+defm WriteIDiv8 : X86SchedWritePair;
+defm WriteIDiv16 : X86SchedWritePair;
+defm WriteIDiv32 : X86SchedWritePair;
+defm WriteIDiv64 : X86SchedWritePair;
+
+defm WriteBSF : X86SchedWritePair; // Bit scan forward.
+defm WriteBSR : X86SchedWritePair; // Bit scan reverse.
+defm WritePOPCNT : X86SchedWritePair; // Bit population count.
+defm WriteLZCNT : X86SchedWritePair; // Leading zero count.
+defm WriteTZCNT : X86SchedWritePair; // Trailing zero count.
+defm WriteCMOV : X86SchedWritePair; // Conditional move.
+def WriteFCMOV : SchedWrite; // X87 conditional move.
+def WriteSETCC : SchedWrite; // Set register based on condition code.
+def WriteSETCCStore : SchedWrite;
+def WriteLAHFSAHF : SchedWrite; // Load/Store flags in AH.
+
+def WriteBitTest : SchedWrite; // Bit Test
+def WriteBitTestImmLd : SchedWrite;
+def WriteBitTestRegLd : SchedWrite;
+
+def WriteBitTestSet : SchedWrite; // Bit Test + Set
+def WriteBitTestSetImmLd : SchedWrite;
+def WriteBitTestSetRegLd : SchedWrite;
+def WriteBitTestSetImmRMW : WriteSequence<[WriteBitTestSetImmLd, WriteRMW]>;
+def WriteBitTestSetRegRMW : WriteSequence<[WriteBitTestSetRegLd, WriteRMW]>;
+
+// Integer shifts and rotates.
+defm WriteShift : X86SchedWritePair;
+defm WriteShiftCL : X86SchedWritePair;
+defm WriteRotate : X86SchedWritePair;
+defm WriteRotateCL : X86SchedWritePair;
+
+// Double shift instructions.
+def WriteSHDrri : SchedWrite;
+def WriteSHDrrcl : SchedWrite;
+def WriteSHDmri : SchedWrite;
+def WriteSHDmrcl : SchedWrite;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm WriteBEXTR : X86SchedWritePair;
+defm WriteBLS : X86SchedWritePair;
+defm WriteBZHI : X86SchedWritePair;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def WriteZero : SchedWrite;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm WriteJump : X86SchedWritePair;
+
+// Floating point. This covers both scalar and vector operations.
+def WriteFLD0 : SchedWrite;
+def WriteFLD1 : SchedWrite;
+def WriteFLDC : SchedWrite;
+def WriteFLoad : SchedWrite;
+def WriteFLoadX : SchedWrite;
+def WriteFLoadY : SchedWrite;
+def WriteFMaskedLoad : SchedWrite;
+def WriteFMaskedLoadY : SchedWrite;
+def WriteFStore : SchedWrite;
+def WriteFStoreX : SchedWrite;
+def WriteFStoreY : SchedWrite;
+def WriteFStoreNT : SchedWrite;
+def WriteFStoreNTX : SchedWrite;
+def WriteFStoreNTY : SchedWrite;
+
+def WriteFMaskedStore32 : SchedWrite;
+def WriteFMaskedStore64 : SchedWrite;
+def WriteFMaskedStore32Y : SchedWrite;
+def WriteFMaskedStore64Y : SchedWrite;
+
+def WriteFMove : SchedWrite;
+def WriteFMoveX : SchedWrite;
+def WriteFMoveY : SchedWrite;
+
+defm WriteFAdd : X86SchedWritePair<ReadAfterVecLd>; // Floating point add/sub.
+defm WriteFAddX : X86SchedWritePair<ReadAfterVecXLd>; // Floating point add/sub (XMM).
+defm WriteFAddY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point add/sub (YMM).
+defm WriteFAddZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point add/sub (ZMM).
+defm WriteFAdd64 : X86SchedWritePair<ReadAfterVecLd>; // Floating point double add/sub.
+defm WriteFAdd64X : X86SchedWritePair<ReadAfterVecXLd>; // Floating point double add/sub (XMM).
+defm WriteFAdd64Y : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double add/sub (YMM).
+defm WriteFAdd64Z : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double add/sub (ZMM).
+defm WriteFCmp : X86SchedWritePair<ReadAfterVecLd>; // Floating point compare.
+defm WriteFCmpX : X86SchedWritePair<ReadAfterVecXLd>; // Floating point compare (XMM).
+defm WriteFCmpY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point compare (YMM).
+defm WriteFCmpZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point compare (ZMM).
+defm WriteFCmp64 : X86SchedWritePair<ReadAfterVecLd>; // Floating point double compare.
+defm WriteFCmp64X : X86SchedWritePair<ReadAfterVecXLd>; // Floating point double compare (XMM).
+defm WriteFCmp64Y : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double compare (YMM).
+defm WriteFCmp64Z : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double compare (ZMM).
+defm WriteFCom : X86SchedWritePair<ReadAfterVecLd>; // Floating point compare to flags (X87).
+defm WriteFComX : X86SchedWritePair<ReadAfterVecLd>; // Floating point compare to flags (SSE).
+defm WriteFMul : X86SchedWritePair<ReadAfterVecLd>; // Floating point multiplication.
+defm WriteFMulX : X86SchedWritePair<ReadAfterVecXLd>; // Floating point multiplication (XMM).
+defm WriteFMulY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point multiplication (YMM).
+defm WriteFMulZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point multiplication (YMM).
+defm WriteFMul64 : X86SchedWritePair<ReadAfterVecLd>; // Floating point double multiplication.
+defm WriteFMul64X : X86SchedWritePair<ReadAfterVecXLd>; // Floating point double multiplication (XMM).
+defm WriteFMul64Y : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double multiplication (YMM).
+defm WriteFMul64Z : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double multiplication (ZMM).
+defm WriteFDiv : X86SchedWritePair<ReadAfterVecLd>; // Floating point division.
+defm WriteFDivX : X86SchedWritePair<ReadAfterVecXLd>; // Floating point division (XMM).
+defm WriteFDivY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point division (YMM).
+defm WriteFDivZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point division (ZMM).
+defm WriteFDiv64 : X86SchedWritePair<ReadAfterVecLd>; // Floating point double division.
+defm WriteFDiv64X : X86SchedWritePair<ReadAfterVecXLd>; // Floating point double division (XMM).
+defm WriteFDiv64Y : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double division (YMM).
+defm WriteFDiv64Z : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double division (ZMM).
+defm WriteFSqrt : X86SchedWritePair<ReadAfterVecLd>; // Floating point square root.
+defm WriteFSqrtX : X86SchedWritePair<ReadAfterVecXLd>; // Floating point square root (XMM).
+defm WriteFSqrtY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point square root (YMM).
+defm WriteFSqrtZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point square root (ZMM).
+defm WriteFSqrt64 : X86SchedWritePair<ReadAfterVecLd>; // Floating point double square root.
+defm WriteFSqrt64X : X86SchedWritePair<ReadAfterVecXLd>; // Floating point double square root (XMM).
+defm WriteFSqrt64Y : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double square root (YMM).
+defm WriteFSqrt64Z : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double square root (ZMM).
+defm WriteFSqrt80 : X86SchedWritePair<ReadAfterVecLd>; // Floating point long double square root.
+defm WriteFRcp : X86SchedWritePair<ReadAfterVecLd>; // Floating point reciprocal estimate.
+defm WriteFRcpX : X86SchedWritePair<ReadAfterVecXLd>; // Floating point reciprocal estimate (XMM).
+defm WriteFRcpY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point reciprocal estimate (YMM).
+defm WriteFRcpZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point reciprocal estimate (ZMM).
+defm WriteFRsqrt : X86SchedWritePair<ReadAfterVecLd>; // Floating point reciprocal square root estimate.
+defm WriteFRsqrtX: X86SchedWritePair<ReadAfterVecXLd>; // Floating point reciprocal square root estimate (XMM).
+defm WriteFRsqrtY: X86SchedWritePair<ReadAfterVecYLd>; // Floating point reciprocal square root estimate (YMM).
+defm WriteFRsqrtZ: X86SchedWritePair<ReadAfterVecYLd>; // Floating point reciprocal square root estimate (ZMM).
+defm WriteFMA : X86SchedWritePair<ReadAfterVecLd>; // Fused Multiply Add.
+defm WriteFMAX : X86SchedWritePair<ReadAfterVecXLd>; // Fused Multiply Add (XMM).
+defm WriteFMAY : X86SchedWritePair<ReadAfterVecYLd>; // Fused Multiply Add (YMM).
+defm WriteFMAZ : X86SchedWritePair<ReadAfterVecYLd>; // Fused Multiply Add (ZMM).
+defm WriteDPPD : X86SchedWritePair<ReadAfterVecXLd>; // Floating point double dot product.
+defm WriteDPPS : X86SchedWritePair<ReadAfterVecXLd>; // Floating point single dot product.
+defm WriteDPPSY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point single dot product (YMM).
+defm WriteDPPSZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point single dot product (ZMM).
+defm WriteFSign : X86SchedWritePair<ReadAfterVecLd>; // Floating point fabs/fchs.
+defm WriteFRnd : X86SchedWritePair<ReadAfterVecXLd>; // Floating point rounding.
+defm WriteFRndY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point rounding (YMM).
+defm WriteFRndZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point rounding (ZMM).
+defm WriteFLogic : X86SchedWritePair<ReadAfterVecXLd>; // Floating point and/or/xor logicals.
+defm WriteFLogicY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point and/or/xor logicals (YMM).
+defm WriteFLogicZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point and/or/xor logicals (ZMM).
+defm WriteFTest : X86SchedWritePair<ReadAfterVecXLd>; // Floating point TEST instructions.
+defm WriteFTestY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point TEST instructions (YMM).
+defm WriteFTestZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point TEST instructions (ZMM).
+defm WriteFShuffle : X86SchedWritePair<ReadAfterVecXLd>; // Floating point vector shuffles.
+defm WriteFShuffleY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point vector shuffles (YMM).
+defm WriteFShuffleZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point vector shuffles (ZMM).
+defm WriteFVarShuffle : X86SchedWritePair<ReadAfterVecXLd>; // Floating point vector variable shuffles.
+defm WriteFVarShuffleY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point vector variable shuffles (YMM).
+defm WriteFVarShuffleZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point vector variable shuffles (ZMM).
+defm WriteFBlend : X86SchedWritePair<ReadAfterVecXLd>; // Floating point vector blends.
+defm WriteFBlendY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point vector blends (YMM).
+defm WriteFBlendZ : X86SchedWritePair<ReadAfterVecYLd>; // Floating point vector blends (ZMM).
+defm WriteFVarBlend : X86SchedWritePair<ReadAfterVecXLd>; // Fp vector variable blends.
+defm WriteFVarBlendY : X86SchedWritePair<ReadAfterVecYLd>; // Fp vector variable blends (YMM).
+defm WriteFVarBlendZ : X86SchedWritePair<ReadAfterVecYLd>; // Fp vector variable blends (YMZMM).
+
+// FMA Scheduling helper class.
+class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Horizontal Add/Sub (float and integer)
+defm WriteFHAdd : X86SchedWritePair<ReadAfterVecXLd>;
+defm WriteFHAddY : X86SchedWritePair<ReadAfterVecYLd>;
+defm WriteFHAddZ : X86SchedWritePair<ReadAfterVecYLd>;
+defm WritePHAdd : X86SchedWritePair<ReadAfterVecLd>;
+defm WritePHAddX : X86SchedWritePair<ReadAfterVecXLd>;
+defm WritePHAddY : X86SchedWritePair<ReadAfterVecYLd>;
+defm WritePHAddZ : X86SchedWritePair<ReadAfterVecYLd>;
+
+// Vector integer operations.
+def WriteVecLoad : SchedWrite;
+def WriteVecLoadX : SchedWrite;
+def WriteVecLoadY : SchedWrite;
+def WriteVecLoadNT : SchedWrite;
+def WriteVecLoadNTY : SchedWrite;
+def WriteVecMaskedLoad : SchedWrite;
+def WriteVecMaskedLoadY : SchedWrite;
+def WriteVecStore : SchedWrite;
+def WriteVecStoreX : SchedWrite;
+def WriteVecStoreY : SchedWrite;
+def WriteVecStoreNT : SchedWrite;
+def WriteVecStoreNTY : SchedWrite;
+def WriteVecMaskedStore32 : SchedWrite;
+def WriteVecMaskedStore64 : SchedWrite;
+def WriteVecMaskedStore32Y : SchedWrite;
+def WriteVecMaskedStore64Y : SchedWrite;
+def WriteVecMove : SchedWrite;
+def WriteVecMoveX : SchedWrite;
+def WriteVecMoveY : SchedWrite;
+def WriteVecMoveToGpr : SchedWrite;
+def WriteVecMoveFromGpr : SchedWrite;
+
+defm WriteVecALU : X86SchedWritePair<ReadAfterVecLd>; // Vector integer ALU op, no logicals.
+defm WriteVecALUX : X86SchedWritePair<ReadAfterVecXLd>; // Vector integer ALU op, no logicals (XMM).
+defm WriteVecALUY : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer ALU op, no logicals (YMM).
+defm WriteVecALUZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer ALU op, no logicals (ZMM).
+defm WriteVecLogic : X86SchedWritePair<ReadAfterVecLd>; // Vector integer and/or/xor logicals.
+defm WriteVecLogicX : X86SchedWritePair<ReadAfterVecXLd>; // Vector integer and/or/xor logicals (XMM).
+defm WriteVecLogicY : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer and/or/xor logicals (YMM).
+defm WriteVecLogicZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer and/or/xor logicals (ZMM).
+defm WriteVecTest : X86SchedWritePair<ReadAfterVecXLd>; // Vector integer TEST instructions.
+defm WriteVecTestY : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer TEST instructions (YMM).
+defm WriteVecTestZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer TEST instructions (ZMM).
+defm WriteVecShift : X86SchedWritePair<ReadAfterVecLd>; // Vector integer shifts (default).
+defm WriteVecShiftX : X86SchedWritePair<ReadAfterVecXLd>; // Vector integer shifts (XMM).
+defm WriteVecShiftY : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer shifts (YMM).
+defm WriteVecShiftZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer shifts (ZMM).
+defm WriteVecShiftImm : X86SchedWritePair<ReadAfterVecLd>; // Vector integer immediate shifts (default).
+defm WriteVecShiftImmX: X86SchedWritePair<ReadAfterVecXLd>; // Vector integer immediate shifts (XMM).
+defm WriteVecShiftImmY: X86SchedWritePair<ReadAfterVecYLd>; // Vector integer immediate shifts (YMM).
+defm WriteVecShiftImmZ: X86SchedWritePair<ReadAfterVecYLd>; // Vector integer immediate shifts (ZMM).
+defm WriteVecIMul : X86SchedWritePair<ReadAfterVecLd>; // Vector integer multiply (default).
+defm WriteVecIMulX : X86SchedWritePair<ReadAfterVecXLd>; // Vector integer multiply (XMM).
+defm WriteVecIMulY : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer multiply (YMM).
+defm WriteVecIMulZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector integer multiply (ZMM).
+defm WritePMULLD : X86SchedWritePair<ReadAfterVecXLd>; // Vector PMULLD.
+defm WritePMULLDY : X86SchedWritePair<ReadAfterVecYLd>; // Vector PMULLD (YMM).
+defm WritePMULLDZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector PMULLD (ZMM).
+defm WriteShuffle : X86SchedWritePair<ReadAfterVecLd>; // Vector shuffles.
+defm WriteShuffleX : X86SchedWritePair<ReadAfterVecXLd>; // Vector shuffles (XMM).
+defm WriteShuffleY : X86SchedWritePair<ReadAfterVecYLd>; // Vector shuffles (YMM).
+defm WriteShuffleZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector shuffles (ZMM).
+defm WriteVarShuffle : X86SchedWritePair<ReadAfterVecLd>; // Vector variable shuffles.
+defm WriteVarShuffleX : X86SchedWritePair<ReadAfterVecXLd>; // Vector variable shuffles (XMM).
+defm WriteVarShuffleY : X86SchedWritePair<ReadAfterVecYLd>; // Vector variable shuffles (YMM).
+defm WriteVarShuffleZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector variable shuffles (ZMM).
+defm WriteBlend : X86SchedWritePair<ReadAfterVecXLd>; // Vector blends.
+defm WriteBlendY : X86SchedWritePair<ReadAfterVecYLd>; // Vector blends (YMM).
+defm WriteBlendZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector blends (ZMM).
+defm WriteVarBlend : X86SchedWritePair<ReadAfterVecXLd>; // Vector variable blends.
+defm WriteVarBlendY : X86SchedWritePair<ReadAfterVecYLd>; // Vector variable blends (YMM).
+defm WriteVarBlendZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector variable blends (ZMM).
+defm WritePSADBW : X86SchedWritePair<ReadAfterVecLd>; // Vector PSADBW.
+defm WritePSADBWX : X86SchedWritePair<ReadAfterVecXLd>; // Vector PSADBW (XMM).
+defm WritePSADBWY : X86SchedWritePair<ReadAfterVecYLd>; // Vector PSADBW (YMM).
+defm WritePSADBWZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector PSADBW (ZMM).
+defm WriteMPSAD : X86SchedWritePair<ReadAfterVecXLd>; // Vector MPSAD.
+defm WriteMPSADY : X86SchedWritePair<ReadAfterVecYLd>; // Vector MPSAD (YMM).
+defm WriteMPSADZ : X86SchedWritePair<ReadAfterVecYLd>; // Vector MPSAD (ZMM).
+defm WritePHMINPOS : X86SchedWritePair<ReadAfterVecXLd>; // Vector PHMINPOS.
+
+// Vector insert/extract operations.
+defm WriteVecInsert : X86SchedWritePair; // Insert gpr to vector element.
+def WriteVecExtract : SchedWrite; // Extract vector element to gpr.
+def WriteVecExtractSt : SchedWrite; // Extract vector element and store.
+
+// MOVMSK operations.
+def WriteFMOVMSK : SchedWrite;
+def WriteVecMOVMSK : SchedWrite;
+def WriteVecMOVMSKY : SchedWrite;
+def WriteMMXMOVMSK : SchedWrite;
+
+// Conversion between integer and float.
+defm WriteCvtSD2I : X86SchedWritePair<ReadAfterVecLd>; // Double -> Integer.
+defm WriteCvtPD2I : X86SchedWritePair<ReadAfterVecXLd>; // Double -> Integer (XMM).
+defm WriteCvtPD2IY : X86SchedWritePair<ReadAfterVecYLd>; // Double -> Integer (YMM).
+defm WriteCvtPD2IZ : X86SchedWritePair<ReadAfterVecYLd>; // Double -> Integer (ZMM).
+
+defm WriteCvtSS2I : X86SchedWritePair<ReadAfterVecLd>; // Float -> Integer.
+defm WriteCvtPS2I : X86SchedWritePair<ReadAfterVecXLd>; // Float -> Integer (XMM).
+defm WriteCvtPS2IY : X86SchedWritePair<ReadAfterVecYLd>; // Float -> Integer (YMM).
+defm WriteCvtPS2IZ : X86SchedWritePair<ReadAfterVecYLd>; // Float -> Integer (ZMM).
+
+defm WriteCvtI2SD : X86SchedWritePair<ReadAfterVecLd>; // Integer -> Double.
+defm WriteCvtI2PD : X86SchedWritePair<ReadAfterVecXLd>; // Integer -> Double (XMM).
+defm WriteCvtI2PDY : X86SchedWritePair<ReadAfterVecYLd>; // Integer -> Double (YMM).
+defm WriteCvtI2PDZ : X86SchedWritePair<ReadAfterVecYLd>; // Integer -> Double (ZMM).
+
+defm WriteCvtI2SS : X86SchedWritePair<ReadAfterVecLd>; // Integer -> Float.
+defm WriteCvtI2PS : X86SchedWritePair<ReadAfterVecXLd>; // Integer -> Float (XMM).
+defm WriteCvtI2PSY : X86SchedWritePair<ReadAfterVecYLd>; // Integer -> Float (YMM).
+defm WriteCvtI2PSZ : X86SchedWritePair<ReadAfterVecYLd>; // Integer -> Float (ZMM).
+
+defm WriteCvtSS2SD : X86SchedWritePair<ReadAfterVecLd>; // Float -> Double size conversion.
+defm WriteCvtPS2PD : X86SchedWritePair<ReadAfterVecXLd>; // Float -> Double size conversion (XMM).
+defm WriteCvtPS2PDY : X86SchedWritePair<ReadAfterVecYLd>; // Float -> Double size conversion (YMM).
+defm WriteCvtPS2PDZ : X86SchedWritePair<ReadAfterVecYLd>; // Float -> Double size conversion (ZMM).
+
+defm WriteCvtSD2SS : X86SchedWritePair<ReadAfterVecLd>; // Double -> Float size conversion.
+defm WriteCvtPD2PS : X86SchedWritePair<ReadAfterVecXLd>; // Double -> Float size conversion (XMM).
+defm WriteCvtPD2PSY : X86SchedWritePair<ReadAfterVecYLd>; // Double -> Float size conversion (YMM).
+defm WriteCvtPD2PSZ : X86SchedWritePair<ReadAfterVecYLd>; // Double -> Float size conversion (ZMM).
+
+defm WriteCvtPH2PS : X86SchedWritePair<ReadAfterVecXLd>; // Half -> Float size conversion.
+defm WriteCvtPH2PSY : X86SchedWritePair<ReadAfterVecYLd>; // Half -> Float size conversion (YMM).
+defm WriteCvtPH2PSZ : X86SchedWritePair<ReadAfterVecYLd>; // Half -> Float size conversion (ZMM).
+
+def WriteCvtPS2PH : SchedWrite; // // Float -> Half size conversion.
+def WriteCvtPS2PHY : SchedWrite; // // Float -> Half size conversion (YMM).
+def WriteCvtPS2PHZ : SchedWrite; // // Float -> Half size conversion (ZMM).
+def WriteCvtPS2PHSt : SchedWrite; // // Float -> Half + store size conversion.
+def WriteCvtPS2PHYSt : SchedWrite; // // Float -> Half + store size conversion (YMM).
+def WriteCvtPS2PHZSt : SchedWrite; // // Float -> Half + store size conversion (ZMM).
+
+// CRC32 instruction.
+defm WriteCRC32 : X86SchedWritePair<ReadAfterLd>;
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm WritePCmpIStrM : X86SchedWritePair<ReadAfterVecXLd>;
+// Packed Compare Explicit Length Strings, Return Mask
+defm WritePCmpEStrM : X86SchedWritePair<ReadAfterVecXLd>;
+// Packed Compare Implicit Length Strings, Return Index
+defm WritePCmpIStrI : X86SchedWritePair<ReadAfterVecXLd>;
+// Packed Compare Explicit Length Strings, Return Index
+defm WritePCmpEStrI : X86SchedWritePair<ReadAfterVecXLd>;
+
+// AES instructions.
+defm WriteAESDecEnc : X86SchedWritePair<ReadAfterVecXLd>; // Decryption, encryption.
+defm WriteAESIMC : X86SchedWritePair<ReadAfterVecXLd>; // InvMixColumn.
+defm WriteAESKeyGen : X86SchedWritePair<ReadAfterVecXLd>; // Key Generation.
+
+// Carry-less multiplication instructions.
+defm WriteCLMul : X86SchedWritePair<ReadAfterVecXLd>;
+
+// EMMS/FEMMS
+def WriteEMMS : SchedWrite;
+
+// Load/store MXCSR
+def WriteLDMXCSR : SchedWrite;
+def WriteSTMXCSR : SchedWrite;
+
+// Catch-all for expensive system instructions.
+def WriteSystem : SchedWrite;
+
+// AVX2.
+defm WriteFShuffle256 : X86SchedWritePair<ReadAfterVecYLd>; // Fp 256-bit width vector shuffles.
+defm WriteFVarShuffle256 : X86SchedWritePair<ReadAfterVecYLd>; // Fp 256-bit width variable shuffles.
+defm WriteShuffle256 : X86SchedWritePair<ReadAfterVecYLd>; // 256-bit width vector shuffles.
+defm WriteVarShuffle256 : X86SchedWritePair<ReadAfterVecYLd>; // 256-bit width vector variable shuffles.
+defm WriteVarVecShift : X86SchedWritePair<ReadAfterVecXLd>; // Variable vector shifts.
+defm WriteVarVecShiftY : X86SchedWritePair<ReadAfterVecYLd>; // Variable vector shifts (YMM).
+defm WriteVarVecShiftZ : X86SchedWritePair<ReadAfterVecYLd>; // Variable vector shifts (ZMM).
+
+// Old microcoded instructions that nobody use.
+def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def WriteFence : SchedWrite;
+
+// Nop, not very useful expect it provides a model for nops!
+def WriteNop : SchedWrite;
+
+// Move/Load/Store wrappers.
+def WriteFMoveLS
+ : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStore>;
+def WriteFMoveLSX
+ : X86SchedWriteMoveLS<WriteFMoveX, WriteFLoadX, WriteFStoreX>;
+def WriteFMoveLSY
+ : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreY>;
+def SchedWriteFMoveLS
+ : X86SchedWriteMoveLSWidths<WriteFMoveLS, WriteFMoveLSX,
+ WriteFMoveLSY, WriteFMoveLSY>;
+
+def WriteFMoveLSNT
+ : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNT>;
+def WriteFMoveLSNTX
+ : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNTX>;
+def WriteFMoveLSNTY
+ : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreNTY>;
+def SchedWriteFMoveLSNT
+ : X86SchedWriteMoveLSWidths<WriteFMoveLSNT, WriteFMoveLSNTX,
+ WriteFMoveLSNTY, WriteFMoveLSNTY>;
+
+def WriteVecMoveLS
+ : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoad, WriteVecStore>;
+def WriteVecMoveLSX
+ : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadX, WriteVecStoreX>;
+def WriteVecMoveLSY
+ : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadY, WriteVecStoreY>;
+def SchedWriteVecMoveLS
+ : X86SchedWriteMoveLSWidths<WriteVecMoveLS, WriteVecMoveLSX,
+ WriteVecMoveLSY, WriteVecMoveLSY>;
+
+def WriteVecMoveLSNT
+ : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoadNT, WriteVecStoreNT>;
+def WriteVecMoveLSNTX
+ : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadNT, WriteVecStoreNT>;
+def WriteVecMoveLSNTY
+ : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadNTY, WriteVecStoreNTY>;
+def SchedWriteVecMoveLSNT
+ : X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX,
+ WriteVecMoveLSNTY, WriteVecMoveLSNTY>;
+
+// Conditional SIMD Packed Loads and Stores wrappers.
+def WriteFMaskMove32
+ : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore32>;
+def WriteFMaskMove64
+ : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore64>;
+def WriteFMaskMove32Y
+ : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>;
+def WriteFMaskMove64Y
+ : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>;
+def WriteVecMaskMove32
+ : X86SchedWriteMaskMove<WriteVecMaskedLoad, WriteVecMaskedStore32>;
+def WriteVecMaskMove64
+ : X86SchedWriteMaskMove<WriteVecMaskedLoad, WriteVecMaskedStore64>;
+def WriteVecMaskMove32Y
+ : X86SchedWriteMaskMove<WriteVecMaskedLoadY, WriteVecMaskedStore32Y>;
+def WriteVecMaskMove64Y
+ : X86SchedWriteMaskMove<WriteVecMaskedLoadY, WriteVecMaskedStore64Y>;
+
+// Vector width wrappers.
+def SchedWriteFAdd
+ : X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>;
+def SchedWriteFAdd64
+ : X86SchedWriteWidths<WriteFAdd64, WriteFAdd64X, WriteFAdd64Y, WriteFAdd64Z>;
+def SchedWriteFHAdd
+ : X86SchedWriteWidths<WriteFHAdd, WriteFHAdd, WriteFHAddY, WriteFHAddZ>;
+def SchedWriteFCmp
+ : X86SchedWriteWidths<WriteFCmp, WriteFCmpX, WriteFCmpY, WriteFCmpZ>;
+def SchedWriteFCmp64
+ : X86SchedWriteWidths<WriteFCmp64, WriteFCmp64X, WriteFCmp64Y, WriteFCmp64Z>;
+def SchedWriteFMul
+ : X86SchedWriteWidths<WriteFMul, WriteFMulX, WriteFMulY, WriteFMulZ>;
+def SchedWriteFMul64
+ : X86SchedWriteWidths<WriteFMul64, WriteFMul64X, WriteFMul64Y, WriteFMul64Z>;
+def SchedWriteFMA
+ : X86SchedWriteWidths<WriteFMA, WriteFMAX, WriteFMAY, WriteFMAZ>;
+def SchedWriteDPPD
+ : X86SchedWriteWidths<WriteDPPD, WriteDPPD, WriteDPPD, WriteDPPD>;
+def SchedWriteDPPS
+ : X86SchedWriteWidths<WriteDPPS, WriteDPPS, WriteDPPSY, WriteDPPSZ>;
+def SchedWriteFDiv
+ : X86SchedWriteWidths<WriteFDiv, WriteFDivX, WriteFDivY, WriteFDivZ>;
+def SchedWriteFDiv64
+ : X86SchedWriteWidths<WriteFDiv64, WriteFDiv64X, WriteFDiv64Y, WriteFDiv64Z>;
+def SchedWriteFSqrt
+ : X86SchedWriteWidths<WriteFSqrt, WriteFSqrtX,
+ WriteFSqrtY, WriteFSqrtZ>;
+def SchedWriteFSqrt64
+ : X86SchedWriteWidths<WriteFSqrt64, WriteFSqrt64X,
+ WriteFSqrt64Y, WriteFSqrt64Z>;
+def SchedWriteFRcp
+ : X86SchedWriteWidths<WriteFRcp, WriteFRcpX, WriteFRcpY, WriteFRcpZ>;
+def SchedWriteFRsqrt
+ : X86SchedWriteWidths<WriteFRsqrt, WriteFRsqrtX, WriteFRsqrtY, WriteFRsqrtZ>;
+def SchedWriteFRnd
+ : X86SchedWriteWidths<WriteFRnd, WriteFRnd, WriteFRndY, WriteFRndZ>;
+def SchedWriteFLogic
+ : X86SchedWriteWidths<WriteFLogic, WriteFLogic, WriteFLogicY, WriteFLogicZ>;
+def SchedWriteFTest
+ : X86SchedWriteWidths<WriteFTest, WriteFTest, WriteFTestY, WriteFTestZ>;
+
+def SchedWriteFShuffle
+ : X86SchedWriteWidths<WriteFShuffle, WriteFShuffle,
+ WriteFShuffleY, WriteFShuffleZ>;
+def SchedWriteFVarShuffle
+ : X86SchedWriteWidths<WriteFVarShuffle, WriteFVarShuffle,
+ WriteFVarShuffleY, WriteFVarShuffleZ>;
+def SchedWriteFBlend
+ : X86SchedWriteWidths<WriteFBlend, WriteFBlend, WriteFBlendY, WriteFBlendZ>;
+def SchedWriteFVarBlend
+ : X86SchedWriteWidths<WriteFVarBlend, WriteFVarBlend,
+ WriteFVarBlendY, WriteFVarBlendZ>;
+
+def SchedWriteCvtDQ2PD
+ : X86SchedWriteWidths<WriteCvtI2SD, WriteCvtI2PD,
+ WriteCvtI2PDY, WriteCvtI2PDZ>;
+def SchedWriteCvtDQ2PS
+ : X86SchedWriteWidths<WriteCvtI2SS, WriteCvtI2PS,
+ WriteCvtI2PSY, WriteCvtI2PSZ>;
+def SchedWriteCvtPD2DQ
+ : X86SchedWriteWidths<WriteCvtSD2I, WriteCvtPD2I,
+ WriteCvtPD2IY, WriteCvtPD2IZ>;
+def SchedWriteCvtPS2DQ
+ : X86SchedWriteWidths<WriteCvtSS2I, WriteCvtPS2I,
+ WriteCvtPS2IY, WriteCvtPS2IZ>;
+def SchedWriteCvtPS2PD
+ : X86SchedWriteWidths<WriteCvtSS2SD, WriteCvtPS2PD,
+ WriteCvtPS2PDY, WriteCvtPS2PDZ>;
+def SchedWriteCvtPD2PS
+ : X86SchedWriteWidths<WriteCvtSD2SS, WriteCvtPD2PS,
+ WriteCvtPD2PSY, WriteCvtPD2PSZ>;
+
+def SchedWriteVecALU
+ : X86SchedWriteWidths<WriteVecALU, WriteVecALUX, WriteVecALUY, WriteVecALUZ>;
+def SchedWritePHAdd
+ : X86SchedWriteWidths<WritePHAdd, WritePHAddX, WritePHAddY, WritePHAddZ>;
+def SchedWriteVecLogic
+ : X86SchedWriteWidths<WriteVecLogic, WriteVecLogicX,
+ WriteVecLogicY, WriteVecLogicZ>;
+def SchedWriteVecTest
+ : X86SchedWriteWidths<WriteVecTest, WriteVecTest,
+ WriteVecTestY, WriteVecTestZ>;
+def SchedWriteVecShift
+ : X86SchedWriteWidths<WriteVecShift, WriteVecShiftX,
+ WriteVecShiftY, WriteVecShiftZ>;
+def SchedWriteVecShiftImm
+ : X86SchedWriteWidths<WriteVecShiftImm, WriteVecShiftImmX,
+ WriteVecShiftImmY, WriteVecShiftImmZ>;
+def SchedWriteVarVecShift
+ : X86SchedWriteWidths<WriteVarVecShift, WriteVarVecShift,
+ WriteVarVecShiftY, WriteVarVecShiftZ>;
+def SchedWriteVecIMul
+ : X86SchedWriteWidths<WriteVecIMul, WriteVecIMulX,
+ WriteVecIMulY, WriteVecIMulZ>;
+def SchedWritePMULLD
+ : X86SchedWriteWidths<WritePMULLD, WritePMULLD,
+ WritePMULLDY, WritePMULLDZ>;
+def SchedWriteMPSAD
+ : X86SchedWriteWidths<WriteMPSAD, WriteMPSAD,
+ WriteMPSADY, WriteMPSADZ>;
+def SchedWritePSADBW
+ : X86SchedWriteWidths<WritePSADBW, WritePSADBWX,
+ WritePSADBWY, WritePSADBWZ>;
+
+def SchedWriteShuffle
+ : X86SchedWriteWidths<WriteShuffle, WriteShuffleX,
+ WriteShuffleY, WriteShuffleZ>;
+def SchedWriteVarShuffle
+ : X86SchedWriteWidths<WriteVarShuffle, WriteVarShuffleX,
+ WriteVarShuffleY, WriteVarShuffleZ>;
+def SchedWriteBlend
+ : X86SchedWriteWidths<WriteBlend, WriteBlend, WriteBlendY, WriteBlendZ>;
+def SchedWriteVarBlend
+ : X86SchedWriteWidths<WriteVarBlend, WriteVarBlend,
+ WriteVarBlendY, WriteVarBlendZ>;
+
+// Vector size wrappers.
+def SchedWriteFAddSizes
+ : X86SchedWriteSizes<SchedWriteFAdd, SchedWriteFAdd64>;
+def SchedWriteFCmpSizes
+ : X86SchedWriteSizes<SchedWriteFCmp, SchedWriteFCmp64>;
+def SchedWriteFMulSizes
+ : X86SchedWriteSizes<SchedWriteFMul, SchedWriteFMul64>;
+def SchedWriteFDivSizes
+ : X86SchedWriteSizes<SchedWriteFDiv, SchedWriteFDiv64>;
+def SchedWriteFSqrtSizes
+ : X86SchedWriteSizes<SchedWriteFSqrt, SchedWriteFSqrt64>;
+def SchedWriteFLogicSizes
+ : X86SchedWriteSizes<SchedWriteFLogic, SchedWriteFLogic>;
+def SchedWriteFShuffleSizes
+ : X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle>;
+
+//===----------------------------------------------------------------------===//
+// Generic Processor Scheduler Models.
+
+// IssueWidth is analogous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MicroOpBufferSize > 1 indicates that RAW dependencies can be
+// decoded in the same cycle. The value 32 is a reasonably arbitrary
+// number of in-flight instructions.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// The GenericX86Model contains no instruction schedules
+// and disables PostRAScheduler.
+class GenericX86Model : SchedMachineModel {
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 32;
+ let LoadLatency = 4;
+ let HighLatency = 10;
+ let PostRAScheduler = 0;
+ let CompleteModel = 0;
+}
+
+def GenericModel : GenericX86Model;
+
+// Define a model with the PostRAScheduler enabled.
+def GenericPostRAModel : GenericX86Model {
+ let PostRAScheduler = 1;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td
new file mode 100644
index 000000000000..b90baf6c16b1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -0,0 +1,908 @@
+//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the schedule class data for the Intel Atom
+// in order (Saltwell-32nm/Bonnell-45nm) processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from the "Intel 64 and IA32 Architectures
+// Optimization Reference Manual", Chapter 13, Section 4.
+
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+ let IssueWidth = 2; // Allows 2 instructions per scheduling group.
+ let MicroOpBufferSize = 0; // In-order execution, always hide latency.
+ let LoadLatency = 3; // Expected cycles, may be overriden.
+ let HighLatency = 30;// Expected, may be overriden.
+
+ // On the Atom, the throughput for taken branches is 2 cycles. For small
+ // simple loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+ let PostRAScheduler = 1;
+ let CompleteModel = 0;
+}
+
+let SchedModel = AtomModel in {
+
+// Functional Units
+def AtomPort0 : ProcResource<1>; // ALU: ALU0, shift/rotate, load/store
+ // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def AtomPort1 : ProcResource<1>; // ALU: ALU1, bit processing, jump, and LEA
+ // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomPort01 : ProcResGroup<[AtomPort0, AtomPort1]>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+def : ReadAdvance<ReadAfterVecLd, 3>;
+def : ReadAdvance<ReadAfterVecXLd, 3>;
+def : ReadAdvance<ReadAfterVecYLd, 3>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass AtomWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> RRPorts,
+ list<ProcResourceKind> RMPorts,
+ int RRLat = 1, int RMLat = 1,
+ list<int> RRRes = [1],
+ list<int> RMRes = [1]> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, RRPorts> {
+ let Latency = RRLat;
+ let ResourceCycles = RRRes;
+ }
+
+ // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, RMPorts> {
+ let Latency = RMLat;
+ let ResourceCycles = RMRes;
+ }
+}
+
+// A folded store needs a cycle on Port0 for the store data.
+def : WriteRes<WriteRMW, [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteALU, [AtomPort01], [AtomPort0]>;
+defm : AtomWriteResPair<WriteADC, [AtomPort01], [AtomPort0]>;
+
+defm : AtomWriteResPair<WriteIMul8, [AtomPort01], [AtomPort01], 7, 7, [7], [7]>;
+defm : AtomWriteResPair<WriteIMul16, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : AtomWriteResPair<WriteIMul16Imm, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteIMul16Reg, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteIMul32, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteIMul32Imm, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteIMul32Reg, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteIMul64, [AtomPort01], [AtomPort01], 12, 12, [12], [12]>;
+defm : AtomWriteResPair<WriteIMul64Imm, [AtomPort01], [AtomPort01], 14, 14, [14], [14]>;
+defm : AtomWriteResPair<WriteIMul64Reg, [AtomPort01], [AtomPort01], 12, 12, [12], [12]>;
+defm : X86WriteResUnsupported<WriteIMulH>;
+
+defm : X86WriteRes<WriteXCHG, [AtomPort01], 2, [2], 1>;
+defm : X86WriteRes<WriteBSWAP32, [AtomPort0], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [AtomPort0], 1, [1], 1>;
+defm : AtomWriteResPair<WriteCMPXCHG, [AtomPort01], [AtomPort01], 15, 15, [15]>;
+defm : X86WriteRes<WriteCMPXCHGRMW, [AtomPort01, AtomPort0], 1, [1, 1], 1>;
+
+defm : AtomWriteResPair<WriteDiv8, [AtomPort01], [AtomPort01], 50, 68, [50], [68]>;
+defm : AtomWriteResPair<WriteDiv16, [AtomPort01], [AtomPort01], 50, 50, [50], [50]>;
+defm : AtomWriteResPair<WriteDiv32, [AtomPort01], [AtomPort01], 50, 50, [50], [50]>;
+defm : AtomWriteResPair<WriteDiv64, [AtomPort01], [AtomPort01],130,130,[130],[130]>;
+defm : AtomWriteResPair<WriteIDiv8, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteIDiv16, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteIDiv32, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteIDiv64, [AtomPort01], [AtomPort01],130,130,[130],[130]>;
+
+defm : X86WriteResPairUnsupported<WriteCRC32>;
+
+defm : AtomWriteResPair<WriteCMOV, [AtomPort01], [AtomPort0]>;
+defm : X86WriteRes<WriteFCMOV, [AtomPort01], 9, [9], 1>; // x87 conditional move.
+
+def : WriteRes<WriteSETCC, [AtomPort01]>;
+def : WriteRes<WriteSETCCStore, [AtomPort01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteLAHFSAHF, [AtomPort01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+defm : X86WriteRes<WriteBitTest, [AtomPort1], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [AtomPort0], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestRegLd, [AtomPort01], 9, [9], 1>;
+defm : X86WriteRes<WriteBitTestSet, [AtomPort1], 1, [1], 1>;
+//defm : X86WriteRes<WriteBitTestSetImmLd, [AtomPort1], 1, [1], 1>;
+//defm : X86WriteRes<WriteBitTestSetRegLd, [AtomPort1], 1, [1], 1>;
+
+// This is for simple LEAs with one or two input operands.
+def : WriteRes<WriteLEA, [AtomPort1]>;
+
+// Bit counts.
+defm : AtomWriteResPair<WriteBSF, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>;
+defm : AtomWriteResPair<WriteBSR, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>;
+defm : X86WriteResPairUnsupported<WritePOPCNT>;
+defm : X86WriteResPairUnsupported<WriteLZCNT>;
+defm : X86WriteResPairUnsupported<WriteTZCNT>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : X86WriteResPairUnsupported<WriteBEXTR>;
+defm : X86WriteResPairUnsupported<WriteBLS>;
+defm : X86WriteResPairUnsupported<WriteBZHI>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteShift, [AtomPort0], [AtomPort0]>;
+defm : AtomWriteResPair<WriteShiftCL, [AtomPort0], [AtomPort0]>;
+defm : AtomWriteResPair<WriteRotate, [AtomPort0], [AtomPort0]>;
+defm : AtomWriteResPair<WriteRotateCL, [AtomPort0], [AtomPort0]>;
+
+defm : X86WriteRes<WriteSHDrri, [AtomPort01], 2, [2], 1>;
+defm : X86WriteRes<WriteSHDrrcl,[AtomPort01], 2, [2], 1>;
+defm : X86WriteRes<WriteSHDmri, [AtomPort01], 4, [4], 1>;
+defm : X86WriteRes<WriteSHDmrcl,[AtomPort01], 4, [4], 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad, [AtomPort0]>;
+def : WriteRes<WriteStore, [AtomPort0]>;
+def : WriteRes<WriteStoreNT, [AtomPort0]>;
+def : WriteRes<WriteMove, [AtomPort01]>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteJump, [AtomPort1], [AtomPort1]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Special case scheduling classes.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteSystem, [AtomPort01]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [AtomPort01]> { let Latency = 100; }
+def : WriteRes<WriteFence, [AtomPort0]>;
+
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [AtomPort01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteRes<WriteFLD0, [AtomPort01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [AtomPort01], 6, [6], 1>;
+def : WriteRes<WriteFLoad, [AtomPort0]>;
+def : WriteRes<WriteFLoadX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteFLoadY>;
+defm : X86WriteResUnsupported<WriteFMaskedLoad>;
+defm : X86WriteResUnsupported<WriteFMaskedLoadY>;
+
+def : WriteRes<WriteFStore, [AtomPort0]>;
+def : WriteRes<WriteFStoreX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteFStoreY>;
+def : WriteRes<WriteFStoreNT, [AtomPort0]>;
+def : WriteRes<WriteFStoreNTX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteFStoreNTY>;
+defm : X86WriteResUnsupported<WriteFMaskedStore32>;
+defm : X86WriteResUnsupported<WriteFMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteFMaskedStore64>;
+defm : X86WriteResUnsupported<WriteFMaskedStore64Y>;
+
+def : WriteRes<WriteFMove, [AtomPort01]>;
+def : WriteRes<WriteFMoveX, [AtomPort01]>;
+defm : X86WriteResUnsupported<WriteFMoveY>;
+
+defm : X86WriteRes<WriteEMMS, [AtomPort01], 5, [5], 1>;
+
+defm : AtomWriteResPair<WriteFAdd, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFAddX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteFAddY>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : AtomWriteResPair<WriteFAdd64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFAdd64X, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Y>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : AtomWriteResPair<WriteFCmp, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFCmpX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteFCmpY>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : AtomWriteResPair<WriteFCmp64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFCmp64X, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Y>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : AtomWriteResPair<WriteFCom, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFComX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WriteFMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteFMulY>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : AtomWriteResPair<WriteFMul64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFMul64X, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>;
+defm : X86WriteResPairUnsupported<WriteFMul64Y>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : AtomWriteResPair<WriteFRcp, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WriteFRcpX, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>;
+defm : X86WriteResPairUnsupported<WriteFRcpY>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : AtomWriteResPair<WriteFRsqrt, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WriteFRsqrtX, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtY>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : AtomWriteResPair<WriteFDiv, [AtomPort01], [AtomPort01], 34, 34, [34], [34]>;
+defm : AtomWriteResPair<WriteFDivX, [AtomPort01], [AtomPort01], 70, 70, [70], [70]>;
+defm : X86WriteResPairUnsupported<WriteFDivY>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : AtomWriteResPair<WriteFDiv64, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteFDiv64X, [AtomPort01], [AtomPort01],125,125,[125],[125]>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Y>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : AtomWriteResPair<WriteFSqrt, [AtomPort01], [AtomPort01], 34, 34, [34], [34]>;
+defm : AtomWriteResPair<WriteFSqrtX, [AtomPort01], [AtomPort01], 70, 70, [70], [70]>;
+defm : X86WriteResPairUnsupported<WriteFSqrtY>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : AtomWriteResPair<WriteFSqrt64, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
+defm : AtomWriteResPair<WriteFSqrt64X, [AtomPort01], [AtomPort01],125,125,[125],[125]>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Y>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : AtomWriteResPair<WriteFSqrt80, [AtomPort01], [AtomPort01], 71, 71, [71], [71]>;
+defm : AtomWriteResPair<WriteFSign, [AtomPort1], [AtomPort1]>;
+defm : AtomWriteResPair<WriteFRnd, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteFRndY>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : AtomWriteResPair<WriteFLogic, [AtomPort01], [AtomPort0]>;
+defm : X86WriteResPairUnsupported<WriteFLogicY>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : AtomWriteResPair<WriteFTest, [AtomPort01], [AtomPort0]>;
+defm : X86WriteResPairUnsupported<WriteFTestY>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : AtomWriteResPair<WriteFShuffle, [AtomPort0], [AtomPort0]>;
+defm : X86WriteResPairUnsupported<WriteFShuffleY>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : X86WriteResPairUnsupported<WriteDPPD>;
+defm : X86WriteResPairUnsupported<WriteDPPS>;
+defm : X86WriteResPairUnsupported<WriteDPPSY>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : X86WriteResPairUnsupported<WriteFBlend>;
+defm : X86WriteResPairUnsupported<WriteFBlendY>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFVarBlend>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFShuffle256>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Conversions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteCvtSS2I, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WriteCvtPS2I, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IY>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : AtomWriteResPair<WriteCvtSD2I, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WriteCvtPD2I, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IY>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : AtomWriteResPair<WriteCvtI2SS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtI2PS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : AtomWriteResPair<WriteCvtI2SD, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtI2PD, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDY>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : AtomWriteResPair<WriteCvtSS2SD, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtPS2PD, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDY>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : AtomWriteResPair<WriteCvtSD2SS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>;
+defm : AtomWriteResPair<WriteCvtPD2PS, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteResPairUnsupported<WriteCvtPH2PS>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PH>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector integer operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteVecLoad, [AtomPort0]>;
+def : WriteRes<WriteVecLoadX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecLoadY>;
+def : WriteRes<WriteVecLoadNT, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecLoadNTY>;
+defm : X86WriteResUnsupported<WriteVecMaskedLoad>;
+defm : X86WriteResUnsupported<WriteVecMaskedLoadY>;
+
+def : WriteRes<WriteVecStore, [AtomPort0]>;
+def : WriteRes<WriteVecStoreX, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecStoreY>;
+def : WriteRes<WriteVecStoreNT, [AtomPort0]>;
+defm : X86WriteResUnsupported<WriteVecStoreNTY>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
+
+def : WriteRes<WriteVecMove, [AtomPort0]>;
+def : WriteRes<WriteVecMoveX, [AtomPort01]>;
+defm : X86WriteResUnsupported<WriteVecMoveY>;
+defm : X86WriteRes<WriteVecMoveToGpr, [AtomPort0], 3, [3], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [AtomPort0], 1, [1], 1>;
+
+defm : AtomWriteResPair<WriteVecALU, [AtomPort01], [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVecALUX, [AtomPort01], [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUY>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : AtomWriteResPair<WriteVecLogic, [AtomPort01], [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVecLogicX, [AtomPort01], [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicY>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : AtomWriteResPair<WriteVecTest, [AtomPort01], [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestY>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : AtomWriteResPair<WriteVecShift, [AtomPort01], [AtomPort01], 2, 3, [2], [3]>;
+defm : AtomWriteResPair<WriteVecShiftX, [AtomPort01], [AtomPort01], 2, 3, [2], [3]>;
+defm : X86WriteResPairUnsupported<WriteVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : AtomWriteResPair<WriteVecShiftImm, [AtomPort01], [AtomPort01], 1, 1, [1], [1]>;
+defm : AtomWriteResPair<WriteVecShiftImmX, [AtomPort01], [AtomPort01], 1, 1, [1], [1]>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : AtomWriteResPair<WriteVecIMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WriteVecIMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WriteVecIMulY>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : X86WriteResPairUnsupported<WritePMULLD>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : X86WriteResPairUnsupported<WritePHMINPOS>;
+defm : X86WriteResPairUnsupported<WriteMPSAD>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : AtomWriteResPair<WritePSADBW, [AtomPort01], [AtomPort01], 4, 4, [4], [4]>;
+defm : AtomWriteResPair<WritePSADBWX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : AtomWriteResPair<WriteShuffle, [AtomPort0], [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteShuffleX, [AtomPort0], [AtomPort0], 1, 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleY>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : AtomWriteResPair<WriteVarShuffle, [AtomPort0], [AtomPort0], 1, 1>;
+defm : AtomWriteResPair<WriteVarShuffleX, [AtomPort01], [AtomPort01], 4, 5, [4], [5]>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : X86WriteResPairUnsupported<WriteBlend>;
+defm : X86WriteResPairUnsupported<WriteBlendY>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : X86WriteResPairUnsupported<WriteVarBlend>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarVecShift>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector insert/extract operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteVecInsert, [AtomPort0], [AtomPort0], 1, 1>;
+def : WriteRes<WriteVecExtract, [AtomPort0]>;
+def : WriteRes<WriteVecExtractSt, [AtomPort0]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE42 String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteResPairUnsupported<WritePCmpIStrI>;
+defm : X86WriteResPairUnsupported<WritePCmpIStrM>;
+defm : X86WriteResPairUnsupported<WritePCmpEStrI>;
+defm : X86WriteResPairUnsupported<WritePCmpEStrM>;
+
+////////////////////////////////////////////////////////////////////////////////
+// MOVMSK Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteFMOVMSK, [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; }
+def : WriteRes<WriteVecMOVMSK, [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; }
+defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
+def : WriteRes<WriteMMXMOVMSK, [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; }
+
+////////////////////////////////////////////////////////////////////////////////
+// AES instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteResPairUnsupported<WriteAESIMC>;
+defm : X86WriteResPairUnsupported<WriteAESKeyGen>;
+defm : X86WriteResPairUnsupported<WriteAESDecEnc>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : AtomWriteResPair<WriteFHAdd, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WriteFHAddY, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
+defm : AtomWriteResPair<WritePHAdd, [AtomPort01], [AtomPort01], 3, 4, [3], [4]>;
+defm : AtomWriteResPair<WritePHAddX, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : AtomWriteResPair<WritePHAddY, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteResPairUnsupported<WriteCLMul>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Load/store MXCSR.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLDMXCSR, [AtomPort01]> { let Latency = 5; let ResourceCycles = [5]; }
+def : WriteRes<WriteSTMXCSR, [AtomPort01]> { let Latency = 15; let ResourceCycles = [15]; }
+
+////////////////////////////////////////////////////////////////////////////////
+// Special Cases.
+////////////////////////////////////////////////////////////////////////////////
+
+// Port0
+def AtomWrite0_1 : SchedWriteRes<[AtomPort0]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+def : InstRW<[AtomWrite0_1], (instrs FXAM, LD_Frr,
+ MOVSX64rr32)>;
+def : SchedAlias<WriteALURMW, AtomWrite0_1>;
+def : SchedAlias<WriteADCRMW, AtomWrite0_1>;
+def : InstRW<[AtomWrite0_1], (instregex "(RCL|RCR|ROL|ROR|SAR|SHL|SHR)(8|16|32|64)m",
+ "MOV(S|Z)X(32|64)rr(8|8_NOREX|16)")>;
+
+// Port1
+def AtomWrite1_1 : SchedWriteRes<[AtomPort1]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+def : InstRW<[AtomWrite1_1], (instrs FCOMPP)>;
+def : InstRW<[AtomWrite1_1], (instregex "UCOM_F(P|PP)?r")>;
+
+def AtomWrite1_5 : SchedWriteRes<[AtomPort1]> {
+ let Latency = 5;
+ let ResourceCycles = [5];
+}
+def : InstRW<[AtomWrite1_5], (instrs MMX_CVTPI2PSirr, MMX_CVTPI2PSirm,
+ MMX_CVTPS2PIirr, MMX_CVTTPS2PIirr)>;
+
+// Port0 and Port1
+def AtomWrite0_1_1 : SchedWriteRes<[AtomPort0, AtomPort1]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[AtomWrite0_1_1], (instrs POP32r, POP64r,
+ POP16rmr, POP32rmr, POP64rmr,
+ PUSH16r, PUSH32r, PUSH64r,
+ PUSHi16, PUSHi32,
+ PUSH16rmr, PUSH32rmr, PUSH64rmr,
+ PUSH16i8, PUSH32i8, PUSH64i8, PUSH64i32,
+ XCH_F)>;
+def : InstRW<[AtomWrite0_1_1], (instregex "RETI(L|Q|W)$",
+ "IRET(16|32|64)?")>;
+
+def AtomWrite0_1_5 : SchedWriteRes<[AtomPort0, AtomPort1]> {
+ let Latency = 5;
+ let ResourceCycles = [5, 5];
+}
+def : InstRW<[AtomWrite0_1_5], (instrs MMX_CVTPS2PIirm, MMX_CVTTPS2PIirm)>;
+def : InstRW<[AtomWrite0_1_5], (instregex "ILD_F(16|32|64)")>;
+
+// Port0 or Port1
+def AtomWrite01_1 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+def : InstRW<[AtomWrite01_1], (instrs FDECSTP, FFREE, FFREEP, FINCSTP, WAIT,
+ LFENCE,
+ STOSB, STOSL, STOSQ, STOSW,
+ MOVSSrr, MOVSSrr_REV,
+ PSLLDQri, PSRLDQri)>;
+def : InstRW<[AtomWrite01_1], (instregex "MMX_PACK(SSDW|SSWB|USWB)irr",
+ "MMX_PUNPCKH(BW|DQ|WD)irr")>;
+
+def AtomWrite01_2 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[AtomWrite01_2], (instrs LEAVE, LEAVE64, POP16r,
+ PUSH16rmm, PUSH32rmm, PUSH64rmm,
+ LODSB, LODSL, LODSQ, LODSW,
+ SCASB, SCASL, SCASQ, SCASW)>;
+def : InstRW<[AtomWrite01_2], (instregex "PUSH(CS|DS|ES|FS|GS|SS)(16|32|64)",
+ "(ST|ISTT)_F(P)?(16|32|64)?(m|rr)",
+ "MMX_P(ADD|SUB)Qirr",
+ "MOV(S|Z)X16rr8",
+ "MOV(UPS|UPD|DQU)mr",
+ "MASKMOVDQU(64)?",
+ "P(ADD|SUB)Qrr")>;
+def : SchedAlias<WriteBitTestSetImmRMW, AtomWrite01_2>;
+
+def AtomWrite01_3 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 3;
+ let ResourceCycles = [3];
+}
+def : InstRW<[AtomWrite01_3], (instrs CLD, LDDQUrm,
+ CMPSB, CMPSL, CMPSQ, CMPSW,
+ MOVSB, MOVSL, MOVSQ, MOVSW,
+ POP16rmm, POP32rmm, POP64rmm)>;
+def : InstRW<[AtomWrite01_3], (instregex "XADD(8|16|32|64)rm",
+ "XCHG(8|16|32|64)rm",
+ "PH(ADD|SUB)Drr",
+ "MOV(S|Z)X16rm8",
+ "MMX_P(ADD|SUB)Qirm",
+ "MOV(UPS|UPD|DQU)rm",
+ "P(ADD|SUB)Qrm")>;
+
+def AtomWrite01_4 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 4;
+ let ResourceCycles = [4];
+}
+def : InstRW<[AtomWrite01_4], (instrs CBW, CWD, CWDE, CDQ, CDQE, CQO,
+ JCXZ, JECXZ, JRCXZ,
+ LD_F80m)>;
+def : InstRW<[AtomWrite01_4], (instregex "PH(ADD|SUB)Drm",
+ "(MMX_)?PEXTRWrr(_REV)?")>;
+
+def AtomWrite01_5 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 5;
+ let ResourceCycles = [5];
+}
+def : InstRW<[AtomWrite01_5], (instrs FLDCW16m, ST_FP80m)>;
+def : InstRW<[AtomWrite01_5], (instregex "MMX_PH(ADD|SUB)S?Wrr")>;
+
+def AtomWrite01_6 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 6;
+ let ResourceCycles = [6];
+}
+def : InstRW<[AtomWrite01_6], (instrs CMPXCHG8rm, INTO, XLAT,
+ SHLD16rrCL, SHRD16rrCL,
+ SHLD16rri8, SHRD16rri8,
+ SHLD16mrCL, SHRD16mrCL,
+ SHLD16mri8, SHRD16mri8)>;
+def : InstRW<[AtomWrite01_6], (instregex "IST_F(P)?(16|32|64)?m",
+ "MMX_PH(ADD|SUB)S?Wrm")>;
+
+def AtomWrite01_7 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 7;
+ let ResourceCycles = [7];
+}
+def : InstRW<[AtomWrite01_7], (instrs AAD8i8)>;
+
+def AtomWrite01_8 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 8;
+ let ResourceCycles = [8];
+}
+def : InstRW<[AtomWrite01_8], (instrs LOOPE,
+ PUSHA16, PUSHA32,
+ SHLD64rrCL, SHRD64rrCL,
+ FNSTCW16m)>;
+
+def AtomWrite01_9 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 9;
+ let ResourceCycles = [9];
+}
+def : InstRW<[AtomWrite01_9], (instrs POPA16, POPA32,
+ PUSHF16, PUSHF32, PUSHF64,
+ SHLD64mrCL, SHRD64mrCL,
+ SHLD64mri8, SHRD64mri8,
+ SHLD64rri8, SHRD64rri8,
+ CMPXCHG8rr)>;
+def : InstRW<[AtomWrite01_9], (instregex "(U)?COM_FI", "TST_F",
+ "(U)?COMIS(D|S)rr",
+ "CVT(T)?SS2SI64rr(_Int)?")>;
+
+def AtomWrite01_10 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 10;
+ let ResourceCycles = [10];
+}
+def : SchedAlias<WriteFLDC, AtomWrite01_10>;
+def : InstRW<[AtomWrite01_10], (instregex "(U)?COMIS(D|S)rm",
+ "CVT(T)?SS2SI64rm(_Int)?")>;
+
+def AtomWrite01_11 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 11;
+ let ResourceCycles = [11];
+}
+def : InstRW<[AtomWrite01_11], (instrs BOUNDS16rm, BOUNDS32rm)>;
+def : SchedAlias<WriteBitTestSetRegRMW, AtomWrite01_11>;
+
+def AtomWrite01_13 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 13;
+ let ResourceCycles = [13];
+}
+def : InstRW<[AtomWrite01_13], (instrs AAA, AAS)>;
+
+def AtomWrite01_14 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 14;
+ let ResourceCycles = [14];
+}
+def : InstRW<[AtomWrite01_14], (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>;
+
+def AtomWrite01_17 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 17;
+ let ResourceCycles = [17];
+}
+def : InstRW<[AtomWrite01_17], (instrs LOOPNE, PAUSE)>;
+
+def AtomWrite01_18 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 18;
+ let ResourceCycles = [18];
+}
+def : InstRW<[AtomWrite01_18], (instrs CMPXCHG8B, DAA, LOOP)>;
+
+def AtomWrite01_20 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 20;
+ let ResourceCycles = [20];
+}
+def : InstRW<[AtomWrite01_20], (instrs DAS)>;
+
+def AtomWrite01_21 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 21;
+ let ResourceCycles = [21];
+}
+def : InstRW<[AtomWrite01_21], (instrs AAM8i8, STD)>;
+
+def AtomWrite01_22 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 22;
+ let ResourceCycles = [22];
+}
+def : InstRW<[AtomWrite01_22], (instrs CMPXCHG16B)>;
+
+def AtomWrite01_23 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 23;
+ let ResourceCycles = [23];
+}
+def : InstRW<[AtomWrite01_23], (instrs ARPL16mr, ARPL16rr)>;
+
+def AtomWrite01_25 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 25;
+ let ResourceCycles = [25];
+}
+def : InstRW<[AtomWrite01_25], (instrs FNCLEX, FXTRACT)>;
+
+def AtomWrite01_26 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 26;
+ let ResourceCycles = [26];
+}
+def : InstRW<[AtomWrite01_26], (instrs POPF32, POPF64)>;
+
+def AtomWrite01_29 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 29;
+ let ResourceCycles = [29];
+}
+def : InstRW<[AtomWrite01_29], (instregex "POP(DS|ES|FS|GS)(16|32|64)")>;
+
+def AtomWrite01_30 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 30;
+ let ResourceCycles = [30];
+}
+def : InstRW<[AtomWrite01_30], (instrs RDTSC, RDTSCP)>;
+
+def AtomWrite01_32 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 32;
+ let ResourceCycles = [32];
+}
+def : InstRW<[AtomWrite01_32], (instrs ENTER, POPF16)>;
+
+def AtomWrite01_45 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 45;
+ let ResourceCycles = [45];
+}
+def : InstRW<[AtomWrite01_45], (instrs MONITOR32rrr, MONITOR64rrr)>;
+
+def AtomWrite01_46 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 46;
+ let ResourceCycles = [46];
+}
+def : InstRW<[AtomWrite01_46], (instrs FRNDINT, MWAITrr, RDPMC)>;
+
+def AtomWrite01_48 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 48;
+ let ResourceCycles = [48];
+}
+def : InstRW<[AtomWrite01_48], (instrs POPSS16, POPSS32)>;
+
+def AtomWrite01_55 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 55;
+ let ResourceCycles = [55];
+}
+def : InstRW<[AtomWrite01_55], (instrs FPREM)>;
+
+def AtomWrite01_59 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 59;
+ let ResourceCycles = [59];
+}
+def : InstRW<[AtomWrite01_59], (instrs INSB, INSL, INSW)>;
+
+def AtomWrite01_63 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 63;
+ let ResourceCycles = [63];
+}
+def : InstRW<[AtomWrite01_63], (instrs FNINIT)>;
+
+def AtomWrite01_68 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 68;
+ let ResourceCycles = [68];
+}
+def : InstRW<[AtomWrite01_68], (instrs OUT8rr, OUT16rr, OUT32rr)>;
+
+def AtomWrite01_71 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 71;
+ let ResourceCycles = [71];
+}
+def : InstRW<[AtomWrite01_71], (instrs FPREM1,
+ INVLPG, INVLPGA32, INVLPGA64)>;
+
+def AtomWrite01_72 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 72;
+ let ResourceCycles = [72];
+}
+def : InstRW<[AtomWrite01_72], (instrs OUT8ir, OUT16ir, OUT32ir)>;
+
+def AtomWrite01_74 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 74;
+ let ResourceCycles = [74];
+}
+def : InstRW<[AtomWrite01_74], (instrs OUTSB, OUTSL, OUTSW)>;
+
+def AtomWrite01_77 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 77;
+ let ResourceCycles = [77];
+}
+def : InstRW<[AtomWrite01_77], (instrs FSCALE)>;
+
+def AtomWrite01_78 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 78;
+ let ResourceCycles = [78];
+}
+def : InstRW<[AtomWrite01_78], (instrs RDMSR)>;
+
+def AtomWrite01_79 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 79;
+ let ResourceCycles = [79];
+}
+def : InstRW<[AtomWrite01_79], (instregex "RET(L|Q|W)?$",
+ "LRETI?(L|Q|W)")>;
+
+def AtomWrite01_92 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 92;
+ let ResourceCycles = [92];
+}
+def : InstRW<[AtomWrite01_92], (instrs IN8ri, IN16ri, IN32ri)>;
+
+def AtomWrite01_94 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 94;
+ let ResourceCycles = [94];
+}
+def : InstRW<[AtomWrite01_94], (instrs IN8rr, IN16rr, IN32rr)>;
+
+def AtomWrite01_99 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 99;
+ let ResourceCycles = [99];
+}
+def : InstRW<[AtomWrite01_99], (instrs F2XM1)>;
+
+def AtomWrite01_121 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 121;
+ let ResourceCycles = [121];
+}
+def : InstRW<[AtomWrite01_121], (instrs CPUID)>;
+
+def AtomWrite01_127 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 127;
+ let ResourceCycles = [127];
+}
+def : InstRW<[AtomWrite01_127], (instrs INT)>;
+
+def AtomWrite01_130 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 130;
+ let ResourceCycles = [130];
+}
+def : InstRW<[AtomWrite01_130], (instrs INT3)>;
+
+def AtomWrite01_140 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 140;
+ let ResourceCycles = [140];
+}
+def : InstRW<[AtomWrite01_140], (instrs FXSAVE, FXSAVE64)>;
+
+def AtomWrite01_141 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 141;
+ let ResourceCycles = [141];
+}
+def : InstRW<[AtomWrite01_141], (instrs FXRSTOR, FXRSTOR64)>;
+
+def AtomWrite01_146 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 146;
+ let ResourceCycles = [146];
+}
+def : InstRW<[AtomWrite01_146], (instrs FYL2X)>;
+
+def AtomWrite01_147 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 147;
+ let ResourceCycles = [147];
+}
+def : InstRW<[AtomWrite01_147], (instrs FYL2XP1)>;
+
+def AtomWrite01_168 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 168;
+ let ResourceCycles = [168];
+}
+def : InstRW<[AtomWrite01_168], (instrs FPTAN)>;
+
+def AtomWrite01_174 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 174;
+ let ResourceCycles = [174];
+}
+def : InstRW<[AtomWrite01_174], (instrs FSINCOS, FSIN, FCOS)>;
+
+def AtomWrite01_183 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 183;
+ let ResourceCycles = [183];
+}
+def : InstRW<[AtomWrite01_183], (instrs FPATAN)>;
+
+def AtomWrite01_202 : SchedWriteRes<[AtomPort01]> {
+ let Latency = 202;
+ let ResourceCycles = [202];
+}
+def : InstRW<[AtomWrite01_202], (instrs WRMSR)>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td
new file mode 100644
index 000000000000..0a201bc74a48
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -0,0 +1,1462 @@
+//=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AMD bdver2 (Piledriver) to support
+// instruction scheduling and other instruction cost heuristics.
+// Based on:
+// * AMD Software Optimization Guide for AMD Family 15h Processors.
+// https://support.amd.com/TechDocs/47414_15h_sw_opt_guide.pdf
+// * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
+// http://www.agner.org/optimize/microarchitecture.pdf
+// * https://www.realworldtech.com/bulldozer/
+// Yes, that is for Bulldozer aka bdver1, not Piledriver aka bdver2.
+//
+//===----------------------------------------------------------------------===//
+
+def BdVer2Model : SchedMachineModel {
+ let IssueWidth = 4; // Up to 4 IPC can be decoded, issued, retired.
+ let MicroOpBufferSize = 128; // RCU reorder buffer size, which is unconfirmed.
+ let LoopMicroOpBufferSize = -1; // There does not seem to be a loop buffer.
+ let LoadLatency = 4; // L1 data cache has a 4-cycle load-to-use latency.
+ let HighLatency = 25; // FIXME: any better choice?
+ let MispredictPenalty = 20; // Minimum branch misdirection penalty.
+
+ let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
+
+ // FIXME: Incomplete. This flag is set to allow the scheduler to assign
+ // a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+} // SchedMachineModel
+
+let SchedModel = BdVer2Model in {
+
+
+//===----------------------------------------------------------------------===//
+// Pipes
+//===----------------------------------------------------------------------===//
+
+// There are total of eight pipes.
+
+//===----------------------------------------------------------------------===//
+// Integer execution pipes
+//
+
+// Two EX (ALU) pipes.
+def PdEX0 : ProcResource<1>; // ALU, Integer Pipe0
+def PdEX1 : ProcResource<1>; // ALU, Integer Pipe1
+def PdEX01 : ProcResGroup<[PdEX0, PdEX1]>;
+
+// Two AGLU pipes, identical.
+def PdAGLU01 : ProcResource<2>; // AGU, Integer Pipe[23]
+
+//===----------------------------------------------------------------------===//
+// Floating point execution pipes
+//
+
+// Four FPU pipes.
+
+def PdFPU0 : ProcResource<1>; // Vector/FPU Pipe0
+def PdFPU1 : ProcResource<1>; // Vector/FPU Pipe1
+def PdFPU2 : ProcResource<1>; // Vector/FPU Pipe2
+def PdFPU3 : ProcResource<1>; // Vector/FPU Pipe3
+
+// FPU grouping
+def PdFPU01 : ProcResGroup<[PdFPU0, PdFPU1]>;
+def PdFPU23 : ProcResGroup<[PdFPU2, PdFPU3]>;
+
+
+//===----------------------------------------------------------------------===//
+// RCU
+//===----------------------------------------------------------------------===//
+
+// The Retire Control Unit on Piledriver can retire up to 4 macro-ops per cycle.
+// On the other hand, the RCU reorder buffer size for Piledriver does not
+// seem be specified in any trustworthy source.
+// But as per https://www.realworldtech.com/bulldozer/6/ the Bulldozer had
+// RCU reorder buffer size of 128. So that is a good guess for now.
+def PdRCU : RetireControlUnit<128, 4>;
+
+
+//===----------------------------------------------------------------------===//
+// Pipelines
+//===----------------------------------------------------------------------===//
+
+// There are total of two pipelines, each one with it's own scheduler.
+
+//===----------------------------------------------------------------------===//
+// Integer Pipeline Scheduling
+//
+
+// There is one Integer Scheduler per core.
+
+// Integer physical register file has 96 registers of 64-bit.
+def PdIntegerPRF : RegisterFile<96, [GR64, CCR]>;
+
+// Unified Integer, Memory Scheduler has 40 entries.
+def PdEX : ProcResGroup<[PdEX0, PdEX1, PdAGLU01]> {
+ // Up to 4 IPC can be decoded, issued, retired.
+ let BufferSize = 40;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FPU Pipeline Scheduling
+//
+
+// The FPU unit is shared between the two cores.
+
+// FP physical register file has 160 registers of 128-bit.
+// Operations on 256-bit data types are cracked into two COPs.
+def PdFpuPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// Unified FP Scheduler has 64 entries,
+def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> {
+ // Up to 4 IPC can be decoded, issued, retired.
+ let BufferSize = 64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Functional units
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Load-Store Units
+//
+
+let Super = PdAGLU01 in
+def PdLoad : ProcResource<2> {
+ // For Piledriver, the load queue is 40 entries deep.
+ let BufferSize = 40;
+}
+
+def PdLoadQueue : LoadQueue<PdLoad>;
+
+let Super = PdAGLU01 in
+def PdStore : ProcResource<1> {
+ // For Piledriver, the store queue is 24 entries deep.
+ let BufferSize = 24;
+}
+
+def PdStoreQueue : StoreQueue<PdStore>;
+
+//===----------------------------------------------------------------------===//
+// Integer Execution Units
+//
+
+def PdDiv : ProcResource<1>; // PdEX0; unpipelined integer division
+def PdCount : ProcResource<1>; // PdEX0; POPCNT, LZCOUNT
+
+def PdMul : ProcResource<1>; // PdEX1; integer multiplication
+def PdBranch : ProcResource<1>; // PdEX1; JMP, fused branches
+
+//===----------------------------------------------------------------------===//
+// Floating-Point Units
+//
+
+// Two FMAC/FPFMA units.
+def PdFPFMA : ProcResource<2>; // PdFPU0, PdFPU1
+
+// One 128-bit integer multiply-accumulate unit.
+def PdFPMMA : ProcResource<1>; // PdFPU0
+
+// One fp conversion unit.
+def PdFPCVT : ProcResource<1>; // PdFPU0
+
+// One unit for shuffles, packs, permutes, shifts.
+def PdFPXBR : ProcResource<1>; // PdFPU1
+
+// Two 128-bit packed integer units.
+def PdFPMAL : ProcResource<2>; // PdFPU2, PdFPU3
+
+// One FP store unit.
+def PdFPSTO : ProcResource<1>; // PdFPU3
+
+
+//===----------------------------------------------------------------------===//
+// Basic helper classes.
+//===----------------------------------------------------------------------===//
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass PdWriteRes<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+}
+
+multiclass __pdWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat,
+ list<int> Res, int UOps,
+ int LoadLat, int LoadRes, int LoadUOps> {
+ defm : PdWriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+
+ defm : PdWriteRes<SchedRW.Folded,
+ !listconcat([PdLoad], ExePorts),
+ !add(Lat, LoadLat),
+ !if(!and(!empty(Res), !eq(LoadRes, 1)),
+ [],
+ !listconcat([LoadRes],
+ !if(!empty(Res),
+ !listsplat(1, !size(ExePorts)),
+ Res))),
+ !add(UOps, LoadUOps)>;
+}
+
+multiclass PdWriteResExPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0> {
+ defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ /*LoadLat*/4, /*LoadRes*/3, LoadUOps>;
+}
+
+multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0> {
+ defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
+}
+
+multiclass PdWriteResYMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat,
+ list<int> Res = [], int UOps = 2,
+ int LoadUOps = 0> {
+ defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
+}
+
+//===----------------------------------------------------------------------===//
+// Here be dragons.
+//===----------------------------------------------------------------------===//
+
+// L1 data cache has a 4-cycle load-to-use latency, so ReadAfterLd registers
+// needn't be available until 4 cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available
+// until 5 cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 5>;
+def : ReadAdvance<ReadAfterVecYLd, 5>;
+
+// Transfer from int domain to ivec domain incurs additional latency of 8..10cy
+// Reference: Agner, Microarchitecture, "AMD Bulldozer, Piledriver, Steamroller
+// and Excavator pipeline", "Data delay between different execution domains"
+def : ReadAdvance<ReadInt2Fpu, -10>;
+
+// A folded store needs a cycle on the PdStore for the store data.
+def : WriteRes<WriteRMW, [PdStore]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; let ResourceCycles = [2]; }
+def : WriteRes<WriteStore, [PdStore]>;
+def : WriteRes<WriteStoreNT, [PdStore]>;
+def : WriteRes<WriteMove, [PdEX01]> { let ResourceCycles = [2]; }
+
+// Load/store MXCSR.
+// FIXME: These are copy and pasted from WriteLoad/Store.
+def : WriteRes<WriteLDMXCSR, [PdLoad]> { let Latency = 5; }
+def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; let ResourceCycles = [18]; }
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero, [/*No ExePorts*/]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResExPair<WriteJump, [PdEX1, PdBranch]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Special case scheduling classes.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteSystem, [PdEX01]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [PdEX01]> { let Latency = 100; }
+def : WriteRes<WriteFence, [PdStore]>;
+
+def PdWriteXLAT : SchedWriteRes<[PdEX01]> {
+ let Latency = 6;
+}
+def : InstRW<[PdWriteXLAT], (instrs XLAT)>;
+
+def PdWriteLARrr : SchedWriteRes<[PdEX01]> {
+ let Latency = 184;
+ let ResourceCycles = [375];
+ let NumMicroOps = 45;
+}
+def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
+ "LSL(16|32|64)rr")>;
+
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [PdEX01]> { let ResourceCycles = [2]; }
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResExPair<WriteALU, [PdEX01], 1, [2]>;
+
+def PdWriteALURMW : SchedWriteRes<[PdLoad, PdEX01, PdStore]> {
+ let Latency = 6;
+ let ResourceCycles = [3, 2, 1];
+ let NumMicroOps = 1;
+}
+def : SchedAlias<WriteALURMW, PdWriteALURMW>;
+
+def PdWriteLXADD : SchedWriteRes<[PdEX01]> {
+ let Latency = 6;
+ let ResourceCycles = [88];
+ let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>;
+
+def PdWriteBMI1 : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBMI1],
+ (instrs BLCFILL32rr, BLCFILL64rr, BLCI32rr, BLCI64rr,
+ BLCIC32rr, BLCIC64rr, BLCMSK32rr, BLCMSK64rr,
+ BLCS32rr, BLCS64rr, BLSFILL32rr, BLSFILL64rr,
+ BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr,
+ TZMSK32rr, TZMSK64rr)>;
+
+def PdWriteBMI1m : SchedWriteRes<[PdLoad, PdEX01]> {
+ let Latency = 6;
+ let ResourceCycles = [3, 3];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBMI1m],
+ (instrs BLCFILL32rm, BLCFILL64rm, BLCI32rm, BLCI64rm,
+ BLCIC32rm, BLCIC64rm, BLCMSK32rm, BLCMSK64rm,
+ BLCS32rm, BLCS64rm, BLSFILL32rm, BLSFILL64rm,
+ BLSIC32rm, BLSIC64rm, T1MSKC32rm, T1MSKC64rm,
+ TZMSK32rm, TZMSK64rm)>;
+
+defm : PdWriteResExPair<WriteADC, [PdEX01], 1, [2]>;
+
+def PdWriteADCSBB64ri32 : SchedWriteRes<[PdEX01]> {
+ let ResourceCycles = [3];
+}
+def : InstRW<[PdWriteADCSBB64ri32], (instrs ADC64ri32, SBB64ri32)>;
+
+defm : PdWriteRes<WriteBSWAP32, [PdEX01]>;
+defm : PdWriteRes<WriteBSWAP64, [PdEX01]>;
+defm : PdWriteRes<WriteCMPXCHG, [PdEX1], 3, [3], 5>;
+defm : PdWriteRes<WriteCMPXCHGRMW, [PdEX1, PdStore, PdLoad], 3, [44, 1, 1], 2>;
+defm : PdWriteRes<WriteXCHG, [PdEX1], 1, [], 2>;
+
+def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> {
+ let Latency = 3;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
+
+def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> {
+ let Latency = 3;
+ let ResourceCycles = [23];
+ let NumMicroOps = 5;
+}
+def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>;
+
+def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> {
+ let Latency = 3;
+ let ResourceCycles = [21];
+ let NumMicroOps = 6;
+}
+def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
+ (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>;
+
+def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> {
+ let Latency = 3;
+ let ResourceCycles = [26];
+ let NumMicroOps = 18;
+}
+def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
+
+def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> {
+ let Latency = 3;
+ let ResourceCycles = [69];
+ let NumMicroOps = 22;
+}
+def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>;
+
+def PdWriteXADD : SchedWriteRes<[PdEX1]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>;
+
+def PdWriteXADDm : SchedWriteRes<[PdEX1]> {
+ let Latency = 6;
+ let ResourceCycles = [20];
+ let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>;
+
+defm : PdWriteResExPair<WriteIMul8, [PdEX1, PdMul], 4, [1, 4]>;
+defm : PdWriteResExPair<WriteIMul16, [PdEX1, PdMul], 4, [1, 5], 2>;
+defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul], 5, [1, 5], 2>;
+defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul], 4, [1, 2]>;
+defm : PdWriteResExPair<WriteIMul32, [PdEX1, PdMul], 4, [1, 4]>;
+defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul], 4, [1, 2], 1, 1>;
+defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul], 4, [1, 2]>;
+defm : PdWriteResExPair<WriteIMul64, [PdEX1, PdMul], 6, [1, 6]>;
+defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul], 6, [1, 4],1, 1>;
+defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul], 6, [1, 4]>;
+defm : X86WriteResUnsupported<WriteIMulH>; // BMI2 MULX
+
+defm : PdWriteResExPair<WriteDiv8, [PdEX1, PdDiv], 12, [1, 12]>;
+defm : PdWriteResExPair<WriteDiv16, [PdEX1, PdDiv], 15, [1, 15], 2>;
+defm : PdWriteResExPair<WriteDiv32, [PdEX1, PdDiv], 14, [1, 14], 2>;
+defm : PdWriteResExPair<WriteDiv64, [PdEX1, PdDiv], 14, [1, 14], 2>;
+
+defm : PdWriteResExPair<WriteIDiv8, [PdEX1, PdDiv], 12, [1, 12]>;
+defm : PdWriteResExPair<WriteIDiv16, [PdEX1, PdDiv], 15, [1, 17], 2>;
+defm : PdWriteResExPair<WriteIDiv32, [PdEX1, PdDiv], 14, [1, 25], 2>;
+defm : PdWriteResExPair<WriteIDiv64, [PdEX1, PdDiv], 14, [1, 14], 2>;
+
+defm : PdWriteResExPair<WriteCRC32, [PdEX01], 2, [4], 3>;
+
+def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> {
+ let Latency = 5;
+ let ResourceCycles = [10];
+ let NumMicroOps = 5;
+}
+def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>;
+
+def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> {
+ let Latency = 6;
+ let ResourceCycles = [12];
+ let NumMicroOps = 7;
+}
+def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>;
+
+def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> {
+ let Latency = 10;
+ let ResourceCycles = [17];
+ let NumMicroOps = 11;
+}
+def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>;
+
+defm : PdWriteResExPair<WriteCMOV, [PdEX01]>; // Conditional move.
+
+def PdWriteCMOVm : SchedWriteRes<[PdLoad, PdEX01]> {
+ let Latency = 5;
+ let ResourceCycles = [3, 3];
+ let NumMicroOps = 2;
+}
+
+def PdWriteCMOVmVar : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_BE">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_A">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_L">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_GE">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_LE">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_G">>, [PdWriteCMOVm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[PdWriteCMOVmVar], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+defm : PdWriteRes<WriteFCMOV, [PdFPU0, PdFPFMA]>; // x87 conditional move.
+
+def : WriteRes<WriteSETCC, [PdEX01]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [PdEX01, PdStore]>;
+
+def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> {
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def PdSETGEmSETGmSETLEmSETLm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_GE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_G">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_LE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_L">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+def : InstRW<[PdSETGEmSETGmSETLEmSETLm], (instrs SETCCm)>;
+
+defm : PdWriteRes<WriteLAHFSAHF, [PdEX01], 2, [4], 2>;
+
+def PdWriteLAHF : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [4];
+ let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteLAHF], (instrs LAHF)>;
+
+def PdWriteSAHF : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteSAHF], (instrs SAHF)>;
+
+defm : PdWriteRes<WriteBitTest, [PdEX01], 1, [2], 1>;
+defm : PdWriteRes<WriteBitTestImmLd, [PdEX01, PdLoad], 5, [2, 3], 1>;
+defm : PdWriteRes<WriteBitTestRegLd, [PdEX01, PdLoad], 5, [7, 2], 7>;
+defm : PdWriteRes<WriteBitTestSet, [PdEX01], 2, [2], 2>;
+defm : PdWriteRes<WriteBitTestSetImmLd, [PdEX01, PdLoad], 6, [1, 1], 4>;
+defm : PdWriteRes<WriteBitTestSetRegLd, [PdEX01, PdLoad], 6, [1, 1], 10>;
+
+def PdWriteBTSIm : SchedWriteRes<[PdEX01, PdLoad]> {
+ let Latency = 7;
+ let ResourceCycles = [42, 1];
+ let NumMicroOps = 4;
+}
+def : SchedAlias<WriteBitTestSetImmRMW, PdWriteBTSIm>;
+def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> {
+ let Latency = 7;
+ let ResourceCycles = [44, 1];
+ let NumMicroOps = 10;
+}
+def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>;
+
+// This is for simple LEAs with one or two input operands.
+def : WriteRes<WriteLEA, [PdEX01]> { let ResourceCycles = [2]; }
+
+// This write is used for slow LEA instructions.
+def PdWrite3OpsLEA : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// On Piledriver, a slow LEA is either a 3Ops LEA (base, index, offset),
+// or an LEA with a `Scale` value different than 1.
+def PdSlowLEAPredicate : MCSchedPredicate<
+ CheckAny<[
+ // A 3-operand LEA (base, index, offset).
+ IsThreeOperandsLEAFn,
+ // An LEA with a "Scale" different than 1.
+ CheckAll<[
+ CheckIsImmOperand<2>,
+ CheckNot<CheckImmOperand<2, 1>>
+ ]>
+ ]>
+>;
+
+def PdWriteLEA : SchedWriteVariant<[
+ SchedVar<PdSlowLEAPredicate, [PdWrite3OpsLEA]>,
+ SchedVar<NoSchedPred, [WriteLEA]>
+]>;
+
+def : InstRW<[PdWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+
+def PdWriteLEA16r : SchedWriteRes<[PdEX01]> {
+ let ResourceCycles = [3];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteLEA16r], (instrs LEA16r)>;
+
+// Bit counts.
+defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [6], 6, 2>;
+defm : PdWriteResExPair<WriteBSR, [PdEX01], 4, [8], 7, 2>;
+defm : PdWriteResExPair<WritePOPCNT, [PdEX01], 4, [4]>;
+defm : PdWriteResExPair<WriteLZCNT, [PdEX0], 2, [2], 2>;
+defm : PdWriteResExPair<WriteTZCNT, [PdEX0], 2, [2], 2>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : PdWriteResExPair<WriteBEXTR, [PdEX01], 2, [2], 2>;
+defm : PdWriteResExPair<WriteBLS, [PdEX01], 2, [2], 2>;
+defm : PdWriteResExPair<WriteBZHI, [PdEX01]>;
+
+def PdWriteBEXTRI : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBEXTRI], (instrs BEXTRI32ri, BEXTRI64ri)>;
+
+def PdWriteBEXTRIm : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [5];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBEXTRIm], (instrs BEXTRI32mi, BEXTRI64mi)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResExPair<WriteShift, [PdEX01], 1, [2]>;
+defm : PdWriteResExPair<WriteShiftCL, [PdEX01]>;
+defm : PdWriteResExPair<WriteRotate, [PdEX01], 1, [2]>;
+defm : PdWriteResExPair<WriteRotateCL, [PdEX01]>;
+
+def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> {
+ let Latency = 12;
+ let ResourceCycles = [24];
+ let NumMicroOps = 26;
+}
+def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>;
+
+def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> {
+ let Latency = 12;
+ let ResourceCycles = [23];
+ let NumMicroOps = 23;
+}
+def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>;
+
+def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> {
+ let Latency = 11;
+ let ResourceCycles = [22];
+ let NumMicroOps = 24;
+}
+def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>;
+
+def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> {
+ let Latency = 10;
+ let ResourceCycles = [20];
+ let NumMicroOps = 22;
+}
+def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>;
+
+def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> {
+ let Latency = 10;
+ let ResourceCycles = [19];
+ let NumMicroOps = 19;
+}
+def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>;
+
+def PdWriteRCL3264rCL : SchedWriteRes<[PdEX01]> {
+ let Latency = 7;
+ let ResourceCycles = [14];
+ let NumMicroOps = 17;
+}
+def : InstRW<[PdWriteRCL3264rCL], (instrs RCL32rCL, RCL64rCL)>;
+
+def PdWriteRCR3264rCL : SchedWriteRes<[PdEX01]> {
+ let Latency = 7;
+ let ResourceCycles = [13];
+ let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteRCR3264rCL], (instrs RCR32rCL, RCR64rCL)>;
+
+def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> {
+ let Latency = 7;
+ let ResourceCycles = [14];
+ let NumMicroOps = 15;
+}
+def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
+
+
+def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> {
+ let Latency = 9;
+ let ResourceCycles = [18];
+ let NumMicroOps = 20;
+}
+def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>;
+
+def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> {
+ let Latency = 11;
+ let ResourceCycles = [21];
+ let NumMicroOps = 21;
+}
+def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>;
+
+def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> {
+ let Latency = 8;
+ let ResourceCycles = [15];
+ let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>;
+
+def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> {
+ let Latency = 13;
+ let ResourceCycles = [25];
+ let NumMicroOps = 25;
+}
+def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>;
+
+// SHLD/SHRD.
+defm : PdWriteRes<WriteSHDrri, [PdEX01], 3, [6], 6>;
+defm : PdWriteRes<WriteSHDrrcl, [PdEX01], 3, [8], 7>;
+
+def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> {
+ let Latency = 3;
+ let ResourceCycles = [6];
+ let NumMicroOps = 6;
+}
+def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>;
+
+def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> {
+ let Latency = 3;
+ let ResourceCycles = [6];
+ let NumMicroOps = 7;
+}
+def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL,
+ SHLD32rrCL,
+ SHRD32rrCL)>;
+
+defm : PdWriteRes<WriteSHDmri, [PdLoad, PdEX01], 4, [1, 22], 8>;
+defm : PdWriteRes<WriteSHDmrcl, [PdLoad, PdEX01], 4, [1, 22], 8>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteFLD0, [PdFPU1, PdFPSTO], 3>;
+defm : PdWriteRes<WriteFLD1, [PdFPU1, PdFPSTO], 3>;
+defm : PdWriteRes<WriteFLDC, [PdFPU1, PdFPSTO], 3>;
+
+defm : PdWriteRes<WriteFLoad, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteFLoadX, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteFLoadY, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3], 2>;
+
+defm : PdWriteRes<WriteFMaskedLoad, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 1, 4]>;
+defm : PdWriteRes<WriteFMaskedLoadY, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 2, 4], 2>;
+
+defm : PdWriteRes<WriteFStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
+defm : PdWriteRes<WriteFStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
+defm : PdWriteRes<WriteFStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [1, 36, 2], 4>;
+
+def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU23, PdFPSTO]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 3, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>;
+
+def PdWriteVMOVUPDYmrVMOVUPSYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
+ let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVMOVUPDYmrVMOVUPSYmr], (instrs VMOVUPDYmr, VMOVUPSYmr)>;
+
+defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>;
+defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>;
+defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>;
+
+defm : PdWriteRes<WriteFMaskedStore32, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStore64, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStore32Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
+defm : PdWriteRes<WriteFMaskedStore64Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
+
+defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>;
+defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>;
+defm : PdWriteRes<WriteFMoveY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
+
+defm : PdWriteRes<WriteEMMS, [PdFPU01, PdFPFMA], 2>;
+
+defm : PdWriteResXMMPair<WriteFAdd, [PdFPU0, PdFPFMA], 5>;
+defm : PdWriteResXMMPair<WriteFAddX, [PdFPU0, PdFPFMA], 5>;
+defm : PdWriteResYMMPair<WriteFAddY, [PdFPU0, PdFPFMA], 5, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+
+def PdWriteX87Add: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
+ let Latency = 5;
+ let ResourceCycles = [3, 1, 10];
+}
+def : InstRW<[PdWriteX87Add], (instrs ADD_FI16m, ADD_FI32m, ADD_F32m, ADD_F64m,
+ SUB_FI16m, SUB_FI32m, SUB_F32m, SUB_F64m,
+ SUBR_FI16m, SUBR_FI32m, SUBR_F32m, SUBR_F64m)>;
+
+defm : PdWriteResXMMPair<WriteFAdd64, [PdFPU0, PdFPFMA], 5>;
+defm : PdWriteResXMMPair<WriteFAdd64X, [PdFPU0, PdFPFMA], 5>;
+defm : PdWriteResYMMPair<WriteFAdd64Y, [PdFPU0, PdFPFMA], 5, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+
+defm : PdWriteResXMMPair<WriteFCmp, [PdFPU0, PdFPFMA], 2>;
+defm : PdWriteResXMMPair<WriteFCmpX, [PdFPU0, PdFPFMA], 2>;
+defm : PdWriteResYMMPair<WriteFCmpY, [PdFPU0, PdFPFMA], 2, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+
+defm : PdWriteResXMMPair<WriteFCmp64, [PdFPU0, PdFPFMA], 2>;
+defm : PdWriteResXMMPair<WriteFCmp64X, [PdFPU0, PdFPFMA], 2>;
+defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+
+defm : PdWriteResXMMPair<WriteFCom, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
+defm : PdWriteResXMMPair<WriteFComX, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
+
+def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> {
+ let Latency = 6;
+}
+def : InstRW<[PdWriteFCOMPm], (instrs FCOM32m, FCOM64m, FCOMP32m, FCOMP64m)>;
+
+def PdWriteTST_F_UCOM_FPPr : SchedWriteRes<[PdFPU1, PdFPFMA]>;
+def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>;
+
+defm : PdWriteResXMMPair<WriteFMul, [PdFPU1, PdFPFMA], 5>;
+defm : PdWriteResXMMPair<WriteFMulX, [PdFPU1, PdFPFMA], 5>;
+defm : PdWriteResYMMPair<WriteFMulY, [PdFPU1, PdFPFMA], 5, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+
+def PdWriteX87Mul: SchedWriteRes<[PdLoad, PdFPU1, PdFPFMA]> {
+ let Latency = 5;
+ let ResourceCycles = [3, 1, 10];
+}
+def : InstRW<[PdWriteX87Mul], (instrs MUL_FI16m, MUL_FI32m, MUL_F32m, MUL_F64m)>;
+
+defm : PdWriteResXMMPair<WriteFMul64, [PdFPU1, PdFPFMA], 5>;
+defm : PdWriteResXMMPair<WriteFMul64X, [PdFPU1, PdFPFMA], 5>;
+defm : PdWriteResYMMPair<WriteFMul64Y, [PdFPU1, PdFPFMA], 5, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+
+defm : PdWriteResXMMPair<WriteFMA, [PdFPU, PdFPFMA], 5, [1, 3]>;
+defm : PdWriteResXMMPair<WriteFMAX, [PdFPU, PdFPFMA], 5, [1, 3]>;
+defm : PdWriteResYMMPair<WriteFMAY, [PdFPU, PdFPFMA], 5, [1, 3]>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+
+
+defm : PdWriteResXMMPair<WriteDPPD, [PdFPU1, PdFPFMA], 15, [1, 10], 15, 2>;
+
+defm : PdWriteResXMMPair<WriteDPPS, [PdFPU1, PdFPFMA], 25, [1, 14], 16, 2>;
+defm : PdWriteResYMMPair<WriteDPPSY, [PdFPU1, PdFPFMA], 27, [2, 25], /*or 29*/ 25, 4>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+
+def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> {
+ let Latency = 27;
+ let ResourceCycles = [1, 14];
+ let NumMicroOps = 17;
+}
+def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>;
+
+defm : PdWriteResXMMPair<WriteFRcp, [PdFPU1, PdFPFMA], 5>;
+defm : PdWriteResXMMPair<WriteFRcpX, [PdFPU1, PdFPFMA], 5>;
+defm : PdWriteResYMMPair<WriteFRcpY, [PdFPU1, PdFPFMA], 5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : PdWriteResXMMPair<WriteFRsqrt, [PdFPU1, PdFPFMA], 5, [1, 2]>;
+defm : PdWriteResXMMPair<WriteFRsqrtX, [PdFPU1, PdFPFMA], 5>;
+defm : PdWriteResYMMPair<WriteFRsqrtY, [PdFPU1, PdFPFMA], 5, [2, 2]>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
+defm : PdWriteResXMMPair<WriteFDiv, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFDivX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFDivY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+
+def PdWriteX87Div: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
+ let Latency = 9;
+ let ResourceCycles = [3, 1, 18];
+}
+def : InstRW<[PdWriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
+ DIVR_FI16m, DIVR_FI32m,
+ DIV_F32m, DIV_F64m,
+ DIVR_F32m, DIVR_F64m)>;
+
+defm : PdWriteResXMMPair<WriteFDiv64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFDiv64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFDiv64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+
+defm : PdWriteResXMMPair<WriteFSqrt, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFSqrtX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFSqrtY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+
+defm : PdWriteResXMMPair<WriteFSqrt64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFSqrt64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFSqrt64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+
+defm : PdWriteResXMMPair<WriteFSqrt80, [PdFPU1, PdFPFMA], 1, [1, 18]>;
+defm : PdWriteResXMMPair<WriteFSign, [PdFPU1, PdFPFMA], 1, [1, 4]>;
+
+defm : PdWriteResXMMPair<WriteFRnd, [PdFPU1, PdFPSTO], 4, []>;
+defm : PdWriteResYMMPair<WriteFRndY, [PdFPU1, PdFPSTO], 4, [2, 1], 2>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+
+def PdWriteVFRCZP : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVFRCZP], (instrs VFRCZPDrr, VFRCZPSrr)>;
+
+def PdWriteVFRCZS : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+ let Latency = 10;
+ let ResourceCycles = [10, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVFRCZS], (instrs VFRCZSDrr, VFRCZSSrr)>;
+
+def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+ let Latency = 15;
+ let ResourceCycles = [2, 1];
+ let NumMicroOps = 3;
+}
+def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm,
+ VFRCZSDrm, VFRCZSSrm)>;
+
+def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 1];
+ let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>;
+
+def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+ let Latency = 15;
+ let ResourceCycles = [4, 1];
+ let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>;
+
+defm : PdWriteResXMMPair<WriteFLogic, [PdFPU01, PdFPFMA], 2, [1, 2]>;
+defm : PdWriteResYMMPair<WriteFLogicY, [PdFPU01, PdFPFMA], 2, [2, 2]>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+
+defm : PdWriteResXMMPair<WriteFTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
+defm : PdWriteResYMMPair<WriteFTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [4, 4, 1], 4, 2>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+
+defm : PdWriteResXMMPair<WriteFShuffle, [PdFPU01, PdFPFMA], 2, [1, 2]>;
+defm : PdWriteResYMMPair<WriteFShuffleY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+
+def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 3];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>;
+
+defm : PdWriteResXMMPair<WriteFVarShuffle, [PdFPU01, PdFPFMA], 3, [1, 2]>;
+defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA], 3, [2, 4], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+
+defm : PdWriteResXMMPair<WriteFBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>;
+defm : PdWriteResYMMPair<WriteFBlendY, [PdFPU01, PdFPFMA], 2, [2, 3], 2>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+
+defm : PdWriteResXMMPair<WriteFVarBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>;
+defm : PdWriteResYMMPair<WriteFVarBlendY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+
+defm : PdWriteResXMMPair<WriteFShuffle256, [PdFPU01, PdFPFMA], 2, [1, 3], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+
+def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>;
+
+def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>;
+
+def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+ let Latency = 4;
+ let ResourceCycles = [1, 6];
+ let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>;
+
+def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+ let Latency = 8; // 4 + 4
+ let ResourceCycles = [1, 8];
+ let NumMicroOps = 10;
+}
+def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Conversions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteCvtSS2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+
+defm : PdWriteResXMMPair<WriteCvtPS2I, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtPS2IY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+
+defm : PdWriteResXMMPair<WriteCvtSD2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+
+defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>;
+
+// FIXME: f+3 ST, LD+STC latency
+defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
+// FIXME: .Folded version is one NumMicroOp *less*..
+
+defm : PdWriteResXMMPair<WriteCvtI2PS, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtI2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+
+defm : PdWriteResXMMPair<WriteCvtI2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
+// FIXME: .Folded version is one NumMicroOp *less*..
+
+def PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
+ let Latency = 13;
+ let ResourceCycles = [1, 3, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr], (instrs CVTSI642SDrr, CVTSI642SSrr, CVTSI2SDrr, CVTSI2SSrr)>;
+
+defm : PdWriteResXMMPair<WriteCvtI2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtI2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : PdWriteResXMMPair<WriteCvtSS2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
+
+defm : PdWriteResXMMPair<WriteCvtPS2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+
+defm : PdWriteResXMMPair<WriteCvtSD2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
+
+defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+def PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr,
+ MMX_CVTPI2PDirr)>;
+
+def PdWriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>;
+
+defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2, 1>;
+defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+
+defm : PdWriteRes<WriteCvtPS2PH, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2>;
+defm : PdWriteRes<WriteCvtPS2PHY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+
+defm : PdWriteRes<WriteCvtPS2PHSt, [PdFPU0, PdFPCVT, PdFPSTO, PdStore], 4, [1, 2, 1, 1], 3>;
+defm : PdWriteRes<WriteCvtPS2PHYSt, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdStore], 4, [1, 2, 1, 1, 1], 4>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector integer operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteVecLoad, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteVecLoadX, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteVecLoadY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 3], 2>;
+
+defm : PdWriteRes<WriteVecLoadNT, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 4]>;
+defm : PdWriteRes<WriteVecLoadNTY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 4]>;
+
+defm : PdWriteRes<WriteVecMaskedLoad, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 1, 2]>;
+defm : PdWriteRes<WriteVecMaskedLoadY, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 2, 4], 2>;
+
+defm : PdWriteRes<WriteVecStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
+defm : PdWriteRes<WriteVecStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
+defm : PdWriteRes<WriteVecStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [2, 36, 2], 4>;
+
+def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
+ let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>;
+
+defm : PdWriteRes<WriteVecStoreNT, [PdStore, PdFPU1, PdFPSTO], 2>;
+defm : PdWriteRes<WriteVecStoreNTY, [PdStore, PdFPU1, PdFPSTO], 2, [2, 2, 2], 4>;
+
+defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
+
+defm : PdWriteRes<WriteVecMove, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 1, [1, 2]>;
+defm : PdWriteRes<WriteVecMoveY, [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
+
+def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+}
+def : InstRW<[PdWriteMOVDQArr], (instrs MOVDQArr)>;
+
+def PdWriteMOVQ2DQrr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 4;
+}
+def : InstRW<[PdWriteMOVQ2DQrr], (instrs MMX_MOVQ2DQrr)>;
+
+defm : PdWriteRes<WriteVecMoveToGpr, [PdFPU0, PdFPFMA, PdEX0], 11>;
+defm : PdWriteRes<WriteVecMoveFromGpr, [PdFPU01, PdFPFMA], 11, [1, 2], 2>;
+
+defm : PdWriteResXMMPair<WriteVecALU, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecALUX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteVecALUY>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+
+defm : PdWriteResXMMPair<WriteVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>;
+defm : PdWriteResXMMPair<WriteVecShiftX, [PdFPU01, PdFPMAL], 3, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+
+defm : PdWriteResXMMPair<WriteVecShiftImm, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : PdWriteResXMMPair<WriteVecShiftImmX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+
+defm : PdWriteResXMMPair<WriteVecIMul, [PdFPU0, PdFPMMA], 4>;
+defm : PdWriteResXMMPair<WriteVecIMulX, [PdFPU0, PdFPMMA], 4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulY>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+
+defm : PdWriteResXMMPair<WritePMULLD, [PdFPU0, PdFPU01, PdFPMMA, PdFPMAL], 5, [2, 1, 2, 1]>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+
+def PdWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPMMA, PdFPMAL]> {
+ let Latency = 4;
+}
+def : InstRW<[PdWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
+ VPMACSSDQLrr)>;
+
+defm : PdWriteResXMMPair<WriteMPSAD, [PdFPU0, PdFPMMA], 9, [1, 4], 8>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+
+def PdWriteVMPSADBW : SchedWriteRes<[PdFPU0, PdFPMMA]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 4];
+ let NumMicroOps = 10;
+}
+def : InstRW<[PdWriteVMPSADBW], (instrs VMPSADBWrri)>;
+
+defm : PdWriteResXMMPair<WritePSADBW, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
+defm : PdWriteResXMMPair<WritePSADBWX, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+
+defm : PdWriteResXMMPair<WritePHMINPOS, [PdFPU0, PdFPMAL], 4, [], 2>;
+
+defm : PdWriteResXMMPair<WriteShuffle, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : PdWriteResXMMPair<WriteShuffleX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : PdWriteResYMMPair<WriteShuffleY, [PdFPU01, PdFPMAL], 2, [1, 4]>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+
+defm : PdWriteResXMMPair<WriteVarShuffle, [PdFPU01, PdFPMAL], 3, [1, 2]>;
+defm : PdWriteResXMMPair<WriteVarShuffleX, [PdFPU01, PdFPMAL], 3, [1, 3]>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+
+def PdWriteVPPERM : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[PdWriteVPPERM], (instrs VPPERMrrr, VPPERMrrr_REV)>;
+
+defm : PdWriteResXMMPair<WriteBlend, [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WriteBlendY>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+
+defm : PdWriteResXMMPair<WriteVarBlend, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+
+defm : PdWriteResXMMPair<WriteVecLogic, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecLogicX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteVecLogicY>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+
+defm : PdWriteResXMMPair<WriteVecTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
+defm : PdWriteResYMMPair<WriteVecTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 4, 1], 4, 2>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+
+defm : PdWriteResXMMPair<WriteShuffle256, [PdFPU01, PdFPMAL]>;
+defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>;
+
+defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector insert/extract operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteVecInsert, [PdFPU01, PdFPMAL], 2, [1, 3], 2>;
+defm : PdWriteRes<WriteVecInsertLd, [PdFPU01, PdFPMAL, PdLoad], 6, [1, 4, 3], 2>;
+
+defm : PdWriteRes<WriteVecExtract, [PdFPU0, PdFPFMA, PdEX0], 12, [1, 3, 1], 2>;
+defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [2, 1, 1], 2>;
+
+def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE42 String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 11, [1, 6, 1], 7, 1>;
+defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0], 7, [1, 8, 1], 7, 2>;
+
+defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 14, [1, 10, 10, 10, 1, 1], 27, 1>;
+defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 10, 10, 10, 1, 1], 27, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// MOVMSK Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
+
+defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
+defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
+// defm : X86WriteResUnsupported<WriteVecMOVMSKZ>;
+
+defm : PdWriteRes<WriteMMXMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 10, [], 2>;
+
+////////////////////////////////////////////////////////////////////////////////
+// AES Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteAESIMC, [PdFPU0, PdFPMMA], 5>;
+defm : PdWriteResXMMPair<WriteAESKeyGen, [PdFPU0, PdFPMMA], 5>;
+defm : PdWriteResXMMPair<WriteAESDecEnc, [PdFPU0, PdFPMMA], 9, [], 2>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteFHAdd, [PdFPU0, PdFPFMA], 11, [1, 5], 3, 1>;
+defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [1, 8], 8, 2>;
+defm : X86WriteResPairUnsupported<WriteFHAddZ>;
+
+defm : PdWriteResXMMPair<WritePHAdd, [PdFPU01, PdFPMAL], 5, [1, 4], 3, 1>;
+defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : X86WriteResPairUnsupported<WritePHAddY>;
+defm : X86WriteResPairUnsupported<WritePHAddZ>;
+
+def : InstRW<[WritePHAdd], (instrs PHADDDrr, PHSUBDrr,
+ PHADDWrr, PHSUBWrr,
+ PHADDSWrr, PHSUBSWrr,
+ VPHADDDrr, VPHSUBDrr,
+ VPHADDWrr, VPHSUBWrr,
+ VPHADDSWrr, VPHSUBSWrr)>;
+
+def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm,
+ PHADDWrm, PHSUBWrm,
+ PHADDSWrm, PHSUBSWrm,
+ VPHADDDrm, VPHSUBDrm,
+ VPHADDWrm, VPHSUBWrm,
+ VPHADDSWrm, VPHSUBSWrm)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [1, 7], 5, 1>;
+
+def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> {
+ let Latency = 12;
+ let ResourceCycles = [1, 7];
+ let NumMicroOps = 6;
+}
+def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ)>;
+
+def PdWriteINSERTQI : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[PdWriteINSERTQI], (instrs INSERTQI)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// AVX instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def PdWriteVBROADCASTYLd : SchedWriteRes<[PdLoad, PdFPU01, PdFPFMA]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 2, 4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm,
+ VBROADCASTSSYrm)>;
+
+def PdWriteVZEROALL : SchedWriteRes<[]> {
+ let Latency = 90;
+ let NumMicroOps = 32;
+}
+def : InstRW<[PdWriteVZEROALL], (instrs VZEROALL)>;
+
+def PdWriteVZEROUPPER : SchedWriteRes<[]> {
+ let Latency = 46;
+ let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteVZEROUPPER], (instrs VZEROUPPER)>;
+
+///////////////////////////////////////////////////////////////////////////////
+// SchedWriteVariant definitions.
+///////////////////////////////////////////////////////////////////////////////
+
+def PdWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def PdWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteALU]>
+]>;
+def : InstRW<[PdWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def PdWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteFLogic]>
+]>;
+def : InstRW<[PdWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr,
+ XORPDrr, VXORPDrr,
+ ANDNPSrr, VANDNPSrr,
+ ANDNPDrr, VANDNPDrr)>;
+
+// VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr "zero-idioms" have latency of 1.
+
+def PdWriteVZeroIdiomLogic : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogic]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
+
+def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogicX]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
+ PANDNrr, VPANDNrr)>;
+
+def PdWriteVZeroIdiomALU : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteVecALU]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
+ MMX_PSUBQirr, MMX_PSUBWirr,
+ MMX_PCMPGTBirr,
+ MMX_PCMPGTDirr,
+ MMX_PCMPGTWirr)>;
+
+def PdWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteVecALUX]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+// VPCMPGTQ, but not PCMPGTQ!
+
+def : IsZeroIdiomFunction<[
+ // GPR Zero-idioms.
+ DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+ // MMX Zero-idioms.
+ DepBreakingClass<[
+ MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
+ MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
+ MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
+ MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
+ ], ZeroIdiomPredicate>,
+
+ // SSE Zero-idioms.
+ DepBreakingClass<[
+ // fp variants.
+ XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
+
+ // int variants.
+ PXORrr, PANDNrr,
+ PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+ PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
+ PCMPGTBrr, PCMPGTDrr, PCMPGTWrr
+ ], ZeroIdiomPredicate>,
+
+ // AVX Zero-idioms.
+ DepBreakingClass<[
+ // xmm fp variants.
+ VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
+
+ // xmm int variants.
+ VPXORrr, VPANDNrr,
+ VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+ VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
+ VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+
+ // ymm variants.
+ VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
+ ], ZeroIdiomPredicate>
+]>;
+
+def : IsDepBreakingFunction<[
+ // GPR
+ DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
+ DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
+
+ // MMX
+ DepBreakingClass<[
+ MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
+ ], ZeroIdiomPredicate>,
+
+ // SSE
+ DepBreakingClass<[
+ PCMPEQBrr, PCMPEQWrr, PCMPEQDrr
+ // But not PCMPEQQrr.
+ ], ZeroIdiomPredicate>,
+
+ // AVX
+ DepBreakingClass<[
+ VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr
+ // But not VPCMPEQQrr.
+ ], ZeroIdiomPredicate>
+]>;
+
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td
new file mode 100644
index 000000000000..13b6eed5126d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -0,0 +1,1049 @@
+//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AMD btver2 (Jaguar) to support
+// instruction scheduling and other instruction cost heuristics. Based off AMD Software
+// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
+//
+//===----------------------------------------------------------------------===//
+
+def BtVer2Model : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and btver2 can
+ // decode 2 instructions per cycle.
+ let IssueWidth = 2;
+ let MicroOpBufferSize = 64; // Retire Control Unit
+ let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
+ let HighLatency = 25;
+ let MispredictPenalty = 14; // Minimum branch misdirection penalty
+ let PostRAScheduler = 1;
+
+ // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = BtVer2Model in {
+
+// Jaguar can issue up to 6 micro-ops in one cycle
+def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
+def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
+def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
+def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
+def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
+def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
+
+// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
+// speculative version of the 64-bit integer registers.
+// Reference: www.realworldtech.com/jaguar/4/
+//
+// The processor always keeps the different parts of an integer register
+// together. An instruction that writes to a part of a register will therefore
+// have a false dependence on any previous write to the same register or any
+// part of it.
+// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
+// access" - Agner Fog's "microarchitecture.pdf".
+def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
+ 0, // Max moves that can be eliminated per cycle.
+ 1>; // Restrict move elimination to zero regs.
+
+// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
+// registers. Operations on 256-bit data types are cracked into two COPs.
+// Reference: www.realworldtech.com/jaguar/4/
+
+// The PRF in the floating point unit can eliminate a move from a MMX or SSE
+// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
+// dependency breaking instruction, or via VZEROALL).
+// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
+// instructions" - Agner Fog's "microarchitecture.pdf"
+def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
+ 0, // Max moves that can be eliminated per cycle.
+ 1>; // Restrict move elimination to zero regs.
+
+// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
+// retire up to two macro-ops per cycle.
+// Reference: "Software Optimization Guide for AMD Family 16h Processors"
+def JRCU : RetireControlUnit<64, 2>;
+
+// Integer Pipe Scheduler
+def JALU01 : ProcResGroup<[JALU0, JALU1]> {
+ let BufferSize=20;
+}
+
+// AGU Pipe Scheduler
+def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
+ let BufferSize=12;
+}
+
+// Fpu Pipe Scheduler
+def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
+ let BufferSize=18;
+}
+
+// Functional units
+def JDiv : ProcResource<1>; // integer division
+def JMul : ProcResource<1>; // integer multiplication
+def JVALU0 : ProcResource<1>; // vector integer
+def JVALU1 : ProcResource<1>; // vector integer
+def JVIMUL : ProcResource<1>; // vector integer multiplication
+def JSTC : ProcResource<1>; // vector store/convert
+def JFPM : ProcResource<1>; // FP multiplication
+def JFPA : ProcResource<1>; // FP addition
+
+// Functional unit groups
+def JFPX : ProcResGroup<[JFPA, JFPM]>;
+def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
+
+// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 5>;
+def : ReadAdvance<ReadAfterVecYLd, 5>;
+
+/// "Additional 6 cycle transfer operation which moves a floating point
+/// operation input value from the integer unit to the floating point unit.
+/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
+def : ReadAdvance<ReadInt2Fpu, -6>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
+ let Latency = !add(Lat, 3);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
+ }
+}
+
+multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
+ let Latency = !add(Lat, 5);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
+ }
+}
+
+multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [2], int UOps = 2,
+ int LoadUOps = 0> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
+ let Latency = !add(Lat, 5);
+ let ResourceCycles = !listconcat([2], Res);
+ let NumMicroOps = !add(UOps, LoadUOps);
+ }
+}
+
+// Instructions that have local forwarding disabled have an extra +1cy latency.
+
+// A folded store needs a cycle on the SAGU for the store data, most RMW
+// instructions don't need an extra uop. ALU RMW operations don't seem to
+// benefit from STLF, and their observed latency is 6cy. That is the reason why
+// this write adds two extra cycles (instead of just 1cy for the store).
+defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteALU, [JALU01], 1>;
+defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>;
+
+defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
+defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>;
+
+defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>;
+defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
+defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>;
+defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>;
+defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
+defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
+defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>;
+
+defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>;
+defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
+defm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
+defm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
+defm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>;
+defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
+defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
+defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
+
+defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>;
+
+defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move.
+defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
+def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
+def : WriteRes<WriteLAHFSAHF, [JALU01]>;
+
+defm : X86WriteRes<WriteBitTest, [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [JALU01,JLAGU], 4, [1,1], 1>;
+defm : X86WriteRes<WriteBitTestRegLd, [JALU01,JLAGU], 4, [1,1], 5>;
+defm : X86WriteRes<WriteBitTestSet, [JALU01], 1, [1], 2>;
+defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
+defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
+
+// This is for simple LEAs with one or two input operands.
+def : WriteRes<WriteLEA, [JALU01]>;
+
+// Bit counts.
+defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
+defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
+defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>;
+defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>;
+defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2], 2>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
+defm : JWriteResIntPair<WriteBLS, [JALU01], 2, [2], 2>;
+defm : X86WriteResPairUnsupported<WriteBZHI>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteShift, [JALU01], 1>;
+defm : JWriteResIntPair<WriteShiftCL, [JALU01], 1>;
+defm : JWriteResIntPair<WriteRotate, [JALU01], 1>;
+defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
+
+// SHLD/SHRD.
+defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
+defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
+defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
+defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; }
+def : WriteRes<WriteStore, [JSAGU]>;
+def : WriteRes<WriteStoreNT, [JSAGU]>;
+def : WriteRes<WriteMove, [JALU01]>;
+
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
+def : WriteRes<WriteSTMXCSR, [JSAGU]>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteJump, [JALU01], 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Special case scheduling classes.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
+def : WriteRes<WriteFence, [JSAGU]>;
+
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
+
+def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
+ let Latency = 3;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 16;
+ let ResourceCycles = [3,16,16];
+ let NumMicroOps = 5;
+}
+
+def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 17;
+ let ResourceCycles = [3,17,17];
+ let NumMicroOps = 6;
+}
+
+def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [3,1,1];
+ let NumMicroOps = 5;
+}
+
+def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [3,1,1];
+ let NumMicroOps = 18;
+}
+
+def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 32;
+ let ResourceCycles = [6,1,1];
+ let NumMicroOps = 28;
+}
+
+def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 19;
+ let ResourceCycles = [3,19,19];
+ let NumMicroOps = 18;
+}
+
+def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 38;
+ let ResourceCycles = [6,38,38];
+ let NumMicroOps = 28;
+}
+
+def JWriteCMPXCHGVariant : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>,
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>,
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>,
+ SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>,
+ SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>,
+ SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>,
+ SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>,
+ SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>,
+ SchedVar<NoSchedPred, [WriteCMPXCHG]>
+]>;
+
+// The first five reads are contributed by the memory load operand.
+// We ignore those reads and set a read-advance for the other input operands
+// including the implicit read of RAX.
+def : InstRW<[JWriteCMPXCHGVariant,
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
+ LCMPXCHG32, LCMPXCHG64,
+ CMPXCHG8rm, CMPXCHG16rm,
+ CMPXCHG32rm, CMPXCHG64rm)>;
+
+def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
+ CMPXCHG32rr, CMPXCHG64rr)>;
+
+def : InstRW<[JWriteCMPXCHGVariant,
+ // Ignore reads contributed by the memory operand.
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // Add a read-advance to every implicit register read.
+ ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
+ CMPXCHG8B, CMPXCHG16B)>;
+
+def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 19;
+ let ResourceCycles = [1,19,19];
+ let NumMicroOps = 1;
+}
+
+def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
+ SchedVar<NoSchedPred, [WriteALURMW]>
+]>;
+def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
+ DEC8m, DEC16m, DEC32m, DEC64m,
+ NOT8m, NOT16m, NOT32m, NOT64m,
+ NEG8m, NEG16m, NEG32m, NEG64m)>;
+
+def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
+ let Latency = 2;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
+ XADD32rr, XADD64rr)>;
+
+// This write defines the latency of the in/out register operand of a non-atomic
+// XADDrm. This is the first of a pair of writes that model non-atomic
+// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
+//
+// We need two writes because the instruction latency differs from the output
+// register operand latency. In particular, the first write describes the first
+// (and only) output register operand of the instruction. However, the
+// instruction latency is set to the MAX of all the write latencies. That's why
+// a second write is needed in this case (see example below).
+//
+// Example:
+// XADD %ecx, (%rsp) ## Instruction latency: 11cy
+// ## ECX write Latency: 3cy
+//
+// Register ECX becomes available in 3 cycles. That is because the value of ECX
+// is exchanged with the value read from the stack pointer, and the load-to-use
+// latency is assumed to be 3cy.
+def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 3; // load-to-use latency
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XADDrm. This is the first of a sequence of two writes used to model atomic
+// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
+//
+//
+// Example:
+// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy
+// ## ECX write Latency: 11cy
+//
+// The value of ECX becomes available only after 11cy from the start of
+// execution. This write is used to specifically set that operand latency.
+def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XCHGrm. This write is the first of a sequence of two writes that describe
+// atomic XCHG operations. We need two writes because the instruction latency
+// differs from the output register write latency. We want to make sure that
+// the output register operand becomes visible after 11cy. However, we want to
+// set the instruction latency to 16cy.
+def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 11;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 1;
+}
+
+def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+ let Latency = 16;
+ let ResourceCycles = [16, 16];
+ let NumMicroOps = 1;
+}
+
+def JWriteXADDrm_Part1 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
+ SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]>
+]>;
+
+def JWriteXADDrm_Part2 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
+ SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]>
+]>;
+
+def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
+ (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
+ LXADD8, LXADD16, LXADD32, LXADD64)>;
+
+def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
+ (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>;
+defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
+defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
+
+defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>;
+defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
+defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
+
+defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>;
+
+defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>;
+
+defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>;
+defm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>;
+defm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>;
+defm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>;
+defm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>;
+defm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>;
+defm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>;
+defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>;
+defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>;
+defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>;
+defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>;
+defm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>;
+defm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>;
+defm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>;
+defm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>;
+defm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>;
+defm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>;
+defm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>;
+defm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>;
+defm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>;
+defm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>;
+defm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>;
+defm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>;
+defm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>;
+defm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>;
+defm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>;
+defm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>;
+defm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>;
+defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency.
+defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency.
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>;
+defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [4, 4], 3>;
+defm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [6, 6], 6>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Conversions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
+defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
+defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
+defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>;
+defm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+
+defm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>;
+defm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+
+defm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector integer operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>;
+defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
+
+defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>;
+defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
+
+defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>;
+defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>;
+
+defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUY>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency.
+defm : X86WriteResPairUnsupported<WriteVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
+defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : X86WriteResPairUnsupported<WriteVarVecShift>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>;
+defm : X86WriteResPairUnsupported<WriteVecIMulY>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2], 3>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>;
+defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : JWriteResFpuPair<WritePHMINPOS, [JFPU01, JVALU], 2>;
+defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleY>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 1], 1>;
+defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendY>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [4, 4], 3>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicY>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector insert/extract operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
+defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE42 String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
+defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
+defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
+defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
+
+////////////////////////////////////////////////////////////////////////////////
+// MOVMSK Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; }
+def : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; }
+defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
+def : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; }
+
+////////////////////////////////////////////////////////////////////////////////
+// AES Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency.
+defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency.
+defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency.
+defm : X86WriteResPairUnsupported<WritePHAddY>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 4];
+}
+def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// AVX instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
+def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>;
+
+def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 2, 4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
+ VBROADCASTSSYrm,
+ VBROADCASTF128)>;
+
+def JWriteJVZEROALL: SchedWriteRes<[]> {
+ let Latency = 90;
+ let NumMicroOps = 73;
+}
+def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
+
+def JWriteJVZEROUPPER: SchedWriteRes<[]> {
+ let Latency = 46;
+ let NumMicroOps = 37;
+}
+def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
+
+///////////////////////////////////////////////////////////////////////////////
+// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
+///////////////////////////////////////////////////////////////////////////////
+
+def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
+ let Latency = 34;
+ let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
+ let NumMicroOps = 63;
+}
+def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
+ VMASKMOVDQU, VMASKMOVDQU64)>;
+
+///////////////////////////////////////////////////////////////////////////////
+// SchedWriteVariant definitions.
+///////////////////////////////////////////////////////////////////////////////
+
+def JWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
+ let NumMicroOps = 2;
+}
+
+// Certain instructions that use the same register for both source
+// operands do not have a real dependency on the previous contents of the
+// register, and thus, do not have to wait before completing. They can be
+// optimized out at register renaming stage.
+// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
+// 15h Processors".
+// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// Section 21.8 [Dependency-breaking instructions].
+
+def JWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def JWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
+ ANDNPSrr, VANDNPSrr,
+ ANDNPDrr, VANDNPDrr)>;
+
+def JWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
+ VANDNPSYrr, VANDNPDYrr)>;
+
+def JWriteVZeroIdiomLogic : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogic]>
+]>;
+def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
+
+def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
+ PANDNrr, VPANDNrr)>;
+
+def JWriteVZeroIdiomALU : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALU]>
+]>;
+def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
+ MMX_PSUBQirr, MMX_PSUBWirr,
+ MMX_PSUBSBirr, MMX_PSUBSWirr,
+ MMX_PSUBUSBirr, MMX_PSUBUSWirr,
+ MMX_PCMPGTBirr, MMX_PCMPGTDirr,
+ MMX_PCMPGTWirr)>;
+
+def JWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PSUBSBrr, VPSUBSBrr,
+ PSUBSWrr, VPSUBSWrr,
+ PSUBUSBrr, VPSUBUSBrr,
+ PSUBUSWrr, VPSUBUSWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTQrr, VPCMPGTQrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def JWriteVPERM2F128 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
+ SchedVar<NoSchedPred, [WriteFShuffle256]>
+]>;
+def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
+
+// This write is used for slow LEA instructions.
+def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
+ let Latency = 2;
+}
+
+// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
+// with a `Scale` value different than 1.
+def JSlowLEAPredicate : MCSchedPredicate<
+ CheckAny<[
+ // A 3-operand LEA (base, index, offset).
+ IsThreeOperandsLEAFn,
+ // An LEA with a "Scale" different than 1.
+ CheckAll<[
+ CheckIsImmOperand<2>,
+ CheckNot<CheckImmOperand<2, 1>>
+ ]>
+ ]>
+>;
+
+def JWriteLEA : SchedWriteVariant<[
+ SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
+ SchedVar<NoSchedPred, [WriteLEA]>
+]>;
+
+def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+
+def JSlowLEA16r : SchedWriteRes<[JALU01]> {
+ let Latency = 3;
+ let ResourceCycles = [4];
+}
+
+def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+ // GPR Zero-idioms.
+ DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+ // MMX Zero-idioms.
+ DepBreakingClass<[
+ MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
+ MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
+ MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
+ MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
+ ], ZeroIdiomPredicate>,
+
+ // SSE Zero-idioms.
+ DepBreakingClass<[
+ // fp variants.
+ XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
+
+ // int variants.
+ PXORrr, PANDNrr,
+ PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+ PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
+ PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
+ ], ZeroIdiomPredicate>,
+
+ // AVX Zero-idioms.
+ DepBreakingClass<[
+ // xmm fp variants.
+ VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
+
+ // xmm int variants.
+ VPXORrr, VPANDNrr,
+ VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+ VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
+ VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+
+ // ymm variants.
+ VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
+ ], ZeroIdiomPredicate>,
+
+ DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
+]>;
+
+def : IsDepBreakingFunction<[
+ // GPR
+ DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
+ DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
+
+ // MMX
+ DepBreakingClass<[
+ MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
+ ], ZeroIdiomPredicate>,
+
+ // SSE
+ DepBreakingClass<[
+ PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
+ ], ZeroIdiomPredicate>,
+
+ // AVX
+ DepBreakingClass<[
+ VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
+ ], ZeroIdiomPredicate>
+]>;
+
+def : IsOptimizableRegisterMove<[
+ InstructionEquivalenceClass<[
+ // GPR variants.
+ MOV32rr, MOV64rr,
+
+ // MMX variants.
+ MMX_MOVQ64rr,
+
+ // SSE variants.
+ MOVAPSrr, MOVUPSrr,
+ MOVAPDrr, MOVUPDrr,
+ MOVDQArr, MOVDQUrr,
+
+ // AVX variants.
+ VMOVAPSrr, VMOVUPSrr,
+ VMOVAPDrr, VMOVUPDrr,
+ VMOVDQArr, VMOVDQUrr
+ ], TruePred >
+]>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td
new file mode 100644
index 000000000000..3d53ef104ed6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -0,0 +1,474 @@
+//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Intel Silvermont to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SLMModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SLM can decode 2
+ // instructions per cycle.
+ let IssueWidth = 2;
+ let MicroOpBufferSize = 32; // Based on the reorder buffer.
+ let LoadLatency = 3;
+ let MispredictPenalty = 10;
+ let PostRAScheduler = 1;
+
+ // For small loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+
+ // FIXME: SSE4 is unimplemented. This flag is set to allow
+ // the scheduler to assign a default model to unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SLMModel in {
+
+// Silvermont has 5 reservation stations for micro-ops
+def SLM_IEC_RSV0 : ProcResource<1>;
+def SLM_IEC_RSV1 : ProcResource<1>;
+def SLM_FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def SLM_FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def SLM_MEC_RSV : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SLM_IEC_RSV01 : ProcResGroup<[SLM_IEC_RSV0, SLM_IEC_RSV1]>;
+def SLM_FPC_RSV01 : ProcResGroup<[SLM_FPC_RSV0, SLM_FPC_RSV1]>;
+
+def SLMDivider : ProcResource<1>;
+def SLMFPMultiplier : ProcResource<1>;
+def SLMFPDivider : ProcResource<1>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+def : ReadAdvance<ReadAfterVecLd, 3>;
+def : ReadAdvance<ReadAfterVecXLd, 3>;
+def : ReadAdvance<ReadAfterVecYLd, 3>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SLMWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1,
+ int LoadLat = 3> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on MEC_RSV and adds LoadLat cycles to
+ // the latency (default = 3).
+ def : WriteRes<SchedRW.Folded, !listconcat([SLM_MEC_RSV], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
+ }
+}
+
+// A folded store needs a cycle on MEC_RSV for the store data, but it does not
+// need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [SLM_MEC_RSV]>;
+
+def : WriteRes<WriteStore, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
+def : WriteRes<WriteStoreNT, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
+def : WriteRes<WriteLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove, [SLM_IEC_RSV01]>;
+def : WriteRes<WriteZero, []>;
+
+// Load/store MXCSR.
+// FIXME: These are probably wrong. They are copy pasted from WriteStore/Load.
+def : WriteRes<WriteSTMXCSR, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
+def : WriteRes<WriteLDMXCSR, [SLM_MEC_RSV]> { let Latency = 3; }
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+defm : SLMWriteResPair<WriteALU, [SLM_IEC_RSV01], 1>;
+defm : SLMWriteResPair<WriteADC, [SLM_IEC_RSV01], 1>;
+
+defm : SLMWriteResPair<WriteIMul8, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul16, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul16Imm, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul16Reg, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul32, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul32Imm, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul32Reg, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul64, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul64Imm, [SLM_IEC_RSV1], 3>;
+defm : SLMWriteResPair<WriteIMul64Reg, [SLM_IEC_RSV1], 3>;
+
+defm : X86WriteRes<WriteBSWAP32, [SLM_IEC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [SLM_IEC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [SLM_IEC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHGRMW, [SLM_IEC_RSV01, SLM_MEC_RSV], 4, [1, 2], 2>;
+defm : X86WriteRes<WriteXCHG, [SLM_IEC_RSV01], 1, [1], 1>;
+
+defm : SLMWriteResPair<WriteShift, [SLM_IEC_RSV0], 1>;
+defm : SLMWriteResPair<WriteShiftCL, [SLM_IEC_RSV0], 1>;
+defm : SLMWriteResPair<WriteRotate, [SLM_IEC_RSV0], 1>;
+defm : SLMWriteResPair<WriteRotateCL, [SLM_IEC_RSV0], 1>;
+
+defm : X86WriteRes<WriteSHDrri, [SLM_IEC_RSV0], 1, [1], 1>;
+defm : X86WriteRes<WriteSHDrrcl,[SLM_IEC_RSV0], 1, [1], 1>;
+defm : X86WriteRes<WriteSHDmri, [SLM_MEC_RSV, SLM_IEC_RSV0], 4, [2, 1], 2>;
+defm : X86WriteRes<WriteSHDmrcl,[SLM_MEC_RSV, SLM_IEC_RSV0], 4, [2, 1], 2>;
+
+defm : SLMWriteResPair<WriteJump, [SLM_IEC_RSV1], 1>;
+defm : SLMWriteResPair<WriteCRC32, [SLM_IEC_RSV1], 3>;
+
+defm : SLMWriteResPair<WriteCMOV, [SLM_IEC_RSV01], 2, [2]>;
+defm : X86WriteRes<WriteFCMOV, [SLM_FPC_RSV1], 3, [1], 1>; // x87 conditional move.
+def : WriteRes<WriteSETCC, [SLM_IEC_RSV01]>;
+def : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> {
+ // FIXME Latency and NumMicrOps?
+ let ResourceCycles = [2,1];
+}
+defm : X86WriteRes<WriteLAHFSAHF, [SLM_IEC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTest, [SLM_IEC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [SLM_IEC_RSV01, SLM_MEC_RSV], 4, [1,1], 1>;
+defm : X86WriteRes<WriteBitTestRegLd, [SLM_IEC_RSV01, SLM_MEC_RSV], 4, [1,1], 1>;
+defm : X86WriteRes<WriteBitTestSet, [SLM_IEC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestSetImmLd, [SLM_IEC_RSV01, SLM_MEC_RSV], 3, [1,1], 1>;
+defm : X86WriteRes<WriteBitTestSetRegLd, [SLM_IEC_RSV01, SLM_MEC_RSV], 3, [1,1], 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SLM_IEC_RSV1]>;
+
+// Bit counts.
+defm : SLMWriteResPair<WriteBSF, [SLM_IEC_RSV01], 10, [20], 10>;
+defm : SLMWriteResPair<WriteBSR, [SLM_IEC_RSV01], 10, [20], 10>;
+defm : SLMWriteResPair<WriteLZCNT, [SLM_IEC_RSV0], 3>;
+defm : SLMWriteResPair<WriteTZCNT, [SLM_IEC_RSV0], 3>;
+defm : SLMWriteResPair<WritePOPCNT, [SLM_IEC_RSV0], 3>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : X86WriteResPairUnsupported<WriteBEXTR>;
+defm : X86WriteResPairUnsupported<WriteBLS>;
+defm : X86WriteResPairUnsupported<WriteBZHI>;
+
+defm : SLMWriteResPair<WriteDiv8, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv16, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv32, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv8, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv16, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv32, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteIDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+
+// Scalar and vector floating point.
+defm : X86WriteRes<WriteFLD0, [SLM_FPC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1, [SLM_FPC_RSV01], 1, [1], 1>;
+defm : X86WriteRes<WriteFLDC, [SLM_FPC_RSV01], 1, [2], 2>;
+def : WriteRes<WriteFLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFLoadX, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFLoadY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFMaskedLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFMaskedLoadY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFStore, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreX, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreNT, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreNTX, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFStoreNTY, [SLM_MEC_RSV]>;
+
+def : WriteRes<WriteFMaskedStore32, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore32Y, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore64, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore64Y, [SLM_MEC_RSV]>;
+
+def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>;
+defm : X86WriteRes<WriteEMMS, [SLM_FPC_RSV01], 10, [10], 9>;
+
+defm : SLMWriteResPair<WriteFAdd, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAddX, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAddY, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : SLMWriteResPair<WriteFAdd64, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAdd64X, [SLM_FPC_RSV1], 4, [2]>;
+defm : SLMWriteResPair<WriteFAdd64Y, [SLM_FPC_RSV1], 4, [2]>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : SLMWriteResPair<WriteFCmp, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmpX, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmpY, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : SLMWriteResPair<WriteFCmp64, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmp64X, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFCmp64Y, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : SLMWriteResPair<WriteFCom, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFComX, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFMul, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMulX, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMulY, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : SLMWriteResPair<WriteFMul64, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMul64X, [SLM_FPC_RSV0, SLMFPMultiplier], 7, [1,4]>;
+defm : SLMWriteResPair<WriteFMul64Y, [SLM_FPC_RSV0, SLMFPMultiplier], 7, [1,4]>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : SLMWriteResPair<WriteFDiv, [SLM_FPC_RSV0, SLMFPDivider], 19, [1,17]>;
+defm : SLMWriteResPair<WriteFDivX, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
+defm : SLMWriteResPair<WriteFDivY, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : SLMWriteResPair<WriteFDiv64, [SLM_FPC_RSV0, SLMFPDivider], 34, [1,32]>;
+defm : SLMWriteResPair<WriteFDiv64X, [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>;
+defm : SLMWriteResPair<WriteFDiv64Y, [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : SLMWriteResPair<WriteFRcp, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRcpX, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRcpY, [SLM_FPC_RSV0], 5>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : SLMWriteResPair<WriteFRsqrt, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRsqrtX, [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRsqrtY, [SLM_FPC_RSV0], 5>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : SLMWriteResPair<WriteFSqrt, [SLM_FPC_RSV0,SLMFPDivider], 20, [1,20], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrtX, [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrtY, [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : SLMWriteResPair<WriteFSqrt64, [SLM_FPC_RSV0,SLMFPDivider], 35, [1,35], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrt64X, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrt64Y, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : SLMWriteResPair<WriteFSqrt80, [SLM_FPC_RSV0,SLMFPDivider], 40, [1,40]>;
+defm : SLMWriteResPair<WriteDPPD, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteDPPS, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteDPPSY, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+defm : SLMWriteResPair<WriteFSign, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteFRnd, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFRndY, [SLM_FPC_RSV1], 3>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : SLMWriteResPair<WriteFLogic, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteFLogicY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : SLMWriteResPair<WriteFTest, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteFTestY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : SLMWriteResPair<WriteFShuffle, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteFShuffleY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : SLMWriteResPair<WriteFVarShuffle, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteFVarShuffleY,[SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : SLMWriteResPair<WriteFBlend, [SLM_FPC_RSV0], 1>;
+
+// Conversion between integer and float.
+defm : SLMWriteResPair<WriteCvtSS2I, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2I, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2IY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : SLMWriteResPair<WriteCvtSD2I, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2I, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2IY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : SLMWriteResPair<WriteCvtI2SS, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PS, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PSY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : SLMWriteResPair<WriteCvtI2SD, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PD, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtI2PDY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : SLMWriteResPair<WriteCvtSS2SD, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2PD, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPS2PDY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : SLMWriteResPair<WriteCvtSD2SS, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2PS, [SLM_FPC_RSV01], 4>;
+defm : SLMWriteResPair<WriteCvtPD2PSY, [SLM_FPC_RSV01], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+// Vector integer operations.
+def : WriteRes<WriteVecLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecLoadX, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecLoadY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecLoadNT, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecLoadNTY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecMaskedLoad, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecMaskedLoadY, [SLM_MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecStore, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecStoreX, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecStoreY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecStoreNT, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecStoreNTY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStore32, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStore32Y, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStore64, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStore64Y, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMove, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteVecMoveX, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteVecMoveY, [SLM_FPC_RSV01]>;
+def : WriteRes<WriteVecMoveToGpr, [SLM_IEC_RSV01]>;
+def : WriteRes<WriteVecMoveFromGpr, [SLM_IEC_RSV01]>;
+
+defm : SLMWriteResPair<WriteVecShift, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVecShiftX, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVecShiftY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : SLMWriteResPair<WriteVecShiftImm, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVecShiftImmX,[SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVecShiftImmY,[SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : SLMWriteResPair<WriteVecLogic, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecLogicX,[SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecLogicY,[SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : SLMWriteResPair<WriteVecTest, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecTestY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : SLMWriteResPair<WriteVecALU, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecALUX, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WriteVecALUY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+// FIXME: The below is closer to correct, but caused some perf regressions.
+//defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;
+defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WritePMULLDY, [SLM_FPC_RSV0], 4>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : SLMWriteResPair<WriteShuffle, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteShuffleY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : SLMWriteResPair<WriteShuffleX, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVarShuffle, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVarShuffleX, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVarShuffleY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : SLMWriteResPair<WriteBlend, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : SLMWriteResPair<WriteMPSAD, [SLM_FPC_RSV0], 7>;
+defm : SLMWriteResPair<WriteMPSADY, [SLM_FPC_RSV0], 7>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+defm : SLMWriteResPair<WritePSADBW, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WritePSADBWX, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WritePSADBWY, [SLM_FPC_RSV0], 4>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : SLMWriteResPair<WritePHMINPOS, [SLM_FPC_RSV0], 4>;
+
+// Vector insert/extract operations.
+defm : SLMWriteResPair<WriteVecInsert, [SLM_FPC_RSV0], 1>;
+
+def : WriteRes<WriteVecExtract, [SLM_FPC_RSV0]>;
+def : WriteRes<WriteVecExtractSt, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : SLMWriteResPair<WriteFHAdd, [SLM_FPC_RSV01], 6, [6], 4>;
+defm : SLMWriteResPair<WriteFHAddY, [SLM_FPC_RSV01], 6, [6], 4>;
+defm : X86WriteResPairUnsupported<WriteFHAddZ>;
+defm : SLMWriteResPair<WritePHAdd, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WritePHAddX, [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WritePHAddY, [SLM_FPC_RSV01], 1>;
+defm : X86WriteResPairUnsupported<WritePHAddZ>;
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm : SLMWriteResPair<WritePCmpIStrM, [SLM_FPC_RSV0], 13, [13]>;
+
+// Packed Compare Explicit Length Strings, Return Mask
+defm : SLMWriteResPair<WritePCmpEStrM, [SLM_FPC_RSV0], 17, [17]>;
+// Packed Compare Implicit Length Strings, Return Index
+defm : SLMWriteResPair<WritePCmpIStrI, [SLM_FPC_RSV0], 17, [17]>;
+
+// Packed Compare Explicit Length Strings, Return Index
+defm : SLMWriteResPair<WritePCmpEStrI, [SLM_FPC_RSV0], 21, [21]>;
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; }
+def : WriteRes<WriteVecMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; }
+def : WriteRes<WriteVecMOVMSKY, [SLM_FPC_RSV1]> { let Latency = 4; }
+def : WriteRes<WriteMMXMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; }
+
+// AES Instructions.
+defm : SLMWriteResPair<WriteAESDecEnc, [SLM_FPC_RSV0], 8, [5]>;
+defm : SLMWriteResPair<WriteAESIMC, [SLM_FPC_RSV0], 8, [5]>;
+defm : SLMWriteResPair<WriteAESKeyGen, [SLM_FPC_RSV0], 8, [5]>;
+
+// Carry-less multiplication instructions.
+defm : SLMWriteResPair<WriteCLMul, [SLM_FPC_RSV0], 10, [10]>;
+
+def : WriteRes<WriteSystem, [SLM_FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SLM_FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SLM_MEC_RSV]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX/FMA is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+def : WriteRes<WriteIMulH, [SLM_FPC_RSV0]>;
+defm : X86WriteResPairUnsupported<WriteFBlendY>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 4, [4], 3>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : X86WriteResPairUnsupported<WriteFShuffle256>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+defm : X86WriteResPairUnsupported<WriteShuffle256>;
+defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
+defm : SLMWriteResPair<WriteVarVecShift, [SLM_FPC_RSV0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+defm : X86WriteResPairUnsupported<WriteFMA>;
+defm : X86WriteResPairUnsupported<WriteFMAX>;
+defm : X86WriteResPairUnsupported<WriteFMAY>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+
+defm : X86WriteResPairUnsupported<WriteCvtPH2PS>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PH>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+// Remaining SLM instrs.
+
+def SLMWriteResGroup1rr : SchedWriteRes<[SLM_FPC_RSV01]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [4];
+}
+def: InstRW<[SLMWriteResGroup1rr], (instrs PADDQrr, PSUBQrr, PCMPEQQrr)>;
+
+def SLMWriteResGroup1rm : SchedWriteRes<[SLM_MEC_RSV,SLM_FPC_RSV01]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[SLMWriteResGroup1rm], (instrs PADDQrm, PSUBQrm, PCMPEQQrm)>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td
new file mode 100644
index 000000000000..fe09d6f85221
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -0,0 +1,1561 @@
+//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver1 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Znver1Model : SchedMachineModel {
+ // Zen can decode 4 instructions per cycle.
+ let IssueWidth = 4;
+ // Based on the reorder buffer we define MicroOpBufferSize
+ let MicroOpBufferSize = 192;
+ let LoadLatency = 4;
+ let MispredictPenalty = 17;
+ let HighLatency = 25;
+ let PostRAScheduler = 1;
+
+ // FIXME: This variable is required for incomplete model.
+ // We haven't catered all instructions.
+ // So, we reset the value of this variable so as to
+ // say that the model is incomplete.
+ let CompleteModel = 0;
+}
+
+let SchedModel = Znver1Model in {
+
+// Zen can issue micro-ops to 10 different units in one cycle.
+// These are
+// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3)
+// * Two AGU units (ZAGU0, ZAGU1)
+// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3)
+// AGUs feed load store queues @two loads and 1 store per cycle.
+
+// Four ALU units are defined below
+def ZnALU0 : ProcResource<1>;
+def ZnALU1 : ProcResource<1>;
+def ZnALU2 : ProcResource<1>;
+def ZnALU3 : ProcResource<1>;
+
+// Two AGU units are defined below
+def ZnAGU0 : ProcResource<1>;
+def ZnAGU1 : ProcResource<1>;
+
+// Four FPU units are defined below
+def ZnFPU0 : ProcResource<1>;
+def ZnFPU1 : ProcResource<1>;
+def ZnFPU2 : ProcResource<1>;
+def ZnFPU3 : ProcResource<1>;
+
+// FPU grouping
+def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>;
+def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>;
+def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>;
+def ZnFPU13 : ProcResGroup<[ZnFPU1, ZnFPU3]>;
+def ZnFPU23 : ProcResGroup<[ZnFPU2, ZnFPU3]>;
+def ZnFPU02 : ProcResGroup<[ZnFPU0, ZnFPU2]>;
+def ZnFPU03 : ProcResGroup<[ZnFPU0, ZnFPU3]>;
+
+// Below are the grouping of the units.
+// Micro-ops to be issued to multiple units are tackled this way.
+
+// ALU grouping
+// ZnALU03 - 0,3 grouping
+def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>;
+
+// 56 Entry (14x4 entries) Int Scheduler
+def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> {
+ let BufferSize=56;
+}
+
+// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations
+// but are relevant for some instructions
+def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> {
+ let BufferSize=28;
+}
+
+// Integer Multiplication issued on ALU1.
+def ZnMultiplier : ProcResource<1>;
+
+// Integer division issued on ALU2.
+def ZnDivider : ProcResource<1>;
+
+// 4 Cycles integer load-to use Latency is captured
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// 8 Cycles vector load-to use Latency is captured
+def : ReadAdvance<ReadAfterVecLd, 8>;
+def : ReadAdvance<ReadAfterVecXLd, 8>;
+def : ReadAdvance<ReadAfterVecYLd, 8>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// The Integer PRF for Zen is 168 entries, and it holds the architectural and
+// speculative version of the 64-bit integer registers.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def ZnIntegerPRF : RegisterFile<168, [GR64, CCR]>;
+
+// 36 Entry (9x4 entries) floating-point Scheduler
+def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]> {
+let BufferSize=36;
+}
+
+// The Zen FP Retire Queue renames SIMD and FP uOps onto a pool of 160 128-bit
+// registers. Operations on 256-bit data types are cracked into two COPs.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def ZnFpuPRF: RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// The unit can track up to 192 macro ops in-flight.
+// The retire unit handles in-order commit of up to 8 macro ops per cycle.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+// To be noted, the retire unit is shared between integer and FP ops.
+// In SMT mode it is 96 entry per thread. But, we do not use the conservative
+// value here because there is currently no way to fully mode the SMT mode,
+// so there is no point in trying.
+def ZnRCU : RetireControlUnit<192, 8>;
+
+// FIXME: there are 72 read buffers and 44 write buffers.
+
+// (a folded load is an instruction that loads and does some operation)
+// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops.
+// a. load and
+// b. addpd
+// This multiclass is for folded loads for integer units.
+multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadLat = 4, int LoadUOps = 1> {
+ // Register variant takes 1-cycle on Execution Port.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on ZnAGU
+ // adds LoadLat cycles to the latency (default = 4).
+ def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
+ }
+}
+
+// This multiclass is for folded loads for floating point units.
+multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadLat = 7, int LoadUOps = 0> {
+ // Register variant takes 1-cycle on Execution Port.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on ZnAGU
+ // adds LoadLat cycles to the latency (default = 7).
+ def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
+ }
+}
+
+// WriteRMW is set for instructions with Memory write
+// operation in codegen
+def : WriteRes<WriteRMW, [ZnAGU]>;
+
+def : WriteRes<WriteStore, [ZnAGU]>;
+def : WriteRes<WriteStoreNT, [ZnAGU]>;
+def : WriteRes<WriteMove, [ZnALU]>;
+def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; }
+
+def : WriteRes<WriteZero, []>;
+def : WriteRes<WriteLEA, [ZnALU]>;
+defm : ZnWriteResPair<WriteALU, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteADC, [ZnALU], 1>;
+
+defm : ZnWriteResPair<WriteIMul8, [ZnALU1, ZnMultiplier], 4>;
+//defm : ZnWriteResPair<WriteIMul16, [ZnALU1, ZnMultiplier], 4>;
+//defm : ZnWriteResPair<WriteIMul16Imm, [ZnALU1, ZnMultiplier], 4>;
+//defm : ZnWriteResPair<WriteIMul16Reg, [ZnALU1, ZnMultiplier], 4>;
+//defm : ZnWriteResPair<WriteIMul32, [ZnALU1, ZnMultiplier], 4>;
+//defm : ZnWriteResPair<WriteIMul32Imm, [ZnALU1, ZnMultiplier], 4>;
+//defm : ZnWriteResPair<WriteIMul32Reg, [ZnALU1, ZnMultiplier], 4>;
+//defm : ZnWriteResPair<WriteIMul64, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
+//defm : ZnWriteResPair<WriteIMul64Imm, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
+//defm : ZnWriteResPair<WriteIMul64Reg, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
+
+defm : X86WriteRes<WriteBSWAP32, [ZnALU], 1, [4], 1>;
+defm : X86WriteRes<WriteBSWAP64, [ZnALU], 1, [4], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [ZnALU], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[ZnALU,ZnAGU], 8, [1,1], 5>;
+defm : X86WriteRes<WriteXCHG, [ZnALU], 1, [2], 2>;
+
+defm : ZnWriteResPair<WriteShift, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteShiftCL, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteRotate, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteRotateCL, [ZnALU], 1>;
+
+defm : X86WriteRes<WriteSHDrri, [ZnALU], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteSHDrrcl>;
+defm : X86WriteResUnsupported<WriteSHDmri>;
+defm : X86WriteResUnsupported<WriteSHDmrcl>;
+
+defm : ZnWriteResPair<WriteJump, [ZnALU], 1>;
+defm : ZnWriteResFpuPair<WriteCRC32, [ZnFPU0], 3>;
+
+defm : ZnWriteResPair<WriteCMOV, [ZnALU], 1>;
+def : WriteRes<WriteSETCC, [ZnALU]>;
+def : WriteRes<WriteSETCCStore, [ZnALU, ZnAGU]>;
+defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>;
+
+defm : X86WriteRes<WriteBitTest, [ZnALU], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestRegLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestSet, [ZnALU], 2, [1], 2>;
+//defm : X86WriteRes<WriteBitTestSetImmLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
+//defm : X86WriteRes<WriteBitTestSetRegLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
+
+// Bit counts.
+defm : ZnWriteResPair<WriteBSF, [ZnALU], 3>;
+defm : ZnWriteResPair<WriteBSR, [ZnALU], 3>;
+defm : ZnWriteResPair<WriteLZCNT, [ZnALU], 2>;
+defm : ZnWriteResPair<WriteTZCNT, [ZnALU], 2>;
+defm : ZnWriteResPair<WritePOPCNT, [ZnALU], 1>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : ZnWriteResPair<WriteBEXTR, [ZnALU], 1>;
+//defm : ZnWriteResPair<WriteBLS, [ZnALU], 2>;
+defm : ZnWriteResPair<WriteBZHI, [ZnALU], 1>;
+
+// IDIV
+defm : ZnWriteResPair<WriteDiv8, [ZnALU2, ZnDivider], 15, [1,15], 1>;
+defm : ZnWriteResPair<WriteDiv16, [ZnALU2, ZnDivider], 17, [1,17], 2>;
+defm : ZnWriteResPair<WriteDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>;
+defm : ZnWriteResPair<WriteDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>;
+defm : ZnWriteResPair<WriteIDiv8, [ZnALU2, ZnDivider], 15, [1,15], 1>;
+defm : ZnWriteResPair<WriteIDiv16, [ZnALU2, ZnDivider], 17, [1,17], 2>;
+defm : ZnWriteResPair<WriteIDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>;
+defm : ZnWriteResPair<WriteIDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>;
+
+// IMULH
+def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
+ let Latency = 4;
+}
+
+// Floating point operations
+defm : X86WriteRes<WriteFLoad, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [ZnAGU,ZnFPU01], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedLoadY, [ZnAGU,ZnFPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteFStore, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreX, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+
+defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>;
+
+defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAddX, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAddY, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : ZnWriteResFpuPair<WriteFAdd64, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd64X, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd64Y, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : ZnWriteResFpuPair<WriteFCmp, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmpX, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmpY, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : ZnWriteResFpuPair<WriteFCmp64, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmp64X, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmp64Y, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : ZnWriteResFpuPair<WriteFCom, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFComX, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFBlendY, [ZnFPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : ZnWriteResFpuPair<WriteFVarBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFVarBlendY,[ZnFPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : ZnWriteResFpuPair<WriteVarBlend, [ZnFPU0], 1>;
+defm : ZnWriteResFpuPair<WriteVarBlendY, [ZnFPU0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : ZnWriteResFpuPair<WriteCvtSS2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtPS2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtPS2IY, [ZnFPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : ZnWriteResFpuPair<WriteCvtSD2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtPD2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtPD2IY, [ZnFPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+defm : ZnWriteResFpuPair<WriteCvtI2SS, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PS, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PSY, [ZnFPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : ZnWriteResFpuPair<WriteCvtI2SD, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PD, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtI2PDY, [ZnFPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+defm : ZnWriteResFpuPair<WriteFDiv, [ZnFPU3], 15>;
+defm : ZnWriteResFpuPair<WriteFDivX, [ZnFPU3], 15>;
+//defm : ZnWriteResFpuPair<WriteFDivY, [ZnFPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : ZnWriteResFpuPair<WriteFDiv64, [ZnFPU3], 15>;
+defm : ZnWriteResFpuPair<WriteFDiv64X, [ZnFPU3], 15>;
+//defm : ZnWriteResFpuPair<WriteFDiv64Y, [ZnFPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : ZnWriteResFpuPair<WriteFSign, [ZnFPU3], 2>;
+defm : ZnWriteResFpuPair<WriteFRnd, [ZnFPU3], 4, [1], 1, 7, 1>; // FIXME: Should folds require 1 extra uops?
+defm : ZnWriteResFpuPair<WriteFRndY, [ZnFPU3], 4, [1], 1, 7, 1>; // FIXME: Should folds require 1 extra uops?
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : ZnWriteResFpuPair<WriteFLogic, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteFLogicY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : ZnWriteResFpuPair<WriteFTest, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteFTestY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : ZnWriteResFpuPair<WriteFShuffle, [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteFShuffleY, [ZnFPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : ZnWriteResFpuPair<WriteFVarShuffle, [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteFVarShuffleY,[ZnFPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : ZnWriteResFpuPair<WriteFMul, [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMulX, [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMulY, [ZnFPU01], 4, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : ZnWriteResFpuPair<WriteFMul64, [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMul64X, [ZnFPU01], 3, [1], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFMul64Y, [ZnFPU01], 4, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : ZnWriteResFpuPair<WriteFMA, [ZnFPU03], 5>;
+defm : ZnWriteResFpuPair<WriteFMAX, [ZnFPU03], 5>;
+defm : ZnWriteResFpuPair<WriteFMAY, [ZnFPU03], 5>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : ZnWriteResFpuPair<WriteFRcp, [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFRcpX, [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFRcpY, [ZnFPU01], 5, [1], 1, 7, 2>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+//defm : ZnWriteResFpuPair<WriteFRsqrt, [ZnFPU02], 5>;
+defm : ZnWriteResFpuPair<WriteFRsqrtX, [ZnFPU01], 5, [1], 1, 7, 1>;
+//defm : ZnWriteResFpuPair<WriteFRsqrtY, [ZnFPU01], 5, [2], 2>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : ZnWriteResFpuPair<WriteFSqrt, [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrtX, [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrtY, [ZnFPU3], 28, [28], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : ZnWriteResFpuPair<WriteFSqrt64, [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrt64X, [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFSqrt64Y, [ZnFPU3], 40, [40], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : ZnWriteResFpuPair<WriteFSqrt80, [ZnFPU3], 20, [20]>;
+
+// Vector integer operations which uses FPU units
+defm : X86WriteRes<WriteVecLoad, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [ZnAGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [ZnAGU,ZnFPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [ZnAGU,ZnFPU01], 9, [1,3], 2>;
+defm : X86WriteRes<WriteVecStore, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreX, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNT, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY, [ZnAGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMove, [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [ZnFPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [ZnFPU], 2, [1], 2>;
+defm : X86WriteRes<WriteVecMoveToGpr, [ZnFPU2], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [ZnFPU2], 3, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [ZnFPU], 2, [1], 1>;
+
+defm : ZnWriteResFpuPair<WriteVecShift, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftX, [ZnFPU2], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftY, [ZnFPU2], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : ZnWriteResFpuPair<WriteVecShiftImm, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmX, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : ZnWriteResFpuPair<WriteVecLogic, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecLogicX, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecLogicY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : ZnWriteResFpuPair<WriteVecTest, [ZnFPU12], 1, [2], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteVecTestY, [ZnFPU12], 1, [2], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecALUX, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecALUY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>;
+defm : ZnWriteResFpuPair<WriteVecIMulX, [ZnFPU0], 4>;
+defm : ZnWriteResFpuPair<WriteVecIMulY, [ZnFPU0], 4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : ZnWriteResFpuPair<WritePMULLD, [ZnFPU0], 4, [1], 1, 7, 1>; // FIXME
+defm : ZnWriteResFpuPair<WritePMULLDY, [ZnFPU0], 5, [2], 1, 7, 1>; // FIXME
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : ZnWriteResFpuPair<WriteShuffle, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteShuffleX, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteShuffleY, [ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : ZnWriteResFpuPair<WriteVarShuffle, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVarShuffleX,[ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVarShuffleY,[ZnFPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteBlendY, [ZnFPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>;
+defm : ZnWriteResFpuPair<WriteVarShuffle256, [ZnFPU], 2>;
+defm : ZnWriteResFpuPair<WritePSADBW, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WritePSADBWX, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WritePSADBWY, [ZnFPU0], 3>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : ZnWriteResFpuPair<WritePHMINPOS, [ZnFPU0], 4>;
+
+// Vector Shift Operations
+defm : ZnWriteResFpuPair<WriteVarVecShift, [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteVarVecShiftY, [ZnFPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+defm : ZnWriteResFpuPair<WriteVecInsert, [ZnFPU], 1>;
+
+def : WriteRes<WriteVecExtract, [ZnFPU12, ZnFPU2]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteVecExtractSt, [ZnAGU, ZnFPU12, ZnFPU2]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2, 3];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [ZnFPU2]>;
+def : WriteRes<WriteMMXMOVMSK, [ZnFPU2]>;
+def : WriteRes<WriteVecMOVMSK, [ZnFPU2]>;
+
+def : WriteRes<WriteVecMOVMSKY, [ZnFPU2]> {
+ let NumMicroOps = 2;
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// AES Instructions.
+defm : ZnWriteResFpuPair<WriteAESDecEnc, [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESIMC, [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESKeyGen, [ZnFPU01], 4>;
+
+def : WriteRes<WriteFence, [ZnAGU]>;
+def : WriteRes<WriteNop, []>;
+
+// Following instructions with latency=100 are microcoded.
+// We set long latency so as to block the entire pipeline.
+defm : ZnWriteResFpuPair<WriteFShuffle256, [ZnFPU], 100>;
+defm : ZnWriteResFpuPair<WriteFVarShuffle256, [ZnFPU], 100>;
+
+// Microcoded Instructions
+def ZnWriteMicrocoded : SchedWriteRes<[]> {
+ let Latency = 100;
+}
+
+def : SchedAlias<WriteMicrocoded, ZnWriteMicrocoded>;
+def : SchedAlias<WriteFCMOV, ZnWriteMicrocoded>;
+def : SchedAlias<WriteSystem, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSAD, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSADY, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSADLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteMPSADYLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCLMul, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCLMulLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrM, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrMLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrI, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrILd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrM, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrMLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrI, ZnWriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrILd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteLDMXCSR, ZnWriteMicrocoded>;
+def : SchedAlias<WriteSTMXCSR, ZnWriteMicrocoded>;
+
+//=== Regex based InstRW ===//
+// Notation:
+// - r: register.
+// - m = memory.
+// - i = immediate
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+// MOV.
+// r16,m.
+def : InstRW<[WriteALULd, ReadAfterLd], (instrs MOV16rm)>;
+
+// MOVSX, MOVZX.
+// r,m.
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
+
+// XCHG.
+// r,m.
+def ZnWriteXCHGrm : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>;
+
+def : InstRW<[WriteMicrocoded], (instrs XLAT)>;
+
+// POP16.
+// r.
+def ZnWritePop16r : SchedWriteRes<[ZnAGU]>{
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWritePop16r], (instrs POP16rmm)>;
+def : InstRW<[WriteMicrocoded], (instregex "POPF(16|32)")>;
+def : InstRW<[WriteMicrocoded], (instregex "POPA(16|32)")>;
+
+
+// PUSH.
+// r. Has default values.
+// m.
+def ZnWritePUSH : SchedWriteRes<[ZnAGU]>{
+ let Latency = 4;
+}
+def : InstRW<[ZnWritePUSH], (instregex "PUSH(16|32)rmm")>;
+
+//PUSHF
+def : InstRW<[WriteMicrocoded], (instregex "PUSHF(16|32)")>;
+
+// PUSHA.
+def ZnWritePushA : SchedWriteRes<[ZnAGU]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWritePushA], (instregex "PUSHA(16|32)")>;
+
+//LAHF
+def : InstRW<[WriteMicrocoded], (instrs LAHF)>;
+
+// MOVBE.
+// r,m.
+def ZnWriteMOVBE : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 5;
+}
+def : InstRW<[ZnWriteMOVBE, ReadAfterLd], (instregex "MOVBE(16|32|64)rm")>;
+
+// m16,r16.
+def : InstRW<[ZnWriteMOVBE], (instregex "MOVBE(16|32|64)mr")>;
+
+//-- Arithmetic instructions --//
+
+// ADD SUB.
+// m,r/i.
+def : InstRW<[WriteALULd], (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
+ "(ADD|SUB)(8|16|32|64)mi8",
+ "(ADD|SUB)64mi32")>;
+
+// ADC SBB.
+// m,r/i.
+def : InstRW<[WriteALULd],
+ (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
+ "(ADC|SBB)(16|32|64)mi8",
+ "(ADC|SBB)64mi32")>;
+
+// INC DEC NOT NEG.
+// m.
+def : InstRW<[WriteALULd],
+ (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m")>;
+
+// MUL IMUL.
+// r16.
+def ZnWriteMul16 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteIMul16, ZnWriteMul16>;
+def : SchedAlias<WriteIMul16Imm, ZnWriteMul16>; // TODO: is this right?
+def : SchedAlias<WriteIMul16Reg, ZnWriteMul16>; // TODO: is this right?
+def : SchedAlias<WriteIMul16ImmLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
+def : SchedAlias<WriteIMul16RegLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
+
+// m16.
+def ZnWriteMul16Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 8;
+}
+def : SchedAlias<WriteIMul16Ld, ZnWriteMul16Ld>;
+
+// r32.
+def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteIMul32, ZnWriteMul32>;
+def : SchedAlias<WriteIMul32Imm, ZnWriteMul32>; // TODO: is this right?
+def : SchedAlias<WriteIMul32Reg, ZnWriteMul32>; // TODO: is this right?
+def : SchedAlias<WriteIMul32ImmLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
+def : SchedAlias<WriteIMul32RegLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
+
+// m32.
+def ZnWriteMul32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 8;
+}
+def : SchedAlias<WriteIMul32Ld, ZnWriteMul32Ld>;
+
+// r64.
+def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteIMul64, ZnWriteMul64>;
+def : SchedAlias<WriteIMul64Imm, ZnWriteMul64>; // TODO: is this right?
+def : SchedAlias<WriteIMul64Reg, ZnWriteMul64>; // TODO: is this right?
+def : SchedAlias<WriteIMul64ImmLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
+def : SchedAlias<WriteIMul64RegLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
+
+// m64.
+def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteIMul64Ld, ZnWriteMul64Ld>;
+
+// MULX.
+// r32,r32,r32.
+def ZnWriteMulX32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteMulX32], (instrs MULX32rr)>;
+
+// r32,r32,m32.
+def ZnWriteMulX32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 2, 2];
+}
+def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>;
+
+// r64,r64,r64.
+def ZnWriteMulX64 : SchedWriteRes<[ZnALU1]> {
+ let Latency = 3;
+}
+def : InstRW<[ZnWriteMulX64], (instrs MULX64rr)>;
+
+// r64,r64,m64.
+def ZnWriteMulX64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>;
+
+//-- Control transfer instructions --//
+
+// J(E|R)CXZ.
+def ZnWriteJCXZ : SchedWriteRes<[ZnALU03]>;
+def : InstRW<[ZnWriteJCXZ], (instrs JCXZ, JECXZ, JRCXZ)>;
+
+// INTO
+def : InstRW<[WriteMicrocoded], (instrs INTO)>;
+
+// LOOP.
+def ZnWriteLOOP : SchedWriteRes<[ZnALU03]>;
+def : InstRW<[ZnWriteLOOP], (instrs LOOP)>;
+
+// LOOP(N)E, LOOP(N)Z
+def ZnWriteLOOPE : SchedWriteRes<[ZnALU03]>;
+def : InstRW<[ZnWriteLOOPE], (instrs LOOPE, LOOPNE)>;
+
+// CALL.
+// r.
+def ZnWriteCALLr : SchedWriteRes<[ZnAGU, ZnALU03]>;
+def : InstRW<[ZnWriteCALLr], (instregex "CALL(16|32)r")>;
+
+def : InstRW<[WriteMicrocoded], (instregex "CALL(16|32)m")>;
+
+// RET.
+def ZnWriteRET : SchedWriteRes<[ZnALU03]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)",
+ "IRET(16|32|64)")>;
+
+//-- Logic instructions --//
+
+// AND OR XOR.
+// m,r/i.
+def : InstRW<[WriteALULd],
+ (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
+ "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
+
+// Define ALU latency variants
+def ZnWriteALULat2 : SchedWriteRes<[ZnALU]> {
+ let Latency = 2;
+}
+def ZnWriteALULat2Ld : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 6;
+}
+
+// BTR BTS BTC.
+// m,r,i.
+def ZnWriteBTRSCm : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+// m,r,i.
+def : SchedAlias<WriteBitTestSetImmRMW, ZnWriteBTRSCm>;
+def : SchedAlias<WriteBitTestSetRegRMW, ZnWriteBTRSCm>;
+
+// BLSI BLSMSK BLSR.
+// r,r.
+def : SchedAlias<WriteBLS, ZnWriteALULat2>;
+// r,m.
+def : SchedAlias<WriteBLSLd, ZnWriteALULat2Ld>;
+
+// CLD STD.
+def : InstRW<[WriteALU], (instrs STD, CLD)>;
+
+// PDEP PEXT.
+// r,r,r.
+def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
+// r,r,m.
+def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
+
+// RCR RCL.
+// m,i.
+def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(1|i|CL)")>;
+
+// SHR SHL SAR.
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+
+// SHRD SHLD.
+// m,r
+def : InstRW<[WriteShiftLd], (instregex "SH(R|L)D(16|32|64)mri8")>;
+
+// r,r,cl.
+def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)rrCL")>;
+
+// m,r,cl.
+def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)mrCL")>;
+
+//-- Misc instructions --//
+// CMPXCHG8B.
+def ZnWriteCMPXCHG8B : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let NumMicroOps = 18;
+}
+def : InstRW<[ZnWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
+
+def : InstRW<[WriteMicrocoded], (instrs CMPXCHG16B)>;
+
+// LEAVE
+def ZnWriteLEAVE : SchedWriteRes<[ZnALU, ZnAGU]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteLEAVE], (instregex "LEAVE")>;
+
+// PAUSE.
+def : InstRW<[WriteMicrocoded], (instrs PAUSE)>;
+
+// RDTSC.
+def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>;
+
+// RDPMC.
+def : InstRW<[WriteMicrocoded], (instrs RDPMC)>;
+
+// RDRAND.
+def : InstRW<[WriteMicrocoded], (instrs RDRAND16r, RDRAND32r, RDRAND64r)>;
+
+// XGETBV.
+def : InstRW<[WriteMicrocoded], (instrs XGETBV)>;
+
+//-- String instructions --//
+// CMPS.
+def : InstRW<[WriteMicrocoded], (instregex "CMPS(B|L|Q|W)")>;
+
+// LODSB/W.
+def : InstRW<[WriteMicrocoded], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[WriteMicrocoded], (instregex "LODS(L|Q)")>;
+
+// MOVS.
+def : InstRW<[WriteMicrocoded], (instregex "MOVS(B|L|Q|W)")>;
+
+// SCAS.
+def : InstRW<[WriteMicrocoded], (instregex "SCAS(B|W|L|Q)")>;
+
+// STOS
+def : InstRW<[WriteMicrocoded], (instregex "STOS(B|L|Q|W)")>;
+
+// XADD.
+def ZnXADD : SchedWriteRes<[ZnALU]>;
+def : InstRW<[ZnXADD], (instregex "XADD(8|16|32|64)rr")>;
+def : InstRW<[WriteMicrocoded], (instregex "XADD(8|16|32|64)rm")>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+def ZnWriteFLDr : SchedWriteRes<[ZnFPU13]> ;
+
+def ZnWriteSTr: SchedWriteRes<[ZnFPU23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// LD_F.
+// r.
+def : InstRW<[ZnWriteFLDr], (instrs LD_Frr)>;
+
+// m.
+def ZnWriteLD_F80m : SchedWriteRes<[ZnAGU, ZnFPU13]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteLD_F80m], (instrs LD_F80m)>;
+
+// FBLD.
+def : InstRW<[WriteMicrocoded], (instrs FBLDm)>;
+
+// FST(P).
+// r.
+def : InstRW<[ZnWriteSTr], (instregex "ST_(F|FP)rr")>;
+
+// m80.
+def ZnWriteST_FP80m : SchedWriteRes<[ZnAGU, ZnFPU23]> {
+ let Latency = 5;
+}
+def : InstRW<[ZnWriteST_FP80m], (instrs ST_FP80m)>;
+
+// FBSTP.
+// m80.
+def : InstRW<[WriteMicrocoded], (instrs FBSTPm)>;
+
+def ZnWriteFXCH : SchedWriteRes<[ZnFPU]>;
+
+// FXCHG.
+def : InstRW<[ZnWriteFXCH], (instrs XCH_F)>;
+
+// FILD.
+def ZnWriteFILD : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteFILD], (instregex "ILD_F(16|32|64)m")>;
+
+// FIST(P) FISTTP.
+def ZnWriteFIST : SchedWriteRes<[ZnAGU, ZnFPU23]> {
+ let Latency = 12;
+}
+def : InstRW<[ZnWriteFIST], (instregex "IS(T|TT)_(F|FP)(16|32|64)m")>;
+
+def ZnWriteFPU13 : SchedWriteRes<[ZnAGU, ZnFPU13]> {
+ let Latency = 8;
+}
+
+def ZnWriteFPU3 : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 11;
+}
+
+// FLDZ.
+def : SchedAlias<WriteFLD0, ZnWriteFPU13>;
+
+// FLD1.
+def : SchedAlias<WriteFLD1, ZnWriteFPU3>;
+
+// FLDPI FLDL2E etc.
+def : SchedAlias<WriteFLDC, ZnWriteFPU3>;
+
+// FNSTSW.
+// AX.
+def : InstRW<[WriteMicrocoded], (instrs FNSTSW16r)>;
+
+// m16.
+def : InstRW<[WriteMicrocoded], (instrs FNSTSWm)>;
+
+// FLDCW.
+def : InstRW<[WriteMicrocoded], (instrs FLDCW16m)>;
+
+// FNSTCW.
+def : InstRW<[WriteMicrocoded], (instrs FNSTCW16m)>;
+
+// FINCSTP FDECSTP.
+def : InstRW<[ZnWriteFPU3], (instrs FINCSTP, FDECSTP)>;
+
+// FFREE.
+def : InstRW<[ZnWriteFPU3], (instregex "FFREE")>;
+
+// FNSAVE.
+def : InstRW<[WriteMicrocoded], (instrs FSAVEm)>;
+
+// FRSTOR.
+def : InstRW<[WriteMicrocoded], (instrs FRSTORm)>;
+
+//-- Arithmetic instructions --//
+
+def ZnWriteFPU3Lat1 : SchedWriteRes<[ZnFPU3]> ;
+
+def ZnWriteFPU0Lat1 : SchedWriteRes<[ZnFPU0]> ;
+
+def ZnWriteFPU0Lat1Ld : SchedWriteRes<[ZnAGU, ZnFPU0]> {
+ let Latency = 8;
+}
+
+// FCHS.
+def : InstRW<[ZnWriteFPU3Lat1], (instregex "CHS_F")>;
+
+// FCOM(P) FUCOM(P).
+// r.
+def : InstRW<[ZnWriteFPU0Lat1], (instregex "COM(P?)_FST0r", "UCOM_F(P?)r")>;
+// m.
+def : InstRW<[ZnWriteFPU0Lat1Ld], (instregex "FCOM(P?)(32|64)m")>;
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[ZnWriteFPU0Lat1], (instrs FCOMPP, UCOM_FPPr)>;
+
+def ZnWriteFPU02 : SchedWriteRes<[ZnAGU, ZnFPU02]>
+{
+ let Latency = 9;
+}
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[ZnWriteFPU02], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
+
+def ZnWriteFPU03 : SchedWriteRes<[ZnAGU, ZnFPU03]>
+{
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,3];
+}
+
+// FICOM(P).
+def : InstRW<[ZnWriteFPU03], (instregex "FICOM(P?)(16|32)m")>;
+
+// FTST.
+def : InstRW<[ZnWriteFPU0Lat1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[ZnWriteFPU3Lat1], (instrs FXAM)>;
+
+// FPREM.
+def : InstRW<[WriteMicrocoded], (instrs FPREM)>;
+
+// FPREM1.
+def : InstRW<[WriteMicrocoded], (instrs FPREM1)>;
+
+// FRNDINT.
+def : InstRW<[WriteMicrocoded], (instrs FRNDINT)>;
+
+// FSCALE.
+def : InstRW<[WriteMicrocoded], (instrs FSCALE)>;
+
+// FXTRACT.
+def : InstRW<[WriteMicrocoded], (instrs FXTRACT)>;
+
+// FNOP.
+def : InstRW<[ZnWriteFPU0Lat1], (instrs FNOP)>;
+
+// WAIT.
+def : InstRW<[ZnWriteFPU0Lat1], (instrs WAIT)>;
+
+// FNCLEX.
+def : InstRW<[WriteMicrocoded], (instrs FNCLEX)>;
+
+// FNINIT.
+def : InstRW<[WriteMicrocoded], (instrs FNINIT)>;
+
+//=== Integer MMX and XMM Instructions ===//
+
+// PACKSSWB/DW.
+// mm <- mm.
+def ZnWriteFPU12 : SchedWriteRes<[ZnFPU12]> ;
+def ZnWriteFPU12Y : SchedWriteRes<[ZnFPU12]> {
+ let NumMicroOps = 2;
+}
+def ZnWriteFPU12m : SchedWriteRes<[ZnAGU, ZnFPU12]> ;
+def ZnWriteFPU12Ym : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[ZnWriteFPU12], (instrs MMX_PACKSSDWirr,
+ MMX_PACKSSWBirr,
+ MMX_PACKUSWBirr)>;
+def : InstRW<[ZnWriteFPU12m], (instrs MMX_PACKSSDWirm,
+ MMX_PACKSSWBirm,
+ MMX_PACKUSWBirm)>;
+
+// VPMOVSX/ZX BW BD BQ WD WQ DQ.
+// y <- x.
+def : InstRW<[ZnWriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrr")>;
+def : InstRW<[ZnWriteFPU12Ym], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrm")>;
+
+def ZnWriteFPU013 : SchedWriteRes<[ZnFPU013]> ;
+def ZnWriteFPU013Y : SchedWriteRes<[ZnFPU013]> {
+ let Latency = 2;
+}
+def ZnWriteFPU013m : SchedWriteRes<[ZnAGU, ZnFPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def ZnWriteFPU013Ld : SchedWriteRes<[ZnAGU, ZnFPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def ZnWriteFPU013LdY : SchedWriteRes<[ZnAGU, ZnFPU013]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+
+// PBLENDW.
+// x,x,i / v,v,v,i
+def : InstRW<[ZnWriteFPU013], (instregex "(V?)PBLENDWrri")>;
+// ymm
+def : InstRW<[ZnWriteFPU013Y], (instrs VPBLENDWYrri)>;
+
+// x,m,i / v,v,m,i
+def : InstRW<[ZnWriteFPU013Ld], (instregex "(V?)PBLENDWrmi")>;
+// y,m,i
+def : InstRW<[ZnWriteFPU013LdY], (instrs VPBLENDWYrmi)>;
+
+def ZnWriteFPU01 : SchedWriteRes<[ZnFPU01]> ;
+def ZnWriteFPU01Y : SchedWriteRes<[ZnFPU01]> {
+ let NumMicroOps = 2;
+}
+
+// VPBLENDD.
+// v,v,v,i.
+def : InstRW<[ZnWriteFPU01], (instrs VPBLENDDrri)>;
+// ymm
+def : InstRW<[ZnWriteFPU01Y], (instrs VPBLENDDYrri)>;
+
+// v,v,m,i
+def ZnWriteFPU01Op2 : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let NumMicroOps = 2;
+ let Latency = 8;
+ let ResourceCycles = [1, 2];
+}
+def ZnWriteFPU01Op2Y : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let NumMicroOps = 2;
+ let Latency = 9;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[ZnWriteFPU01Op2], (instrs VPBLENDDrmi)>;
+def : InstRW<[ZnWriteFPU01Op2Y], (instrs VPBLENDDYrmi)>;
+
+// MASKMOVQ.
+def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>;
+
+// MASKMOVDQU.
+def : InstRW<[WriteMicrocoded], (instregex "(V?)MASKMOVDQU(64)?")>;
+
+// VPMASKMOVD.
+// ymm
+def : InstRW<[WriteMicrocoded],
+ (instregex "VPMASKMOVD(Y?)rm")>;
+// m, v,v.
+def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+
+// VPBROADCAST B/W.
+// x, m8/16.
+def ZnWriteVPBROADCAST128Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteVPBROADCAST128Ld],
+ (instregex "VPBROADCAST(B|W)rm")>;
+
+// y, m8/16
+def ZnWriteVPBROADCAST256Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteVPBROADCAST256Ld],
+ (instregex "VPBROADCAST(B|W)Yrm")>;
+
+// VPGATHER.
+def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+// PHADD|PHSUB (S) W/D.
+def : SchedAlias<WritePHAdd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddX, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddXLd, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddY, ZnWriteMicrocoded>;
+def : SchedAlias<WritePHAddYLd, ZnWriteMicrocoded>;
+
+// PCMPGTQ.
+def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>;
+def : InstRW<[ZnWritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+
+// x <- x,m.
+def ZnWritePCMPGTQm : SchedWriteRes<[ZnAGU, ZnFPU03]> {
+ let Latency = 8;
+}
+// ymm.
+def ZnWritePCMPGTQYm : SchedWriteRes<[ZnAGU, ZnFPU03]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,2];
+}
+def : InstRW<[ZnWritePCMPGTQm], (instregex "(V?)PCMPGTQrm")>;
+def : InstRW<[ZnWritePCMPGTQYm], (instrs VPCMPGTQYrm)>;
+
+//-- Logic instructions --//
+
+// PSLL,PSRL,PSRA W/D/Q.
+// x,x / v,v,x.
+def ZnWritePShift : SchedWriteRes<[ZnFPU2]> ;
+def ZnWritePShiftY : SchedWriteRes<[ZnFPU2]> {
+ let Latency = 2;
+}
+
+// PSLL,PSRL DQ.
+def : InstRW<[ZnWritePShift], (instregex "(V?)PS(R|L)LDQri")>;
+def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+//-- Move instructions --//
+
+// VPERM2F128.
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rr)>;
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rm)>;
+
+def ZnWriteBROADCAST : SchedWriteRes<[ZnAGU, ZnFPU13]> {
+ let NumMicroOps = 2;
+ let Latency = 8;
+}
+// VBROADCASTF128.
+def : InstRW<[ZnWriteBROADCAST], (instrs VBROADCASTF128)>;
+
+// EXTRACTPS.
+// r32,x,i.
+def ZnWriteEXTRACTPSr : SchedWriteRes<[ZnFPU12, ZnFPU2]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+
+def ZnWriteEXTRACTPSm : SchedWriteRes<[ZnAGU,ZnFPU12, ZnFPU2]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [5, 1, 2];
+}
+// m32,x,i.
+def : InstRW<[ZnWriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+
+// VEXTRACTF128.
+// x,y,i.
+def : InstRW<[ZnWriteFPU013], (instrs VEXTRACTF128rr)>;
+
+// m128,y,i.
+def : InstRW<[ZnWriteFPU013m], (instrs VEXTRACTF128mr)>;
+
+def ZnWriteVINSERT128r: SchedWriteRes<[ZnFPU013]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def ZnWriteVINSERT128Ld: SchedWriteRes<[ZnAGU,ZnFPU013]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+// VINSERTF128.
+// y,y,x,i.
+def : InstRW<[ZnWriteVINSERT128r], (instrs VINSERTF128rr)>;
+def : InstRW<[ZnWriteVINSERT128Ld], (instrs VINSERTF128rm)>;
+
+// VGATHER.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHER(Q|D)(PD|PS)(Y?)rm")>;
+
+//-- Conversion instructions --//
+def ZnWriteCVTPD2PSr: SchedWriteRes<[ZnFPU3]> {
+ let Latency = 4;
+}
+def ZnWriteCVTPD2PSYr: SchedWriteRes<[ZnFPU3]> {
+ let Latency = 5;
+}
+
+// CVTPD2PS.
+// x,x.
+def : SchedAlias<WriteCvtPD2PS, ZnWriteCVTPD2PSr>;
+// y,y.
+def : SchedAlias<WriteCvtPD2PSY, ZnWriteCVTPD2PSYr>;
+// z,z.
+defm : X86WriteResUnsupported<WriteCvtPD2PSZ>;
+
+def ZnWriteCVTPD2PSLd: SchedWriteRes<[ZnAGU,ZnFPU03]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,2];
+}
+// x,m128.
+def : SchedAlias<WriteCvtPD2PSLd, ZnWriteCVTPD2PSLd>;
+
+// x,m256.
+def ZnWriteCVTPD2PSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 11;
+}
+def : SchedAlias<WriteCvtPD2PSYLd, ZnWriteCVTPD2PSYLd>;
+// z,m512
+defm : X86WriteResUnsupported<WriteCvtPD2PSZLd>;
+
+// CVTSD2SS.
+// x,x.
+// Same as WriteCVTPD2PSr
+def : SchedAlias<WriteCvtSD2SS, ZnWriteCVTPD2PSr>;
+
+// x,m64.
+def : SchedAlias<WriteCvtSD2SSLd, ZnWriteCVTPD2PSLd>;
+
+// CVTPS2PD.
+// x,x.
+def ZnWriteCVTPS2PDr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteCvtPS2PD, ZnWriteCVTPS2PDr>;
+
+// x,m64.
+// y,m128.
+def ZnWriteCVTPS2PDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteCvtPS2PDLd, ZnWriteCVTPS2PDLd>;
+def : SchedAlias<WriteCvtPS2PDYLd, ZnWriteCVTPS2PDLd>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZLd>;
+
+// y,x.
+def ZnWriteVCVTPS2PDY : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteCvtPS2PDY, ZnWriteVCVTPS2PDY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZ>;
+
+// CVTSS2SD.
+// x,x.
+def ZnWriteCVTSS2SDr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 4;
+}
+def : SchedAlias<WriteCvtSS2SD, ZnWriteCVTSS2SDr>;
+
+// x,m32.
+def ZnWriteCVTSS2SDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : SchedAlias<WriteCvtSS2SDLd, ZnWriteCVTSS2SDLd>;
+
+def ZnWriteCVTDQ2PDr: SchedWriteRes<[ZnFPU12,ZnFPU3]> {
+ let Latency = 5;
+}
+// CVTDQ2PD.
+// x,x.
+def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>;
+
+// Same as xmm
+// y,x.
+def : InstRW<[ZnWriteCVTDQ2PDr], (instrs VCVTDQ2PDYrr)>;
+
+def ZnWriteCVTPD2DQr: SchedWriteRes<[ZnFPU12, ZnFPU3]> {
+ let Latency = 5;
+}
+// CVT(T)PD2DQ.
+// x,x.
+def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V?)CVT(T?)PD2DQrr")>;
+
+def ZnWriteCVTPD2DQLd: SchedWriteRes<[ZnAGU,ZnFPU12,ZnFPU3]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+// x,m128.
+def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)PD2DQrm")>;
+// same as xmm handling
+// x,y.
+def : InstRW<[ZnWriteCVTPD2DQr], (instregex "VCVT(T?)PD2DQYrr")>;
+// x,m256.
+def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQYrm")>;
+
+def ZnWriteCVTPS2PIr: SchedWriteRes<[ZnFPU3]> {
+ let Latency = 4;
+}
+// CVT(T)PS2PI.
+// mm,x.
+def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>;
+
+// CVTPI2PD.
+// x,mm.
+def : InstRW<[ZnWriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
+
+// CVT(T)PD2PI.
+// mm,x.
+def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
+
+def ZnWriteCVSTSI2SSr: SchedWriteRes<[ZnFPU3]> {
+ let Latency = 5;
+}
+
+// same as CVTPD2DQr
+// CVT(T)SS2SI.
+// r32,x.
+def : InstRW<[ZnWriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>;
+// same as CVTPD2DQm
+// r32,m32.
+def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>;
+
+def ZnWriteCVSTSI2SDr: SchedWriteRes<[ZnFPU013, ZnFPU3]> {
+ let Latency = 5;
+}
+// CVTSI2SD.
+// x,r32/64.
+def : InstRW<[ZnWriteCVSTSI2SDr], (instregex "(V?)CVTSI(64)?2SDrr")>;
+
+
+def ZnWriteCVSTSI2SIr: SchedWriteRes<[ZnFPU3, ZnFPU2]> {
+ let Latency = 5;
+}
+def ZnWriteCVSTSI2SILd: SchedWriteRes<[ZnAGU, ZnFPU3, ZnFPU2]> {
+ let Latency = 12;
+}
+// CVTSD2SI.
+// r32/64
+def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(V?)CVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(V?)CVT(T?)SD2SI(64)?rm")>;
+
+// VCVTPS2PH.
+// x,v,i.
+def : SchedAlias<WriteCvtPS2PH, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHY, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+// m,v,i.
+def : SchedAlias<WriteCvtPS2PHSt, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHYSt, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+// VCVTPH2PS.
+// v,x.
+def : SchedAlias<WriteCvtPH2PS, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSY, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+// v,m.
+def : SchedAlias<WriteCvtPH2PSLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSYLd, ZnWriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+//-- SSE4A instructions --//
+// EXTRQ
+def ZnWriteEXTRQ: SchedWriteRes<[ZnFPU12, ZnFPU2]> {
+ let Latency = 2;
+}
+def : InstRW<[ZnWriteEXTRQ], (instregex "EXTRQ")>;
+
+// INSERTQ
+def ZnWriteINSERTQ: SchedWriteRes<[ZnFPU03,ZnFPU1]> {
+ let Latency = 4;
+}
+def : InstRW<[ZnWriteINSERTQ], (instregex "INSERTQ")>;
+
+//-- SHA instructions --//
+// SHA256MSG2
+def : InstRW<[WriteMicrocoded], (instregex "SHA256MSG2(Y?)r(r|m)")>;
+
+// SHA1MSG1, SHA256MSG1
+// x,x.
+def ZnWriteSHA1MSG1r : SchedWriteRes<[ZnFPU12]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[ZnWriteSHA1MSG1r], (instregex "SHA(1|256)MSG1rr")>;
+// x,m.
+def ZnWriteSHA1MSG1Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+ let Latency = 9;
+ let ResourceCycles = [1,2];
+}
+def : InstRW<[ZnWriteSHA1MSG1Ld], (instregex "SHA(1|256)MSG1rm")>;
+
+// SHA1MSG2
+// x,x.
+def ZnWriteSHA1MSG2r : SchedWriteRes<[ZnFPU12]> ;
+def : InstRW<[ZnWriteSHA1MSG2r], (instrs SHA1MSG2rr)>;
+// x,m.
+def ZnWriteSHA1MSG2Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteSHA1MSG2Ld], (instrs SHA1MSG2rm)>;
+
+// SHA1NEXTE
+// x,x.
+def ZnWriteSHA1NEXTEr : SchedWriteRes<[ZnFPU1]> ;
+def : InstRW<[ZnWriteSHA1NEXTEr], (instrs SHA1NEXTErr)>;
+// x,m.
+def ZnWriteSHA1NEXTELd : SchedWriteRes<[ZnAGU, ZnFPU1]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteSHA1NEXTELd], (instrs SHA1NEXTErm)>;
+
+// SHA1RNDS4
+// x,x.
+def ZnWriteSHA1RNDS4r : SchedWriteRes<[ZnFPU1]> {
+ let Latency = 6;
+}
+def : InstRW<[ZnWriteSHA1RNDS4r], (instrs SHA1RNDS4rri)>;
+// x,m.
+def ZnWriteSHA1RNDS4Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> {
+ let Latency = 13;
+}
+def : InstRW<[ZnWriteSHA1RNDS4Ld], (instrs SHA1RNDS4rmi)>;
+
+// SHA256RNDS2
+// x,x.
+def ZnWriteSHA256RNDS2r : SchedWriteRes<[ZnFPU1]> {
+ let Latency = 4;
+}
+def : InstRW<[ZnWriteSHA256RNDS2r], (instrs SHA256RNDS2rr)>;
+// x,m.
+def ZnWriteSHA256RNDS2Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> {
+ let Latency = 11;
+}
+def : InstRW<[ZnWriteSHA256RNDS2Ld], (instrs SHA256RNDS2rm)>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+def : SchedAlias<WriteFHAdd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteFHAddLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteFHAddY, ZnWriteMicrocoded>;
+def : SchedAlias<WriteFHAddYLd, ZnWriteMicrocoded>;
+
+// VDIVPS.
+// TODO - convert to ZnWriteResFpuPair
+// y,y,y.
+def ZnWriteVDIVPSYr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 12;
+ let ResourceCycles = [12];
+}
+def : SchedAlias<WriteFDivY, ZnWriteVDIVPSYr>;
+
+// y,y,m256.
+def ZnWriteVDIVPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 19;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 19];
+}
+def : SchedAlias<WriteFDivYLd, ZnWriteVDIVPSYLd>;
+
+// VDIVPD.
+// TODO - convert to ZnWriteResFpuPair
+// y,y,y.
+def ZnWriteVDIVPDY : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 15;
+ let ResourceCycles = [15];
+}
+def : SchedAlias<WriteFDiv64Y, ZnWriteVDIVPDY>;
+
+// y,y,m256.
+def ZnWriteVDIVPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 22;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,22];
+}
+def : SchedAlias<WriteFDiv64YLd, ZnWriteVDIVPDYLd>;
+
+// DPPS.
+// x,x,i / v,v,v,i.
+def : SchedAlias<WriteDPPS, ZnWriteMicrocoded>;
+def : SchedAlias<WriteDPPSY, ZnWriteMicrocoded>;
+
+// x,m,i / v,v,m,i.
+def : SchedAlias<WriteDPPSLd, ZnWriteMicrocoded>;
+def : SchedAlias<WriteDPPSYLd,ZnWriteMicrocoded>;
+
+// DPPD.
+// x,x,i.
+def : SchedAlias<WriteDPPD, ZnWriteMicrocoded>;
+
+// x,m,i.
+def : SchedAlias<WriteDPPDLd, ZnWriteMicrocoded>;
+
+// RSQRTSS
+// TODO - convert to ZnWriteResFpuPair
+// x,x.
+def ZnWriteRSQRTSSr : SchedWriteRes<[ZnFPU02]> {
+ let Latency = 5;
+}
+def : SchedAlias<WriteFRsqrt, ZnWriteRSQRTSSr>;
+
+// x,m128.
+def ZnWriteRSQRTSSLd: SchedWriteRes<[ZnAGU, ZnFPU02]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,2]; // FIXME: Is this right?
+}
+def : SchedAlias<WriteFRsqrtLd, ZnWriteRSQRTSSLd>;
+
+// RSQRTPS
+// TODO - convert to ZnWriteResFpuPair
+// y,y.
+def ZnWriteRSQRTPSYr : SchedWriteRes<[ZnFPU01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : SchedAlias<WriteFRsqrtY, ZnWriteRSQRTPSYr>;
+
+// y,m256.
+def ZnWriteRSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteFRsqrtYLd, ZnWriteRSQRTPSYLd>;
+
+//-- Other instructions --//
+
+// VZEROUPPER.
+def : InstRW<[WriteMicrocoded], (instrs VZEROUPPER)>;
+
+// VZEROALL.
+def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td
new file mode 100644
index 000000000000..48da0d6329b1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -0,0 +1,1548 @@
+//=- X86ScheduleZnver2.td - X86 Znver2 Scheduling -------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver2 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Znver2Model : SchedMachineModel {
+ // Zen can decode 4 instructions per cycle.
+ let IssueWidth = 4;
+ // Based on the reorder buffer we define MicroOpBufferSize
+ let MicroOpBufferSize = 224;
+ let LoadLatency = 4;
+ let MispredictPenalty = 17;
+ let HighLatency = 25;
+ let PostRAScheduler = 1;
+
+ // FIXME: This variable is required for incomplete model.
+ // We haven't catered all instructions.
+ // So, we reset the value of this variable so as to
+ // say that the model is incomplete.
+ let CompleteModel = 0;
+}
+
+let SchedModel = Znver2Model in {
+
+// Zen can issue micro-ops to 10 different units in one cycle.
+// These are
+// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3)
+// * Three AGU units (ZAGU0, ZAGU1, ZAGU2)
+// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3)
+// AGUs feed load store queues @two loads and 1 store per cycle.
+
+// Four ALU units are defined below
+def Zn2ALU0 : ProcResource<1>;
+def Zn2ALU1 : ProcResource<1>;
+def Zn2ALU2 : ProcResource<1>;
+def Zn2ALU3 : ProcResource<1>;
+
+// Three AGU units are defined below
+def Zn2AGU0 : ProcResource<1>;
+def Zn2AGU1 : ProcResource<1>;
+def Zn2AGU2 : ProcResource<1>;
+
+// Four FPU units are defined below
+def Zn2FPU0 : ProcResource<1>;
+def Zn2FPU1 : ProcResource<1>;
+def Zn2FPU2 : ProcResource<1>;
+def Zn2FPU3 : ProcResource<1>;
+
+// FPU grouping
+def Zn2FPU013 : ProcResGroup<[Zn2FPU0, Zn2FPU1, Zn2FPU3]>;
+def Zn2FPU01 : ProcResGroup<[Zn2FPU0, Zn2FPU1]>;
+def Zn2FPU12 : ProcResGroup<[Zn2FPU1, Zn2FPU2]>;
+def Zn2FPU13 : ProcResGroup<[Zn2FPU1, Zn2FPU3]>;
+def Zn2FPU23 : ProcResGroup<[Zn2FPU2, Zn2FPU3]>;
+def Zn2FPU02 : ProcResGroup<[Zn2FPU0, Zn2FPU2]>;
+def Zn2FPU03 : ProcResGroup<[Zn2FPU0, Zn2FPU3]>;
+
+// Below are the grouping of the units.
+// Micro-ops to be issued to multiple units are tackled this way.
+
+// ALU grouping
+// Zn2ALU03 - 0,3 grouping
+def Zn2ALU03: ProcResGroup<[Zn2ALU0, Zn2ALU3]>;
+
+// 64 Entry (16x4 entries) Int Scheduler
+def Zn2ALU : ProcResGroup<[Zn2ALU0, Zn2ALU1, Zn2ALU2, Zn2ALU3]> {
+ let BufferSize=64;
+}
+
+// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations
+// but are relevant for some instructions
+def Zn2AGU : ProcResGroup<[Zn2AGU0, Zn2AGU1, Zn2AGU2]> {
+ let BufferSize=28;
+}
+
+// Integer Multiplication issued on ALU1.
+def Zn2Multiplier : ProcResource<1>;
+
+// Integer division issued on ALU2.
+def Zn2Divider : ProcResource<1>;
+
+// 4 Cycles load-to use Latency is captured
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// 7 Cycles vector load-to use Latency is captured
+def : ReadAdvance<ReadAfterVecLd, 7>;
+def : ReadAdvance<ReadAfterVecXLd, 7>;
+def : ReadAdvance<ReadAfterVecYLd, 7>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// The Integer PRF for Zen is 168 entries, and it holds the architectural and
+// speculative version of the 64-bit integer registers.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def Zn2IntegerPRF : RegisterFile<168, [GR64, CCR]>;
+
+// 36 Entry (9x4 entries) floating-point Scheduler
+def Zn2FPU : ProcResGroup<[Zn2FPU0, Zn2FPU1, Zn2FPU2, Zn2FPU3]> {
+ let BufferSize=36;
+}
+
+// The Zen FP Retire Queue renames SIMD and FP uOps onto a pool of 160 128-bit
+// registers. Operations on 256-bit data types are cracked into two COPs.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def Zn2FpuPRF: RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// The unit can track up to 192 macro ops in-flight.
+// The retire unit handles in-order commit of up to 8 macro ops per cycle.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+// To be noted, the retire unit is shared between integer and FP ops.
+// In SMT mode it is 96 entry per thread. But, we do not use the conservative
+// value here because there is currently no way to fully mode the SMT mode,
+// so there is no point in trying.
+def Zn2RCU : RetireControlUnit<192, 8>;
+
+// (a folded load is an instruction that loads and does some operation)
+// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops.
+// a. load and
+// b. addpd
+// This multiclass is for folded loads for integer units.
+multiclass Zn2WriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadLat = 4, int LoadUOps = 1> {
+ // Register variant takes 1-cycle on Execution Port.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on Zn2AGU
+ // adds LoadLat cycles to the latency (default = 4).
+ def : WriteRes<SchedRW.Folded, !listconcat([Zn2AGU], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
+ }
+}
+
+// This multiclass is for folded loads for floating point units.
+multiclass Zn2WriteResFpuPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadLat = 7, int LoadUOps = 0> {
+ // Register variant takes 1-cycle on Execution Port.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on Zn2AGU
+ // adds LoadLat cycles to the latency (default = 7).
+ def : WriteRes<SchedRW.Folded, !listconcat([Zn2AGU], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
+ }
+}
+
+// WriteRMW is set for instructions with Memory write
+// operation in codegen
+def : WriteRes<WriteRMW, [Zn2AGU]>;
+
+def : WriteRes<WriteStore, [Zn2AGU]>;
+def : WriteRes<WriteStoreNT, [Zn2AGU]>;
+def : WriteRes<WriteMove, [Zn2ALU]>;
+def : WriteRes<WriteLoad, [Zn2AGU]> { let Latency = 8; }
+
+def : WriteRes<WriteZero, []>;
+def : WriteRes<WriteLEA, [Zn2ALU]>;
+defm : Zn2WriteResPair<WriteALU, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteADC, [Zn2ALU], 1>;
+
+defm : Zn2WriteResPair<WriteIMul8, [Zn2ALU1, Zn2Multiplier], 4>;
+
+defm : X86WriteRes<WriteBSWAP32, [Zn2ALU], 1, [4], 1>;
+defm : X86WriteRes<WriteBSWAP64, [Zn2ALU], 1, [4], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 3, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[Zn2ALU,Zn2AGU], 8, [1,1], 5>;
+defm : X86WriteRes<WriteXCHG, [Zn2ALU], 1, [2], 2>;
+
+defm : Zn2WriteResPair<WriteShift, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteShiftCL, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteRotate, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteRotateCL, [Zn2ALU], 1>;
+
+defm : X86WriteRes<WriteSHDrri, [Zn2ALU], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteSHDrrcl>;
+defm : X86WriteResUnsupported<WriteSHDmri>;
+defm : X86WriteResUnsupported<WriteSHDmrcl>;
+
+defm : Zn2WriteResPair<WriteJump, [Zn2ALU], 1>;
+defm : Zn2WriteResFpuPair<WriteCRC32, [Zn2FPU0], 3>;
+
+defm : Zn2WriteResPair<WriteCMOV, [Zn2ALU], 1>;
+def : WriteRes<WriteSETCC, [Zn2ALU]>;
+def : WriteRes<WriteSETCCStore, [Zn2ALU, Zn2AGU]>;
+defm : X86WriteRes<WriteLAHFSAHF, [Zn2ALU], 2, [1], 2>;
+
+defm : X86WriteRes<WriteBitTest, [Zn2ALU], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [Zn2ALU,Zn2AGU], 5, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestRegLd, [Zn2ALU,Zn2AGU], 5, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestSet, [Zn2ALU], 2, [1], 2>;
+
+// Bit counts.
+defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3>;
+defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 4>;
+defm : Zn2WriteResPair<WriteLZCNT, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteTZCNT, [Zn2ALU], 2>;
+defm : Zn2WriteResPair<WritePOPCNT, [Zn2ALU], 1>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : Zn2WriteResPair<WriteBEXTR, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteBZHI, [Zn2ALU], 1>;
+
+// IDIV
+defm : Zn2WriteResPair<WriteDiv8, [Zn2ALU2, Zn2Divider], 15, [1,15], 1>;
+defm : Zn2WriteResPair<WriteDiv16, [Zn2ALU2, Zn2Divider], 17, [1,17], 2>;
+defm : Zn2WriteResPair<WriteDiv32, [Zn2ALU2, Zn2Divider], 25, [1,25], 2>;
+defm : Zn2WriteResPair<WriteDiv64, [Zn2ALU2, Zn2Divider], 41, [1,41], 2>;
+defm : Zn2WriteResPair<WriteIDiv8, [Zn2ALU2, Zn2Divider], 15, [1,15], 1>;
+defm : Zn2WriteResPair<WriteIDiv16, [Zn2ALU2, Zn2Divider], 17, [1,17], 2>;
+defm : Zn2WriteResPair<WriteIDiv32, [Zn2ALU2, Zn2Divider], 25, [1,25], 2>;
+defm : Zn2WriteResPair<WriteIDiv64, [Zn2ALU2, Zn2Divider], 41, [1,41], 2>;
+
+// IMULH
+def : WriteRes<WriteIMulH, [Zn2ALU1, Zn2Multiplier]>{
+ let Latency = 4;
+}
+
+// Floating point operations
+defm : X86WriteRes<WriteFLoad, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [Zn2AGU,Zn2FPU01], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedLoadY, [Zn2AGU,Zn2FPU01], 8, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+
+defm : X86WriteRes<WriteFStore, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreX, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreY, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNT, [Zn2AGU,Zn2FPU2], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTX, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNTY, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMove, [Zn2FPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [Zn2FPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [Zn2FPU], 1, [1], 1>;
+
+defm : Zn2WriteResFpuPair<WriteFAdd, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAddX, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAddY, [Zn2FPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : Zn2WriteResFpuPair<WriteFAdd64, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64X, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64Y, [Zn2FPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : Zn2WriteResFpuPair<WriteFCmp, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmpX, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmpY, [Zn2FPU0], 1>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : Zn2WriteResFpuPair<WriteFCmp64, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64X, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64Y, [Zn2FPU0], 1>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : Zn2WriteResFpuPair<WriteFCom, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFComX, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFBlend, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFBlendY, [Zn2FPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : Zn2WriteResFpuPair<WriteFVarBlend, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFVarBlendY,[Zn2FPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : Zn2WriteResFpuPair<WriteVarBlend, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteVarBlendY, [Zn2FPU0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : Zn2WriteResFpuPair<WriteCvtSS2I, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtPS2I, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtPS2IY, [Zn2FPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : Zn2WriteResFpuPair<WriteCvtSD2I, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtPD2I, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtPD2IY, [Zn2FPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+defm : Zn2WriteResFpuPair<WriteCvtI2SS, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtI2PS, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtI2PSY, [Zn2FPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : Zn2WriteResFpuPair<WriteCvtI2SD, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtI2PD, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtI2PDY, [Zn2FPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+defm : Zn2WriteResFpuPair<WriteFDiv, [Zn2FPU3], 15>;
+defm : Zn2WriteResFpuPair<WriteFDivX, [Zn2FPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : Zn2WriteResFpuPair<WriteFDiv64, [Zn2FPU3], 15>;
+defm : Zn2WriteResFpuPair<WriteFDiv64X, [Zn2FPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : Zn2WriteResFpuPair<WriteFSign, [Zn2FPU3], 2>;
+defm : Zn2WriteResFpuPair<WriteFRnd, [Zn2FPU3], 3, [1], 1, 7, 0>;
+defm : Zn2WriteResFpuPair<WriteFRndY, [Zn2FPU3], 3, [1], 1, 7, 0>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : Zn2WriteResFpuPair<WriteFLogic, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteFLogicY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : Zn2WriteResFpuPair<WriteFTest, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteFTestY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : Zn2WriteResFpuPair<WriteFShuffle, [Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteFShuffleY, [Zn2FPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 3>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 3>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : Zn2WriteResFpuPair<WriteFMul, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMulX, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMulY, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : Zn2WriteResFpuPair<WriteFMul64, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMul64X, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMul64Y, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : Zn2WriteResFpuPair<WriteFMA, [Zn2FPU03], 5>;
+defm : Zn2WriteResFpuPair<WriteFMAX, [Zn2FPU03], 5>;
+defm : Zn2WriteResFpuPair<WriteFMAY, [Zn2FPU03], 5>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : Zn2WriteResFpuPair<WriteFRcp, [Zn2FPU01], 5>;
+defm : Zn2WriteResFpuPair<WriteFRcpX, [Zn2FPU01], 5>;
+defm : Zn2WriteResFpuPair<WriteFRcpY, [Zn2FPU01], 5, [1], 1, 7, 2>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : Zn2WriteResFpuPair<WriteFRsqrtX, [Zn2FPU01], 5, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : Zn2WriteResFpuPair<WriteFSqrt, [Zn2FPU3], 20, [20]>;
+defm : Zn2WriteResFpuPair<WriteFSqrtX, [Zn2FPU3], 20, [20]>;
+defm : Zn2WriteResFpuPair<WriteFSqrtY, [Zn2FPU3], 28, [28], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : Zn2WriteResFpuPair<WriteFSqrt64, [Zn2FPU3], 20, [20]>;
+defm : Zn2WriteResFpuPair<WriteFSqrt64X, [Zn2FPU3], 20, [20]>;
+defm : Zn2WriteResFpuPair<WriteFSqrt64Y, [Zn2FPU3], 20, [20], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : Zn2WriteResFpuPair<WriteFSqrt80, [Zn2FPU3], 20, [20]>;
+
+// Vector integer operations which uses FPU units
+defm : X86WriteRes<WriteVecLoad, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [Zn2AGU,Zn2FPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [Zn2AGU,Zn2FPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteVecStore, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreX, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNT, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore32, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMove, [Zn2FPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [Zn2FPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [Zn2FPU], 2, [1], 2>;
+defm : X86WriteRes<WriteVecMoveToGpr, [Zn2FPU2], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [Zn2FPU2], 3, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [Zn2FPU], 2, [1], 1>;
+
+defm : Zn2WriteResFpuPair<WriteVecShift, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftX, [Zn2FPU2], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftY, [Zn2FPU2], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImm, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImmY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : Zn2WriteResFpuPair<WriteVecLogic, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecLogicX, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecLogicY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : Zn2WriteResFpuPair<WriteVecTest, [Zn2FPU12], 1, [2], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteVecTestY, [Zn2FPU12], 1, [2], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : Zn2WriteResFpuPair<WriteVecALU, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecALUX, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecALUY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : Zn2WriteResFpuPair<WriteVecIMul, [Zn2FPU0], 4>;
+defm : Zn2WriteResFpuPair<WriteVecIMulX, [Zn2FPU0], 4>;
+defm : Zn2WriteResFpuPair<WriteVecIMulY, [Zn2FPU0], 4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : Zn2WriteResFpuPair<WritePMULLD, [Zn2FPU0], 4, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WritePMULLDY, [Zn2FPU0], 4, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : Zn2WriteResFpuPair<WriteShuffle, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteShuffleX, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteShuffleY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : Zn2WriteResFpuPair<WriteVarShuffle, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVarShuffleX,[Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVarShuffleY,[Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : Zn2WriteResFpuPair<WriteBlend, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteBlendY, [Zn2FPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : Zn2WriteResFpuPair<WriteShuffle256, [Zn2FPU], 2>;
+defm : Zn2WriteResFpuPair<WriteVarShuffle256, [Zn2FPU], 2>;
+defm : Zn2WriteResFpuPair<WritePSADBW, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WritePSADBWX, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WritePSADBWY, [Zn2FPU0], 3>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : Zn2WriteResFpuPair<WritePHMINPOS, [Zn2FPU0], 4>;
+
+// Vector Shift Operations
+defm : Zn2WriteResFpuPair<WriteVarVecShift, [Zn2FPU12], 3>;
+defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 3>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+defm : Zn2WriteResFpuPair<WriteVecInsert, [Zn2FPU], 1>;
+
+def : WriteRes<WriteVecExtract, [Zn2FPU12, Zn2FPU2]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteVecExtractSt, [Zn2AGU, Zn2FPU12, Zn2FPU2]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2, 3];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [Zn2FPU2]>;
+def : WriteRes<WriteMMXMOVMSK, [Zn2FPU2]>;
+def : WriteRes<WriteVecMOVMSK, [Zn2FPU2]>;
+
+def : WriteRes<WriteVecMOVMSKY, [Zn2FPU2]> {
+ let NumMicroOps = 2;
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// AES Instructions.
+defm : Zn2WriteResFpuPair<WriteAESDecEnc, [Zn2FPU01], 4>;
+defm : Zn2WriteResFpuPair<WriteAESIMC, [Zn2FPU01], 4>;
+defm : Zn2WriteResFpuPair<WriteAESKeyGen, [Zn2FPU01], 4>;
+
+def : WriteRes<WriteFence, [Zn2AGU]>;
+def : WriteRes<WriteNop, []>;
+
+// Following instructions with latency=100 are microcoded.
+// We set long latency so as to block the entire pipeline.
+defm : Zn2WriteResFpuPair<WriteFShuffle256, [Zn2FPU], 100>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
+
+// Microcoded Instructions
+def Zn2WriteMicrocoded : SchedWriteRes<[]> {
+ let Latency = 100;
+}
+defm : Zn2WriteResPair<WriteDPPS, [], 15>;
+defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
+defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
+defm : Zn2WriteResPair<WritePHAdd, [], 3>;
+defm : Zn2WriteResPair<WritePHAddX, [], 3>;
+defm : Zn2WriteResPair<WritePHAddY, [], 3>;
+
+def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteSystem, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteMPSAD, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteMPSADY, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteMPSADLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteMPSADYLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCLMul, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCLMulLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrM, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrMLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrI, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrILd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrM, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrMLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrI, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrILd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteLDMXCSR, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteSTMXCSR, Zn2WriteMicrocoded>;
+
+//=== Regex based InstRW ===//
+// Notation:
+// - r: register.
+// - m = memory.
+// - i = immediate
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+// MOV.
+// r16,m.
+def : InstRW<[WriteALULd, ReadAfterLd], (instregex "MOV16rm")>;
+
+// MOVSX, MOVZX.
+// r,m.
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
+
+// XCHG.
+// r,r.
+def Zn2WriteXCHG : SchedWriteRes<[Zn2ALU]> {
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[Zn2WriteXCHG], (instregex "^XCHG(8|16|32|64)rr", "^XCHG(16|32|64)ar")>;
+
+// r,m.
+def Zn2WriteXCHGrm : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "^XCHG(8|16|32|64)rm")>;
+
+def : InstRW<[WriteMicrocoded], (instrs XLAT)>;
+
+// POP16.
+// r.
+def Zn2WritePop16r : SchedWriteRes<[Zn2AGU]>{
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WritePop16r], (instregex "POP16rmm")>;
+def : InstRW<[WriteMicrocoded], (instregex "POPF(16|32)")>;
+def : InstRW<[WriteMicrocoded], (instregex "POPA(16|32)")>;
+
+
+// PUSH.
+// r. Has default values.
+// m.
+def Zn2WritePUSH : SchedWriteRes<[Zn2AGU]>{
+ let Latency = 4;
+}
+def : InstRW<[Zn2WritePUSH], (instregex "PUSH(16|32)rmm")>;
+
+//PUSHF
+def : InstRW<[WriteMicrocoded], (instregex "PUSHF(16|32)")>;
+
+// PUSHA.
+def Zn2WritePushA : SchedWriteRes<[Zn2AGU]> {
+ let Latency = 8;
+}
+def : InstRW<[Zn2WritePushA], (instregex "PUSHA(16|32)")>;
+
+//LAHF
+def : InstRW<[WriteMicrocoded], (instrs LAHF)>;
+
+// MOVBE.
+// r,m.
+def Zn2WriteMOVBE : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let Latency = 5;
+}
+def : InstRW<[Zn2WriteMOVBE, ReadAfterLd], (instregex "MOVBE(16|32|64)rm")>;
+
+// m16,r16.
+def : InstRW<[Zn2WriteMOVBE], (instregex "MOVBE(16|32|64)mr")>;
+
+//-- Arithmetic instructions --//
+
+// ADD SUB.
+// m,r/i.
+def : InstRW<[WriteALULd], (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
+ "(ADD|SUB)(8|16|32|64)mi8",
+ "(ADD|SUB)64mi32")>;
+
+// ADC SBB.
+// m,r/i.
+def : InstRW<[WriteALULd],
+ (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
+ "(ADC|SBB)(16|32|64)mi8",
+ "(ADC|SBB)64mi32")>;
+
+// INC DEC NOT NEG.
+// m.
+def : InstRW<[WriteALULd],
+ (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m")>;
+
+// MUL IMUL.
+// r16.
+def Zn2WriteMul16 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 3;
+}
+def Zn2WriteMul16Imm : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 4;
+}
+def : SchedAlias<WriteIMul16, Zn2WriteMul16>;
+def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16Imm>;
+def : SchedAlias<WriteIMul16Reg, Zn2WriteMul16>;
+
+// m16.
+def Zn2WriteMul16Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 7;
+}
+def : SchedAlias<WriteIMul16Ld, Zn2WriteMul16Ld>;
+def : SchedAlias<WriteIMul16ImmLd, Zn2WriteMul16Ld>;
+def : SchedAlias<WriteIMul16RegLd, Zn2WriteMul16Ld>;
+
+// r32.
+def Zn2WriteMul32 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteIMul32, Zn2WriteMul32>;
+def : SchedAlias<WriteIMul32Imm, Zn2WriteMul32>;
+def : SchedAlias<WriteIMul32Reg, Zn2WriteMul32>;
+
+// m32.
+def Zn2WriteMul32Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 7;
+}
+def : SchedAlias<WriteIMul32Ld, Zn2WriteMul32Ld>;
+def : SchedAlias<WriteIMul32ImmLd, Zn2WriteMul32Ld>;
+def : SchedAlias<WriteIMul32RegLd, Zn2WriteMul32Ld>;
+
+// r64.
+def Zn2WriteMul64 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteIMul64, Zn2WriteMul64>;
+def : SchedAlias<WriteIMul64Imm, Zn2WriteMul64>;
+def : SchedAlias<WriteIMul64Reg, Zn2WriteMul64>;
+
+// m64.
+def Zn2WriteMul64Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteIMul64Ld, Zn2WriteMul64Ld>;
+def : SchedAlias<WriteIMul64ImmLd, Zn2WriteMul64Ld>;
+def : SchedAlias<WriteIMul64RegLd, Zn2WriteMul64Ld>;
+
+// MULX.
+// r32,r32,r32.
+def Zn2WriteMulX32 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[Zn2WriteMulX32], (instrs MULX32rr)>;
+
+// r32,r32,m32.
+def Zn2WriteMulX32Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 2, 2];
+}
+def : InstRW<[Zn2WriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>;
+
+// r64,r64,r64.
+def Zn2WriteMulX64 : SchedWriteRes<[Zn2ALU1]> {
+ let Latency = 3;
+}
+def : InstRW<[Zn2WriteMulX64], (instrs MULX64rr)>;
+
+// r64,r64,m64.
+def Zn2WriteMulX64Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 7;
+}
+def : InstRW<[Zn2WriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>;
+
+//-- Control transfer instructions --//
+
+// J(E|R)CXZ.
+def Zn2WriteJCXZ : SchedWriteRes<[Zn2ALU03]>;
+def : InstRW<[Zn2WriteJCXZ], (instrs JCXZ, JECXZ, JRCXZ)>;
+
+// INTO
+def : InstRW<[WriteMicrocoded], (instrs INTO)>;
+
+// LOOP.
+def Zn2WriteLOOP : SchedWriteRes<[Zn2ALU03]>;
+def : InstRW<[Zn2WriteLOOP], (instrs LOOP)>;
+
+// LOOP(N)E, LOOP(N)Z
+def Zn2WriteLOOPE : SchedWriteRes<[Zn2ALU03]>;
+def : InstRW<[Zn2WriteLOOPE], (instrs LOOPE, LOOPNE)>;
+
+// CALL.
+// r.
+def Zn2WriteCALLr : SchedWriteRes<[Zn2AGU, Zn2ALU03]>;
+def : InstRW<[Zn2WriteCALLr], (instregex "CALL(16|32)r")>;
+
+def : InstRW<[WriteMicrocoded], (instregex "CALL(16|32)m")>;
+
+// RET.
+def Zn2WriteRET : SchedWriteRes<[Zn2ALU03]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)",
+ "IRET(16|32|64)")>;
+
+//-- Logic instructions --//
+
+// AND OR XOR.
+// m,r/i.
+def : InstRW<[WriteALULd],
+ (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
+ "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
+
+// Define ALU latency variants
+def Zn2WriteALULat2 : SchedWriteRes<[Zn2ALU]> {
+ let Latency = 2;
+}
+def Zn2WriteALULat2Ld : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let Latency = 6;
+}
+
+// BT.
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
+
+// BTR BTS BTC.
+// r,r,i.
+def Zn2WriteBTRSC : SchedWriteRes<[Zn2ALU]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteBTRSC], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
+
+// m,r,i.
+def Zn2WriteBTRSCm : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+// m,r,i.
+def : SchedAlias<WriteBitTestSetImmRMW, Zn2WriteBTRSCm>;
+def : SchedAlias<WriteBitTestSetRegRMW, Zn2WriteBTRSCm>;
+
+// BLSI BLSMSK BLSR.
+// r,r.
+def : SchedAlias<WriteBLS, Zn2WriteALULat2>;
+// r,m.
+def : SchedAlias<WriteBLSLd, Zn2WriteALULat2Ld>;
+
+// CLD STD.
+def : InstRW<[WriteALU], (instrs STD, CLD)>;
+
+// PDEP PEXT.
+// r,r,r.
+def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
+// r,r,m.
+def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
+
+// RCR RCL.
+// m,i.
+def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(1|i|CL)")>;
+
+// SHR SHL SAR.
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+
+// SHRD SHLD.
+// m,r
+def : InstRW<[WriteShiftLd], (instregex "SH(R|L)D(16|32|64)mri8")>;
+
+// r,r,cl.
+def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)rrCL")>;
+
+// m,r,cl.
+def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)mrCL")>;
+
+//-- Misc instructions --//
+// CMPXCHG8B.
+def Zn2WriteCMPXCHG8B : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let NumMicroOps = 18;
+}
+def : InstRW<[Zn2WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
+
+def : InstRW<[WriteMicrocoded], (instrs CMPXCHG16B)>;
+
+// LEAVE
+def Zn2WriteLEAVE : SchedWriteRes<[Zn2ALU, Zn2AGU]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteLEAVE], (instregex "LEAVE")>;
+
+// PAUSE.
+def : InstRW<[WriteMicrocoded], (instrs PAUSE)>;
+
+// RDTSC.
+def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>;
+
+// RDPMC.
+def : InstRW<[WriteMicrocoded], (instrs RDPMC)>;
+
+// RDRAND.
+def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>;
+
+// XGETBV.
+def : InstRW<[WriteMicrocoded], (instregex "XGETBV")>;
+
+//-- String instructions --//
+// CMPS.
+def : InstRW<[WriteMicrocoded], (instregex "CMPS(B|L|Q|W)")>;
+
+// LODSB/W.
+def : InstRW<[WriteMicrocoded], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[WriteMicrocoded], (instregex "LODS(L|Q)")>;
+
+// MOVS.
+def : InstRW<[WriteMicrocoded], (instregex "MOVS(B|L|Q|W)")>;
+
+// SCAS.
+def : InstRW<[WriteMicrocoded], (instregex "SCAS(B|W|L|Q)")>;
+
+// STOS
+def : InstRW<[WriteMicrocoded], (instregex "STOS(B|L|Q|W)")>;
+
+// XADD.
+def Zn2XADD : SchedWriteRes<[Zn2ALU]>;
+def : InstRW<[Zn2XADD], (instregex "XADD(8|16|32|64)rr")>;
+def : InstRW<[WriteMicrocoded], (instregex "XADD(8|16|32|64)rm")>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+def Zn2WriteFLDr : SchedWriteRes<[Zn2FPU13]> ;
+
+def Zn2WriteSTr: SchedWriteRes<[Zn2FPU23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// LD_F.
+// r.
+def : InstRW<[Zn2WriteFLDr], (instregex "LD_Frr")>;
+
+// m.
+def Zn2WriteLD_F80m : SchedWriteRes<[Zn2AGU, Zn2FPU13]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteLD_F80m], (instregex "LD_F80m")>;
+
+// FBLD.
+def : InstRW<[WriteMicrocoded], (instregex "FBLDm")>;
+
+// FST(P).
+// r.
+def : InstRW<[Zn2WriteSTr], (instregex "ST_(F|FP)rr")>;
+
+// m80.
+def Zn2WriteST_FP80m : SchedWriteRes<[Zn2AGU, Zn2FPU23]> {
+ let Latency = 5;
+}
+def : InstRW<[Zn2WriteST_FP80m], (instregex "ST_FP80m")>;
+
+// FBSTP.
+// m80.
+def : InstRW<[WriteMicrocoded], (instregex "FBSTPm")>;
+
+def Zn2WriteFXCH : SchedWriteRes<[Zn2FPU]>;
+
+// FXCHG.
+def : InstRW<[Zn2WriteFXCH], (instrs XCH_F)>;
+
+// FILD.
+def Zn2WriteFILD : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteFILD], (instregex "ILD_F(16|32|64)m")>;
+
+// FIST(P) FISTTP.
+def Zn2WriteFIST : SchedWriteRes<[Zn2AGU, Zn2FPU23]> {
+ let Latency = 12;
+}
+def : InstRW<[Zn2WriteFIST], (instregex "IS(T|TT)_(F|FP)(16|32|64)m")>;
+
+def Zn2WriteFPU13 : SchedWriteRes<[Zn2AGU, Zn2FPU13]> {
+ let Latency = 8;
+}
+
+def Zn2WriteFPU3 : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 11;
+}
+
+// FLDZ.
+def : SchedAlias<WriteFLD0, Zn2WriteFPU13>;
+
+// FLD1.
+def : SchedAlias<WriteFLD1, Zn2WriteFPU3>;
+
+// FLDPI FLDL2E etc.
+def : SchedAlias<WriteFLDC, Zn2WriteFPU3>;
+
+// FNSTSW.
+// AX.
+def : InstRW<[WriteMicrocoded], (instrs FNSTSW16r)>;
+
+// m16.
+def : InstRW<[WriteMicrocoded], (instrs FNSTSWm)>;
+
+// FLDCW.
+def : InstRW<[WriteMicrocoded], (instrs FLDCW16m)>;
+
+// FNSTCW.
+def : InstRW<[WriteMicrocoded], (instrs FNSTCW16m)>;
+
+// FINCSTP FDECSTP.
+def : InstRW<[Zn2WriteFPU3], (instrs FINCSTP, FDECSTP)>;
+
+// FFREE.
+def : InstRW<[Zn2WriteFPU3], (instregex "FFREE")>;
+
+// FNSAVE.
+def : InstRW<[WriteMicrocoded], (instregex "FSAVEm")>;
+
+// FRSTOR.
+def : InstRW<[WriteMicrocoded], (instregex "FRSTORm")>;
+
+//-- Arithmetic instructions --//
+
+def Zn2WriteFPU3Lat1 : SchedWriteRes<[Zn2FPU3]> ;
+
+def Zn2WriteFPU0Lat1 : SchedWriteRes<[Zn2FPU0]> ;
+
+def Zn2WriteFPU0Lat1Ld : SchedWriteRes<[Zn2AGU, Zn2FPU0]> {
+ let Latency = 8;
+}
+
+// FCHS.
+def : InstRW<[Zn2WriteFPU3Lat1], (instregex "CHS_F")>;
+
+// FCOM(P) FUCOM(P).
+// r.
+def : InstRW<[Zn2WriteFPU0Lat1], (instregex "COM(P?)_FST0r", "UCOM_F(P?)r")>;
+// m.
+def : InstRW<[Zn2WriteFPU0Lat1Ld], (instregex "FCOM(P?)(32|64)m")>;
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[Zn2WriteFPU0Lat1], (instrs FCOMPP, UCOM_FPPr)>;
+
+def Zn2WriteFPU02 : SchedWriteRes<[Zn2AGU, Zn2FPU02]>
+{
+ let Latency = 9;
+}
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[Zn2WriteFPU02], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
+
+def Zn2WriteFPU03 : SchedWriteRes<[Zn2AGU, Zn2FPU03]>
+{
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,3];
+}
+
+// FICOM(P).
+def : InstRW<[Zn2WriteFPU03], (instregex "FICOM(P?)(16|32)m")>;
+
+// FTST.
+def : InstRW<[Zn2WriteFPU0Lat1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[Zn2WriteFPU3Lat1], (instrs FXAM)>;
+
+// FPREM.
+def : InstRW<[WriteMicrocoded], (instrs FPREM)>;
+
+// FPREM1.
+def : InstRW<[WriteMicrocoded], (instrs FPREM1)>;
+
+// FRNDINT.
+def : InstRW<[WriteMicrocoded], (instrs FRNDINT)>;
+
+// FSCALE.
+def : InstRW<[WriteMicrocoded], (instrs FSCALE)>;
+
+// FXTRACT.
+def : InstRW<[WriteMicrocoded], (instrs FXTRACT)>;
+
+// FNOP.
+def : InstRW<[Zn2WriteFPU0Lat1], (instrs FNOP)>;
+
+// WAIT.
+def : InstRW<[Zn2WriteFPU0Lat1], (instrs WAIT)>;
+
+// FNCLEX.
+def : InstRW<[WriteMicrocoded], (instrs FNCLEX)>;
+
+// FNINIT.
+def : InstRW<[WriteMicrocoded], (instrs FNINIT)>;
+
+//=== Integer MMX and XMM Instructions ===//
+
+// PACKSSWB/DW.
+// mm <- mm.
+def Zn2WriteFPU12 : SchedWriteRes<[Zn2FPU12]> ;
+def Zn2WriteFPU12Y : SchedWriteRes<[Zn2FPU12]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def Zn2WriteFPU12m : SchedWriteRes<[Zn2AGU, Zn2FPU12]> ;
+def Zn2WriteFPU12Ym : SchedWriteRes<[Zn2AGU, Zn2FPU12]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[Zn2WriteFPU12], (instrs MMX_PACKSSDWirr,
+ MMX_PACKSSWBirr,
+ MMX_PACKUSWBirr)>;
+def : InstRW<[Zn2WriteFPU12m], (instrs MMX_PACKSSDWirm,
+ MMX_PACKSSWBirm,
+ MMX_PACKUSWBirm)>;
+
+// VPMOVSX/ZX BW BD BQ WD WQ DQ.
+// y <- x.
+def : InstRW<[Zn2WriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrr")>;
+def : InstRW<[Zn2WriteFPU12Ym], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrm")>;
+
+def Zn2WriteFPU013 : SchedWriteRes<[Zn2FPU013]> ;
+def Zn2WriteFPU013Y : SchedWriteRes<[Zn2FPU013]> ;
+def Zn2WriteFPU013m : SchedWriteRes<[Zn2AGU, Zn2FPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def Zn2WriteFPU013Ld : SchedWriteRes<[Zn2AGU, Zn2FPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def Zn2WriteFPU013LdY : SchedWriteRes<[Zn2AGU, Zn2FPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+// PBLENDW.
+// x,x,i / v,v,v,i
+def : InstRW<[Zn2WriteFPU013], (instregex "(V?)PBLENDWrri")>;
+// ymm
+def : InstRW<[Zn2WriteFPU013Y], (instrs VPBLENDWYrri)>;
+
+// x,m,i / v,v,m,i
+def : InstRW<[Zn2WriteFPU013Ld], (instregex "(V?)PBLENDWrmi")>;
+// y,m,i
+def : InstRW<[Zn2WriteFPU013LdY], (instrs VPBLENDWYrmi)>;
+
+def Zn2WriteFPU01 : SchedWriteRes<[Zn2FPU01]> ;
+def Zn2WriteFPU01Y : SchedWriteRes<[Zn2FPU01]> {
+ let NumMicroOps = 2;
+}
+
+// VPBLENDD.
+// v,v,v,i.
+def : InstRW<[Zn2WriteFPU01], (instrs VPBLENDDrri)>;
+// ymm
+def : InstRW<[Zn2WriteFPU01Y], (instrs VPBLENDDYrri)>;
+
+// v,v,m,i
+def Zn2WriteFPU01Op2 : SchedWriteRes<[Zn2AGU, Zn2FPU01]> {
+ let NumMicroOps = 2;
+ let Latency = 8;
+ let ResourceCycles = [1, 2];
+}
+def Zn2WriteFPU01Op2Y : SchedWriteRes<[Zn2AGU, Zn2FPU01]> {
+ let NumMicroOps = 2;
+ let Latency = 9;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[Zn2WriteFPU01Op2], (instrs VPBLENDDrmi)>;
+def : InstRW<[Zn2WriteFPU01Op2Y], (instrs VPBLENDDYrmi)>;
+
+// MASKMOVQ.
+def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>;
+
+// MASKMOVDQU.
+def : InstRW<[WriteMicrocoded], (instregex "(V?)MASKMOVDQU(64)?")>;
+
+// VPMASKMOVD.
+// ymm
+def : InstRW<[WriteMicrocoded],
+ (instregex "VPMASKMOVD(Y?)rm")>;
+// m, v,v.
+def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+
+// VPBROADCAST B/W.
+// x, m8/16.
+def Zn2WriteVPBROADCAST128Ld : SchedWriteRes<[Zn2AGU, Zn2FPU12]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[Zn2WriteVPBROADCAST128Ld],
+ (instregex "VPBROADCAST(B|W)rm")>;
+
+// y, m8/16
+def Zn2WriteVPBROADCAST256Ld : SchedWriteRes<[Zn2AGU, Zn2FPU1]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[Zn2WriteVPBROADCAST256Ld],
+ (instregex "VPBROADCAST(B|W)Yrm")>;
+
+// VPGATHER.
+def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
+
+//-- Arithmetic instructions --//
+
+// PCMPGTQ.
+def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>;
+def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+
+// x <- x,m.
+def Zn2WritePCMPGTQm : SchedWriteRes<[Zn2AGU, Zn2FPU03]> {
+ let Latency = 8;
+}
+// ymm.
+def Zn2WritePCMPGTQYm : SchedWriteRes<[Zn2AGU, Zn2FPU03]> {
+ let Latency = 8;
+}
+def : InstRW<[Zn2WritePCMPGTQm], (instregex "(V?)PCMPGTQrm")>;
+def : InstRW<[Zn2WritePCMPGTQYm], (instrs VPCMPGTQYrm)>;
+
+//-- Logic instructions --//
+
+// PSLL,PSRL,PSRA W/D/Q.
+// x,x / v,v,x.
+def Zn2WritePShift : SchedWriteRes<[Zn2FPU2]> {
+ let Latency = 3;
+}
+def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> {
+ let Latency = 3;
+}
+
+// PSLL,PSRL DQ.
+def : InstRW<[Zn2WritePShift], (instregex "(V?)PS(R|L)LDQri")>;
+def : InstRW<[Zn2WritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+//-- Move instructions --//
+
+// VPERM2F128.
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rr)>;
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rm)>;
+
+def Zn2WriteBROADCAST : SchedWriteRes<[Zn2AGU, Zn2FPU13]> {
+ let NumMicroOps = 2;
+ let Latency = 8;
+}
+// VBROADCASTF128.
+def : InstRW<[Zn2WriteBROADCAST], (instrs VBROADCASTF128)>;
+
+// EXTRACTPS.
+// r32,x,i.
+def Zn2WriteEXTRACTPSr : SchedWriteRes<[Zn2FPU12, Zn2FPU2]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[Zn2WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+
+def Zn2WriteEXTRACTPSm : SchedWriteRes<[Zn2AGU,Zn2FPU12, Zn2FPU2]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [5, 1, 2];
+}
+// m32,x,i.
+def : InstRW<[Zn2WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+
+// VEXTRACTF128.
+// x,y,i.
+def : InstRW<[Zn2WriteFPU013], (instrs VEXTRACTF128rr)>;
+
+// m128,y,i.
+def : InstRW<[Zn2WriteFPU013m], (instrs VEXTRACTF128mr)>;
+
+def Zn2WriteVINSERT128r: SchedWriteRes<[Zn2FPU013]> {
+ let Latency = 2;
+// let ResourceCycles = [2];
+}
+def Zn2WriteVINSERT128Ld: SchedWriteRes<[Zn2AGU,Zn2FPU013]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+// VINSERTF128.
+// y,y,x,i.
+def : InstRW<[Zn2WriteVINSERT128r], (instrs VINSERTF128rr)>;
+def : InstRW<[Zn2WriteVINSERT128Ld], (instrs VINSERTF128rm)>;
+
+// VGATHER.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHER(Q|D)(PD|PS)(Y?)rm")>;
+
+//-- Conversion instructions --//
+def Zn2WriteCVTPD2PSr: SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+def Zn2WriteCVTPD2PSYr: SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+
+// CVTPD2PS.
+// x,x.
+def : SchedAlias<WriteCvtPD2PS, Zn2WriteCVTPD2PSr>;
+// y,y.
+def : SchedAlias<WriteCvtPD2PSY, Zn2WriteCVTPD2PSYr>;
+// z,z.
+defm : X86WriteResUnsupported<WriteCvtPD2PSZ>;
+
+def Zn2WriteCVTPD2PSLd: SchedWriteRes<[Zn2AGU,Zn2FPU03]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+// x,m128.
+def : SchedAlias<WriteCvtPD2PSLd, Zn2WriteCVTPD2PSLd>;
+
+// x,m256.
+def Zn2WriteCVTPD2PSYLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 10;
+}
+def : SchedAlias<WriteCvtPD2PSYLd, Zn2WriteCVTPD2PSYLd>;
+// z,m512
+defm : X86WriteResUnsupported<WriteCvtPD2PSZLd>;
+
+// CVTSD2SS.
+// x,x.
+// Same as WriteCVTPD2PSr
+def : SchedAlias<WriteCvtSD2SS, Zn2WriteCVTPD2PSr>;
+
+// x,m64.
+def : SchedAlias<WriteCvtSD2SSLd, Zn2WriteCVTPD2PSLd>;
+
+// CVTPS2PD.
+// x,x.
+def Zn2WriteCVTPS2PDr : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteCvtPS2PD, Zn2WriteCVTPS2PDr>;
+
+// x,m64.
+// y,m128.
+def Zn2WriteCVTPS2PDLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteCvtPS2PDLd, Zn2WriteCVTPS2PDLd>;
+def : SchedAlias<WriteCvtPS2PDYLd, Zn2WriteCVTPS2PDLd>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZLd>;
+
+// y,x.
+def Zn2WriteVCVTPS2PDY : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteCvtPS2PDY, Zn2WriteVCVTPS2PDY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZ>;
+
+// CVTSS2SD.
+// x,x.
+def Zn2WriteCVTSS2SDr : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteCvtSS2SD, Zn2WriteCVTSS2SDr>;
+
+// x,m32.
+def Zn2WriteCVTSS2SDLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : SchedAlias<WriteCvtSS2SDLd, Zn2WriteCVTSS2SDLd>;
+
+def Zn2WriteCVTDQ2PDr: SchedWriteRes<[Zn2FPU12,Zn2FPU3]> {
+ let Latency = 3;
+}
+// CVTDQ2PD.
+// x,x.
+def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2P(D|S)rr")>;
+
+// Same as xmm
+// y,x.
+def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PDYrr)>;
+def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PSYrr)>;
+
+def Zn2WriteCVTPD2DQr: SchedWriteRes<[Zn2FPU12, Zn2FPU3]> {
+ let Latency = 3;
+}
+// CVT(T)P(D|S)2DQ.
+// x,x.
+def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)P(D|S)2DQrr")>;
+
+def Zn2WriteCVTPD2DQLd: SchedWriteRes<[Zn2AGU,Zn2FPU12,Zn2FPU3]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+// x,m128.
+def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)PD2DQrm")>;
+// same as xmm handling
+// x,y.
+def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "VCVT(T?)PD2DQYrr")>;
+// x,m256.
+def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQYrm")>;
+
+def Zn2WriteCVTPS2PIr: SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 4;
+}
+// CVT(T)PS2PI.
+// mm,x.
+def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>;
+
+// CVTPI2PD.
+// x,mm.
+def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
+
+// CVT(T)PD2PI.
+// mm,x.
+def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
+
+def Zn2WriteCVSTSI2SSr: SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+
+// same as CVTPD2DQr
+// CVT(T)SS2SI.
+// r32,x.
+def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>;
+// same as CVTPD2DQm
+// r32,m32.
+def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>;
+
+def Zn2WriteCVSTSI2SDr: SchedWriteRes<[Zn2FPU013, Zn2FPU3]> {
+ let Latency = 3;
+}
+// CVTSI2SD.
+// x,r32/64.
+def : InstRW<[Zn2WriteCVSTSI2SDr], (instregex "(V?)CVTSI(64)?2SDrr")>;
+
+
+def Zn2WriteCVSTSI2SIr: SchedWriteRes<[Zn2FPU3, Zn2FPU2]> {
+ let Latency = 4;
+}
+def Zn2WriteCVSTSI2SILd: SchedWriteRes<[Zn2AGU, Zn2FPU3, Zn2FPU2]> {
+ let Latency = 11;
+}
+// CVTSD2SI.
+// r32/64
+def : InstRW<[Zn2WriteCVSTSI2SIr], (instregex "(V?)CVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[Zn2WriteCVSTSI2SILd], (instregex "(V?)CVT(T?)SD2SI(64)?rm")>;
+
+// VCVTPS2PH.
+// x,v,i.
+def : SchedAlias<WriteCvtPS2PH, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHY, Zn2WriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+// m,v,i.
+def : SchedAlias<WriteCvtPS2PHSt, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHYSt, Zn2WriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+// VCVTPH2PS.
+// v,x.
+def : SchedAlias<WriteCvtPH2PS, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSY, Zn2WriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+// v,m.
+def : SchedAlias<WriteCvtPH2PSLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSYLd, Zn2WriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+//-- SSE4A instructions --//
+// EXTRQ
+def Zn2WriteEXTRQ: SchedWriteRes<[Zn2FPU12, Zn2FPU2]> {
+ let Latency = 3;
+}
+def : InstRW<[Zn2WriteEXTRQ], (instregex "EXTRQ")>;
+
+// INSERTQ
+def Zn2WriteINSERTQ: SchedWriteRes<[Zn2FPU03,Zn2FPU1]> {
+ let Latency = 4;
+}
+def : InstRW<[Zn2WriteINSERTQ], (instregex "INSERTQ")>;
+
+//-- SHA instructions --//
+// SHA256MSG2
+def : InstRW<[WriteMicrocoded], (instregex "SHA256MSG2(Y?)r(r|m)")>;
+
+// SHA1MSG1, SHA256MSG1
+// x,x.
+def Zn2WriteSHA1MSG1r : SchedWriteRes<[Zn2FPU12]> {
+ let Latency = 2;
+}
+def : InstRW<[Zn2WriteSHA1MSG1r], (instregex "SHA(1|256)MSG1rr")>;
+// x,m.
+def Zn2WriteSHA1MSG1Ld : SchedWriteRes<[Zn2AGU, Zn2FPU12]> {
+ let Latency = 9;
+}
+def : InstRW<[Zn2WriteSHA1MSG1Ld], (instregex "SHA(1|256)MSG1rm")>;
+
+// SHA1MSG2
+// x,x.
+def Zn2WriteSHA1MSG2r : SchedWriteRes<[Zn2FPU12]> ;
+def : InstRW<[Zn2WriteSHA1MSG2r], (instregex "SHA1MSG2rr")>;
+// x,m.
+def Zn2WriteSHA1MSG2Ld : SchedWriteRes<[Zn2AGU, Zn2FPU12]> {
+ let Latency = 8;
+}
+def : InstRW<[Zn2WriteSHA1MSG2Ld], (instregex "SHA1MSG2rm")>;
+
+// SHA1NEXTE
+// x,x.
+def Zn2WriteSHA1NEXTEr : SchedWriteRes<[Zn2FPU1]> ;
+def : InstRW<[Zn2WriteSHA1NEXTEr], (instregex "SHA1NEXTErr")>;
+// x,m.
+def Zn2WriteSHA1NEXTELd : SchedWriteRes<[Zn2AGU, Zn2FPU1]> {
+ let Latency = 8;
+}
+def : InstRW<[Zn2WriteSHA1NEXTELd], (instregex "SHA1NEXTErm")>;
+
+// SHA1RNDS4
+// x,x.
+def Zn2WriteSHA1RNDS4r : SchedWriteRes<[Zn2FPU1]> {
+ let Latency = 6;
+}
+def : InstRW<[Zn2WriteSHA1RNDS4r], (instregex "SHA1RNDS4rr")>;
+// x,m.
+def Zn2WriteSHA1RNDS4Ld : SchedWriteRes<[Zn2AGU, Zn2FPU1]> {
+ let Latency = 13;
+}
+def : InstRW<[Zn2WriteSHA1RNDS4Ld], (instregex "SHA1RNDS4rm")>;
+
+// SHA256RNDS2
+// x,x.
+def Zn2WriteSHA256RNDS2r : SchedWriteRes<[Zn2FPU1]> {
+ let Latency = 4;
+}
+def : InstRW<[Zn2WriteSHA256RNDS2r], (instregex "SHA256RNDS2rr")>;
+// x,m.
+def Zn2WriteSHA256RNDS2Ld : SchedWriteRes<[Zn2AGU, Zn2FPU1]> {
+ let Latency = 11;
+}
+def : InstRW<[Zn2WriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>;
+
+//-- Arithmetic instructions --//
+
+// VDIVPS.
+// TODO - convert to Zn2WriteResFpuPair
+// y,y,y.
+def Zn2WriteVDIVPSYr : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 10;
+ let ResourceCycles = [10];
+}
+def : SchedAlias<WriteFDivY, Zn2WriteVDIVPSYr>;
+
+// y,y,m256.
+def Zn2WriteVDIVPSYLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 17;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 17];
+}
+def : SchedAlias<WriteFDivYLd, Zn2WriteVDIVPSYLd>;
+
+// VDIVPD.
+// TODO - convert to Zn2WriteResFpuPair
+// y,y,y.
+def Zn2WriteVDIVPDY : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 13;
+ let ResourceCycles = [13];
+}
+def : SchedAlias<WriteFDiv64Y, Zn2WriteVDIVPDY>;
+
+// y,y,m256.
+def Zn2WriteVDIVPDYLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 20;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,20];
+}
+def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>;
+
+// DPPS.
+// x,x,i / v,v,v,i.
+def : SchedAlias<WriteDPPSY, Zn2WriteMicrocoded>;
+
+// x,m,i / v,v,m,i.
+def : SchedAlias<WriteDPPSYLd,Zn2WriteMicrocoded>;
+
+// DPPD.
+// x,x,i.
+def : SchedAlias<WriteDPPD, Zn2WriteMicrocoded>;
+
+// x,m,i.
+def : SchedAlias<WriteDPPDLd, Zn2WriteMicrocoded>;
+
+// RSQRTSS
+// TODO - convert to Zn2WriteResFpuPair
+// x,x.
+def Zn2WriteRSQRTSSr : SchedWriteRes<[Zn2FPU02]> {
+ let Latency = 5;
+}
+def : SchedAlias<WriteFRsqrt, Zn2WriteRSQRTSSr>;
+
+// x,m128.
+def Zn2WriteRSQRTSSLd: SchedWriteRes<[Zn2AGU, Zn2FPU02]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,2];
+}
+def : SchedAlias<WriteFRsqrtLd, Zn2WriteRSQRTSSLd>;
+
+// RSQRTPS
+// TODO - convert to Zn2WriteResFpuPair
+// y,y.
+def Zn2WriteRSQRTPSYr : SchedWriteRes<[Zn2FPU01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : SchedAlias<WriteFRsqrtY, Zn2WriteRSQRTPSYr>;
+
+// y,m256.
+def Zn2WriteRSQRTPSYLd : SchedWriteRes<[Zn2AGU, Zn2FPU01]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteFRsqrtYLd, Zn2WriteRSQRTPSYLd>;
+
+//-- Other instructions --//
+
+// VZEROUPPER.
+def : InstRW<[WriteALU], (instrs VZEROUPPER)>;
+
+// VZEROALL.
+def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>;
+
+} // SchedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
new file mode 100644
index 000000000000..e76908ef4bc4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -0,0 +1,325 @@
+//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86SelectionDAGInfo.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/DerivedTypes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-selectiondag-info"
+
+static cl::opt<bool>
+ UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
+ cl::desc("Use fast short rep mov in memcpy lowering"));
+
+bool X86SelectionDAGInfo::isBaseRegConflictPossible(
+ SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
+ // We cannot use TRI->hasBasePointer() until *after* we select all basic
+ // blocks. Legalization may introduce new stack temporaries with large
+ // alignment requirements. Fall back to generic code if there are any
+ // dynamic stack adjustments (hopefully rare) and the base pointer would
+ // conflict if we had to use it.
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
+ return false;
+
+ const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
+ DAG.getSubtarget().getRegisterInfo());
+ Register BaseReg = TRI->getBaseRegister();
+ for (unsigned R : ClobberSet)
+ if (BaseReg == R)
+ return true;
+ return false;
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
+ SDValue Size, Align Alignment, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const {
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ const X86Subtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+
+#ifndef NDEBUG
+ // If the base register might conflict with our physical registers, bail out.
+ const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
+ X86::ECX, X86::EAX, X86::EDI};
+ assert(!isBaseRegConflictPossible(DAG, ClobberSet));
+#endif
+
+ // If to a segment-relative address space, use the default lowering.
+ if (DstPtrInfo.getAddrSpace() >= 256)
+ return SDValue();
+
+ // If not DWORD aligned or size is more than the threshold, call the library.
+ // The libc version is likely to be faster for these cases. It can use the
+ // address value and run time information about the CPU.
+ if (Alignment < Align(4) || !ConstantSize ||
+ ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
+ // Check to see if there is a specialized entry-point for memory zeroing.
+ ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
+
+ if (const char *bzeroName = (ValC && ValC->isNullValue())
+ ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
+ : nullptr) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+ Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Dst;
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(Chain)
+ .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol(bzeroName, IntPtr),
+ std::move(Args))
+ .setDiscardResult();
+
+ std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+ return CallResult.second;
+ }
+
+ // Otherwise have the target-independent code call memset.
+ return SDValue();
+ }
+
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ SDValue InFlag;
+ EVT AVT;
+ SDValue Count;
+ ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
+ unsigned BytesLeft = 0;
+ if (ValC) {
+ unsigned ValReg;
+ uint64_t Val = ValC->getZExtValue() & 255;
+
+ // If the value is a constant, then we can potentially use larger sets.
+ if (Alignment > Align(2)) {
+ // DWORD aligned
+ AVT = MVT::i32;
+ ValReg = X86::EAX;
+ Val = (Val << 8) | Val;
+ Val = (Val << 16) | Val;
+ if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
+ AVT = MVT::i64;
+ ValReg = X86::RAX;
+ Val = (Val << 32) | Val;
+ }
+ } else if (Alignment == Align(2)) {
+ // WORD aligned
+ AVT = MVT::i16;
+ ValReg = X86::AX;
+ Val = (Val << 8) | Val;
+ } else {
+ // Byte aligned
+ AVT = MVT::i8;
+ ValReg = X86::AL;
+ Count = DAG.getIntPtrConstant(SizeVal, dl);
+ }
+
+ if (AVT.bitsGT(MVT::i8)) {
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
+ BytesLeft = SizeVal % UBytes;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ } else {
+ AVT = MVT::i8;
+ Count = DAG.getIntPtrConstant(SizeVal, dl);
+ Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ bool Use64BitRegs = Subtarget.isTarget64BitLP64();
+ Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+
+ if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ EVT AddrVT = Dst.getValueType();
+ EVT SizeVT = Size.getValueType();
+
+ Chain =
+ DAG.getMemset(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+ DAG.getConstant(Offset, dl, AddrVT)),
+ Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
+ isVolatile, false, DstPtrInfo.getWithOffset(Offset));
+ }
+
+ // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+ return Chain;
+}
+
+/// Emit a single REP MOVS{B,W,D,Q} instruction.
+static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, SDValue Chain, SDValue Dst,
+ SDValue Src, SDValue Size, MVT AVT) {
+ const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
+ const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
+ const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
+ const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
+
+ SDValue InFlag;
+ Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = {Chain, DAG.getValueType(AVT), InFlag};
+ return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+}
+
+/// Emit a single REP MOVSB instruction for a particular constant size.
+static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, SDValue Chain, SDValue Dst,
+ SDValue Src, uint64_t Size) {
+ return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
+ DAG.getIntPtrConstant(Size, dl), MVT::i8);
+}
+
+/// Returns the best type to use with repmovs depending on alignment.
+static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
+ uint64_t Align) {
+ assert((Align != 0) && "Align is normalized");
+ assert(isPowerOf2_64(Align) && "Align is a power of 2");
+ switch (Align) {
+ case 1:
+ return MVT::i8;
+ case 2:
+ return MVT::i16;
+ case 4:
+ return MVT::i32;
+ default:
+ return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+ }
+}
+
+/// Returns a REP MOVS instruction, possibly with a few load/stores to implement
+/// a constant size memory copy. In some cases where we know REP MOVS is
+/// inefficient we return an empty SDValue so the calling code can either
+/// generate a load/store sequence or call the runtime memcpy function.
+static SDValue emitConstantSizeRepmov(
+ SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
+ unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
+
+ /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
+ /// efficient.
+ if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
+ return SDValue();
+
+ /// If we have enhanced repmovs we use it.
+ if (Subtarget.hasERMSB())
+ return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
+
+ assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
+ /// We assume runtime memcpy will do a better job for unaligned copies when
+ /// ERMS is not present.
+ if (!AlwaysInline && (Align & 3) != 0)
+ return SDValue();
+
+ const MVT BlockType = getOptimalRepmovsType(Subtarget, Align);
+ const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
+ const uint64_t BlockCount = Size / BlockBytes;
+ const uint64_t BytesLeft = Size % BlockBytes;
+ SDValue RepMovs =
+ emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
+ DAG.getIntPtrConstant(BlockCount, dl), BlockType);
+
+ /// RepMov can process the whole length.
+ if (BytesLeft == 0)
+ return RepMovs;
+
+ assert(BytesLeft && "We have leftover at this point");
+
+ /// In case we optimize for size we use repmovsb even if it's less efficient
+ /// so we can save the loads/stores of the leftover.
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
+
+ // Handle the last 1 - 7 bytes.
+ SmallVector<SDValue, 4> Results;
+ Results.push_back(RepMovs);
+ unsigned Offset = Size - BytesLeft;
+ EVT DstVT = Dst.getValueType();
+ EVT SrcVT = Src.getValueType();
+ Results.push_back(DAG.getMemcpy(
+ Chain, dl,
+ DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
+ DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
+ DAG.getConstant(BytesLeft, dl, SizeVT), llvm::Align(Align), isVolatile,
+ /*AlwaysInline*/ true, /*isTailCall*/ false,
+ DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ // If to a segment-relative address space, use the default lowering.
+ if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
+ return SDValue();
+
+ // If the base registers conflict with our physical registers, use the default
+ // lowering.
+ const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
+ X86::ECX, X86::ESI, X86::EDI};
+ if (isBaseRegConflictPossible(DAG, ClobberSet))
+ return SDValue();
+
+ const X86Subtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+
+ // If enabled and available, use fast short rep mov.
+ if (UseFSRMForMemcpy && Subtarget.hasFSRM())
+ return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
+
+ /// Handle constant sizes,
+ if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
+ return emitConstantSizeRepmov(
+ DAG, Subtarget, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
+ Size.getValueType(), Alignment.value(), isVolatile, AlwaysInline,
+ DstPtrInfo, SrcPtrInfo);
+
+ return SDValue();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h
new file mode 100644
index 000000000000..dac62973636c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -0,0 +1,45 @@
+//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class X86SelectionDAGInfo : public SelectionDAGTargetInfo {
+ /// Returns true if it is possible for the base register to conflict with the
+ /// given set of clobbers for a memory intrinsic.
+ bool isBaseRegConflictPossible(SelectionDAG &DAG,
+ ArrayRef<MCPhysReg> ClobberSet) const;
+
+public:
+ explicit X86SelectionDAGInfo() = default;
+
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, Align Alignment,
+ bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const override;
+
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, Align Alignment,
+ bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
new file mode 100644
index 000000000000..14a3fea240e7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -0,0 +1,296 @@
+//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecodeConstantPool.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
+ APInt &UndefElts,
+ SmallVectorImpl<uint64_t> &RawMask) {
+ // It is not an error for shuffle masks to not be a vector of
+ // MaskEltSizeInBits because the constant pool uniques constants by their
+ // bit representation.
+ // e.g. the following take up the same space in the constant pool:
+ // i128 -170141183420855150465331762880109871104
+ //
+ // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+ //
+ // <4 x i32> <i32 -2147483648, i32 -2147483648,
+ // i32 -2147483648, i32 -2147483648>
+ auto *CstTy = dyn_cast<FixedVectorType>(C->getType());
+ if (!CstTy)
+ return false;
+
+ Type *CstEltTy = CstTy->getElementType();
+ if (!CstEltTy->isIntegerTy())
+ return false;
+
+ unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+ unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumCstElts = CstTy->getNumElements();
+
+ assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
+ "Unaligned shuffle mask size");
+
+ unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
+ UndefElts = APInt(NumMaskElts, 0);
+ RawMask.resize(NumMaskElts, 0);
+
+ // Fast path - if the constants match the mask size then copy direct.
+ if (MaskEltSizeInBits == CstEltSizeInBits) {
+ assert(NumCstElts == NumMaskElts && "Unaligned shuffle mask size");
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return false;
+
+ if (isa<UndefValue>(COp)) {
+ UndefElts.setBit(i);
+ RawMask[i] = 0;
+ continue;
+ }
+
+ auto *Elt = cast<ConstantInt>(COp);
+ RawMask[i] = Elt->getValue().getZExtValue();
+ }
+ return true;
+ }
+
+ // Extract all the undef/constant element data and pack into single bitsets.
+ APInt UndefBits(CstSizeInBits, 0);
+ APInt MaskBits(CstSizeInBits, 0);
+ for (unsigned i = 0; i != NumCstElts; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return false;
+
+ unsigned BitOffset = i * CstEltSizeInBits;
+
+ if (isa<UndefValue>(COp)) {
+ UndefBits.setBits(BitOffset, BitOffset + CstEltSizeInBits);
+ continue;
+ }
+
+ MaskBits.insertBits(cast<ConstantInt>(COp)->getValue(), BitOffset);
+ }
+
+ // Now extract the undef/constant bit data into the raw shuffle masks.
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ unsigned BitOffset = i * MaskEltSizeInBits;
+ APInt EltUndef = UndefBits.extractBits(MaskEltSizeInBits, BitOffset);
+
+ // Only treat the element as UNDEF if all bits are UNDEF, otherwise
+ // treat it as zero.
+ if (EltUndef.isAllOnesValue()) {
+ UndefElts.setBit(i);
+ RawMask[i] = 0;
+ continue;
+ }
+
+ APInt EltBits = MaskBits.extractBits(MaskEltSizeInBits, BitOffset);
+ RawMask[i] = EltBits.getZExtValue();
+ }
+
+ return true;
+}
+
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+ SmallVectorImpl<int> &ShuffleMask) {
+ assert((Width == 128 || Width == 256 || Width == 512) &&
+ C->getType()->getPrimitiveSizeInBits() >= Width &&
+ "Unexpected vector size.");
+
+ // The shuffle mask requires a byte vector.
+ APInt UndefElts;
+ SmallVector<uint64_t, 64> RawMask;
+ if (!extractConstantMask(C, 8, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = Width / 8;
+ assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+ "Unexpected number of vector elements.");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ uint64_t Element = RawMask[i];
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (Element & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
+ // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+ // lane of the vector we're inside.
+ unsigned Base = i & ~0xf;
+
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (Element & 0xf);
+ ShuffleMask.push_back(Index);
+ }
+ }
+}
+
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
+ SmallVectorImpl<int> &ShuffleMask) {
+ assert((Width == 128 || Width == 256 || Width == 512) &&
+ C->getType()->getPrimitiveSizeInBits() >= Width &&
+ "Unexpected vector size.");
+ assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
+
+ // The shuffle mask requires elements the same size as the target.
+ APInt UndefElts;
+ SmallVector<uint64_t, 16> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = Width / ElSize;
+ unsigned NumEltsPerLane = 128 / ElSize;
+ assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+ "Unexpected number of vector elements.");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ int Index = i & ~(NumEltsPerLane - 1);
+ uint64_t Element = RawMask[i];
+ if (ElSize == 64)
+ Index += (Element >> 1) & 0x1;
+ else
+ Index += Element & 0x3;
+
+ ShuffleMask.push_back(Index);
+ }
+}
+
+void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+ unsigned Width, SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256) && Width >= MaskTySize &&
+ "Unexpected vector size.");
+
+ // The shuffle mask requires elements the same size as the target.
+ APInt UndefElts;
+ SmallVector<uint64_t, 8> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = Width / ElSize;
+ unsigned NumEltsPerLane = 128 / ElSize;
+ assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
+ "Unexpected number of vector elements.");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ // VPERMIL2 Operation.
+ // Bits[3] - Match Bit.
+ // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+ // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+ uint64_t Selector = RawMask[i];
+ unsigned MatchBit = (Selector >> 3) & 0x1;
+
+ // M2Z[0:1] MatchBit
+ // 0Xb X Source selected by Selector index.
+ // 10b 0 Source selected by Selector index.
+ // 10b 1 Zero.
+ // 11b 0 Zero.
+ // 11b 1 Source selected by Selector index.
+ if ((M2Z & 0x2) != 0u && MatchBit != (M2Z & 0x1)) {
+ ShuffleMask.push_back(SM_SentinelZero);
+ continue;
+ }
+
+ int Index = i & ~(NumEltsPerLane - 1);
+ if (ElSize == 64)
+ Index += (Selector >> 1) & 0x1;
+ else
+ Index += Selector & 0x3;
+
+ int Src = (Selector >> 2) & 0x1;
+ Index += Src * NumElts;
+ ShuffleMask.push_back(Index);
+ }
+}
+
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert(Width == 128 && Width >= MaskTySize && "Unexpected vector size.");
+
+ // The shuffle mask requires a byte vector.
+ APInt UndefElts;
+ SmallVector<uint64_t, 16> RawMask;
+ if (!extractConstantMask(C, 8, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = Width / 8;
+ assert(NumElts == 16 && "Unexpected number of vector elements.");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ // VPPERM Operation
+ // Bits[4:0] - Byte Index (0 - 31)
+ // Bits[7:5] - Permute Operation
+ //
+ // Permute Operation:
+ // 0 - Source byte (no logical operation).
+ // 1 - Invert source byte.
+ // 2 - Bit reverse of source byte.
+ // 3 - Bit reverse of inverted source byte.
+ // 4 - 00h (zero - fill).
+ // 5 - FFh (ones - fill).
+ // 6 - Most significant bit of source byte replicated in all bit positions.
+ // 7 - Invert most significant bit of source byte and replicate in all bit
+ // positions.
+ uint64_t Element = RawMask[i];
+ uint64_t Index = Element & 0x1F;
+ uint64_t PermuteOp = (Element >> 5) & 0x7;
+
+ if (PermuteOp == 4) {
+ ShuffleMask.push_back(SM_SentinelZero);
+ continue;
+ }
+ if (PermuteOp != 0) {
+ ShuffleMask.clear();
+ return;
+ }
+ ShuffleMask.push_back((int)Index);
+ }
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
new file mode 100644
index 000000000000..77236f6aac9f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -0,0 +1,43 @@
+//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class Constant;
+template <typename T> class SmallVectorImpl;
+
+/// Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILP2 variable mask from an IR-level vector constant.
+void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+ unsigned Width, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPPERM variable mask from an IR-level vector constant.
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+ SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
new file mode 100644
index 000000000000..d57871130b0c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
@@ -0,0 +1,182 @@
+//===-- X86SpeculativeExecutionSideEffectSuppression.cpp ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file contains the X86 implementation of the speculative execution side
+/// effect suppression mitigation.
+///
+/// This must be used with the -mlvi-cfi flag in order to mitigate indirect
+/// branches and returns.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-seses"
+
+STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
+
+static cl::opt<bool> EnableSpeculativeExecutionSideEffectSuppression(
+ "x86-seses-enable-without-lvi-cfi",
+ cl::desc("Force enable speculative execution side effect suppression. "
+ "(Note: User must pass -mlvi-cfi in order to mitigate indirect "
+ "branches and returns.)"),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> OneLFENCEPerBasicBlock(
+ "x86-seses-one-lfence-per-bb",
+ cl::desc(
+ "Omit all lfences other than the first to be placed in a basic block."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> OnlyLFENCENonConst(
+ "x86-seses-only-lfence-non-const",
+ cl::desc("Only lfence before groups of terminators where at least one "
+ "branch instruction has an input to the addressing mode that is a "
+ "register other than %rip."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+ OmitBranchLFENCEs("x86-seses-omit-branch-lfences",
+ cl::desc("Omit all lfences before branch instructions."),
+ cl::init(false), cl::Hidden);
+
+namespace {
+
+class X86SpeculativeExecutionSideEffectSuppression
+ : public MachineFunctionPass {
+public:
+ X86SpeculativeExecutionSideEffectSuppression() : MachineFunctionPass(ID) {}
+
+ static char ID;
+ StringRef getPassName() const override {
+ return "X86 Speculative Execution Side Effect Suppression";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // namespace
+
+char X86SpeculativeExecutionSideEffectSuppression::ID = 0;
+
+// This function returns whether the passed instruction uses a memory addressing
+// mode that is constant. We treat all memory addressing modes that read
+// from a register that is not %rip as non-constant. Note that the use
+// of the EFLAGS register results in an addressing mode being considered
+// non-constant, therefore all JCC instructions will return false from this
+// function since one of their operands will always be the EFLAGS register.
+static bool hasConstantAddressingMode(const MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.uses())
+ if (MO.isReg() && X86::RIP != MO.getReg())
+ return false;
+ return true;
+}
+
+bool X86SpeculativeExecutionSideEffectSuppression::runOnMachineFunction(
+ MachineFunction &MF) {
+
+ const auto &OptLevel = MF.getTarget().getOptLevel();
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+
+ // Check whether SESES needs to run as the fallback for LVI at O0, whether the
+ // user explicitly passed an SESES flag, or whether the SESES target feature
+ // was set.
+ if (!EnableSpeculativeExecutionSideEffectSuppression &&
+ !(Subtarget.useLVILoadHardening() && OptLevel == CodeGenOpt::None) &&
+ !Subtarget.useSpeculativeExecutionSideEffectSuppression())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+ << " **********\n");
+ bool Modified = false;
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ for (MachineBasicBlock &MBB : MF) {
+ MachineInstr *FirstTerminator = nullptr;
+ // Keep track of whether the previous instruction was an LFENCE to avoid
+ // adding redundant LFENCEs.
+ bool PrevInstIsLFENCE = false;
+ for (auto &MI : MBB) {
+
+ if (MI.getOpcode() == X86::LFENCE) {
+ PrevInstIsLFENCE = true;
+ continue;
+ }
+ // We want to put an LFENCE before any instruction that
+ // may load or store. This LFENCE is intended to avoid leaking any secret
+ // data due to a given load or store. This results in closing the cache
+ // and memory timing side channels. We will treat terminators that load
+ // or store separately.
+ if (MI.mayLoadOrStore() && !MI.isTerminator()) {
+ if (!PrevInstIsLFENCE) {
+ BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE));
+ NumLFENCEsInserted++;
+ Modified = true;
+ }
+ if (OneLFENCEPerBasicBlock)
+ break;
+ }
+ // The following section will be LFENCEing before groups of terminators
+ // that include branches. This will close the branch prediction side
+ // channels since we will prevent code executing after misspeculation as
+ // a result of the LFENCEs placed with this logic.
+
+ // Keep track of the first terminator in a basic block since if we need
+ // to LFENCE the terminators in this basic block we must add the
+ // instruction before the first terminator in the basic block (as
+ // opposed to before the terminator that indicates an LFENCE is
+ // required). An example of why this is necessary is that the
+ // X86InstrInfo::analyzeBranch method assumes all terminators are grouped
+ // together and terminates it's analysis once the first non-termintor
+ // instruction is found.
+ if (MI.isTerminator() && FirstTerminator == nullptr)
+ FirstTerminator = &MI;
+
+ // Look for branch instructions that will require an LFENCE to be put
+ // before this basic block's terminators.
+ if (!MI.isBranch() || OmitBranchLFENCEs) {
+ // This isn't a branch or we're not putting LFENCEs before branches.
+ PrevInstIsLFENCE = false;
+ continue;
+ }
+
+ if (OnlyLFENCENonConst && hasConstantAddressingMode(MI)) {
+ // This is a branch, but it only has constant addressing mode and we're
+ // not adding LFENCEs before such branches.
+ PrevInstIsLFENCE = false;
+ continue;
+ }
+
+ // This branch requires adding an LFENCE.
+ if (!PrevInstIsLFENCE) {
+ assert(FirstTerminator && "Unknown terminator instruction");
+ BuildMI(MBB, FirstTerminator, DebugLoc(), TII->get(X86::LFENCE));
+ NumLFENCEsInserted++;
+ Modified = true;
+ }
+ break;
+ }
+ }
+
+ return Modified;
+}
+
+FunctionPass *llvm::createX86SpeculativeExecutionSideEffectSuppression() {
+ return new X86SpeculativeExecutionSideEffectSuppression();
+}
+
+INITIALIZE_PASS(X86SpeculativeExecutionSideEffectSuppression, "x86-seses",
+ "X86 Speculative Execution Side Effect Suppression", false,
+ false)
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
new file mode 100644
index 000000000000..aa73d4bce65a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -0,0 +1,2278 @@
+//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Provide a pass which mitigates speculative execution attacks which operate
+/// by speculating incorrectly past some predicate (a type check, bounds check,
+/// or other condition) to reach a load with invalid inputs and leak the data
+/// accessed by that load using a side channel out of the speculative domain.
+///
+/// For details on the attacks, see the first variant in both the Project Zero
+/// writeup and the Spectre paper:
+/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
+/// https://spectreattack.com/spectre.pdf
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-slh"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
+STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
+STATISTIC(NumAddrRegsHardened,
+ "Number of address mode used registers hardaned");
+STATISTIC(NumPostLoadRegsHardened,
+ "Number of post-load register values hardened");
+STATISTIC(NumCallsOrJumpsHardened,
+ "Number of calls or jumps requiring extra hardening");
+STATISTIC(NumInstsInserted, "Number of instructions inserted");
+STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
+
+static cl::opt<bool> EnableSpeculativeLoadHardening(
+ "x86-speculative-load-hardening",
+ cl::desc("Force enable speculative load hardening"), cl::init(false),
+ cl::Hidden);
+
+static cl::opt<bool> HardenEdgesWithLFENCE(
+ PASS_KEY "-lfence",
+ cl::desc(
+ "Use LFENCE along each conditional edge to harden against speculative "
+ "loads rather than conditional movs and poisoned pointers."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EnablePostLoadHardening(
+ PASS_KEY "-post-load",
+ cl::desc("Harden the value loaded *after* it is loaded by "
+ "flushing the loaded bits to 1. This is hard to do "
+ "in general but can be done easily for GPRs."),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> FenceCallAndRet(
+ PASS_KEY "-fence-call-and-ret",
+ cl::desc("Use a full speculation fence to harden both call and ret edges "
+ "rather than a lighter weight mitigation."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> HardenInterprocedurally(
+ PASS_KEY "-ip",
+ cl::desc("Harden interprocedurally by passing our state in and out of "
+ "functions in the high bits of the stack pointer."),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+ HardenLoads(PASS_KEY "-loads",
+ cl::desc("Sanitize loads from memory. When disable, no "
+ "significant security is provided."),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> HardenIndirectCallsAndJumps(
+ PASS_KEY "-indirect",
+ cl::desc("Harden indirect calls and jumps against using speculatively "
+ "stored attacker controlled addresses. This is designed to "
+ "mitigate Spectre v1.2 style attacks."),
+ cl::init(true), cl::Hidden);
+
+namespace {
+
+class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
+public:
+ X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { }
+
+ StringRef getPassName() const override {
+ return "X86 speculative load hardening";
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Pass identification, replacement for typeid.
+ static char ID;
+
+private:
+ /// The information about a block's conditional terminators needed to trace
+ /// our predicate state through the exiting edges.
+ struct BlockCondInfo {
+ MachineBasicBlock *MBB;
+
+ // We mostly have one conditional branch, and in extremely rare cases have
+ // two. Three and more are so rare as to be unimportant for compile time.
+ SmallVector<MachineInstr *, 2> CondBrs;
+
+ MachineInstr *UncondBr;
+ };
+
+ /// Manages the predicate state traced through the program.
+ struct PredState {
+ unsigned InitialReg = 0;
+ unsigned PoisonReg = 0;
+
+ const TargetRegisterClass *RC;
+ MachineSSAUpdater SSA;
+
+ PredState(MachineFunction &MF, const TargetRegisterClass *RC)
+ : RC(RC), SSA(MF) {}
+ };
+
+ const X86Subtarget *Subtarget = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+
+ Optional<PredState> PS;
+
+ void hardenEdgesWithLFENCE(MachineFunction &MF);
+
+ SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
+
+ SmallVector<MachineInstr *, 16>
+ tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
+
+ void unfoldCallAndJumpLoads(MachineFunction &MF);
+
+ SmallVector<MachineInstr *, 16>
+ tracePredStateThroughIndirectBranches(MachineFunction &MF);
+
+ void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
+
+ unsigned saveEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
+ void restoreEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+ Register Reg);
+
+ void mergePredStateIntoSP(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+ unsigned PredStateReg);
+ unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc);
+
+ void
+ hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
+ MachineOperand &IndexMO,
+ SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
+ MachineInstr *
+ sinkPostLoadHardenedInst(MachineInstr &MI,
+ SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
+ bool canHardenRegister(Register Reg);
+ unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc);
+ unsigned hardenPostLoad(MachineInstr &MI);
+ void hardenReturnInstr(MachineInstr &MI);
+ void tracePredStateThroughCall(MachineInstr &MI);
+ void hardenIndirectCallOrJumpInstr(
+ MachineInstr &MI,
+ SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
+};
+
+} // end anonymous namespace
+
+char X86SpeculativeLoadHardeningPass::ID = 0;
+
+void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
+ AnalysisUsage &AU) const {
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
+ MachineBasicBlock &Succ, int SuccCount,
+ MachineInstr *Br, MachineInstr *&UncondBr,
+ const X86InstrInfo &TII) {
+ assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
+
+ MachineFunction &MF = *MBB.getParent();
+
+ MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
+
+ // We have to insert the new block immediately after the current one as we
+ // don't know what layout-successor relationships the successor has and we
+ // may not be able to (and generally don't want to) try to fix those up.
+ MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
+
+ // Update the branch instruction if necessary.
+ if (Br) {
+ assert(Br->getOperand(0).getMBB() == &Succ &&
+ "Didn't start with the right target!");
+ Br->getOperand(0).setMBB(&NewMBB);
+
+ // If this successor was reached through a branch rather than fallthrough,
+ // we might have *broken* fallthrough and so need to inject a new
+ // unconditional branch.
+ if (!UncondBr) {
+ MachineBasicBlock &OldLayoutSucc =
+ *std::next(MachineFunction::iterator(&NewMBB));
+ assert(MBB.isSuccessor(&OldLayoutSucc) &&
+ "Without an unconditional branch, the old layout successor should "
+ "be an actual successor!");
+ auto BrBuilder =
+ BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
+ // Update the unconditional branch now that we've added one.
+ UncondBr = &*BrBuilder;
+ }
+
+ // Insert unconditional "jump Succ" instruction in the new block if
+ // necessary.
+ if (!NewMBB.isLayoutSuccessor(&Succ)) {
+ SmallVector<MachineOperand, 4> Cond;
+ TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
+ }
+ } else {
+ assert(!UncondBr &&
+ "Cannot have a branchless successor and an unconditional branch!");
+ assert(NewMBB.isLayoutSuccessor(&Succ) &&
+ "A non-branch successor must have been a layout successor before "
+ "and now is a layout successor of the new block.");
+ }
+
+ // If this is the only edge to the successor, we can just replace it in the
+ // CFG. Otherwise we need to add a new entry in the CFG for the new
+ // successor.
+ if (SuccCount == 1) {
+ MBB.replaceSuccessor(&Succ, &NewMBB);
+ } else {
+ MBB.splitSuccessor(&Succ, &NewMBB);
+ }
+
+ // Hook up the edge from the new basic block to the old successor in the CFG.
+ NewMBB.addSuccessor(&Succ);
+
+ // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
+ for (MachineInstr &MI : Succ) {
+ if (!MI.isPHI())
+ break;
+ for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+ OpIdx += 2) {
+ MachineOperand &OpV = MI.getOperand(OpIdx);
+ MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
+ assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
+ if (OpMBB.getMBB() != &MBB)
+ continue;
+
+ // If this is the last edge to the succesor, just replace MBB in the PHI
+ if (SuccCount == 1) {
+ OpMBB.setMBB(&NewMBB);
+ break;
+ }
+
+ // Otherwise, append a new pair of operands for the new incoming edge.
+ MI.addOperand(MF, OpV);
+ MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
+ break;
+ }
+ }
+
+ // Inherit live-ins from the successor
+ for (auto &LI : Succ.liveins())
+ NewMBB.addLiveIn(LI);
+
+ LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
+ << Succ.getName() << "'.\n");
+ return NewMBB;
+}
+
+/// Removing duplicate PHI operands to leave the PHI in a canonical and
+/// predictable form.
+///
+/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
+/// isn't what you might expect. We may have multiple entries in PHI nodes for
+/// a single predecessor. This makes CFG-updating extremely complex, so here we
+/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
+/// one entry per predecessor, regardless of how many edges there are.
+static void canonicalizePHIOperands(MachineFunction &MF) {
+ SmallPtrSet<MachineBasicBlock *, 4> Preds;
+ SmallVector<int, 4> DupIndices;
+ for (auto &MBB : MF)
+ for (auto &MI : MBB) {
+ if (!MI.isPHI())
+ break;
+
+ // First we scan the operands of the PHI looking for duplicate entries
+ // a particular predecessor. We retain the operand index of each duplicate
+ // entry found.
+ for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+ OpIdx += 2)
+ if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
+ DupIndices.push_back(OpIdx);
+
+ // Now walk the duplicate indices, removing both the block and value. Note
+ // that these are stored as a vector making this element-wise removal
+ // :w
+ // potentially quadratic.
+ //
+ // FIXME: It is really frustrating that we have to use a quadratic
+ // removal algorithm here. There should be a better way, but the use-def
+ // updates required make that impossible using the public API.
+ //
+ // Note that we have to process these backwards so that we don't
+ // invalidate other indices with each removal.
+ while (!DupIndices.empty()) {
+ int OpIdx = DupIndices.pop_back_val();
+ // Remove both the block and value operand, again in reverse order to
+ // preserve indices.
+ MI.RemoveOperand(OpIdx + 1);
+ MI.RemoveOperand(OpIdx);
+ }
+
+ Preds.clear();
+ }
+}
+
+/// Helper to scan a function for loads vulnerable to misspeculation that we
+/// want to harden.
+///
+/// We use this to avoid making changes to functions where there is nothing we
+/// need to do to harden against misspeculation.
+static bool hasVulnerableLoad(MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ // Loads within this basic block after an LFENCE are not at risk of
+ // speculatively executing with invalid predicates from prior control
+ // flow. So break out of this block but continue scanning the function.
+ if (MI.getOpcode() == X86::LFENCE)
+ break;
+
+ // Looking for loads only.
+ if (!MI.mayLoad())
+ continue;
+
+ // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
+ if (MI.getOpcode() == X86::MFENCE)
+ continue;
+
+ // We found a load.
+ return true;
+ }
+ }
+
+ // No loads found.
+ return false;
+}
+
+bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
+ MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+ << " **********\n");
+
+ // Only run if this pass is forced enabled or we detect the relevant function
+ // attribute requesting SLH.
+ if (!EnableSpeculativeLoadHardening &&
+ !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
+ return false;
+
+ Subtarget = &MF.getSubtarget<X86Subtarget>();
+ MRI = &MF.getRegInfo();
+ TII = Subtarget->getInstrInfo();
+ TRI = Subtarget->getRegisterInfo();
+
+ // FIXME: Support for 32-bit.
+ PS.emplace(MF, &X86::GR64_NOSPRegClass);
+
+ if (MF.begin() == MF.end())
+ // Nothing to do for a degenerate empty function...
+ return false;
+
+ // We support an alternative hardening technique based on a debug flag.
+ if (HardenEdgesWithLFENCE) {
+ hardenEdgesWithLFENCE(MF);
+ return true;
+ }
+
+ // Create a dummy debug loc to use for all the generated code here.
+ DebugLoc Loc;
+
+ MachineBasicBlock &Entry = *MF.begin();
+ auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
+
+ // Do a quick scan to see if we have any checkable loads.
+ bool HasVulnerableLoad = hasVulnerableLoad(MF);
+
+ // See if we have any conditional branching blocks that we will need to trace
+ // predicate state through.
+ SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
+
+ // If we have no interesting conditions or loads, nothing to do here.
+ if (!HasVulnerableLoad && Infos.empty())
+ return true;
+
+ // The poison value is required to be an all-ones value for many aspects of
+ // this mitigation.
+ const int PoisonVal = -1;
+ PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
+ BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
+ .addImm(PoisonVal);
+ ++NumInstsInserted;
+
+ // If we have loads being hardened and we've asked for call and ret edges to
+ // get a full fence-based mitigation, inject that fence.
+ if (HasVulnerableLoad && FenceCallAndRet) {
+ // We need to insert an LFENCE at the start of the function to suspend any
+ // incoming misspeculation from the caller. This helps two-fold: the caller
+ // may not have been protected as this code has been, and this code gets to
+ // not take any specific action to protect across calls.
+ // FIXME: We could skip this for functions which unconditionally return
+ // a constant.
+ BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
+ ++NumInstsInserted;
+ ++NumLFENCEsInserted;
+ }
+
+ // If we guarded the entry with an LFENCE and have no conditionals to protect
+ // in blocks, then we're done.
+ if (FenceCallAndRet && Infos.empty())
+ // We may have changed the function's code at this point to insert fences.
+ return true;
+
+ // For every basic block in the function which can b
+ if (HardenInterprocedurally && !FenceCallAndRet) {
+ // Set up the predicate state by extracting it from the incoming stack
+ // pointer so we pick up any misspeculation in our caller.
+ PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
+ } else {
+ // Otherwise, just build the predicate state itself by zeroing a register
+ // as we don't need any initial state.
+ PS->InitialReg = MRI->createVirtualRegister(PS->RC);
+ Register PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
+ PredStateSubReg);
+ ++NumInstsInserted;
+ MachineOperand *ZeroEFLAGSDefOp =
+ ZeroI->findRegisterDefOperand(X86::EFLAGS);
+ assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
+ "Must have an implicit def of EFLAGS!");
+ ZeroEFLAGSDefOp->setIsDead(true);
+ BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
+ PS->InitialReg)
+ .addImm(0)
+ .addReg(PredStateSubReg)
+ .addImm(X86::sub_32bit);
+ }
+
+ // We're going to need to trace predicate state throughout the function's
+ // CFG. Prepare for this by setting up our initial state of PHIs with unique
+ // predecessor entries and all the initial predicate state.
+ canonicalizePHIOperands(MF);
+
+ // Track the updated values in an SSA updater to rewrite into SSA form at the
+ // end.
+ PS->SSA.Initialize(PS->InitialReg);
+ PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
+
+ // Trace through the CFG.
+ auto CMovs = tracePredStateThroughCFG(MF, Infos);
+
+ // We may also enter basic blocks in this function via exception handling
+ // control flow. Here, if we are hardening interprocedurally, we need to
+ // re-capture the predicate state from the throwing code. In the Itanium ABI,
+ // the throw will always look like a call to __cxa_throw and will have the
+ // predicate state in the stack pointer, so extract fresh predicate state from
+ // the stack pointer and make it available in SSA.
+ // FIXME: Handle non-itanium ABI EH models.
+ if (HardenInterprocedurally) {
+ for (MachineBasicBlock &MBB : MF) {
+ assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
+ assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
+ assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
+ if (!MBB.isEHPad())
+ continue;
+ PS->SSA.AddAvailableValue(
+ &MBB,
+ extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
+ }
+ }
+
+ if (HardenIndirectCallsAndJumps) {
+ // If we are going to harden calls and jumps we need to unfold their memory
+ // operands.
+ unfoldCallAndJumpLoads(MF);
+
+ // Then we trace predicate state through the indirect branches.
+ auto IndirectBrCMovs = tracePredStateThroughIndirectBranches(MF);
+ CMovs.append(IndirectBrCMovs.begin(), IndirectBrCMovs.end());
+ }
+
+ // Now that we have the predicate state available at the start of each block
+ // in the CFG, trace it through each block, hardening vulnerable instructions
+ // as we go.
+ tracePredStateThroughBlocksAndHarden(MF);
+
+ // Now rewrite all the uses of the pred state using the SSA updater to insert
+ // PHIs connecting the state between blocks along the CFG edges.
+ for (MachineInstr *CMovI : CMovs)
+ for (MachineOperand &Op : CMovI->operands()) {
+ if (!Op.isReg() || Op.getReg() != PS->InitialReg)
+ continue;
+
+ PS->SSA.RewriteUse(Op);
+ }
+
+ LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
+ dbgs() << "\n"; MF.verify(this));
+ return true;
+}
+
+/// Implements the naive hardening approach of putting an LFENCE after every
+/// potentially mis-predicted control flow construct.
+///
+/// We include this as an alternative mostly for the purpose of comparison. The
+/// performance impact of this is expected to be extremely severe and not
+/// practical for any real-world users.
+void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
+ MachineFunction &MF) {
+ // First, we scan the function looking for blocks that are reached along edges
+ // that we might want to harden.
+ SmallSetVector<MachineBasicBlock *, 8> Blocks;
+ for (MachineBasicBlock &MBB : MF) {
+ // If there are no or only one successor, nothing to do here.
+ if (MBB.succ_size() <= 1)
+ continue;
+
+ // Skip blocks unless their terminators start with a branch. Other
+ // terminators don't seem interesting for guarding against misspeculation.
+ auto TermIt = MBB.getFirstTerminator();
+ if (TermIt == MBB.end() || !TermIt->isBranch())
+ continue;
+
+ // Add all the non-EH-pad succossors to the blocks we want to harden. We
+ // skip EH pads because there isn't really a condition of interest on
+ // entering.
+ for (MachineBasicBlock *SuccMBB : MBB.successors())
+ if (!SuccMBB->isEHPad())
+ Blocks.insert(SuccMBB);
+ }
+
+ for (MachineBasicBlock *MBB : Blocks) {
+ auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
+ BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
+ ++NumInstsInserted;
+ ++NumLFENCEsInserted;
+ }
+}
+
+SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
+X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
+ SmallVector<BlockCondInfo, 16> Infos;
+
+ // Walk the function and build up a summary for each block's conditions that
+ // we need to trace through.
+ for (MachineBasicBlock &MBB : MF) {
+ // If there are no or only one successor, nothing to do here.
+ if (MBB.succ_size() <= 1)
+ continue;
+
+ // We want to reliably handle any conditional branch terminators in the
+ // MBB, so we manually analyze the branch. We can handle all of the
+ // permutations here, including ones that analyze branch cannot.
+ //
+ // The approach is to walk backwards across the terminators, resetting at
+ // any unconditional non-indirect branch, and track all conditional edges
+ // to basic blocks as well as the fallthrough or unconditional successor
+ // edge. For each conditional edge, we track the target and the opposite
+ // condition code in order to inject a "no-op" cmov into that successor
+ // that will harden the predicate. For the fallthrough/unconditional
+ // edge, we inject a separate cmov for each conditional branch with
+ // matching condition codes. This effectively implements an "and" of the
+ // condition flags, even if there isn't a single condition flag that would
+ // directly implement that. We don't bother trying to optimize either of
+ // these cases because if such an optimization is possible, LLVM should
+ // have optimized the conditional *branches* in that way already to reduce
+ // instruction count. This late, we simply assume the minimal number of
+ // branch instructions is being emitted and use that to guide our cmov
+ // insertion.
+
+ BlockCondInfo Info = {&MBB, {}, nullptr};
+
+ // Now walk backwards through the terminators and build up successors they
+ // reach and the conditions.
+ for (MachineInstr &MI : llvm::reverse(MBB)) {
+ // Once we've handled all the terminators, we're done.
+ if (!MI.isTerminator())
+ break;
+
+ // If we see a non-branch terminator, we can't handle anything so bail.
+ if (!MI.isBranch()) {
+ Info.CondBrs.clear();
+ break;
+ }
+
+ // If we see an unconditional branch, reset our state, clear any
+ // fallthrough, and set this is the "else" successor.
+ if (MI.getOpcode() == X86::JMP_1) {
+ Info.CondBrs.clear();
+ Info.UncondBr = &MI;
+ continue;
+ }
+
+ // If we get an invalid condition, we have an indirect branch or some
+ // other unanalyzable "fallthrough" case. We model this as a nullptr for
+ // the destination so we can still guard any conditional successors.
+ // Consider code sequences like:
+ // ```
+ // jCC L1
+ // jmpq *%rax
+ // ```
+ // We still want to harden the edge to `L1`.
+ if (X86::getCondFromBranch(MI) == X86::COND_INVALID) {
+ Info.CondBrs.clear();
+ Info.UncondBr = &MI;
+ continue;
+ }
+
+ // We have a vanilla conditional branch, add it to our list.
+ Info.CondBrs.push_back(&MI);
+ }
+ if (Info.CondBrs.empty()) {
+ ++NumBranchesUntraced;
+ LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
+ MBB.dump());
+ continue;
+ }
+
+ Infos.push_back(Info);
+ }
+
+ return Infos;
+}
+
+/// Trace the predicate state through the CFG, instrumenting each conditional
+/// branch such that misspeculation through an edge will poison the predicate
+/// state.
+///
+/// Returns the list of inserted CMov instructions so that they can have their
+/// uses of the predicate state rewritten into proper SSA form once it is
+/// complete.
+SmallVector<MachineInstr *, 16>
+X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
+ MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
+ // Collect the inserted cmov instructions so we can rewrite their uses of the
+ // predicate state into SSA form.
+ SmallVector<MachineInstr *, 16> CMovs;
+
+ // Now walk all of the basic blocks looking for ones that end in conditional
+ // jumps where we need to update this register along each edge.
+ for (const BlockCondInfo &Info : Infos) {
+ MachineBasicBlock &MBB = *Info.MBB;
+ const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
+ MachineInstr *UncondBr = Info.UncondBr;
+
+ LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
+ << "\n");
+ ++NumCondBranchesTraced;
+
+ // Compute the non-conditional successor as either the target of any
+ // unconditional branch or the layout successor.
+ MachineBasicBlock *UncondSucc =
+ UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
+ ? UncondBr->getOperand(0).getMBB()
+ : nullptr)
+ : &*std::next(MachineFunction::iterator(&MBB));
+
+ // Count how many edges there are to any given successor.
+ SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
+ if (UncondSucc)
+ ++SuccCounts[UncondSucc];
+ for (auto *CondBr : CondBrs)
+ ++SuccCounts[CondBr->getOperand(0).getMBB()];
+
+ // A lambda to insert cmov instructions into a block checking all of the
+ // condition codes in a sequence.
+ auto BuildCheckingBlockForSuccAndConds =
+ [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
+ MachineInstr *Br, MachineInstr *&UncondBr,
+ ArrayRef<X86::CondCode> Conds) {
+ // First, we split the edge to insert the checking block into a safe
+ // location.
+ auto &CheckingMBB =
+ (SuccCount == 1 && Succ.pred_size() == 1)
+ ? Succ
+ : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
+
+ bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
+ if (!LiveEFLAGS)
+ CheckingMBB.addLiveIn(X86::EFLAGS);
+
+ // Now insert the cmovs to implement the checks.
+ auto InsertPt = CheckingMBB.begin();
+ assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
+ "Should never have a PHI in the initial checking block as it "
+ "always has a single predecessor!");
+
+ // We will wire each cmov to each other, but need to start with the
+ // incoming pred state.
+ unsigned CurStateReg = PS->InitialReg;
+
+ for (X86::CondCode Cond : Conds) {
+ int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
+ auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
+
+ Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ // Note that we intentionally use an empty debug location so that
+ // this picks up the preceding location.
+ auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
+ TII->get(CMovOp), UpdatedStateReg)
+ .addReg(CurStateReg)
+ .addReg(PS->PoisonReg)
+ .addImm(Cond);
+ // If this is the last cmov and the EFLAGS weren't originally
+ // live-in, mark them as killed.
+ if (!LiveEFLAGS && Cond == Conds.back())
+ CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
+
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
+ dbgs() << "\n");
+
+ // The first one of the cmovs will be using the top level
+ // `PredStateReg` and need to get rewritten into SSA form.
+ if (CurStateReg == PS->InitialReg)
+ CMovs.push_back(&*CMovI);
+
+ // The next cmov should start from this one's def.
+ CurStateReg = UpdatedStateReg;
+ }
+
+ // And put the last one into the available values for SSA form of our
+ // predicate state.
+ PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
+ };
+
+ std::vector<X86::CondCode> UncondCodeSeq;
+ for (auto *CondBr : CondBrs) {
+ MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
+ int &SuccCount = SuccCounts[&Succ];
+
+ X86::CondCode Cond = X86::getCondFromBranch(*CondBr);
+ X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
+ UncondCodeSeq.push_back(Cond);
+
+ BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
+ {InvCond});
+
+ // Decrement the successor count now that we've split one of the edges.
+ // We need to keep the count of edges to the successor accurate in order
+ // to know above when to *replace* the successor in the CFG vs. just
+ // adding the new successor.
+ --SuccCount;
+ }
+
+ // Since we may have split edges and changed the number of successors,
+ // normalize the probabilities. This avoids doing it each time we split an
+ // edge.
+ MBB.normalizeSuccProbs();
+
+ // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
+ // need to intersect the other condition codes. We can do this by just
+ // doing a cmov for each one.
+ if (!UncondSucc)
+ // If we have no fallthrough to protect (perhaps it is an indirect jump?)
+ // just skip this and continue.
+ continue;
+
+ assert(SuccCounts[UncondSucc] == 1 &&
+ "We should never have more than one edge to the unconditional "
+ "successor at this point because every other edge must have been "
+ "split above!");
+
+ // Sort and unique the codes to minimize them.
+ llvm::sort(UncondCodeSeq);
+ UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
+ UncondCodeSeq.end());
+
+ // Build a checking version of the successor.
+ BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
+ UncondBr, UncondBr, UncondCodeSeq);
+ }
+
+ return CMovs;
+}
+
+/// Compute the register class for the unfolded load.
+///
+/// FIXME: This should probably live in X86InstrInfo, potentially by adding
+/// a way to unfold into a newly created vreg rather than requiring a register
+/// input.
+static const TargetRegisterClass *
+getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
+ unsigned Opcode) {
+ unsigned Index;
+ unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
+ Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
+ const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
+ return TII.getRegClass(MCID, Index, &TII.getRegisterInfo(), MF);
+}
+
+void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
+ MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF)
+ for (auto MII = MBB.instr_begin(), MIE = MBB.instr_end(); MII != MIE;) {
+ // Grab a reference and increment the iterator so we can remove this
+ // instruction if needed without disturbing the iteration.
+ MachineInstr &MI = *MII++;
+
+ // Must either be a call or a branch.
+ if (!MI.isCall() && !MI.isBranch())
+ continue;
+ // We only care about loading variants of these instructions.
+ if (!MI.mayLoad())
+ continue;
+
+ switch (MI.getOpcode()) {
+ default: {
+ LLVM_DEBUG(
+ dbgs() << "ERROR: Found an unexpected loading branch or call "
+ "instruction:\n";
+ MI.dump(); dbgs() << "\n");
+ report_fatal_error("Unexpected loading branch or call!");
+ }
+
+ case X86::FARCALL16m:
+ case X86::FARCALL32m:
+ case X86::FARCALL64m:
+ case X86::FARJMP16m:
+ case X86::FARJMP32m:
+ case X86::FARJMP64m:
+ // We cannot mitigate far jumps or calls, but we also don't expect them
+ // to be vulnerable to Spectre v1.2 style attacks.
+ continue;
+
+ case X86::CALL16m:
+ case X86::CALL16m_NT:
+ case X86::CALL32m:
+ case X86::CALL32m_NT:
+ case X86::CALL64m:
+ case X86::CALL64m_NT:
+ case X86::JMP16m:
+ case X86::JMP16m_NT:
+ case X86::JMP32m:
+ case X86::JMP32m_NT:
+ case X86::JMP64m:
+ case X86::JMP64m_NT:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPm64_REX:
+ case X86::TAILJMPm:
+ case X86::TCRETURNmi64:
+ case X86::TCRETURNmi: {
+ // Use the generic unfold logic now that we know we're dealing with
+ // expected instructions.
+ // FIXME: We don't have test coverage for all of these!
+ auto *UnfoldedRC = getRegClassForUnfoldedLoad(MF, *TII, MI.getOpcode());
+ if (!UnfoldedRC) {
+ LLVM_DEBUG(dbgs()
+ << "ERROR: Unable to unfold load from instruction:\n";
+ MI.dump(); dbgs() << "\n");
+ report_fatal_error("Unable to unfold load!");
+ }
+ Register Reg = MRI->createVirtualRegister(UnfoldedRC);
+ SmallVector<MachineInstr *, 2> NewMIs;
+ // If we were able to compute an unfolded reg class, any failure here
+ // is just a programming error so just assert.
+ bool Unfolded =
+ TII->unfoldMemoryOperand(MF, MI, Reg, /*UnfoldLoad*/ true,
+ /*UnfoldStore*/ false, NewMIs);
+ (void)Unfolded;
+ assert(Unfolded &&
+ "Computed unfolded register class but failed to unfold");
+ // Now stitch the new instructions into place and erase the old one.
+ for (auto *NewMI : NewMIs)
+ MBB.insert(MI.getIterator(), NewMI);
+
+ // Update the call site info.
+ if (MI.isCandidateForCallSiteEntry())
+ MF.eraseCallSiteInfo(&MI);
+
+ MI.eraseFromParent();
+ LLVM_DEBUG({
+ dbgs() << "Unfolded load successfully into:\n";
+ for (auto *NewMI : NewMIs) {
+ NewMI->dump();
+ dbgs() << "\n";
+ }
+ });
+ continue;
+ }
+ }
+ llvm_unreachable("Escaped switch with default!");
+ }
+}
+
+/// Trace the predicate state through indirect branches, instrumenting them to
+/// poison the state if a target is reached that does not match the expected
+/// target.
+///
+/// This is designed to mitigate Spectre variant 1 attacks where an indirect
+/// branch is trained to predict a particular target and then mispredicts that
+/// target in a way that can leak data. Despite using an indirect branch, this
+/// is really a variant 1 style attack: it does not steer execution to an
+/// arbitrary or attacker controlled address, and it does not require any
+/// special code executing next to the victim. This attack can also be mitigated
+/// through retpolines, but those require either replacing indirect branches
+/// with conditional direct branches or lowering them through a device that
+/// blocks speculation. This mitigation can replace these retpoline-style
+/// mitigations for jump tables and other indirect branches within a function
+/// when variant 2 isn't a risk while allowing limited speculation. Indirect
+/// calls, however, cannot be mitigated through this technique without changing
+/// the ABI in a fundamental way.
+SmallVector<MachineInstr *, 16>
+X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
+ MachineFunction &MF) {
+ // We use the SSAUpdater to insert PHI nodes for the target addresses of
+ // indirect branches. We don't actually need the full power of the SSA updater
+ // in this particular case as we always have immediately available values, but
+ // this avoids us having to re-implement the PHI construction logic.
+ MachineSSAUpdater TargetAddrSSA(MF);
+ TargetAddrSSA.Initialize(MRI->createVirtualRegister(&X86::GR64RegClass));
+
+ // Track which blocks were terminated with an indirect branch.
+ SmallPtrSet<MachineBasicBlock *, 4> IndirectTerminatedMBBs;
+
+ // We need to know what blocks end up reached via indirect branches. We
+ // expect this to be a subset of those whose address is taken and so track it
+ // directly via the CFG.
+ SmallPtrSet<MachineBasicBlock *, 4> IndirectTargetMBBs;
+
+ // Walk all the blocks which end in an indirect branch and make the
+ // target address available.
+ for (MachineBasicBlock &MBB : MF) {
+ // Find the last terminator.
+ auto MII = MBB.instr_rbegin();
+ while (MII != MBB.instr_rend() && MII->isDebugInstr())
+ ++MII;
+ if (MII == MBB.instr_rend())
+ continue;
+ MachineInstr &TI = *MII;
+ if (!TI.isTerminator() || !TI.isBranch())
+ // No terminator or non-branch terminator.
+ continue;
+
+ unsigned TargetReg;
+
+ switch (TI.getOpcode()) {
+ default:
+ // Direct branch or conditional branch (leading to fallthrough).
+ continue;
+
+ case X86::FARJMP16m:
+ case X86::FARJMP32m:
+ case X86::FARJMP64m:
+ // We cannot mitigate far jumps or calls, but we also don't expect them
+ // to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
+ continue;
+
+ case X86::JMP16m:
+ case X86::JMP16m_NT:
+ case X86::JMP32m:
+ case X86::JMP32m_NT:
+ case X86::JMP64m:
+ case X86::JMP64m_NT:
+ // Mostly as documentation.
+ report_fatal_error("Memory operand jumps should have been unfolded!");
+
+ case X86::JMP16r:
+ report_fatal_error(
+ "Support for 16-bit indirect branches is not implemented.");
+ case X86::JMP32r:
+ report_fatal_error(
+ "Support for 32-bit indirect branches is not implemented.");
+
+ case X86::JMP64r:
+ TargetReg = TI.getOperand(0).getReg();
+ }
+
+ // We have definitely found an indirect branch. Verify that there are no
+ // preceding conditional branches as we don't yet support that.
+ if (llvm::any_of(MBB.terminators(), [&](MachineInstr &OtherTI) {
+ return !OtherTI.isDebugInstr() && &OtherTI != &TI;
+ })) {
+ LLVM_DEBUG({
+ dbgs() << "ERROR: Found other terminators in a block with an indirect "
+ "branch! This is not yet supported! Terminator sequence:\n";
+ for (MachineInstr &MI : MBB.terminators()) {
+ MI.dump();
+ dbgs() << '\n';
+ }
+ });
+ report_fatal_error("Unimplemented terminator sequence!");
+ }
+
+ // Make the target register an available value for this block.
+ TargetAddrSSA.AddAvailableValue(&MBB, TargetReg);
+ IndirectTerminatedMBBs.insert(&MBB);
+
+ // Add all the successors to our target candidates.
+ for (MachineBasicBlock *Succ : MBB.successors())
+ IndirectTargetMBBs.insert(Succ);
+ }
+
+ // Keep track of the cmov instructions we insert so we can return them.
+ SmallVector<MachineInstr *, 16> CMovs;
+
+ // If we didn't find any indirect branches with targets, nothing to do here.
+ if (IndirectTargetMBBs.empty())
+ return CMovs;
+
+ // We found indirect branches and targets that need to be instrumented to
+ // harden loads within them. Walk the blocks of the function (to get a stable
+ // ordering) and instrument each target of an indirect branch.
+ for (MachineBasicBlock &MBB : MF) {
+ // Skip the blocks that aren't candidate targets.
+ if (!IndirectTargetMBBs.count(&MBB))
+ continue;
+
+ // We don't expect EH pads to ever be reached via an indirect branch. If
+ // this is desired for some reason, we could simply skip them here rather
+ // than asserting.
+ assert(!MBB.isEHPad() &&
+ "Unexpected EH pad as target of an indirect branch!");
+
+ // We should never end up threading EFLAGS into a block to harden
+ // conditional jumps as there would be an additional successor via the
+ // indirect branch. As a consequence, all such edges would be split before
+ // reaching here, and the inserted block will handle the EFLAGS-based
+ // hardening.
+ assert(!MBB.isLiveIn(X86::EFLAGS) &&
+ "Cannot check within a block that already has live-in EFLAGS!");
+
+ // We can't handle having non-indirect edges into this block unless this is
+ // the only successor and we can synthesize the necessary target address.
+ for (MachineBasicBlock *Pred : MBB.predecessors()) {
+ // If we've already handled this by extracting the target directly,
+ // nothing to do.
+ if (IndirectTerminatedMBBs.count(Pred))
+ continue;
+
+ // Otherwise, we have to be the only successor. We generally expect this
+ // to be true as conditional branches should have had a critical edge
+ // split already. We don't however need to worry about EH pad successors
+ // as they'll happily ignore the target and their hardening strategy is
+ // resilient to all ways in which they could be reached speculatively.
+ if (!llvm::all_of(Pred->successors(), [&](MachineBasicBlock *Succ) {
+ return Succ->isEHPad() || Succ == &MBB;
+ })) {
+ LLVM_DEBUG({
+ dbgs() << "ERROR: Found conditional entry to target of indirect "
+ "branch!\n";
+ Pred->dump();
+ MBB.dump();
+ });
+ report_fatal_error("Cannot harden a conditional entry to a target of "
+ "an indirect branch!");
+ }
+
+ // Now we need to compute the address of this block and install it as a
+ // synthetic target in the predecessor. We do this at the bottom of the
+ // predecessor.
+ auto InsertPt = Pred->getFirstTerminator();
+ Register TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ if (MF.getTarget().getCodeModel() == CodeModel::Small &&
+ !Subtarget->isPositionIndependent()) {
+ // Directly materialize it into an immediate.
+ auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(),
+ TII->get(X86::MOV64ri32), TargetReg)
+ .addMBB(&MBB);
+ ++NumInstsInserted;
+ (void)AddrI;
+ LLVM_DEBUG(dbgs() << " Inserting mov: "; AddrI->dump();
+ dbgs() << "\n");
+ } else {
+ auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(), TII->get(X86::LEA64r),
+ TargetReg)
+ .addReg(/*Base*/ X86::RIP)
+ .addImm(/*Scale*/ 1)
+ .addReg(/*Index*/ 0)
+ .addMBB(&MBB)
+ .addReg(/*Segment*/ 0);
+ ++NumInstsInserted;
+ (void)AddrI;
+ LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump();
+ dbgs() << "\n");
+ }
+ // And make this available.
+ TargetAddrSSA.AddAvailableValue(Pred, TargetReg);
+ }
+
+ // Materialize the needed SSA value of the target. Note that we need the
+ // middle of the block as this block might at the bottom have an indirect
+ // branch back to itself. We can do this here because at this point, every
+ // predecessor of this block has an available value. This is basically just
+ // automating the construction of a PHI node for this target.
+ unsigned TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB);
+
+ // Insert a comparison of the incoming target register with this block's
+ // address. This also requires us to mark the block as having its address
+ // taken explicitly.
+ MBB.setHasAddressTaken();
+ auto InsertPt = MBB.SkipPHIsLabelsAndDebug(MBB.begin());
+ if (MF.getTarget().getCodeModel() == CodeModel::Small &&
+ !Subtarget->isPositionIndependent()) {
+ // Check directly against a relocated immediate when we can.
+ auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64ri32))
+ .addReg(TargetReg, RegState::Kill)
+ .addMBB(&MBB);
+ ++NumInstsInserted;
+ (void)CheckI;
+ LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
+ } else {
+ // Otherwise compute the address into a register first.
+ Register AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ auto AddrI =
+ BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg)
+ .addReg(/*Base*/ X86::RIP)
+ .addImm(/*Scale*/ 1)
+ .addReg(/*Index*/ 0)
+ .addMBB(&MBB)
+ .addReg(/*Segment*/ 0);
+ ++NumInstsInserted;
+ (void)AddrI;
+ LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump(); dbgs() << "\n");
+ auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64rr))
+ .addReg(TargetReg, RegState::Kill)
+ .addReg(AddrReg, RegState::Kill);
+ ++NumInstsInserted;
+ (void)CheckI;
+ LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
+ }
+
+ // Now cmov over the predicate if the comparison wasn't equal.
+ int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
+ auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
+ Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ auto CMovI =
+ BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
+ .addReg(PS->InitialReg)
+ .addReg(PS->PoisonReg)
+ .addImm(X86::COND_NE);
+ CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
+ CMovs.push_back(&*CMovI);
+
+ // And put the new value into the available values for SSA form of our
+ // predicate state.
+ PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
+ }
+
+ // Return all the newly inserted cmov instructions of the predicate state.
+ return CMovs;
+}
+
+// Returns true if the MI has EFLAGS as a register def operand and it's live,
+// otherwise it returns false
+static bool isEFLAGSDefLive(const MachineInstr &MI) {
+ if (const MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
+ return !DefOp->isDead();
+ }
+ return false;
+}
+
+static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const TargetRegisterInfo &TRI) {
+ // Check if EFLAGS are alive by seeing if there is a def of them or they
+ // live-in, and then seeing if that def is in turn used.
+ for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
+ if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
+ // If the def is dead, then EFLAGS is not live.
+ if (DefOp->isDead())
+ return false;
+
+ // Otherwise we've def'ed it, and it is live.
+ return true;
+ }
+ // While at this instruction, also check if we use and kill EFLAGS
+ // which means it isn't live.
+ if (MI.killsRegister(X86::EFLAGS, &TRI))
+ return false;
+ }
+
+ // If we didn't find anything conclusive (neither definitely alive or
+ // definitely dead) return whether it lives into the block.
+ return MBB.isLiveIn(X86::EFLAGS);
+}
+
+/// Trace the predicate state through each of the blocks in the function,
+/// hardening everything necessary along the way.
+///
+/// We call this routine once the initial predicate state has been established
+/// for each basic block in the function in the SSA updater. This routine traces
+/// it through the instructions within each basic block, and for non-returning
+/// blocks informs the SSA updater about the final state that lives out of the
+/// block. Along the way, it hardens any vulnerable instruction using the
+/// currently valid predicate state. We have to do these two things together
+/// because the SSA updater only works across blocks. Within a block, we track
+/// the current predicate state directly and update it as it changes.
+///
+/// This operates in two passes over each block. First, we analyze the loads in
+/// the block to determine which strategy will be used to harden them: hardening
+/// the address or hardening the loaded value when loaded into a register
+/// amenable to hardening. We have to process these first because the two
+/// strategies may interact -- later hardening may change what strategy we wish
+/// to use. We also will analyze data dependencies between loads and avoid
+/// hardening those loads that are data dependent on a load with a hardened
+/// address. We also skip hardening loads already behind an LFENCE as that is
+/// sufficient to harden them against misspeculation.
+///
+/// Second, we actively trace the predicate state through the block, applying
+/// the hardening steps we determined necessary in the first pass as we go.
+///
+/// These two passes are applied to each basic block. We operate one block at a
+/// time to simplify reasoning about reachability and sequencing.
+void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
+ MachineFunction &MF) {
+ SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
+ SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
+
+ SmallSet<unsigned, 16> HardenedAddrRegs;
+
+ SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
+
+ // Track the set of load-dependent registers through the basic block. Because
+ // the values of these registers have an existing data dependency on a loaded
+ // value which we would have checked, we can omit any checks on them.
+ SparseBitVector<> LoadDepRegs;
+
+ for (MachineBasicBlock &MBB : MF) {
+ // The first pass over the block: collect all the loads which can have their
+ // loaded value hardened and all the loads that instead need their address
+ // hardened. During this walk we propagate load dependence for address
+ // hardened loads and also look for LFENCE to stop hardening wherever
+ // possible. When deciding whether or not to harden the loaded value or not,
+ // we check to see if any registers used in the address will have been
+ // hardened at this point and if so, harden any remaining address registers
+ // as that often successfully re-uses hardened addresses and minimizes
+ // instructions.
+ //
+ // FIXME: We should consider an aggressive mode where we continue to keep as
+ // many loads value hardened even when some address register hardening would
+ // be free (due to reuse).
+ //
+ // Note that we only need this pass if we are actually hardening loads.
+ if (HardenLoads)
+ for (MachineInstr &MI : MBB) {
+ // We naively assume that all def'ed registers of an instruction have
+ // a data dependency on all of their operands.
+ // FIXME: Do a more careful analysis of x86 to build a conservative
+ // model here.
+ if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
+ return Op.isReg() && LoadDepRegs.test(Op.getReg());
+ }))
+ for (MachineOperand &Def : MI.defs())
+ if (Def.isReg())
+ LoadDepRegs.set(Def.getReg());
+
+ // Both Intel and AMD are guiding that they will change the semantics of
+ // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
+ // no more need to guard things in this block.
+ if (MI.getOpcode() == X86::LFENCE)
+ break;
+
+ // If this instruction cannot load, nothing to do.
+ if (!MI.mayLoad())
+ continue;
+
+ // Some instructions which "load" are trivially safe or unimportant.
+ if (MI.getOpcode() == X86::MFENCE)
+ continue;
+
+ // Extract the memory operand information about this instruction.
+ // FIXME: This doesn't handle loading pseudo instructions which we often
+ // could handle with similarly generic logic. We probably need to add an
+ // MI-layer routine similar to the MC-layer one we use here which maps
+ // pseudos much like this maps real instructions.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemRefBeginIdx < 0) {
+ LLVM_DEBUG(dbgs()
+ << "WARNING: unable to harden loading instruction: ";
+ MI.dump());
+ continue;
+ }
+
+ MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+ MachineOperand &BaseMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+ MachineOperand &IndexMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+
+ // If we have at least one (non-frame-index, non-RIP) register operand,
+ // and neither operand is load-dependent, we need to check the load.
+ unsigned BaseReg = 0, IndexReg = 0;
+ if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
+ BaseMO.getReg() != X86::NoRegister)
+ BaseReg = BaseMO.getReg();
+ if (IndexMO.getReg() != X86::NoRegister)
+ IndexReg = IndexMO.getReg();
+
+ if (!BaseReg && !IndexReg)
+ // No register operands!
+ continue;
+
+ // If any register operand is dependent, this load is dependent and we
+ // needn't check it.
+ // FIXME: Is this true in the case where we are hardening loads after
+ // they complete? Unclear, need to investigate.
+ if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
+ (IndexReg && LoadDepRegs.test(IndexReg)))
+ continue;
+
+ // If post-load hardening is enabled, this load is compatible with
+ // post-load hardening, and we aren't already going to harden one of the
+ // address registers, queue it up to be hardened post-load. Notably,
+ // even once hardened this won't introduce a useful dependency that
+ // could prune out subsequent loads.
+ if (EnablePostLoadHardening && X86InstrInfo::isDataInvariantLoad(MI) &&
+ !isEFLAGSDefLive(MI) && MI.getDesc().getNumDefs() == 1 &&
+ MI.getOperand(0).isReg() &&
+ canHardenRegister(MI.getOperand(0).getReg()) &&
+ !HardenedAddrRegs.count(BaseReg) &&
+ !HardenedAddrRegs.count(IndexReg)) {
+ HardenPostLoad.insert(&MI);
+ HardenedAddrRegs.insert(MI.getOperand(0).getReg());
+ continue;
+ }
+
+ // Record this instruction for address hardening and record its register
+ // operands as being address-hardened.
+ HardenLoadAddr.insert(&MI);
+ if (BaseReg)
+ HardenedAddrRegs.insert(BaseReg);
+ if (IndexReg)
+ HardenedAddrRegs.insert(IndexReg);
+
+ for (MachineOperand &Def : MI.defs())
+ if (Def.isReg())
+ LoadDepRegs.set(Def.getReg());
+ }
+
+ // Now re-walk the instructions in the basic block, and apply whichever
+ // hardening strategy we have elected. Note that we do this in a second
+ // pass specifically so that we have the complete set of instructions for
+ // which we will do post-load hardening and can defer it in certain
+ // circumstances.
+ for (MachineInstr &MI : MBB) {
+ if (HardenLoads) {
+ // We cannot both require hardening the def of a load and its address.
+ assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
+ "Requested to harden both the address and def of a load!");
+
+ // Check if this is a load whose address needs to be hardened.
+ if (HardenLoadAddr.erase(&MI)) {
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+ assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
+
+ MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+ MachineOperand &BaseMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+ MachineOperand &IndexMO =
+ MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+ hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
+ continue;
+ }
+
+ // Test if this instruction is one of our post load instructions (and
+ // remove it from the set if so).
+ if (HardenPostLoad.erase(&MI)) {
+ assert(!MI.isCall() && "Must not try to post-load harden a call!");
+
+ // If this is a data-invariant load and there is no EFLAGS
+ // interference, we want to try and sink any hardening as far as
+ // possible.
+ if (X86InstrInfo::isDataInvariantLoad(MI) && !isEFLAGSDefLive(MI)) {
+ // Sink the instruction we'll need to harden as far as we can down
+ // the graph.
+ MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
+
+ // If we managed to sink this instruction, update everything so we
+ // harden that instruction when we reach it in the instruction
+ // sequence.
+ if (SunkMI != &MI) {
+ // If in sinking there was no instruction needing to be hardened,
+ // we're done.
+ if (!SunkMI)
+ continue;
+
+ // Otherwise, add this to the set of defs we harden.
+ HardenPostLoad.insert(SunkMI);
+ continue;
+ }
+ }
+
+ unsigned HardenedReg = hardenPostLoad(MI);
+
+ // Mark the resulting hardened register as such so we don't re-harden.
+ AddrRegToHardenedReg[HardenedReg] = HardenedReg;
+
+ continue;
+ }
+
+ // Check for an indirect call or branch that may need its input hardened
+ // even if we couldn't find the specific load used, or were able to
+ // avoid hardening it for some reason. Note that here we cannot break
+ // out afterward as we may still need to handle any call aspect of this
+ // instruction.
+ if ((MI.isCall() || MI.isBranch()) && HardenIndirectCallsAndJumps)
+ hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg);
+ }
+
+ // After we finish hardening loads we handle interprocedural hardening if
+ // enabled and relevant for this instruction.
+ if (!HardenInterprocedurally)
+ continue;
+ if (!MI.isCall() && !MI.isReturn())
+ continue;
+
+ // If this is a direct return (IE, not a tail call) just directly harden
+ // it.
+ if (MI.isReturn() && !MI.isCall()) {
+ hardenReturnInstr(MI);
+ continue;
+ }
+
+ // Otherwise we have a call. We need to handle transferring the predicate
+ // state into a call and recovering it after the call returns (unless this
+ // is a tail call).
+ assert(MI.isCall() && "Should only reach here for calls!");
+ tracePredStateThroughCall(MI);
+ }
+
+ HardenPostLoad.clear();
+ HardenLoadAddr.clear();
+ HardenedAddrRegs.clear();
+ AddrRegToHardenedReg.clear();
+
+ // Currently, we only track data-dependent loads within a basic block.
+ // FIXME: We should see if this is necessary or if we could be more
+ // aggressive here without opening up attack avenues.
+ LoadDepRegs.clear();
+ }
+}
+
+/// Save EFLAGS into the returned GPR. This can in turn be restored with
+/// `restoreEFLAGS`.
+///
+/// Note that LLVM can only lower very simple patterns of saved and restored
+/// EFLAGS registers. The restore should always be within the same basic block
+/// as the save so that no PHI nodes are inserted.
+unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc) {
+ // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
+ // what instruction selection does.
+ Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ // We directly copy the FLAGS register and rely on later lowering to clean
+ // this up into the appropriate setCC instructions.
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
+ ++NumInstsInserted;
+ return Reg;
+}
+
+/// Restore EFLAGS from the provided GPR. This should be produced by
+/// `saveEFLAGS`.
+///
+/// This must be done within the same basic block as the save in order to
+/// reliably lower.
+void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+ Register Reg) {
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
+ ++NumInstsInserted;
+}
+
+/// Takes the current predicate state (in a register) and merges it into the
+/// stack pointer. The state is essentially a single bit, but we merge this in
+/// a way that won't form non-canonical pointers and also will be preserved
+/// across normal stack adjustments.
+void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+ unsigned PredStateReg) {
+ Register TmpReg = MRI->createVirtualRegister(PS->RC);
+ // FIXME: This hard codes a shift distance based on the number of bits needed
+ // to stay canonical on 64-bit. We should compute this somehow and support
+ // 32-bit as part of that.
+ auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
+ .addReg(PredStateReg, RegState::Kill)
+ .addImm(47);
+ ShiftI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+ auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(TmpReg, RegState::Kill);
+ OrI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+}
+
+/// Extracts the predicate state stored in the high bits of the stack pointer.
+unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc) {
+ Register PredStateReg = MRI->createVirtualRegister(PS->RC);
+ Register TmpReg = MRI->createVirtualRegister(PS->RC);
+
+ // We know that the stack pointer will have any preserved predicate state in
+ // its high bit. We just want to smear this across the other bits. Turns out,
+ // this is exactly what an arithmetic right shift does.
+ BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
+ .addReg(X86::RSP);
+ auto ShiftI =
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
+ .addReg(TmpReg, RegState::Kill)
+ .addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
+ ShiftI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+
+ return PredStateReg;
+}
+
+void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
+ MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
+ SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc Loc = MI.getDebugLoc();
+
+ // Check if EFLAGS are alive by seeing if there is a def of them or they
+ // live-in, and then seeing if that def is in turn used.
+ bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
+
+ SmallVector<MachineOperand *, 2> HardenOpRegs;
+
+ if (BaseMO.isFI()) {
+ // A frame index is never a dynamically controllable load, so only
+ // harden it if we're covering fixed address loads as well.
+ LLVM_DEBUG(
+ dbgs() << " Skipping hardening base of explicit stack frame load: ";
+ MI.dump(); dbgs() << "\n");
+ } else if (BaseMO.getReg() == X86::RSP) {
+ // Some idempotent atomic operations are lowered directly to a locked
+ // OR with 0 to the top of stack(or slightly offset from top) which uses an
+ // explicit RSP register as the base.
+ assert(IndexMO.getReg() == X86::NoRegister &&
+ "Explicit RSP access with dynamic index!");
+ LLVM_DEBUG(
+ dbgs() << " Cannot harden base of explicit RSP offset in a load!");
+ } else if (BaseMO.getReg() == X86::RIP ||
+ BaseMO.getReg() == X86::NoRegister) {
+ // For both RIP-relative addressed loads or absolute loads, we cannot
+ // meaningfully harden them because the address being loaded has no
+ // dynamic component.
+ //
+ // FIXME: When using a segment base (like TLS does) we end up with the
+ // dynamic address being the base plus -1 because we can't mutate the
+ // segment register here. This allows the signed 32-bit offset to point at
+ // valid segment-relative addresses and load them successfully.
+ LLVM_DEBUG(
+ dbgs() << " Cannot harden base of "
+ << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
+ << " address in a load!");
+ } else {
+ assert(BaseMO.isReg() &&
+ "Only allowed to have a frame index or register base.");
+ HardenOpRegs.push_back(&BaseMO);
+ }
+
+ if (IndexMO.getReg() != X86::NoRegister &&
+ (HardenOpRegs.empty() ||
+ HardenOpRegs.front()->getReg() != IndexMO.getReg()))
+ HardenOpRegs.push_back(&IndexMO);
+
+ assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
+ "Should have exactly one or two registers to harden!");
+ assert((HardenOpRegs.size() == 1 ||
+ HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
+ "Should not have two of the same registers!");
+
+ // Remove any registers that have alreaded been checked.
+ llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
+ // See if this operand's register has already been checked.
+ auto It = AddrRegToHardenedReg.find(Op->getReg());
+ if (It == AddrRegToHardenedReg.end())
+ // Not checked, so retain this one.
+ return false;
+
+ // Otherwise, we can directly update this operand and remove it.
+ Op->setReg(It->second);
+ return true;
+ });
+ // If there are none left, we're done.
+ if (HardenOpRegs.empty())
+ return;
+
+ // Compute the current predicate state.
+ unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+
+ auto InsertPt = MI.getIterator();
+
+ // If EFLAGS are live and we don't have access to instructions that avoid
+ // clobbering EFLAGS we need to save and restore them. This in turn makes
+ // the EFLAGS no longer live.
+ unsigned FlagsReg = 0;
+ if (EFLAGSLive && !Subtarget->hasBMI2()) {
+ EFLAGSLive = false;
+ FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
+ }
+
+ for (MachineOperand *Op : HardenOpRegs) {
+ Register OpReg = Op->getReg();
+ auto *OpRC = MRI->getRegClass(OpReg);
+ Register TmpReg = MRI->createVirtualRegister(OpRC);
+
+ // If this is a vector register, we'll need somewhat custom logic to handle
+ // hardening it.
+ if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) ||
+ OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
+ assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
+ bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
+
+ // Move our state into a vector register.
+ // FIXME: We could skip this at the cost of longer encodings with AVX-512
+ // but that doesn't seem likely worth it.
+ Register VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
+ auto MovI =
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
+ .addReg(StateReg);
+ (void)MovI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
+
+ // Broadcast it across the vector register.
+ Register VBStateReg = MRI->createVirtualRegister(OpRC);
+ auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
+ TII->get(Is128Bit ? X86::VPBROADCASTQrr
+ : X86::VPBROADCASTQYrr),
+ VBStateReg)
+ .addReg(VStateReg);
+ (void)BroadcastI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
+ dbgs() << "\n");
+
+ // Merge our potential poison state into the value with a vector or.
+ auto OrI =
+ BuildMI(MBB, InsertPt, Loc,
+ TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
+ .addReg(VBStateReg)
+ .addReg(OpReg);
+ (void)OrI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
+ } else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) ||
+ OpRC->hasSuperClassEq(&X86::VR256XRegClass) ||
+ OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
+ assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
+ bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
+ bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
+ if (Is128Bit || Is256Bit)
+ assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
+
+ // Broadcast our state into a vector register.
+ Register VStateReg = MRI->createVirtualRegister(OpRC);
+ unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128rr
+ : Is256Bit ? X86::VPBROADCASTQrZ256rr
+ : X86::VPBROADCASTQrZrr;
+ auto BroadcastI =
+ BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
+ .addReg(StateReg);
+ (void)BroadcastI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
+ dbgs() << "\n");
+
+ // Merge our potential poison state into the value with a vector or.
+ unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
+ : Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
+ auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
+ .addReg(VStateReg)
+ .addReg(OpReg);
+ (void)OrI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
+ } else {
+ // FIXME: Need to support GR32 here for 32-bit code.
+ assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
+ "Not a supported register class for address hardening!");
+
+ if (!EFLAGSLive) {
+ // Merge our potential poison state into the value with an or.
+ auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
+ .addReg(StateReg)
+ .addReg(OpReg);
+ OrI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
+ } else {
+ // We need to avoid touching EFLAGS so shift out all but the least
+ // significant bit using the instruction that doesn't update flags.
+ auto ShiftI =
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
+ .addReg(OpReg)
+ .addReg(StateReg);
+ (void)ShiftI;
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
+ dbgs() << "\n");
+ }
+ }
+
+ // Record this register as checked and update the operand.
+ assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
+ "Should not have checked this register yet!");
+ AddrRegToHardenedReg[Op->getReg()] = TmpReg;
+ Op->setReg(TmpReg);
+ ++NumAddrRegsHardened;
+ }
+
+ // And restore the flags if needed.
+ if (FlagsReg)
+ restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
+}
+
+MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
+ MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
+ assert(X86InstrInfo::isDataInvariantLoad(InitialMI) &&
+ "Cannot get here with a non-invariant load!");
+ assert(!isEFLAGSDefLive(InitialMI) &&
+ "Cannot get here with a data invariant load "
+ "that interferes with EFLAGS!");
+
+ // See if we can sink hardening the loaded value.
+ auto SinkCheckToSingleUse =
+ [&](MachineInstr &MI) -> Optional<MachineInstr *> {
+ Register DefReg = MI.getOperand(0).getReg();
+
+ // We need to find a single use which we can sink the check. We can
+ // primarily do this because many uses may already end up checked on their
+ // own.
+ MachineInstr *SingleUseMI = nullptr;
+ for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
+ // If we're already going to harden this use, it is data invariant, it
+ // does not interfere with EFLAGS, and within our block.
+ if (HardenedInstrs.count(&UseMI)) {
+ if (!X86InstrInfo::isDataInvariantLoad(UseMI) || isEFLAGSDefLive(UseMI)) {
+ // If we've already decided to harden a non-load, we must have sunk
+ // some other post-load hardened instruction to it and it must itself
+ // be data-invariant.
+ assert(X86InstrInfo::isDataInvariant(UseMI) &&
+ "Data variant instruction being hardened!");
+ continue;
+ }
+
+ // Otherwise, this is a load and the load component can't be data
+ // invariant so check how this register is being used.
+ const MCInstrDesc &Desc = UseMI.getDesc();
+ int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+ assert(MemRefBeginIdx >= 0 &&
+ "Should always have mem references here!");
+ MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+ MachineOperand &BaseMO =
+ UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+ MachineOperand &IndexMO =
+ UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+ if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
+ (IndexMO.isReg() && IndexMO.getReg() == DefReg))
+ // The load uses the register as part of its address making it not
+ // invariant.
+ return {};
+
+ continue;
+ }
+
+ if (SingleUseMI)
+ // We already have a single use, this would make two. Bail.
+ return {};
+
+ // If this single use isn't data invariant, isn't in this block, or has
+ // interfering EFLAGS, we can't sink the hardening to it.
+ if (!X86InstrInfo::isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent() ||
+ isEFLAGSDefLive(UseMI))
+ return {};
+
+ // If this instruction defines multiple registers bail as we won't harden
+ // all of them.
+ if (UseMI.getDesc().getNumDefs() > 1)
+ return {};
+
+ // If this register isn't a virtual register we can't walk uses of sanely,
+ // just bail. Also check that its register class is one of the ones we
+ // can harden.
+ Register UseDefReg = UseMI.getOperand(0).getReg();
+ if (!UseDefReg.isVirtual() || !canHardenRegister(UseDefReg))
+ return {};
+
+ SingleUseMI = &UseMI;
+ }
+
+ // If SingleUseMI is still null, there is no use that needs its own
+ // checking. Otherwise, it is the single use that needs checking.
+ return {SingleUseMI};
+ };
+
+ MachineInstr *MI = &InitialMI;
+ while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
+ // Update which MI we're checking now.
+ MI = *SingleUse;
+ if (!MI)
+ break;
+ }
+
+ return MI;
+}
+
+bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
+ auto *RC = MRI->getRegClass(Reg);
+ int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
+ if (RegBytes > 8)
+ // We don't support post-load hardening of vectors.
+ return false;
+
+ unsigned RegIdx = Log2_32(RegBytes);
+ assert(RegIdx < 4 && "Unsupported register size");
+
+ // If this register class is explicitly constrained to a class that doesn't
+ // require REX prefix, we may not be able to satisfy that constraint when
+ // emitting the hardening instructions, so bail out here.
+ // FIXME: This seems like a pretty lame hack. The way this comes up is when we
+ // end up both with a NOREX and REX-only register as operands to the hardening
+ // instructions. It would be better to fix that code to handle this situation
+ // rather than hack around it in this way.
+ const TargetRegisterClass *NOREXRegClasses[] = {
+ &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
+ &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
+ if (RC == NOREXRegClasses[RegIdx])
+ return false;
+
+ const TargetRegisterClass *GPRRegClasses[] = {
+ &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
+ &X86::GR64RegClass};
+ return RC->hasSuperClassEq(GPRRegClasses[RegIdx]);
+}
+
+/// Harden a value in a register.
+///
+/// This is the low-level logic to fully harden a value sitting in a register
+/// against leaking during speculative execution.
+///
+/// Unlike hardening an address that is used by a load, this routine is required
+/// to hide *all* incoming bits in the register.
+///
+/// `Reg` must be a virtual register. Currently, it is required to be a GPR no
+/// larger than the predicate state register. FIXME: We should support vector
+/// registers here by broadcasting the predicate state.
+///
+/// The new, hardened virtual register is returned. It will have the same
+/// register class as `Reg`.
+unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
+ Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ DebugLoc Loc) {
+ assert(canHardenRegister(Reg) && "Cannot harden this register!");
+ assert(Reg.isVirtual() && "Cannot harden a physical register!");
+
+ auto *RC = MRI->getRegClass(Reg);
+ int Bytes = TRI->getRegSizeInBits(*RC) / 8;
+
+ unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+
+ // FIXME: Need to teach this about 32-bit mode.
+ if (Bytes != 8) {
+ unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
+ unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
+ Register NarrowStateReg = MRI->createVirtualRegister(RC);
+ BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
+ .addReg(StateReg, 0, SubRegImm);
+ StateReg = NarrowStateReg;
+ }
+
+ unsigned FlagsReg = 0;
+ if (isEFLAGSLive(MBB, InsertPt, *TRI))
+ FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
+
+ Register NewReg = MRI->createVirtualRegister(RC);
+ unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
+ unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
+ auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
+ .addReg(StateReg)
+ .addReg(Reg);
+ OrI->addRegisterDead(X86::EFLAGS, TRI);
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
+
+ if (FlagsReg)
+ restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
+
+ return NewReg;
+}
+
+/// Harden a load by hardening the loaded value in the defined register.
+///
+/// We can harden a non-leaking load into a register without touching the
+/// address by just hiding all of the loaded bits during misspeculation. We use
+/// an `or` instruction to do this because we set up our poison value as all
+/// ones. And the goal is just for the loaded bits to not be exposed to
+/// execution and coercing them to one is sufficient.
+///
+/// Returns the newly hardened register.
+unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc Loc = MI.getDebugLoc();
+
+ auto &DefOp = MI.getOperand(0);
+ Register OldDefReg = DefOp.getReg();
+ auto *DefRC = MRI->getRegClass(OldDefReg);
+
+ // Because we want to completely replace the uses of this def'ed value with
+ // the hardened value, create a dedicated new register that will only be used
+ // to communicate the unhardened value to the hardening.
+ Register UnhardenedReg = MRI->createVirtualRegister(DefRC);
+ DefOp.setReg(UnhardenedReg);
+
+ // Now harden this register's value, getting a hardened reg that is safe to
+ // use. Note that we insert the instructions to compute this *after* the
+ // defining instruction, not before it.
+ unsigned HardenedReg = hardenValueInRegister(
+ UnhardenedReg, MBB, std::next(MI.getIterator()), Loc);
+
+ // Finally, replace the old register (which now only has the uses of the
+ // original def) with the hardened register.
+ MRI->replaceRegWith(/*FromReg*/ OldDefReg, /*ToReg*/ HardenedReg);
+
+ ++NumPostLoadRegsHardened;
+ return HardenedReg;
+}
+
+/// Harden a return instruction.
+///
+/// Returns implicitly perform a load which we need to harden. Without hardening
+/// this load, an attacker my speculatively write over the return address to
+/// steer speculation of the return to an attacker controlled address. This is
+/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
+/// this paper:
+/// https://people.csail.mit.edu/vlk/spectre11.pdf
+///
+/// We can harden this by introducing an LFENCE that will delay any load of the
+/// return address until prior instructions have retired (and thus are not being
+/// speculated), or we can harden the address used by the implicit load: the
+/// stack pointer.
+///
+/// If we are not using an LFENCE, hardening the stack pointer has an additional
+/// benefit: it allows us to pass the predicate state accumulated in this
+/// function back to the caller. In the absence of a BCBS attack on the return,
+/// the caller will typically be resumed and speculatively executed due to the
+/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
+/// priority. It is possible that some code from the caller will be executed
+/// speculatively even during a BCBS-attacked return until the steering takes
+/// effect. Whenever this happens, the caller can recover the (poisoned)
+/// predicate state from the stack pointer and continue to harden loads.
+void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc Loc = MI.getDebugLoc();
+ auto InsertPt = MI.getIterator();
+
+ if (FenceCallAndRet)
+ // No need to fence here as we'll fence at the return site itself. That
+ // handles more cases than we can handle here.
+ return;
+
+ // Take our predicate state, shift it to the high 17 bits (so that we keep
+ // pointers canonical) and merge it into RSP. This will allow the caller to
+ // extract it when we return (speculatively).
+ mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
+}
+
+/// Trace the predicate state through a call.
+///
+/// There are several layers of this needed to handle the full complexity of
+/// calls.
+///
+/// First, we need to send the predicate state into the called function. We do
+/// this by merging it into the high bits of the stack pointer.
+///
+/// For tail calls, this is all we need to do.
+///
+/// For calls where we might return and resume the control flow, we need to
+/// extract the predicate state from the high bits of the stack pointer after
+/// control returns from the called function.
+///
+/// We also need to verify that we intended to return to this location in the
+/// code. An attacker might arrange for the processor to mispredict the return
+/// to this valid but incorrect return address in the program rather than the
+/// correct one. See the paper on this attack, called "ret2spec" by the
+/// researchers, here:
+/// https://christian-rossow.de/publications/ret2spec-ccs2018.pdf
+///
+/// The way we verify that we returned to the correct location is by preserving
+/// the expected return address across the call. One technique involves taking
+/// advantage of the red-zone to load the return address from `8(%rsp)` where it
+/// was left by the RET instruction when it popped `%rsp`. Alternatively, we can
+/// directly save the address into a register that will be preserved across the
+/// call. We compare this intended return address against the address
+/// immediately following the call (the observed return address). If these
+/// mismatch, we have detected misspeculation and can poison our predicate
+/// state.
+void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
+ MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ auto InsertPt = MI.getIterator();
+ DebugLoc Loc = MI.getDebugLoc();
+
+ if (FenceCallAndRet) {
+ if (MI.isReturn())
+ // Tail call, we don't return to this function.
+ // FIXME: We should also handle noreturn calls.
+ return;
+
+ // We don't need to fence before the call because the function should fence
+ // in its entry. However, we do need to fence after the call returns.
+ // Fencing before the return doesn't correctly handle cases where the return
+ // itself is mispredicted.
+ BuildMI(MBB, std::next(InsertPt), Loc, TII->get(X86::LFENCE));
+ ++NumInstsInserted;
+ ++NumLFENCEsInserted;
+ return;
+ }
+
+ // First, we transfer the predicate state into the called function by merging
+ // it into the stack pointer. This will kill the current def of the state.
+ unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+ mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
+
+ // If this call is also a return, it is a tail call and we don't need anything
+ // else to handle it so just return. Also, if there are no further
+ // instructions and no successors, this call does not return so we can also
+ // bail.
+ if (MI.isReturn() || (std::next(InsertPt) == MBB.end() && MBB.succ_empty()))
+ return;
+
+ // Create a symbol to track the return address and attach it to the call
+ // machine instruction. We will lower extra symbols attached to call
+ // instructions as label immediately following the call.
+ MCSymbol *RetSymbol =
+ MF.getContext().createTempSymbol("slh_ret_addr",
+ /*AlwaysAddSuffix*/ true);
+ MI.setPostInstrSymbol(MF, RetSymbol);
+
+ const TargetRegisterClass *AddrRC = &X86::GR64RegClass;
+ unsigned ExpectedRetAddrReg = 0;
+
+ // If we have no red zones or if the function returns twice (possibly without
+ // using the `ret` instruction) like setjmp, we need to save the expected
+ // return address prior to the call.
+ if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) ||
+ MF.exposesReturnsTwice()) {
+ // If we don't have red zones, we need to compute the expected return
+ // address prior to the call and store it in a register that lives across
+ // the call.
+ //
+ // In some ways, this is doubly satisfying as a mitigation because it will
+ // also successfully detect stack smashing bugs in some cases (typically,
+ // when a callee-saved register is used and the callee doesn't push it onto
+ // the stack). But that isn't our primary goal, so we only use it as
+ // a fallback.
+ //
+ // FIXME: It isn't clear that this is reliable in the face of
+ // rematerialization in the register allocator. We somehow need to force
+ // that to not occur for this particular instruction, and instead to spill
+ // or otherwise preserve the value computed *prior* to the call.
+ //
+ // FIXME: It is even less clear why MachineCSE can't just fold this when we
+ // end up having to use identical instructions both before and after the
+ // call to feed the comparison.
+ ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
+ if (MF.getTarget().getCodeModel() == CodeModel::Small &&
+ !Subtarget->isPositionIndependent()) {
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64ri32), ExpectedRetAddrReg)
+ .addSym(RetSymbol);
+ } else {
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ExpectedRetAddrReg)
+ .addReg(/*Base*/ X86::RIP)
+ .addImm(/*Scale*/ 1)
+ .addReg(/*Index*/ 0)
+ .addSym(RetSymbol)
+ .addReg(/*Segment*/ 0);
+ }
+ }
+
+ // Step past the call to handle when it returns.
+ ++InsertPt;
+
+ // If we didn't pre-compute the expected return address into a register, then
+ // red zones are enabled and the return address is still available on the
+ // stack immediately after the call. As the very first instruction, we load it
+ // into a register.
+ if (!ExpectedRetAddrReg) {
+ ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64rm), ExpectedRetAddrReg)
+ .addReg(/*Base*/ X86::RSP)
+ .addImm(/*Scale*/ 1)
+ .addReg(/*Index*/ 0)
+ .addImm(/*Displacement*/ -8) // The stack pointer has been popped, so
+ // the return address is 8-bytes past it.
+ .addReg(/*Segment*/ 0);
+ }
+
+ // Now we extract the callee's predicate state from the stack pointer.
+ unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
+
+ // Test the expected return address against our actual address. If we can
+ // form this basic block's address as an immediate, this is easy. Otherwise
+ // we compute it.
+ if (MF.getTarget().getCodeModel() == CodeModel::Small &&
+ !Subtarget->isPositionIndependent()) {
+ // FIXME: Could we fold this with the load? It would require careful EFLAGS
+ // management.
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64ri32))
+ .addReg(ExpectedRetAddrReg, RegState::Kill)
+ .addSym(RetSymbol);
+ } else {
+ Register ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg)
+ .addReg(/*Base*/ X86::RIP)
+ .addImm(/*Scale*/ 1)
+ .addReg(/*Index*/ 0)
+ .addSym(RetSymbol)
+ .addReg(/*Segment*/ 0);
+ BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64rr))
+ .addReg(ExpectedRetAddrReg, RegState::Kill)
+ .addReg(ActualRetAddrReg, RegState::Kill);
+ }
+
+ // Now conditionally update the predicate state we just extracted if we ended
+ // up at a different return address than expected.
+ int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
+ auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
+
+ Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
+ .addReg(NewStateReg, RegState::Kill)
+ .addReg(PS->PoisonReg)
+ .addImm(X86::COND_NE);
+ CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
+ ++NumInstsInserted;
+ LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
+
+ PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
+}
+
+/// An attacker may speculatively store over a value that is then speculatively
+/// loaded and used as the target of an indirect call or jump instruction. This
+/// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
+/// in this paper:
+/// https://people.csail.mit.edu/vlk/spectre11.pdf
+///
+/// When this happens, the speculative execution of the call or jump will end up
+/// being steered to this attacker controlled address. While most such loads
+/// will be adequately hardened already, we want to ensure that they are
+/// definitively treated as needing post-load hardening. While address hardening
+/// is sufficient to prevent secret data from leaking to the attacker, it may
+/// not be sufficient to prevent an attacker from steering speculative
+/// execution. We forcibly unfolded all relevant loads above and so will always
+/// have an opportunity to post-load harden here, we just need to scan for cases
+/// not already flagged and add them.
+void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
+ MachineInstr &MI,
+ SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
+ switch (MI.getOpcode()) {
+ case X86::FARCALL16m:
+ case X86::FARCALL32m:
+ case X86::FARCALL64m:
+ case X86::FARJMP16m:
+ case X86::FARJMP32m:
+ case X86::FARJMP64m:
+ // We don't need to harden either far calls or far jumps as they are
+ // safe from Spectre.
+ return;
+
+ default:
+ break;
+ }
+
+ // We should never see a loading instruction at this point, as those should
+ // have been unfolded.
+ assert(!MI.mayLoad() && "Found a lingering loading instruction!");
+
+ // If the first operand isn't a register, this is a branch or call
+ // instruction with an immediate operand which doesn't need to be hardened.
+ if (!MI.getOperand(0).isReg())
+ return;
+
+ // For all of these, the target register is the first operand of the
+ // instruction.
+ auto &TargetOp = MI.getOperand(0);
+ Register OldTargetReg = TargetOp.getReg();
+
+ // Try to lookup a hardened version of this register. We retain a reference
+ // here as we want to update the map to track any newly computed hardened
+ // register.
+ unsigned &HardenedTargetReg = AddrRegToHardenedReg[OldTargetReg];
+
+ // If we don't have a hardened register yet, compute one. Otherwise, just use
+ // the already hardened register.
+ //
+ // FIXME: It is a little suspect that we use partially hardened registers that
+ // only feed addresses. The complexity of partial hardening with SHRX
+ // continues to pile up. Should definitively measure its value and consider
+ // eliminating it.
+ if (!HardenedTargetReg)
+ HardenedTargetReg = hardenValueInRegister(
+ OldTargetReg, *MI.getParent(), MI.getIterator(), MI.getDebugLoc());
+
+ // Set the target operand to the hardened register.
+ TargetOp.setReg(HardenedTargetReg);
+
+ ++NumCallsOrJumpsHardened;
+}
+
+INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, PASS_KEY,
+ "X86 speculative load hardener", false, false)
+INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, PASS_KEY,
+ "X86 speculative load hardener", false, false)
+
+FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
+ return new X86SpeculativeLoadHardeningPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 000000000000..c95213c3539d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,352 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Subtarget.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86.h"
+#include "X86CallLowering.h"
+#include "X86LegalizerInfo.h"
+#include "X86MacroFusion.h"
+#include "X86RegisterBankInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "X86GenSubtargetInfo.inc"
+
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+ cl::desc("Enable early if-conversion on X86"));
+
+
+/// Classify a blockaddress reference for the current subtarget according to how
+/// we should reference it in a non-pcrel context.
+unsigned char X86Subtarget::classifyBlockAddressReference() const {
+ return classifyLocalReference(nullptr);
+}
+
+/// Classify a global variable reference for the current subtarget according to
+/// how we should reference it in a non-pcrel context.
+unsigned char
+X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const {
+ return classifyGlobalReference(GV, *GV->getParent());
+}
+
+unsigned char
+X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
+ // If we're not PIC, it's not very interesting.
+ if (!isPositionIndependent())
+ return X86II::MO_NO_FLAG;
+
+ if (is64Bit()) {
+ // 64-bit ELF PIC local references may use GOTOFF relocations.
+ if (isTargetELF()) {
+ switch (TM.getCodeModel()) {
+ // 64-bit small code model is simple: All rip-relative.
+ case CodeModel::Tiny:
+ llvm_unreachable("Tiny codesize model not supported on X86");
+ case CodeModel::Small:
+ case CodeModel::Kernel:
+ return X86II::MO_NO_FLAG;
+
+ // The large PIC code model uses GOTOFF.
+ case CodeModel::Large:
+ return X86II::MO_GOTOFF;
+
+ // Medium is a hybrid: RIP-rel for code, GOTOFF for DSO local data.
+ case CodeModel::Medium:
+ // Constant pool and jump table handling pass a nullptr to this
+ // function so we need to use isa_and_nonnull.
+ if (isa_and_nonnull<Function>(GV))
+ return X86II::MO_NO_FLAG; // All code is RIP-relative
+ return X86II::MO_GOTOFF; // Local symbols use GOTOFF.
+ }
+ llvm_unreachable("invalid code model");
+ }
+
+ // Otherwise, this is either a RIP-relative reference or a 64-bit movabsq,
+ // both of which use MO_NO_FLAG.
+ return X86II::MO_NO_FLAG;
+ }
+
+ // The COFF dynamic linker just patches the executable sections.
+ if (isTargetCOFF())
+ return X86II::MO_NO_FLAG;
+
+ if (isTargetDarwin()) {
+ // 32 bit macho has no relocation for a-b if a is undefined, even if
+ // b is in the section that is being relocated.
+ // This means we have to use o load even for GVs that are known to be
+ // local to the dso.
+ if (GV && (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
+ return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+
+ return X86II::MO_PIC_BASE_OFFSET;
+ }
+
+ return X86II::MO_GOTOFF;
+}
+
+unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
+ const Module &M) const {
+ // The static large model never uses stubs.
+ if (TM.getCodeModel() == CodeModel::Large && !isPositionIndependent())
+ return X86II::MO_NO_FLAG;
+
+ // Absolute symbols can be referenced directly.
+ if (GV) {
+ if (Optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) {
+ // See if we can use the 8-bit immediate form. Note that some instructions
+ // will sign extend the immediate operand, so to be conservative we only
+ // accept the range [0,128).
+ if (CR->getUnsignedMax().ult(128))
+ return X86II::MO_ABS8;
+ else
+ return X86II::MO_NO_FLAG;
+ }
+ }
+
+ if (TM.shouldAssumeDSOLocal(M, GV))
+ return classifyLocalReference(GV);
+
+ if (isTargetCOFF()) {
+ if (GV->hasDLLImportStorageClass())
+ return X86II::MO_DLLIMPORT;
+ return X86II::MO_COFFSTUB;
+ }
+ // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables.
+ if (isOSWindows())
+ return X86II::MO_NO_FLAG;
+
+ if (is64Bit()) {
+ // ELF supports a large, truly PIC code model with non-PC relative GOT
+ // references. Other object file formats do not. Use the no-flag, 64-bit
+ // reference for them.
+ if (TM.getCodeModel() == CodeModel::Large)
+ return isTargetELF() ? X86II::MO_GOT : X86II::MO_NO_FLAG;
+ return X86II::MO_GOTPCREL;
+ }
+
+ if (isTargetDarwin()) {
+ if (!isPositionIndependent())
+ return X86II::MO_DARWIN_NONLAZY;
+ return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+ }
+
+ // 32-bit ELF references GlobalAddress directly in static relocation model.
+ // We cannot use MO_GOT because EBX may not be set up.
+ if (TM.getRelocationModel() == Reloc::Static)
+ return X86II::MO_NO_FLAG;
+ return X86II::MO_GOT;
+}
+
+unsigned char
+X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const {
+ return classifyGlobalFunctionReference(GV, *GV->getParent());
+}
+
+unsigned char
+X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
+ const Module &M) const {
+ if (TM.shouldAssumeDSOLocal(M, GV))
+ return X86II::MO_NO_FLAG;
+
+ // Functions on COFF can be non-DSO local for two reasons:
+ // - They are marked dllimport
+ // - They are extern_weak, and a stub is needed
+ if (isTargetCOFF()) {
+ if (GV->hasDLLImportStorageClass())
+ return X86II::MO_DLLIMPORT;
+ return X86II::MO_COFFSTUB;
+ }
+
+ const Function *F = dyn_cast_or_null<Function>(GV);
+
+ if (isTargetELF()) {
+ if (is64Bit() && F && (CallingConv::X86_RegCall == F->getCallingConv()))
+ // According to psABI, PLT stub clobbers XMM8-XMM15.
+ // In Regcall calling convention those registers are used for passing
+ // parameters. Thus we need to prevent lazy binding in Regcall.
+ return X86II::MO_GOTPCREL;
+ // If PLT must be avoided then the call should be via GOTPCREL.
+ if (((F && F->hasFnAttribute(Attribute::NonLazyBind)) ||
+ (!F && M.getRtLibUseGOT())) &&
+ is64Bit())
+ return X86II::MO_GOTPCREL;
+ // Reference ExternalSymbol directly in static relocation model.
+ if (!is64Bit() && !GV && TM.getRelocationModel() == Reloc::Static)
+ return X86II::MO_NO_FLAG;
+ return X86II::MO_PLT;
+ }
+
+ if (is64Bit()) {
+ if (F && F->hasFnAttribute(Attribute::NonLazyBind))
+ // If the function is marked as non-lazy, generate an indirect call
+ // which loads from the GOT directly. This avoids runtime overhead
+ // at the cost of eager binding (and one extra byte of encoding).
+ return X86II::MO_GOTPCREL;
+ return X86II::MO_NO_FLAG;
+ }
+
+ return X86II::MO_NO_FLAG;
+}
+
+/// Return true if the subtarget allows calls to immediate address.
+bool X86Subtarget::isLegalToCallImmediateAddr() const {
+ // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
+ // but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does,
+ // the following check for Win32 should be removed.
+ if (In64BitMode || isTargetWin32())
+ return false;
+ return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
+ StringRef FS) {
+ if (CPU.empty())
+ CPU = "generic";
+
+ if (TuneCPU.empty())
+ TuneCPU = "i586"; // FIXME: "generic" is more modern than llc tests expect.
+
+ std::string FullFS = X86_MC::ParseX86Triple(TargetTriple);
+ assert(!FullFS.empty() && "Failed to parse X86 triple");
+
+ if (!FS.empty())
+ FullFS = (Twine(FullFS) + "," + FS).str();
+
+ // Parse features string and set the CPU.
+ ParseSubtargetFeatures(CPU, TuneCPU, FullFS);
+
+ // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
+ // 16-bytes and under that are reasonably fast. These features were
+ // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
+ // micro-architectures respectively.
+ if (hasSSE42() || hasSSE4A())
+ IsUAMem16Slow = false;
+
+ LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+ << ", 3DNowLevel " << X863DNowLevel << ", 64bit "
+ << HasX86_64 << "\n");
+ if (In64BitMode && !HasX86_64)
+ report_fatal_error("64-bit code requested on a subtarget that doesn't "
+ "support it!");
+
+ // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and for all
+ // 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes
+ // following the i386 psABI, while on Illumos it is always 16 bytes.
+ if (StackAlignOverride)
+ stackAlignment = *StackAlignOverride;
+ else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
+ In64BitMode)
+ stackAlignment = Align(16);
+
+ // Consume the vector width attribute or apply any target specific limit.
+ if (PreferVectorWidthOverride)
+ PreferVectorWidth = PreferVectorWidthOverride;
+ else if (Prefer128Bit)
+ PreferVectorWidth = 128;
+ else if (Prefer256Bit)
+ PreferVectorWidth = 256;
+}
+
+X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef TuneCPU,
+ StringRef FS) {
+ initSubtargetFeatures(CPU, TuneCPU, FS);
+ return *this;
+}
+
+X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+ StringRef FS, const X86TargetMachine &TM,
+ MaybeAlign StackAlignOverride,
+ unsigned PreferVectorWidthOverride,
+ unsigned RequiredVectorWidth)
+ : X86GenSubtargetInfo(TT, CPU, TuneCPU, FS),
+ PICStyle(PICStyles::Style::None), TM(TM), TargetTriple(TT),
+ StackAlignOverride(StackAlignOverride),
+ PreferVectorWidthOverride(PreferVectorWidthOverride),
+ RequiredVectorWidth(RequiredVectorWidth),
+ InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
+ TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
+ // Determine the PICStyle based on the target selected.
+ if (!isPositionIndependent())
+ setPICStyle(PICStyles::Style::None);
+ else if (is64Bit())
+ setPICStyle(PICStyles::Style::RIPRel);
+ else if (isTargetCOFF())
+ setPICStyle(PICStyles::Style::None);
+ else if (isTargetDarwin())
+ setPICStyle(PICStyles::Style::StubPIC);
+ else if (isTargetELF())
+ setPICStyle(PICStyles::Style::GOT);
+
+ CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering()));
+ Legalizer.reset(new X86LegalizerInfo(*this, TM));
+
+ auto *RBI = new X86RegisterBankInfo(*getRegisterInfo());
+ RegBankInfo.reset(RBI);
+ InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI));
+}
+
+const CallLowering *X86Subtarget::getCallLowering() const {
+ return CallLoweringInfo.get();
+}
+
+InstructionSelector *X86Subtarget::getInstructionSelector() const {
+ return InstSelector.get();
+}
+
+const LegalizerInfo *X86Subtarget::getLegalizerInfo() const {
+ return Legalizer.get();
+}
+
+const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
+ return RegBankInfo.get();
+}
+
+bool X86Subtarget::enableEarlyIfConversion() const {
+ return hasCMov() && X86EarlyIfConv;
+}
+
+void X86Subtarget::getPostRAMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+ Mutations.push_back(createX86MacroFusionDAGMutation());
+}
+
+bool X86Subtarget::isPositionIndependent() const {
+ return TM.isPositionIndependent();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 000000000000..fa2622333d60
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,949 @@
+//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include <climits>
+#include <memory>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "X86GenSubtargetInfo.inc"
+
+namespace llvm {
+
+class CallLowering;
+class GlobalValue;
+class InstructionSelector;
+class LegalizerInfo;
+class RegisterBankInfo;
+class StringRef;
+class TargetMachine;
+
+/// The X86 backend supports a number of different styles of PIC.
+///
+namespace PICStyles {
+
+enum class Style {
+ StubPIC, // Used on i386-darwin in pic mode.
+ GOT, // Used on 32 bit elf on when in pic mode.
+ RIPRel, // Used on X86-64 when in pic mode.
+ None // Set when not in pic mode.
+};
+
+} // end namespace PICStyles
+
+class X86Subtarget final : public X86GenSubtargetInfo {
+ // NOTE: Do not add anything new to this list. Coarse, CPU name based flags
+ // are not a good idea. We should be migrating away from these.
+ enum X86ProcFamilyEnum {
+ Others,
+ IntelAtom,
+ IntelSLM
+ };
+
+ enum X86SSEEnum {
+ NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+ };
+
+ enum X863DNowEnum {
+ NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
+ };
+
+ /// X86 processor family: Intel Atom, and others
+ X86ProcFamilyEnum X86ProcFamily = Others;
+
+ /// Which PIC style to use
+ PICStyles::Style PICStyle;
+
+ const TargetMachine &TM;
+
+ /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
+ X86SSEEnum X86SSELevel = NoSSE;
+
+ /// MMX, 3DNow, 3DNow Athlon, or none supported.
+ X863DNowEnum X863DNowLevel = NoThreeDNow;
+
+ /// True if the processor supports X87 instructions.
+ bool HasX87 = false;
+
+ /// True if the processor supports CMPXCHG8B.
+ bool HasCmpxchg8b = false;
+
+ /// True if this processor has NOPL instruction
+ /// (generally pentium pro+).
+ bool HasNOPL = false;
+
+ /// True if this processor has conditional move instructions
+ /// (generally pentium pro+).
+ bool HasCMov = false;
+
+ /// True if the processor supports X86-64 instructions.
+ bool HasX86_64 = false;
+
+ /// True if the processor supports POPCNT.
+ bool HasPOPCNT = false;
+
+ /// True if the processor supports SSE4A instructions.
+ bool HasSSE4A = false;
+
+ /// Target has AES instructions
+ bool HasAES = false;
+ bool HasVAES = false;
+
+ /// Target has FXSAVE/FXRESTOR instructions
+ bool HasFXSR = false;
+
+ /// Target has XSAVE instructions
+ bool HasXSAVE = false;
+
+ /// Target has XSAVEOPT instructions
+ bool HasXSAVEOPT = false;
+
+ /// Target has XSAVEC instructions
+ bool HasXSAVEC = false;
+
+ /// Target has XSAVES instructions
+ bool HasXSAVES = false;
+
+ /// Target has carry-less multiplication
+ bool HasPCLMUL = false;
+ bool HasVPCLMULQDQ = false;
+
+ /// Target has Galois Field Arithmetic instructions
+ bool HasGFNI = false;
+
+ /// Target has 3-operand fused multiply-add
+ bool HasFMA = false;
+
+ /// Target has 4-operand fused multiply-add
+ bool HasFMA4 = false;
+
+ /// Target has XOP instructions
+ bool HasXOP = false;
+
+ /// Target has TBM instructions.
+ bool HasTBM = false;
+
+ /// Target has LWP instructions
+ bool HasLWP = false;
+
+ /// True if the processor has the MOVBE instruction.
+ bool HasMOVBE = false;
+
+ /// True if the processor has the RDRAND instruction.
+ bool HasRDRAND = false;
+
+ /// Processor has 16-bit floating point conversion instructions.
+ bool HasF16C = false;
+
+ /// Processor has FS/GS base insturctions.
+ bool HasFSGSBase = false;
+
+ /// Processor has LZCNT instruction.
+ bool HasLZCNT = false;
+
+ /// Processor has BMI1 instructions.
+ bool HasBMI = false;
+
+ /// Processor has BMI2 instructions.
+ bool HasBMI2 = false;
+
+ /// Processor has VBMI instructions.
+ bool HasVBMI = false;
+
+ /// Processor has VBMI2 instructions.
+ bool HasVBMI2 = false;
+
+ /// Processor has Integer Fused Multiply Add
+ bool HasIFMA = false;
+
+ /// Processor has RTM instructions.
+ bool HasRTM = false;
+
+ /// Processor has ADX instructions.
+ bool HasADX = false;
+
+ /// Processor has SHA instructions.
+ bool HasSHA = false;
+
+ /// Processor has PRFCHW instructions.
+ bool HasPRFCHW = false;
+
+ /// Processor has RDSEED instructions.
+ bool HasRDSEED = false;
+
+ /// Processor has LAHF/SAHF instructions in 64-bit mode.
+ bool HasLAHFSAHF64 = false;
+
+ /// Processor has MONITORX/MWAITX instructions.
+ bool HasMWAITX = false;
+
+ /// Processor has Cache Line Zero instruction
+ bool HasCLZERO = false;
+
+ /// Processor has Cache Line Demote instruction
+ bool HasCLDEMOTE = false;
+
+ /// Processor has MOVDIRI instruction (direct store integer).
+ bool HasMOVDIRI = false;
+
+ /// Processor has MOVDIR64B instruction (direct store 64 bytes).
+ bool HasMOVDIR64B = false;
+
+ /// Processor has ptwrite instruction.
+ bool HasPTWRITE = false;
+
+ /// Processor has Prefetch with intent to Write instruction
+ bool HasPREFETCHWT1 = false;
+
+ /// True if SHLD instructions are slow.
+ bool IsSHLDSlow = false;
+
+ /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
+ // PMULUDQ.
+ bool IsPMULLDSlow = false;
+
+ /// True if the PMADDWD instruction is slow compared to PMULLD.
+ bool IsPMADDWDSlow = false;
+
+ /// True if unaligned memory accesses of 16-bytes are slow.
+ bool IsUAMem16Slow = false;
+
+ /// True if unaligned memory accesses of 32-bytes are slow.
+ bool IsUAMem32Slow = false;
+
+ /// True if SSE operations can have unaligned memory operands.
+ /// This may require setting a configuration bit in the processor.
+ bool HasSSEUnalignedMem = false;
+
+ /// True if this processor has the CMPXCHG16B instruction;
+ /// this is true for most x86-64 chips, but not the first AMD chips.
+ bool HasCmpxchg16b = false;
+
+ /// True if the LEA instruction should be used for adjusting
+ /// the stack pointer. This is an optimization for Intel Atom processors.
+ bool UseLeaForSP = false;
+
+ /// True if POPCNT instruction has a false dependency on the destination register.
+ bool HasPOPCNTFalseDeps = false;
+
+ /// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
+ bool HasLZCNTFalseDeps = false;
+
+ /// True if its preferable to combine to a single shuffle using a variable
+ /// mask over multiple fixed shuffles.
+ bool HasFastVariableShuffle = false;
+
+ /// True if vzeroupper instructions should be inserted after code that uses
+ /// ymm or zmm registers.
+ bool InsertVZEROUPPER = false;
+
+ /// True if there is no performance penalty for writing NOPs with up to
+ /// 7 bytes.
+ bool HasFast7ByteNOP = false;
+
+ /// True if there is no performance penalty for writing NOPs with up to
+ /// 11 bytes.
+ bool HasFast11ByteNOP = false;
+
+ /// True if there is no performance penalty for writing NOPs with up to
+ /// 15 bytes.
+ bool HasFast15ByteNOP = false;
+
+ /// True if gather is reasonably fast. This is true for Skylake client and
+ /// all AVX-512 CPUs.
+ bool HasFastGather = false;
+
+ /// True if hardware SQRTSS instruction is at least as fast (latency) as
+ /// RSQRTSS followed by a Newton-Raphson iteration.
+ bool HasFastScalarFSQRT = false;
+
+ /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+ /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+ bool HasFastVectorFSQRT = false;
+
+ /// True if 8-bit divisions are significantly faster than
+ /// 32-bit divisions and should be used when possible.
+ bool HasSlowDivide32 = false;
+
+ /// True if 32-bit divides are significantly faster than
+ /// 64-bit divisions and should be used when possible.
+ bool HasSlowDivide64 = false;
+
+ /// True if LZCNT instruction is fast.
+ bool HasFastLZCNT = false;
+
+ /// True if SHLD based rotate is fast.
+ bool HasFastSHLDRotate = false;
+
+ /// True if the processor supports macrofusion.
+ bool HasMacroFusion = false;
+
+ /// True if the processor supports branch fusion.
+ bool HasBranchFusion = false;
+
+ /// True if the processor has enhanced REP MOVSB/STOSB.
+ bool HasERMSB = false;
+
+ /// True if the processor has fast short REP MOV.
+ bool HasFSRM = false;
+
+ /// True if the short functions should be padded to prevent
+ /// a stall when returning too early.
+ bool PadShortFunctions = false;
+
+ /// True if two memory operand instructions should use a temporary register
+ /// instead.
+ bool SlowTwoMemOps = false;
+
+ /// True if the LEA instruction inputs have to be ready at address generation
+ /// (AG) time.
+ bool LEAUsesAG = false;
+
+ /// True if the LEA instruction with certain arguments is slow
+ bool SlowLEA = false;
+
+ /// True if the LEA instruction has all three source operands: base, index,
+ /// and offset or if the LEA instruction uses base and index registers where
+ /// the base is EBP, RBP,or R13
+ bool Slow3OpsLEA = false;
+
+ /// True if INC and DEC instructions are slow when writing to flags
+ bool SlowIncDec = false;
+
+ /// Processor has AVX-512 PreFetch Instructions
+ bool HasPFI = false;
+
+ /// Processor has AVX-512 Exponential and Reciprocal Instructions
+ bool HasERI = false;
+
+ /// Processor has AVX-512 Conflict Detection Instructions
+ bool HasCDI = false;
+
+ /// Processor has AVX-512 population count Instructions
+ bool HasVPOPCNTDQ = false;
+
+ /// Processor has AVX-512 Doubleword and Quadword instructions
+ bool HasDQI = false;
+
+ /// Processor has AVX-512 Byte and Word instructions
+ bool HasBWI = false;
+
+ /// Processor has AVX-512 Vector Length eXtenstions
+ bool HasVLX = false;
+
+ /// Processor has PKU extenstions
+ bool HasPKU = false;
+
+ /// Processor has AVX-512 Vector Neural Network Instructions
+ bool HasVNNI = false;
+
+ /// Processor has AVX Vector Neural Network Instructions
+ bool HasAVXVNNI = false;
+
+ /// Processor has AVX-512 bfloat16 floating-point extensions
+ bool HasBF16 = false;
+
+ /// Processor supports ENQCMD instructions
+ bool HasENQCMD = false;
+
+ /// Processor has AVX-512 Bit Algorithms instructions
+ bool HasBITALG = false;
+
+ /// Processor has AVX-512 vp2intersect instructions
+ bool HasVP2INTERSECT = false;
+
+ /// Processor supports CET SHSTK - Control-Flow Enforcement Technology
+ /// using Shadow Stack
+ bool HasSHSTK = false;
+
+ /// Processor supports Invalidate Process-Context Identifier
+ bool HasINVPCID = false;
+
+ /// Processor has Software Guard Extensions
+ bool HasSGX = false;
+
+ /// Processor supports Flush Cache Line instruction
+ bool HasCLFLUSHOPT = false;
+
+ /// Processor supports Cache Line Write Back instruction
+ bool HasCLWB = false;
+
+ /// Processor supports Write Back No Invalidate instruction
+ bool HasWBNOINVD = false;
+
+ /// Processor support RDPID instruction
+ bool HasRDPID = false;
+
+ /// Processor supports WaitPKG instructions
+ bool HasWAITPKG = false;
+
+ /// Processor supports PCONFIG instruction
+ bool HasPCONFIG = false;
+
+ /// Processor support key locker instructions
+ bool HasKL = false;
+
+ /// Processor support key locker wide instructions
+ bool HasWIDEKL = false;
+
+ /// Processor supports HRESET instruction
+ bool HasHRESET = false;
+
+ /// Processor supports SERIALIZE instruction
+ bool HasSERIALIZE = false;
+
+ /// Processor supports TSXLDTRK instruction
+ bool HasTSXLDTRK = false;
+
+ /// Processor has AMX support
+ bool HasAMXTILE = false;
+ bool HasAMXBF16 = false;
+ bool HasAMXINT8 = false;
+
+ /// Processor supports User Level Interrupt instructions
+ bool HasUINTR = false;
+
+ /// Processor has a single uop BEXTR implementation.
+ bool HasFastBEXTR = false;
+
+ /// Try harder to combine to horizontal vector ops if they are fast.
+ bool HasFastHorizontalOps = false;
+
+ /// Prefer a left/right scalar logical shifts pair over a shift+and pair.
+ bool HasFastScalarShiftMasks = false;
+
+ /// Prefer a left/right vector logical shifts pair over a shift+and pair.
+ bool HasFastVectorShiftMasks = false;
+
+ /// Use a retpoline thunk rather than indirect calls to block speculative
+ /// execution.
+ bool UseRetpolineIndirectCalls = false;
+
+ /// Use a retpoline thunk or remove any indirect branch to block speculative
+ /// execution.
+ bool UseRetpolineIndirectBranches = false;
+
+ /// Deprecated flag, query `UseRetpolineIndirectCalls` and
+ /// `UseRetpolineIndirectBranches` instead.
+ bool DeprecatedUseRetpoline = false;
+
+ /// When using a retpoline thunk, call an externally provided thunk rather
+ /// than emitting one inside the compiler.
+ bool UseRetpolineExternalThunk = false;
+
+ /// Prevent generation of indirect call/branch instructions from memory,
+ /// and force all indirect call/branch instructions from a register to be
+ /// preceded by an LFENCE. Also decompose RET instructions into a
+ /// POP+LFENCE+JMP sequence.
+ bool UseLVIControlFlowIntegrity = false;
+
+ /// Enable Speculative Execution Side Effect Suppression
+ bool UseSpeculativeExecutionSideEffectSuppression = false;
+
+ /// Insert LFENCE instructions to prevent data speculatively injected into
+ /// loads from being used maliciously.
+ bool UseLVILoadHardening = false;
+
+ /// Use software floating point for code generation.
+ bool UseSoftFloat = false;
+
+ /// Use alias analysis during code generation.
+ bool UseAA = false;
+
+ /// The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ Align stackAlignment = Align(4);
+
+ Align TileConfigAlignment = Align(4);
+
+ /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+ ///
+ // FIXME: this is a known good value for Yonah. How about others?
+ unsigned MaxInlineSizeThreshold = 128;
+
+ /// Indicates target prefers 128 bit instructions.
+ bool Prefer128Bit = false;
+
+ /// Indicates target prefers 256 bit instructions.
+ bool Prefer256Bit = false;
+
+ /// Indicates target prefers AVX512 mask registers.
+ bool PreferMaskRegisters = false;
+
+ /// Use Goldmont specific floating point div/sqrt costs.
+ bool UseGLMDivSqrtCosts = false;
+
+ /// What processor and OS we're targeting.
+ Triple TargetTriple;
+
+ /// GlobalISel related APIs.
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+
+private:
+ /// Override the stack alignment.
+ MaybeAlign StackAlignOverride;
+
+ /// Preferred vector width from function attribute.
+ unsigned PreferVectorWidthOverride;
+
+ /// Resolved preferred vector width from function attribute and subtarget
+ /// features.
+ unsigned PreferVectorWidth = UINT32_MAX;
+
+ /// Required vector width from function attribute.
+ unsigned RequiredVectorWidth;
+
+ /// True if compiling for 64-bit, false for 16-bit or 32-bit.
+ bool In64BitMode = false;
+
+ /// True if compiling for 32-bit, false for 16-bit or 64-bit.
+ bool In32BitMode = false;
+
+ /// True if compiling for 16-bit, false for 32-bit or 64-bit.
+ bool In16BitMode = false;
+
+ X86SelectionDAGInfo TSInfo;
+ // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
+ // X86TargetLowering needs.
+ X86InstrInfo InstrInfo;
+ X86TargetLowering TLInfo;
+ X86FrameLowering FrameLowering;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ ///
+ X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
+ const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
+ unsigned PreferVectorWidthOverride,
+ unsigned RequiredVectorWidth);
+
+ const X86TargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+
+ const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+ const X86FrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+
+ const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ const X86RegisterInfo *getRegisterInfo() const override {
+ return &getInstrInfo()->getRegisterInfo();
+ }
+
+ unsigned getTileConfigSize() const { return 64; }
+ Align getTileConfigAlignment() const { return TileConfigAlignment; }
+
+ /// Returns the minimum alignment known to hold of the
+ /// stack frame on entry to the function and which must be maintained by every
+ /// function for this subtarget.
+ Align getStackAlignment() const { return stackAlignment; }
+
+ /// Returns the maximum memset / memcpy size
+ /// that still makes it profitable to inline the call.
+ unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+ /// Methods used by Global ISel
+ const CallLowering *getCallLowering() const override;
+ InstructionSelector *getInstructionSelector() const override;
+ const LegalizerInfo *getLegalizerInfo() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
+
+private:
+ /// Initialize the full set of dependencies so we can use an initializer
+ /// list for X86Subtarget.
+ X86Subtarget &initializeSubtargetDependencies(StringRef CPU,
+ StringRef TuneCPU,
+ StringRef FS);
+ void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+public:
+ /// Is this x86_64? (disregarding specific ABI / programming model)
+ bool is64Bit() const {
+ return In64BitMode;
+ }
+
+ bool is32Bit() const {
+ return In32BitMode;
+ }
+
+ bool is16Bit() const {
+ return In16BitMode;
+ }
+
+ /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+ bool isTarget64BitILP32() const {
+ return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
+ TargetTriple.isOSNaCl());
+ }
+
+ /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+ bool isTarget64BitLP64() const {
+ return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
+ !TargetTriple.isOSNaCl());
+ }
+
+ PICStyles::Style getPICStyle() const { return PICStyle; }
+ void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
+
+ bool hasX87() const { return HasX87; }
+ bool hasCmpxchg8b() const { return HasCmpxchg8b; }
+ bool hasNOPL() const { return HasNOPL; }
+ // SSE codegen depends on cmovs, and all SSE1+ processors support them.
+ // All 64-bit processors support cmov.
+ bool hasCMov() const { return HasCMov || X86SSELevel >= SSE1 || is64Bit(); }
+ bool hasSSE1() const { return X86SSELevel >= SSE1; }
+ bool hasSSE2() const { return X86SSELevel >= SSE2; }
+ bool hasSSE3() const { return X86SSELevel >= SSE3; }
+ bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+ bool hasSSE41() const { return X86SSELevel >= SSE41; }
+ bool hasSSE42() const { return X86SSELevel >= SSE42; }
+ bool hasAVX() const { return X86SSELevel >= AVX; }
+ bool hasAVX2() const { return X86SSELevel >= AVX2; }
+ bool hasAVX512() const { return X86SSELevel >= AVX512F; }
+ bool hasInt256() const { return hasAVX2(); }
+ bool hasSSE4A() const { return HasSSE4A; }
+ bool hasMMX() const { return X863DNowLevel >= MMX; }
+ bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+ bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+ bool hasPOPCNT() const { return HasPOPCNT; }
+ bool hasAES() const { return HasAES; }
+ bool hasVAES() const { return HasVAES; }
+ bool hasFXSR() const { return HasFXSR; }
+ bool hasXSAVE() const { return HasXSAVE; }
+ bool hasXSAVEOPT() const { return HasXSAVEOPT; }
+ bool hasXSAVEC() const { return HasXSAVEC; }
+ bool hasXSAVES() const { return HasXSAVES; }
+ bool hasPCLMUL() const { return HasPCLMUL; }
+ bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
+ bool hasGFNI() const { return HasGFNI; }
+ // Prefer FMA4 to FMA - its better for commutation/memory folding and
+ // has equal or better performance on all supported targets.
+ bool hasFMA() const { return HasFMA; }
+ bool hasFMA4() const { return HasFMA4; }
+ bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
+ bool hasXOP() const { return HasXOP; }
+ bool hasTBM() const { return HasTBM; }
+ bool hasLWP() const { return HasLWP; }
+ bool hasMOVBE() const { return HasMOVBE; }
+ bool hasRDRAND() const { return HasRDRAND; }
+ bool hasF16C() const { return HasF16C; }
+ bool hasFSGSBase() const { return HasFSGSBase; }
+ bool hasLZCNT() const { return HasLZCNT; }
+ bool hasBMI() const { return HasBMI; }
+ bool hasBMI2() const { return HasBMI2; }
+ bool hasVBMI() const { return HasVBMI; }
+ bool hasVBMI2() const { return HasVBMI2; }
+ bool hasIFMA() const { return HasIFMA; }
+ bool hasRTM() const { return HasRTM; }
+ bool hasADX() const { return HasADX; }
+ bool hasSHA() const { return HasSHA; }
+ bool hasPRFCHW() const { return HasPRFCHW; }
+ bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
+ bool hasPrefetchW() const {
+ // The PREFETCHW instruction was added with 3DNow but later CPUs gave it
+ // its own CPUID bit as part of deprecating 3DNow. Intel eventually added
+ // it and KNL has another that prefetches to L2 cache. We assume the
+ // L1 version exists if the L2 version does.
+ return has3DNow() || hasPRFCHW() || hasPREFETCHWT1();
+ }
+ bool hasSSEPrefetch() const {
+ // We implicitly enable these when we have a write prefix supporting cache
+ // level OR if we have prfchw, but don't already have a read prefetch from
+ // 3dnow.
+ return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1();
+ }
+ bool hasRDSEED() const { return HasRDSEED; }
+ bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); }
+ bool hasMWAITX() const { return HasMWAITX; }
+ bool hasCLZERO() const { return HasCLZERO; }
+ bool hasCLDEMOTE() const { return HasCLDEMOTE; }
+ bool hasMOVDIRI() const { return HasMOVDIRI; }
+ bool hasMOVDIR64B() const { return HasMOVDIR64B; }
+ bool hasPTWRITE() const { return HasPTWRITE; }
+ bool isSHLDSlow() const { return IsSHLDSlow; }
+ bool isPMULLDSlow() const { return IsPMULLDSlow; }
+ bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
+ bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
+ bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
+ bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
+ bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
+ bool useLeaForSP() const { return UseLeaForSP; }
+ bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
+ bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
+ bool hasFastVariableShuffle() const {
+ return HasFastVariableShuffle;
+ }
+ bool insertVZEROUPPER() const { return InsertVZEROUPPER; }
+ bool hasFastGather() const { return HasFastGather; }
+ bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+ bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
+ bool hasFastLZCNT() const { return HasFastLZCNT; }
+ bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
+ bool hasFastBEXTR() const { return HasFastBEXTR; }
+ bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
+ bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
+ bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
+ bool hasMacroFusion() const { return HasMacroFusion; }
+ bool hasBranchFusion() const { return HasBranchFusion; }
+ bool hasERMSB() const { return HasERMSB; }
+ bool hasFSRM() const { return HasFSRM; }
+ bool hasSlowDivide32() const { return HasSlowDivide32; }
+ bool hasSlowDivide64() const { return HasSlowDivide64; }
+ bool padShortFunctions() const { return PadShortFunctions; }
+ bool slowTwoMemOps() const { return SlowTwoMemOps; }
+ bool LEAusesAG() const { return LEAUsesAG; }
+ bool slowLEA() const { return SlowLEA; }
+ bool slow3OpsLEA() const { return Slow3OpsLEA; }
+ bool slowIncDec() const { return SlowIncDec; }
+ bool hasCDI() const { return HasCDI; }
+ bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
+ bool hasPFI() const { return HasPFI; }
+ bool hasERI() const { return HasERI; }
+ bool hasDQI() const { return HasDQI; }
+ bool hasBWI() const { return HasBWI; }
+ bool hasVLX() const { return HasVLX; }
+ bool hasPKU() const { return HasPKU; }
+ bool hasVNNI() const { return HasVNNI; }
+ bool hasBF16() const { return HasBF16; }
+ bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
+ bool hasBITALG() const { return HasBITALG; }
+ bool hasSHSTK() const { return HasSHSTK; }
+ bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
+ bool hasCLWB() const { return HasCLWB; }
+ bool hasWBNOINVD() const { return HasWBNOINVD; }
+ bool hasRDPID() const { return HasRDPID; }
+ bool hasWAITPKG() const { return HasWAITPKG; }
+ bool hasPCONFIG() const { return HasPCONFIG; }
+ bool hasSGX() const { return HasSGX; }
+ bool hasINVPCID() const { return HasINVPCID; }
+ bool hasENQCMD() const { return HasENQCMD; }
+ bool hasKL() const { return HasKL; }
+ bool hasWIDEKL() const { return HasWIDEKL; }
+ bool hasHRESET() const { return HasHRESET; }
+ bool hasSERIALIZE() const { return HasSERIALIZE; }
+ bool hasTSXLDTRK() const { return HasTSXLDTRK; }
+ bool hasUINTR() const { return HasUINTR; }
+ bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
+ bool useRetpolineIndirectBranches() const {
+ return UseRetpolineIndirectBranches;
+ }
+ bool hasAVXVNNI() const { return HasAVXVNNI; }
+ bool hasAMXTILE() const { return HasAMXTILE; }
+ bool hasAMXBF16() const { return HasAMXBF16; }
+ bool hasAMXINT8() const { return HasAMXINT8; }
+ bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
+
+ // These are generic getters that OR together all of the thunk types
+ // supported by the subtarget. Therefore useIndirectThunk*() will return true
+ // if any respective thunk feature is enabled.
+ bool useIndirectThunkCalls() const {
+ return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity();
+ }
+ bool useIndirectThunkBranches() const {
+ return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity();
+ }
+
+ bool preferMaskRegisters() const { return PreferMaskRegisters; }
+ bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
+ bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
+ bool useLVILoadHardening() const { return UseLVILoadHardening; }
+ bool useSpeculativeExecutionSideEffectSuppression() const {
+ return UseSpeculativeExecutionSideEffectSuppression;
+ }
+
+ unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
+ unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
+
+ // Helper functions to determine when we should allow widening to 512-bit
+ // during codegen.
+ // TODO: Currently we're always allowing widening on CPUs without VLX,
+ // because for many cases we don't have a better option.
+ bool canExtendTo512DQ() const {
+ return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
+ }
+ bool canExtendTo512BW() const {
+ return hasBWI() && canExtendTo512DQ();
+ }
+
+ // If there are no 512-bit vectors and we prefer not to use 512-bit registers,
+ // disable them in the legalizer.
+ bool useAVX512Regs() const {
+ return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
+ }
+
+ bool useBWIRegs() const {
+ return hasBWI() && useAVX512Regs();
+ }
+
+ bool isXRaySupported() const override { return is64Bit(); }
+
+ /// TODO: to be removed later and replaced with suitable properties
+ bool isAtom() const { return X86ProcFamily == IntelAtom; }
+ bool isSLM() const { return X86ProcFamily == IntelSLM; }
+ bool useSoftFloat() const { return UseSoftFloat; }
+ bool useAA() const override { return UseAA; }
+
+ /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+ /// no-sse2). There isn't any reason to disable it if the target processor
+ /// supports it.
+ bool hasMFence() const { return hasSSE2() || is64Bit(); }
+
+ const Triple &getTargetTriple() const { return TargetTriple; }
+
+ bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+ bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
+ bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
+ bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
+ bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }
+
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+ bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+ bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+ bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
+ bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
+ bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+ bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+ bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+ bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+ bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
+ bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
+
+ bool isTargetWindowsMSVC() const {
+ return TargetTriple.isWindowsMSVCEnvironment();
+ }
+
+ bool isTargetWindowsCoreCLR() const {
+ return TargetTriple.isWindowsCoreCLREnvironment();
+ }
+
+ bool isTargetWindowsCygwin() const {
+ return TargetTriple.isWindowsCygwinEnvironment();
+ }
+
+ bool isTargetWindowsGNU() const {
+ return TargetTriple.isWindowsGNUEnvironment();
+ }
+
+ bool isTargetWindowsItanium() const {
+ return TargetTriple.isWindowsItaniumEnvironment();
+ }
+
+ bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+
+ bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
+ bool isTargetWin64() const { return In64BitMode && isOSWindows(); }
+
+ bool isTargetWin32() const { return !In64BitMode && isOSWindows(); }
+
+ bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; }
+ bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; }
+
+ bool isPICStyleStubPIC() const {
+ return PICStyle == PICStyles::Style::StubPIC;
+ }
+
+ bool isPositionIndependent() const;
+
+ bool isCallingConvWin64(CallingConv::ID CC) const {
+ switch (CC) {
+ // On Win64, all these conventions just use the default convention.
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::Tail:
+ case CallingConv::Swift:
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_VectorCall:
+ case CallingConv::Intel_OCL_BI:
+ return isTargetWin64();
+ // This convention allows using the Win64 convention on other targets.
+ case CallingConv::Win64:
+ return true;
+ // This convention allows using the SysV convention on Windows targets.
+ case CallingConv::X86_64_SysV:
+ return false;
+ // Otherwise, who knows what this is.
+ default:
+ return false;
+ }
+ }
+
+ /// Classify a global variable reference for the current subtarget according
+ /// to how we should reference it in a non-pcrel context.
+ unsigned char classifyLocalReference(const GlobalValue *GV) const;
+
+ unsigned char classifyGlobalReference(const GlobalValue *GV,
+ const Module &M) const;
+ unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+
+ /// Classify a global function reference for the current subtarget.
+ unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
+ const Module &M) const;
+ unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const;
+
+ /// Classify a blockaddress reference for the current subtarget according to
+ /// how we should reference it in a non-pcrel context.
+ unsigned char classifyBlockAddressReference() const;
+
+ /// Return true if the subtarget allows calls to immediate address.
+ bool isLegalToCallImmediateAddr() const;
+
+ /// If we are using indirect thunks, we need to expand indirectbr to avoid it
+ /// lowering to an actual indirect jump.
+ bool enableIndirectBrExpand() const override {
+ return useIndirectThunkBranches();
+ }
+
+ /// Enable the MachineScheduler pass for all X86 subtargets.
+ bool enableMachineScheduler() const override { return true; }
+
+ bool enableEarlyIfConversion() const override;
+
+ void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
+ &Mutations) const override;
+
+ AntiDepBreakMode getAntiDepBreakMode() const override {
+ return TargetSubtargetInfo::ANTIDEP_CRITICAL;
+ }
+
+ bool enableAdvancedRASplitCost() const override { return true; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 000000000000..c8f76c210a3f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,584 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetMachine.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "TargetInfo/X86TargetInfo.h"
+#include "X86.h"
+#include "X86CallLowering.h"
+#include "X86LegalizerInfo.h"
+#include "X86MacroFusion.h"
+#include "X86Subtarget.h"
+#include "X86TargetObjectFile.h"
+#include "X86TargetTransformInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExecutionDomainFix.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/CFGuard.h"
+#include <memory>
+#include <string>
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
+ // Register the target.
+ RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
+ RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
+
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeX86LowerAMXTypeLegacyPassPass(PR);
+ initializeGlobalISel(PR);
+ initializeWinEHStatePassPass(PR);
+ initializeFixupBWInstPassPass(PR);
+ initializeEvexToVexInstPassPass(PR);
+ initializeFixupLEAPassPass(PR);
+ initializeFPSPass(PR);
+ initializeX86FixupSetCCPassPass(PR);
+ initializeX86CallFrameOptimizationPass(PR);
+ initializeX86CmovConverterPassPass(PR);
+ initializeX86TileConfigPass(PR);
+ initializeX86ExpandPseudoPass(PR);
+ initializeX86ExecutionDomainFixPass(PR);
+ initializeX86DomainReassignmentPass(PR);
+ initializeX86AvoidSFBPassPass(PR);
+ initializeX86AvoidTrailingCallPassPass(PR);
+ initializeX86SpeculativeLoadHardeningPassPass(PR);
+ initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR);
+ initializeX86FlagsCopyLoweringPassPass(PR);
+ initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
+ initializeX86LoadValueInjectionRetHardeningPassPass(PR);
+ initializeX86OptimizeLEAPassPass(PR);
+ initializeX86PartialReductionPass(PR);
+ initializePseudoProbeInserterPass(PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ if (TT.isOSBinFormatMachO()) {
+ if (TT.getArch() == Triple::x86_64)
+ return std::make_unique<X86_64MachoTargetObjectFile>();
+ return std::make_unique<TargetLoweringObjectFileMachO>();
+ }
+
+ if (TT.isOSBinFormatCOFF())
+ return std::make_unique<TargetLoweringObjectFileCOFF>();
+ return std::make_unique<X86ELFTargetObjectFile>();
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+ // X86 is little endian
+ std::string Ret = "e";
+
+ Ret += DataLayout::getManglingComponent(TT);
+ // X86 and x32 have 32 bit pointers.
+ if ((TT.isArch64Bit() &&
+ (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) ||
+ !TT.isArch64Bit())
+ Ret += "-p:32:32";
+
+ // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
+ Ret += "-p270:32:32-p271:32:32-p272:64:64";
+
+ // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+ if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
+ Ret += "-i64:64";
+ else if (TT.isOSIAMCU())
+ Ret += "-i64:32-f64:32";
+ else
+ Ret += "-f64:32:64";
+
+ // Some ABIs align long double to 128 bits, others to 32.
+ if (TT.isOSNaCl() || TT.isOSIAMCU())
+ ; // No f80
+ else if (TT.isArch64Bit() || TT.isOSDarwin())
+ Ret += "-f80:128";
+ else
+ Ret += "-f80:32";
+
+ if (TT.isOSIAMCU())
+ Ret += "-f128:32";
+
+ // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+ if (TT.isArch64Bit())
+ Ret += "-n8:16:32:64";
+ else
+ Ret += "-n8:16:32";
+
+ // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+ if ((!TT.isArch64Bit() && TT.isOSWindows()) || TT.isOSIAMCU())
+ Ret += "-a:0:32-S32";
+ else
+ Ret += "-S128";
+
+ return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+ bool JIT,
+ Optional<Reloc::Model> RM) {
+ bool is64Bit = TT.getArch() == Triple::x86_64;
+ if (!RM.hasValue()) {
+ // JIT codegen should use static relocations by default, since it's
+ // typically executed in process and not relocatable.
+ if (JIT)
+ return Reloc::Static;
+
+ // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+ // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+ // use static relocation model by default.
+ if (TT.isOSDarwin()) {
+ if (is64Bit)
+ return Reloc::PIC_;
+ return Reloc::DynamicNoPIC;
+ }
+ if (TT.isOSWindows() && is64Bit)
+ return Reloc::PIC_;
+ return Reloc::Static;
+ }
+
+ // ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC
+ // is defined as a model for code which may be used in static or dynamic
+ // executables but not necessarily a shared library. On X86-32 we just
+ // compile in -static mode, in x86-64 we use PIC.
+ if (*RM == Reloc::DynamicNoPIC) {
+ if (is64Bit)
+ return Reloc::PIC_;
+ if (!TT.isOSDarwin())
+ return Reloc::Static;
+ }
+
+ // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+ // the Mach-O file format doesn't support it.
+ if (*RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
+ return Reloc::PIC_;
+
+ return *RM;
+}
+
+static CodeModel::Model getEffectiveX86CodeModel(Optional<CodeModel::Model> CM,
+ bool JIT, bool Is64Bit) {
+ if (CM) {
+ if (*CM == CodeModel::Tiny)
+ report_fatal_error("Target does not support the tiny CodeModel", false);
+ return *CM;
+ }
+ if (JIT)
+ return Is64Bit ? CodeModel::Large : CodeModel::Small;
+ return CodeModel::Small;
+}
+
+/// Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT)
+ : LLVMTargetMachine(
+ T, computeDataLayout(TT), TT, CPU, FS, Options,
+ getEffectiveRelocModel(TT, JIT, RM),
+ getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
+ OL),
+ TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) {
+ // On PS4, the "return address" of a 'noreturn' call must still be within
+ // the calling function, and TrapUnreachable is an easy way to get that.
+ if (TT.isPS4() || TT.isOSBinFormatMachO()) {
+ this->Options.TrapUnreachable = true;
+ this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
+ }
+
+ setMachineOutliner(true);
+
+ // x86 supports the debug entry values.
+ setSupportsDebugEntryValues(true);
+
+ initAsmInfo();
+}
+
+X86TargetMachine::~X86TargetMachine() = default;
+
+const X86Subtarget *
+X86TargetMachine::getSubtargetImpl(const Function &F) const {
+ Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute TuneAttr = F.getFnAttribute("tune-cpu");
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ StringRef CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU;
+ StringRef TuneCPU =
+ TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU;
+ StringRef FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS;
+
+ SmallString<512> Key;
+ // The additions here are ordered so that the definitely short strings are
+ // added first so we won't exceed the small size. We append the
+ // much longer FS string at the end so that we only heap allocate at most
+ // one time.
+
+ // Extract prefer-vector-width attribute.
+ unsigned PreferVectorWidthOverride = 0;
+ Attribute PreferVecWidthAttr = F.getFnAttribute("prefer-vector-width");
+ if (PreferVecWidthAttr.isValid()) {
+ StringRef Val = PreferVecWidthAttr.getValueAsString();
+ unsigned Width;
+ if (!Val.getAsInteger(0, Width)) {
+ Key += "prefer-vector-width=";
+ Key += Val;
+ PreferVectorWidthOverride = Width;
+ }
+ }
+
+ // Extract min-legal-vector-width attribute.
+ unsigned RequiredVectorWidth = UINT32_MAX;
+ Attribute MinLegalVecWidthAttr = F.getFnAttribute("min-legal-vector-width");
+ if (MinLegalVecWidthAttr.isValid()) {
+ StringRef Val = MinLegalVecWidthAttr.getValueAsString();
+ unsigned Width;
+ if (!Val.getAsInteger(0, Width)) {
+ Key += "min-legal-vector-width=";
+ Key += Val;
+ RequiredVectorWidth = Width;
+ }
+ }
+
+ // Add CPU to the Key.
+ Key += CPU;
+
+ // Add tune CPU to the Key.
+ Key += "tune=";
+ Key += TuneCPU;
+
+ // Keep track of the start of the feature portion of the string.
+ unsigned FSStart = Key.size();
+
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function before we can generate a subtarget. We also need to use
+ // it as a key for the subtarget since that can be the only difference
+ // between two functions.
+ bool SoftFloat =
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ // If the soft float attribute is set on the function turn on the soft float
+ // subtarget feature.
+ if (SoftFloat)
+ Key += FS.empty() ? "+soft-float" : "+soft-float,";
+
+ Key += FS;
+
+ // We may have added +soft-float to the features so move the StringRef to
+ // point to the full string in the Key.
+ FS = Key.substr(FSStart);
+
+ auto &I = SubtargetMap[Key];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = std::make_unique<X86Subtarget>(
+ TargetTriple, CPU, TuneCPU, FS, *this,
+ MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride,
+ RequiredVectorWidth);
+ }
+ return I.get();
+}
+
+bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ assert(SrcAS != DestAS && "Expected different address spaces!");
+ if (getPointerSize(SrcAS) != getPointerSize(DestAS))
+ return false;
+ return SrcAS < 256 && DestAS < 256;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 TTI query.
+//===----------------------------------------------------------------------===//
+
+TargetTransformInfo
+X86TargetMachine::getTargetTransformInfo(const Function &F) {
+ return TargetTransformInfo(X86TTIImpl(this, F));
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// X86 Code Generator Pass Configuration Options.
+class X86PassConfig : public TargetPassConfig {
+public:
+ X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ X86TargetMachine &getX86TargetMachine() const {
+ return getTM<X86TargetMachine>();
+ }
+
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override {
+ ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+ DAG->addMutation(createX86MacroFusionDAGMutation());
+ return DAG;
+ }
+
+ ScheduleDAGInstrs *
+ createPostMachineScheduler(MachineSchedContext *C) const override {
+ ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+ DAG->addMutation(createX86MacroFusionDAGMutation());
+ return DAG;
+ }
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
+ bool addILPOpts() override;
+ bool addPreISel() override;
+ void addMachineSSAOptimization() override;
+ void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
+ void addPreEmitPass() override;
+ void addPreEmitPass2() override;
+ void addPreSched2() override;
+ bool addPreRewrite() override;
+
+ std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
+};
+
+class X86ExecutionDomainFix : public ExecutionDomainFix {
+public:
+ static char ID;
+ X86ExecutionDomainFix() : ExecutionDomainFix(ID, X86::VR128XRegClass) {}
+ StringRef getPassName() const override {
+ return "X86 Execution Dependency Fix";
+ }
+};
+char X86ExecutionDomainFix::ID;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(X86ExecutionDomainFix, "x86-execution-domain-fix",
+ "X86 Execution Domain Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis)
+INITIALIZE_PASS_END(X86ExecutionDomainFix, "x86-execution-domain-fix",
+ "X86 Execution Domain Fix", false, false)
+
+TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new X86PassConfig(*this, PM);
+}
+
+void X86PassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass());
+ addPass(createX86LowerAMXTypePass());
+
+ TargetPassConfig::addIRPasses();
+
+ if (TM->getOptLevel() != CodeGenOpt::None) {
+ addPass(createInterleavedAccessPass());
+ addPass(createX86PartialReductionPass());
+ }
+
+ // Add passes that handle indirect branch removal and insertion of a retpoline
+ // thunk. These will be a no-op unless a function subtarget has the retpoline
+ // feature enabled.
+ addPass(createIndirectBrExpandPass());
+
+ // Add Control Flow Guard checks.
+ const Triple &TT = TM->getTargetTriple();
+ if (TT.isOSWindows()) {
+ if (TT.getArch() == Triple::x86_64) {
+ addPass(createCFGuardDispatchPass());
+ } else {
+ addPass(createCFGuardCheckPass());
+ }
+ }
+}
+
+bool X86PassConfig::addInstSelector() {
+ // Install an instruction selector.
+ addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+ // For ELF, cleanup any local-dynamic TLS accesses.
+ if (TM->getTargetTriple().isOSBinFormatELF() &&
+ getOptLevel() != CodeGenOpt::None)
+ addPass(createCleanupLocalDynamicTLSPass());
+
+ addPass(createX86GlobalBaseRegPass());
+ return false;
+}
+
+bool X86PassConfig::addIRTranslator() {
+ addPass(new IRTranslator(getOptLevel()));
+ return false;
+}
+
+bool X86PassConfig::addLegalizeMachineIR() {
+ addPass(new Legalizer());
+ return false;
+}
+
+bool X86PassConfig::addRegBankSelect() {
+ addPass(new RegBankSelect());
+ return false;
+}
+
+bool X86PassConfig::addGlobalInstructionSelect() {
+ addPass(new InstructionSelect());
+ return false;
+}
+
+bool X86PassConfig::addILPOpts() {
+ addPass(&EarlyIfConverterID);
+ if (EnableMachineCombinerPass)
+ addPass(&MachineCombinerID);
+ addPass(createX86CmovConverterPass());
+ return true;
+}
+
+bool X86PassConfig::addPreISel() {
+ // Only add this pass for 32-bit x86 Windows.
+ const Triple &TT = TM->getTargetTriple();
+ if (TT.isOSWindows() && TT.getArch() == Triple::x86)
+ addPass(createX86WinEHStatePass());
+ return true;
+}
+
+void X86PassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(&LiveRangeShrinkID);
+ addPass(createX86FixupSetCC());
+ addPass(createX86OptimizeLEAs());
+ addPass(createX86CallFrameOptimization());
+ addPass(createX86AvoidStoreForwardingBlocks());
+ }
+
+ addPass(createX86SpeculativeLoadHardeningPass());
+ addPass(createX86FlagsCopyLoweringPass());
+ addPass(createX86WinAllocaExpander());
+
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createX86PreTileConfigPass());
+ }
+}
+
+void X86PassConfig::addMachineSSAOptimization() {
+ addPass(createX86DomainReassignmentPass());
+ TargetPassConfig::addMachineSSAOptimization();
+}
+
+void X86PassConfig::addPostRegAlloc() {
+ addPass(createX86FloatingPointStackifierPass());
+ // When -O0 is enabled, the Load Value Injection Hardening pass will fall back
+ // to using the Speculative Execution Side Effect Suppression pass for
+ // mitigation. This is to prevent slow downs due to
+ // analyses needed by the LVIHardening pass when compiling at -O0.
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createX86LoadValueInjectionLoadHardeningPass());
+}
+
+void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
+
+void X86PassConfig::addPreEmitPass() {
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(new X86ExecutionDomainFix());
+ addPass(createBreakFalseDeps());
+ }
+
+ addPass(createX86IndirectBranchTrackingPass());
+
+ addPass(createX86IssueVZeroUpperPass());
+
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createX86FixupBWInsts());
+ addPass(createX86PadShortFunctions());
+ addPass(createX86FixupLEAs());
+ }
+ addPass(createX86EvexToVexInsts());
+ addPass(createX86DiscriminateMemOpsPass());
+ addPass(createX86InsertPrefetchPass());
+ addPass(createX86InsertX87waitPass());
+}
+
+void X86PassConfig::addPreEmitPass2() {
+ const Triple &TT = TM->getTargetTriple();
+ const MCAsmInfo *MAI = TM->getMCAsmInfo();
+
+ // The X86 Speculative Execution Pass must run after all control
+ // flow graph modifying passes. As a result it was listed to run right before
+ // the X86 Retpoline Thunks pass. The reason it must run after control flow
+ // graph modifications is that the model of LFENCE in LLVM has to be updated
+ // (FIXME: https://bugs.llvm.org/show_bug.cgi?id=45167). Currently the
+ // placement of this pass was hand checked to ensure that the subsequent
+ // passes don't move the code around the LFENCEs in a way that will hurt the
+ // correctness of this pass. This placement has been shown to work based on
+ // hand inspection of the codegen output.
+ addPass(createX86SpeculativeExecutionSideEffectSuppression());
+ addPass(createX86IndirectThunksPass());
+
+ // Insert extra int3 instructions after trailing call instructions to avoid
+ // issues in the unwinder.
+ if (TT.isOSWindows() && TT.getArch() == Triple::x86_64)
+ addPass(createX86AvoidTrailingCallPass());
+
+ // Verify basic block incoming and outgoing cfa offset and register values and
+ // correct CFA calculation rule where needed by inserting appropriate CFI
+ // instructions.
+ if (!TT.isOSDarwin() &&
+ (!TT.isOSWindows() ||
+ MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
+ addPass(createCFIInstrInserter());
+ // Identify valid longjmp targets for Windows Control Flow Guard.
+ if (TT.isOSWindows())
+ addPass(createCFGuardLongjmpPass());
+ addPass(createX86LoadValueInjectionRetHardeningPass());
+}
+
+bool X86PassConfig::addPreRewrite() {
+ addPass(createX86TileConfigPass());
+ return true;
+}
+
+std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
+ return getStandardCSEConfigForOpt(TM->getOptLevel());
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 000000000000..69d7e48b8977
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,63 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+
+#include "X86Subtarget.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+#include <memory>
+
+namespace llvm {
+
+class StringRef;
+class TargetTransformInfo;
+
+class X86TargetMachine final : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap;
+ // True if this is used in JIT.
+ bool IsJIT;
+
+public:
+ X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT);
+ ~X86TargetMachine() override;
+
+ const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+ // DO NOT IMPLEMENT: There is no such thing as a valid default subtarget,
+ // subtargets are per-function entities based on the target-specific
+ // attributes of each function.
+ const X86Subtarget *getSubtargetImpl() const = delete;
+
+ TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
+ // Set up the pass pipeline.
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+
+ bool isJIT() const { return IsJIT; }
+
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 000000000000..b88ad5a478f3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,58 @@
+//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
+ const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+
+ // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+ // is an indirect pc-relative reference.
+ if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
+ const MCSymbol *Sym = TM.getSymbol(GV);
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+ const MCExpr *Four = MCConstantExpr::create(4, getContext());
+ return MCBinaryExpr::createAdd(Res, Four, getContext());
+ }
+
+ return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+ GV, Encoding, TM, MMI, Streamer);
+}
+
+MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
+ const GlobalValue *GV, const TargetMachine &TM,
+ MachineModuleInfo *MMI) const {
+ return TM.getSymbol(GV);
+}
+
+const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
+ const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+ int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry
+ // from a data section. In case there's an additional offset, then use
+ // foo@GOTPCREL+4+<offset>.
+ unsigned FinalOff = Offset+MV.getConstant()+4;
+ const MCExpr *Res =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+ const MCExpr *Off = MCConstantExpr::create(FinalOff, getContext());
+ return MCBinaryExpr::createAdd(Res, Off, getContext());
+}
+
+const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
+ const MCSymbol *Sym) const {
+ return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 000000000000..f4bf52c83771
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,52 @@
+//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+ /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
+ /// x86-64.
+ class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+ public:
+ const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+ unsigned Encoding,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+
+ // getCFIPersonalitySymbol - The symbol that gets passed to
+ // .cfi_personality.
+ MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI) const override;
+
+ const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
+ const MCSymbol *Sym,
+ const MCValue &MV, int64_t Offset,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
+ };
+
+ /// This implementation is used for X86 ELF targets that don't
+ /// have a further specialization.
+ class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ public:
+ X86ELFTargetObjectFile() {
+ PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
+ }
+ /// Describe a TLS variable address within debug info.
+ const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+ };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
new file mode 100644
index 000000000000..71455237fb61
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -0,0 +1,4761 @@
+//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+/// About Cost Model numbers used below it's necessary to say the following:
+/// the numbers correspond to some "generic" X86 CPU instead of usage of
+/// concrete CPU model. Usually the numbers correspond to CPU where the feature
+/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
+/// the lookups below the cost is based on Nehalem as that was the first CPU
+/// to support that feature level and thus has most likely the worst case cost.
+/// Some examples of other technologies/CPUs:
+/// SSE 3 - Pentium4 / Athlon64
+/// SSE 4.1 - Penryn
+/// SSE 4.2 - Nehalem
+/// AVX - Sandy Bridge
+/// AVX2 - Haswell
+/// AVX-512 - Xeon Phi / Skylake
+/// And some examples of instruction target dependent costs (latency)
+/// divss sqrtss rsqrtss
+/// AMD K7 11-16 19 3
+/// Piledriver 9-24 13-15 5
+/// Jaguar 14 16 2
+/// Pentium II,III 18 30 2
+/// Nehalem 7-14 7-18 3
+/// Haswell 10-13 11 5
+/// TODO: Develop and implement the target dependent cost model and
+/// specialize cost numbers for different Cost Model Targets such as throughput,
+/// code size, latency and uop count.
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/CostTable.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+TargetTransformInfo::PopcntSupportKind
+X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ // TODO: Currently the __builtin_popcount() implementation using SSE3
+ // instructions is inefficient. Once the problem is fixed, we should
+ // call ST->hasSSE3() instead of ST->hasPOPCNT().
+ return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
+}
+
+llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
+ TargetTransformInfo::CacheLevel Level) const {
+ switch (Level) {
+ case TargetTransformInfo::CacheLevel::L1D:
+ // - Penryn
+ // - Nehalem
+ // - Westmere
+ // - Sandy Bridge
+ // - Ivy Bridge
+ // - Haswell
+ // - Broadwell
+ // - Skylake
+ // - Kabylake
+ return 32 * 1024; // 32 KByte
+ case TargetTransformInfo::CacheLevel::L2D:
+ // - Penryn
+ // - Nehalem
+ // - Westmere
+ // - Sandy Bridge
+ // - Ivy Bridge
+ // - Haswell
+ // - Broadwell
+ // - Skylake
+ // - Kabylake
+ return 256 * 1024; // 256 KByte
+ }
+
+ llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
+}
+
+llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
+ TargetTransformInfo::CacheLevel Level) const {
+ // - Penryn
+ // - Nehalem
+ // - Westmere
+ // - Sandy Bridge
+ // - Ivy Bridge
+ // - Haswell
+ // - Broadwell
+ // - Skylake
+ // - Kabylake
+ switch (Level) {
+ case TargetTransformInfo::CacheLevel::L1D:
+ LLVM_FALLTHROUGH;
+ case TargetTransformInfo::CacheLevel::L2D:
+ return 8;
+ }
+
+ llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
+}
+
+unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
+ if (Vector && !ST->hasSSE1())
+ return 0;
+
+ if (ST->is64Bit()) {
+ if (Vector && ST->hasAVX512())
+ return 32;
+ return 16;
+ }
+ return 8;
+}
+
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
+ unsigned PreferVectorWidth = ST->getPreferVectorWidth();
+ if (Vector) {
+ if (ST->hasAVX512() && PreferVectorWidth >= 512)
+ return 512;
+ if (ST->hasAVX() && PreferVectorWidth >= 256)
+ return 256;
+ if (ST->hasSSE1() && PreferVectorWidth >= 128)
+ return 128;
+ return 0;
+ }
+
+ if (ST->is64Bit())
+ return 64;
+
+ return 32;
+}
+
+unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
+ return getRegisterBitWidth(true);
+}
+
+unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ // If the loop will not be vectorized, don't interleave the loop.
+ // Let regular unroll to unroll the loop, which saves the overflow
+ // check and memory check cost.
+ if (VF == 1)
+ return 1;
+
+ if (ST->isAtom())
+ return 1;
+
+ // Sandybridge and Haswell have multiple execution ports and pipelined
+ // vector units.
+ if (ST->hasAVX())
+ return 4;
+
+ return 2;
+}
+
+int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ TTI::OperandValueKind Op1Info,
+ TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
+ // TODO: Handle more cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info, Opd1PropInfo,
+ Opd2PropInfo, Args, CxtI);
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ static const CostTblEntry GLMCostTable[] = {
+ { ISD::FDIV, MVT::f32, 18 }, // divss
+ { ISD::FDIV, MVT::v4f32, 35 }, // divps
+ { ISD::FDIV, MVT::f64, 33 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 65 }, // divpd
+ };
+
+ if (ST->useGLMDivSqrtCosts())
+ if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SLMCostTable[] = {
+ { ISD::MUL, MVT::v4i32, 11 }, // pmulld
+ { ISD::MUL, MVT::v8i16, 2 }, // pmullw
+ { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+ { ISD::FMUL, MVT::f64, 2 }, // mulsd
+ { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
+ { ISD::FMUL, MVT::v4f32, 2 }, // mulps
+ { ISD::FDIV, MVT::f32, 17 }, // divss
+ { ISD::FDIV, MVT::v4f32, 39 }, // divps
+ { ISD::FDIV, MVT::f64, 32 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+ { ISD::FADD, MVT::v2f64, 2 }, // addpd
+ { ISD::FSUB, MVT::v2f64, 2 }, // subpd
+ // v2i64/v4i64 mul is custom lowered as a series of long:
+ // multiplies(3), shifts(3) and adds(2)
+ // slm muldq version throughput is 2 and addq throughput 4
+ // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
+ // 3X4 (addq throughput) = 17
+ { ISD::MUL, MVT::v2i64, 17 },
+ // slm addq\subq throughput is 4
+ { ISD::ADD, MVT::v2i64, 4 },
+ { ISD::SUB, MVT::v2i64, 4 },
+ };
+
+ if (ST->isSLM()) {
+ if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
+ // Check if the operands can be shrinked into a smaller datatype.
+ bool Op1Signed = false;
+ unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
+ bool Op2Signed = false;
+ unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
+
+ bool SignedMode = Op1Signed || Op2Signed;
+ unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
+
+ if (OpMinSize <= 7)
+ return LT.first * 3; // pmullw/sext
+ if (!SignedMode && OpMinSize <= 8)
+ return LT.first * 3; // pmullw/zext
+ if (OpMinSize <= 15)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ if (!SignedMode && OpMinSize <= 16)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ }
+
+ if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
+ LT.second)) {
+ return LT.first * Entry->Cost;
+ }
+ }
+
+ if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
+ ISD == ISD::UREM) &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+ Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+ if (ISD == ISD::SDIV || ISD == ISD::SREM) {
+ // On X86, vector signed division by constants power-of-two are
+ // normally expanded to the sequence SRA + SRL + ADD + SRA.
+ // The OperandValue properties may not be the same as that of the previous
+ // operation; conservatively assume OP_None.
+ int Cost =
+ 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
+ Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
+ Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
+ Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+
+ if (ISD == ISD::SREM) {
+ // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
+ Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
+ Op2Info);
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
+ Op2Info);
+ }
+
+ return Cost;
+ }
+
+ // Vector unsigned division/remainder will be simplified to shifts/masks.
+ if (ISD == ISD::UDIV)
+ return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
+ Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+
+ else // UREM
+ return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
+ Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ }
+
+ static const CostTblEntry AVX512BWUniformConstCostTable[] = {
+ { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasBWI()) {
+ if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512UniformConstCostTable[] = {
+ { ISD::SRA, MVT::v2i64, 1 },
+ { ISD::SRA, MVT::v4i64, 1 },
+ { ISD::SRA, MVT::v8i64, 1 },
+
+ { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
+ { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
+ { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
+ { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
+ { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX512()) {
+ if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2UniformConstCostTable[] = {
+ { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
+
+ { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
+ { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
+ { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX2()) {
+ if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2UniformConstCostTable[] = {
+ { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
+ { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
+ { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+ { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
+ { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
+ { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
+ { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
+ { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
+ { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
+ { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
+ };
+
+ // XOP has faster vXi8 shifts.
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasSSE2() && !ST->hasXOP()) {
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512BWConstCostTable[] = {
+ { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
+ { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
+ { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
+ };
+
+ if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+ ST->hasBWI()) {
+ if (const auto *Entry =
+ CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512ConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+ { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
+ { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+ { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
+ { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
+ { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
+ { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
+ { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
+ { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
+ { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
+ };
+
+ if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+ ST->hasAVX512()) {
+ if (const auto *Entry =
+ CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2ConstCostTable[] = {
+ { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
+ { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
+ { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
+ { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
+ { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
+ { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
+ { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
+ };
+
+ if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+ ST->hasAVX2()) {
+ if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2ConstCostTable[] = {
+ { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
+ { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
+ { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
+ { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
+ { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
+ { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
+ { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+ { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+ { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
+ { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
+ { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
+ { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
+ };
+
+ if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+ ST->hasSSE2()) {
+ // pmuldq sequence.
+ if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
+ return LT.first * 32;
+ if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
+ return LT.first * 38;
+ if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 15;
+ if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 20;
+
+ if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512BWShiftCostTable[] = {
+ { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX2UniformCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v16i16, 1 }, // psllw.
+ { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
+ { ISD::SRA, MVT::v16i16, 1 }, // psraw.
+ { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
+ { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
+ { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
+ };
+
+ if (ST->hasAVX2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2UniformCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ { ISD::SHL, MVT::v4i32, 1 }, // pslld
+ { ISD::SHL, MVT::v2i64, 1 }, // psllq.
+
+ { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
+ { ISD::SRL, MVT::v4i32, 1 }, // psrld.
+ { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
+
+ { ISD::SRA, MVT::v8i16, 1 }, // psraw.
+ { ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ };
+
+ if (ST->hasSSE2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512DQCostTable[] = {
+ { ISD::MUL, MVT::v2i64, 1 },
+ { ISD::MUL, MVT::v4i64, 1 },
+ { ISD::MUL, MVT::v8i64, 1 }
+ };
+
+ // Look for AVX512DQ lowering tricks for custom cases.
+ if (ST->hasDQI())
+ if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX512BWCostTable[] = {
+ { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
+
+ { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
+ };
+
+ // Look for AVX512BW lowering tricks for custom cases.
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX512CostTable[] = {
+ { ISD::SHL, MVT::v16i32, 1 },
+ { ISD::SRL, MVT::v16i32, 1 },
+ { ISD::SRA, MVT::v16i32, 1 },
+
+ { ISD::SHL, MVT::v8i64, 1 },
+ { ISD::SRL, MVT::v8i64, 1 },
+
+ { ISD::SRA, MVT::v2i64, 1 },
+ { ISD::SRA, MVT::v4i64, 1 },
+ { ISD::SRA, MVT::v8i64, 1 },
+
+ { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+
+ { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ };
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX2ShiftCostTable[] = {
+ // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+ // customize them to detect the cases where shift amount is a scalar one.
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 1 },
+ { ISD::SRA, MVT::v4i32, 1 },
+ { ISD::SHL, MVT::v8i32, 1 },
+ { ISD::SRL, MVT::v8i32, 1 },
+ { ISD::SRA, MVT::v8i32, 1 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 1 },
+ { ISD::SHL, MVT::v4i64, 1 },
+ { ISD::SRL, MVT::v4i64, 1 },
+ };
+
+ if (ST->hasAVX512()) {
+ if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ // On AVX512, a packed v32i16 shift left by a constant build_vector
+ // is lowered into a vector multiply (vpmullw).
+ return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+ Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ }
+
+ // Look for AVX2 lowering tricks.
+ if (ST->hasAVX2()) {
+ if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ // On AVX2, a packed v16i16 shift left by a constant build_vector
+ // is lowered into a vector multiply (vpmullw).
+ return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+ Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+
+ if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry XOPShiftCostTable[] = {
+ // 128bit shifts take 1cy, but right shifts require negation beforehand.
+ { ISD::SHL, MVT::v16i8, 1 },
+ { ISD::SRL, MVT::v16i8, 2 },
+ { ISD::SRA, MVT::v16i8, 2 },
+ { ISD::SHL, MVT::v8i16, 1 },
+ { ISD::SRL, MVT::v8i16, 2 },
+ { ISD::SRA, MVT::v8i16, 2 },
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 2 },
+ { ISD::SRA, MVT::v4i32, 2 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 2 },
+ { ISD::SRA, MVT::v2i64, 2 },
+ // 256bit shifts require splitting if AVX2 didn't catch them above.
+ { ISD::SHL, MVT::v32i8, 2+2 },
+ { ISD::SRL, MVT::v32i8, 4+2 },
+ { ISD::SRA, MVT::v32i8, 4+2 },
+ { ISD::SHL, MVT::v16i16, 2+2 },
+ { ISD::SRL, MVT::v16i16, 4+2 },
+ { ISD::SRA, MVT::v16i16, 4+2 },
+ { ISD::SHL, MVT::v8i32, 2+2 },
+ { ISD::SRL, MVT::v8i32, 4+2 },
+ { ISD::SRA, MVT::v8i32, 4+2 },
+ { ISD::SHL, MVT::v4i64, 2+2 },
+ { ISD::SRL, MVT::v4i64, 4+2 },
+ { ISD::SRA, MVT::v4i64, 4+2 },
+ };
+
+ // Look for XOP lowering tricks.
+ if (ST->hasXOP()) {
+ // If the right shift is constant then we'll fold the negation so
+ // it's as cheap as a left shift.
+ int ShiftISD = ISD;
+ if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ ShiftISD = ISD::SHL;
+ if (const auto *Entry =
+ CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2UniformShiftCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
+ { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
+ { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
+
+ { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
+ { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
+ { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
+
+ { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
+ { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
+ { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
+ { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
+ };
+
+ if (ST->hasSSE2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+
+ // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
+ if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
+ return LT.first * 4; // 2*psrad + shuffle.
+
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ if (ISD == ISD::SHL &&
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
+ MVT VT = LT.second;
+ // Vector shift left by non uniform constant can be lowered
+ // into vector multiply.
+ if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
+ ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
+ ISD = ISD::MUL;
+ }
+
+ static const CostTblEntry AVX2CostTable[] = {
+ { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
+
+ { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
+
+ { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
+
+ { ISD::SUB, MVT::v32i8, 1 }, // psubb
+ { ISD::ADD, MVT::v32i8, 1 }, // paddb
+ { ISD::SUB, MVT::v16i16, 1 }, // psubw
+ { ISD::ADD, MVT::v16i16, 1 }, // paddw
+ { ISD::SUB, MVT::v8i32, 1 }, // psubd
+ { ISD::ADD, MVT::v8i32, 1 }, // paddd
+ { ISD::SUB, MVT::v4i64, 1 }, // psubq
+ { ISD::ADD, MVT::v4i64, 1 }, // paddq
+
+ { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i16, 1 }, // pmullw
+ { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
+ { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+
+ { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
+ };
+
+ // Look for AVX2 lowering tricks for custom cases.
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX1CostTable[] = {
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v16i16, 4 },
+ { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v32i8, 4 },
+ { ISD::ADD, MVT::v32i8, 4 },
+ { ISD::SUB, MVT::v16i16, 4 },
+ { ISD::ADD, MVT::v16i16, 4 },
+ { ISD::SUB, MVT::v8i32, 4 },
+ { ISD::ADD, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v4i64, 4 },
+ { ISD::ADD, MVT::v4i64, 4 },
+
+ // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+ // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
+ // Because we believe v4i64 to be a legal type, we must also include the
+ // extract+insert in the cost table. Therefore, the cost here is 18
+ // instead of 8.
+ { ISD::MUL, MVT::v4i64, 18 },
+
+ { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
+
+ { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
+ };
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE42CostTable[] = {
+ { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
+ { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
+ { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
+ { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
+ };
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE41CostTable[] = {
+ { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
+ { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
+
+ { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
+
+ { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
+
+ { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
+ };
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE2CostTable[] = {
+ // We don't correctly identify costs of casts because they are marked as
+ // custom.
+ { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
+
+ { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
+
+ { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
+
+ { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v8i16, 1 }, // pmullw
+ { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
+ { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
+
+ { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
+ { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
+
+ { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
+ { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
+ };
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE1CostTable[] = {
+ { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
+
+ { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
+
+ { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
+
+ { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
+
+ { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
+ };
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ // It is not a good idea to vectorize division. We have to scalarize it and
+ // in the process we will often end up having to spilling regular
+ // registers. The overhead of division is going to dominate most kernels
+ // anyways so try hard to prevent vectorization of division - it is
+ // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+ // to hide "20 cycles" for each lane.
+ if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
+ ISD == ISD::UDIV || ISD == ISD::UREM)) {
+ int ScalarCost = getArithmeticInstrCost(
+ Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
+ }
+
+ // Fallback to the default implementation.
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
+}
+
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
+ int Index, VectorType *SubTp) {
+ // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+ // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
+
+ // Treat Transpose as 2-op shuffles - there's no difference in lowering.
+ if (Kind == TTI::SK_Transpose)
+ Kind = TTI::SK_PermuteTwoSrc;
+
+ // For Broadcasts we are splatting the first element from the first input
+ // register, so only need to reference that input and all the output
+ // registers are the same.
+ if (Kind == TTI::SK_Broadcast)
+ LT.first = 1;
+
+ // Subvector extractions are free if they start at the beginning of a
+ // vector and cheap if the subvectors are aligned.
+ if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
+ int NumElts = LT.second.getVectorNumElements();
+ if ((Index % NumElts) == 0)
+ return 0;
+ std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
+ if (SubLT.second.isVector()) {
+ int NumSubElts = SubLT.second.getVectorNumElements();
+ if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
+ return SubLT.first;
+ // Handle some cases for widening legalization. For now we only handle
+ // cases where the original subvector was naturally aligned and evenly
+ // fit in its legalized subvector type.
+ // FIXME: Remove some of the alignment restrictions.
+ // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
+ // vectors.
+ int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
+ if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
+ (NumSubElts % OrigSubElts) == 0 &&
+ LT.second.getVectorElementType() ==
+ SubLT.second.getVectorElementType() &&
+ LT.second.getVectorElementType().getSizeInBits() ==
+ BaseTp->getElementType()->getPrimitiveSizeInBits()) {
+ assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
+ "Unexpected number of elements!");
+ auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
+ LT.second.getVectorNumElements());
+ auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
+ SubLT.second.getVectorNumElements());
+ int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
+ int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
+ ExtractIndex, SubTy);
+
+ // If the original size is 32-bits or more, we can use pshufd. Otherwise
+ // if we have SSSE3 we can use pshufb.
+ if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
+ return ExtractCost + 1; // pshufd or pshufb
+
+ assert(SubTp->getPrimitiveSizeInBits() == 16 &&
+ "Unexpected vector size");
+
+ return ExtractCost + 2; // worst case pshufhw + pshufd
+ }
+ }
+ }
+
+ // Handle some common (illegal) sub-vector types as they are often very cheap
+ // to shuffle even on targets without PSHUFB.
+ EVT VT = TLI->getValueType(DL, BaseTp);
+ if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
+ !ST->hasSSSE3()) {
+ static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
+ {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
+ {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
+ {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
+ {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
+ {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
+
+ {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
+ {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
+ {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
+ {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
+
+ {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
+
+ {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
+ };
+
+ if (ST->hasSSE2())
+ if (const auto *Entry =
+ CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ // We are going to permute multiple sources and the result will be in multiple
+ // destinations. Providing an accurate cost only for splits where the element
+ // type remains the same.
+ if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
+ MVT LegalVT = LT.second;
+ if (LegalVT.isVector() &&
+ LegalVT.getVectorElementType().getSizeInBits() ==
+ BaseTp->getElementType()->getPrimitiveSizeInBits() &&
+ LegalVT.getVectorNumElements() <
+ cast<FixedVectorType>(BaseTp)->getNumElements()) {
+
+ unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ // Number of source vectors after legalization:
+ unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+ // Number of destination vectors after legalization:
+ unsigned NumOfDests = LT.first;
+
+ auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
+ LegalVT.getVectorNumElements());
+
+ unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+ return NumOfShuffles *
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+ }
+
+ return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
+ }
+
+ // For 2-input shuffles, we must account for splitting the 2 inputs into many.
+ if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
+ // We assume that source and destination have the same vector type.
+ int NumOfDests = LT.first;
+ int NumOfShufflesPerDest = LT.first * 2 - 1;
+ LT.first = NumOfDests * NumOfShufflesPerDest;
+ }
+
+ static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+ {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
+ {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
+
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
+
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
+ };
+
+ if (ST->hasVBMI())
+ if (const auto *Entry =
+ CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX512BWShuffleTbl[] = {
+ {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
+
+ {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
+ {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
+
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
+
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
+
+ {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
+ {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry =
+ CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX512ShuffleTbl[] = {
+ {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
+ {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
+ {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
+
+ {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
+ {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
+ {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
+ {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
+
+ {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
+
+ {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
+ {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
+ {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
+ {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
+ {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
+ {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
+ {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
+ {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
+ {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
+ {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
+ {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
+ {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
+
+ // FIXME: This just applies the type legalization cost rules above
+ // assuming these completely split.
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
+
+ {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
+ {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
+ {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
+ {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
+ {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
+ {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
+ };
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX2ShuffleTbl[] = {
+ {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
+ {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
+ {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
+
+ {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
+ {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
+ {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
+ {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
+ {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
+ {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
+
+ {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
+ {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
+
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
+ // + vpblendvb
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
+ // + vpblendvb
+
+ {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
+ {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
+ {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
+ {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
+ // + vpblendvb
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
+ // + vpblendvb
+ };
+
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry XOPShuffleTbl[] = {
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
+ // + vinsertf128
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
+ // + vinsertf128
+
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
+ // + vinsertf128
+ {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
+ // + vinsertf128
+ {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
+ };
+
+ if (ST->hasXOP())
+ if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX1ShuffleTbl[] = {
+ {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
+
+ {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
+ // + vinsertf128
+ {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
+ // + vinsertf128
+
+ {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
+ {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
+ {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
+ {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
+ {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
+ {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
+
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
+ // + 2*por + vinsertf128
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
+ // + 2*por + vinsertf128
+
+ {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
+ {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
+ {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
+ };
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE41ShuffleTbl[] = {
+ {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
+ {TTI::SK_Select, MVT::v2f64, 1}, // movsd
+ {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
+ {TTI::SK_Select, MVT::v4f32, 1}, // blendps
+ {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
+ {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
+ };
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSSE3ShuffleTbl[] = {
+ {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
+
+ {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
+
+ {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
+ {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
+
+ {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
+
+ {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
+ {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
+ };
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE2ShuffleTbl[] = {
+ {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
+ {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
+ {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
+ {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
+
+ {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
+ {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
+ {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
+ {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + packus
+
+ {TTI::SK_Select, MVT::v2i64, 1}, // movsd
+ {TTI::SK_Select, MVT::v2f64, 1}, // movsd
+ {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
+ {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
+ {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
+
+ {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
+ {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
+ {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
+ {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
+ // + pshufd/unpck
+ { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + 2*packus
+
+ { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
+ { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
+ { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
+ };
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE1ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
+ { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
+ };
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
+}
+
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // TODO: Allow non-throughput costs that aren't binary.
+ auto AdjustCost = [&CostKind](int Cost) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Cost == 0 ? 0 : 1;
+ return Cost;
+ };
+
+ // FIXME: Need a better design of the cost table to handle non-simple types of
+ // potential massive combinations (elem_num x src_type x dst_type).
+
+ static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
+
+ // Mask sign extend has an instruction.
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
+
+ // Mask zero extend is a sext + shift.
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
+ { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
+ };
+
+ static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
+
+ { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
+ };
+
+ // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
+ // 256-bit wide vectors.
+
+ static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
+ { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
+
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 },
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
+ { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
+
+ // Sign extend is zmm vpternlogd+vptruncdb.
+ // Zero extend is zmm broadcast load+vptruncdw.
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
+
+ // Sign extend is zmm vpternlogd+vptruncdw.
+ // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+
+ { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
+
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
+
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 },
+
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
+ };
+
+ static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
+ // Mask sign extend has an instruction.
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
+
+ // Mask zero extend is a sext + shift.
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
+ };
+
+ static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
+ };
+
+ static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
+
+ // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
+ // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
+
+ // sign extend is vpcmpeq+maskedmove+vpmovdw
+ // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
+
+ { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
+
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
+
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 },
+
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
+ };
+
+ static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
+
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
+
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
+ { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
+
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
+ };
+
+ static const TypeConversionCostTblEntry AVXConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
+
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
+
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 },
+ // The generic code to compute the scalar overhead is currently broken.
+ // Workaround this limitation by estimating the scalarization overhead
+ // here. We have roughly 10 instructions per scalar element.
+ // Multiply that by the vector width.
+ // FIXME: remove that when PR19268 is fixed.
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 },
+
+ { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
+ // This node is expanded into scalarized operations but BasicTTI is overly
+ // optimistic estimating its cost. It computes 3 per element (one
+ // vector-extract, one scalar conversion and one vector-insert). The
+ // problem is that the inserts form a read-modify-write chain so latency
+ // should be factored in too. Inflating the cost per element by 1.
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
+
+ { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
+ { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
+ };
+
+ static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+
+ // These truncates end up widening elements.
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
+
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
+
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
+
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 },
+
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
+ };
+
+ static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
+ // These are somewhat magic numbers justified by looking at the output of
+ // Intel's IACA, running some kernels and making sure when we take
+ // legalization into account the throughput will be overestimated.
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 },
+
+ { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
+
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
+
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
+
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
+
+ // These truncates are really widening elements.
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
+
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
+ { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
+ };
+
+ std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
+
+ if (ST->hasSSE2() && !ST->hasAVX()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ LTDest.second, LTSrc.second))
+ return AdjustCost(LTSrc.first * Entry->Cost);
+ }
+
+ EVT SrcTy = TLI->getValueType(DL, Src);
+ EVT DstTy = TLI->getValueType(DL, Dst);
+
+ // The function getSimpleVT only handles simple value types.
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
+
+ MVT SimpleSrcTy = SrcTy.getSimpleVT();
+ MVT SimpleDstTy = DstTy.getSimpleVT();
+
+ if (ST->useAVX512Regs()) {
+ if (ST->hasBWI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasBWI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasAVX2()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasAVX()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasSSE41()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasSSE2()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ return AdjustCost(
+ BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
+}
+
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+ I);
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ unsigned ExtraCost = 0;
+ if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
+ // Some vector comparison predicates cost extra instructions.
+ if (MTy.isVector() &&
+ !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
+ (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
+ ST->hasBWI())) {
+ switch (cast<CmpInst>(I)->getPredicate()) {
+ case CmpInst::Predicate::ICMP_NE:
+ // xor(cmpeq(x,y),-1)
+ ExtraCost = 1;
+ break;
+ case CmpInst::Predicate::ICMP_SGE:
+ case CmpInst::Predicate::ICMP_SLE:
+ // xor(cmpgt(x,y),-1)
+ ExtraCost = 1;
+ break;
+ case CmpInst::Predicate::ICMP_ULT:
+ case CmpInst::Predicate::ICMP_UGT:
+ // cmpgt(xor(x,signbit),xor(y,signbit))
+ // xor(cmpeq(pmaxu(x,y),x),-1)
+ ExtraCost = 2;
+ break;
+ case CmpInst::Predicate::ICMP_ULE:
+ case CmpInst::Predicate::ICMP_UGE:
+ if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
+ (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
+ // cmpeq(psubus(x,y),0)
+ // cmpeq(pminu(x,y),x)
+ ExtraCost = 1;
+ } else {
+ // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
+ ExtraCost = 3;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ static const CostTblEntry SLMCostTbl[] = {
+ // slm pcmpeq/pcmpgt throughput is 2
+ { ISD::SETCC, MVT::v2i64, 2 },
+ };
+
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ { ISD::SETCC, MVT::v32i16, 1 },
+ { ISD::SETCC, MVT::v64i8, 1 },
+
+ { ISD::SELECT, MVT::v32i16, 1 },
+ { ISD::SELECT, MVT::v64i8, 1 },
+ };
+
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::SETCC, MVT::v8i64, 1 },
+ { ISD::SETCC, MVT::v16i32, 1 },
+ { ISD::SETCC, MVT::v8f64, 1 },
+ { ISD::SETCC, MVT::v16f32, 1 },
+
+ { ISD::SELECT, MVT::v8i64, 1 },
+ { ISD::SELECT, MVT::v16i32, 1 },
+ { ISD::SELECT, MVT::v8f64, 1 },
+ { ISD::SELECT, MVT::v16f32, 1 },
+
+ { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
+ { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
+
+ { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
+ { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
+ };
+
+ static const CostTblEntry AVX2CostTbl[] = {
+ { ISD::SETCC, MVT::v4i64, 1 },
+ { ISD::SETCC, MVT::v8i32, 1 },
+ { ISD::SETCC, MVT::v16i16, 1 },
+ { ISD::SETCC, MVT::v32i8, 1 },
+
+ { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
+ };
+
+ static const CostTblEntry AVX1CostTbl[] = {
+ { ISD::SETCC, MVT::v4f64, 1 },
+ { ISD::SETCC, MVT::v8f32, 1 },
+ // AVX1 does not support 8-wide integer compare.
+ { ISD::SETCC, MVT::v4i64, 4 },
+ { ISD::SETCC, MVT::v8i32, 4 },
+ { ISD::SETCC, MVT::v16i16, 4 },
+ { ISD::SETCC, MVT::v32i8, 4 },
+
+ { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
+ { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
+ { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
+ { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
+ { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
+ { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
+ };
+
+ static const CostTblEntry SSE42CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+ { ISD::SETCC, MVT::v2i64, 1 },
+ };
+
+ static const CostTblEntry SSE41CostTbl[] = {
+ { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
+ { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
+ { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
+ };
+
+ static const CostTblEntry SSE2CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 2 },
+ { ISD::SETCC, MVT::f64, 1 },
+ { ISD::SETCC, MVT::v2i64, 8 },
+ { ISD::SETCC, MVT::v4i32, 1 },
+ { ISD::SETCC, MVT::v8i16, 1 },
+ { ISD::SETCC, MVT::v16i8, 1 },
+
+ { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
+ { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
+ };
+
+ static const CostTblEntry SSE1CostTbl[] = {
+ { ISD::SETCC, MVT::v4f32, 2 },
+ { ISD::SETCC, MVT::f32, 1 },
+
+ { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
+ };
+
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+}
+
+unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
+
+int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
+ const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) {
+
+ // Costs should match the codegen from:
+ // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
+ // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
+ // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
+ // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
+ // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+
+ // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
+ // specialized in these tables yet.
+ static const CostTblEntry AVX512CDCostTbl[] = {
+ { ISD::CTLZ, MVT::v8i64, 1 },
+ { ISD::CTLZ, MVT::v16i32, 1 },
+ { ISD::CTLZ, MVT::v32i16, 8 },
+ { ISD::CTLZ, MVT::v64i8, 20 },
+ { ISD::CTLZ, MVT::v4i64, 1 },
+ { ISD::CTLZ, MVT::v8i32, 1 },
+ { ISD::CTLZ, MVT::v16i16, 4 },
+ { ISD::CTLZ, MVT::v32i8, 10 },
+ { ISD::CTLZ, MVT::v2i64, 1 },
+ { ISD::CTLZ, MVT::v4i32, 1 },
+ { ISD::CTLZ, MVT::v8i16, 4 },
+ { ISD::CTLZ, MVT::v16i8, 4 },
+ };
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ { ISD::ABS, MVT::v32i16, 1 },
+ { ISD::ABS, MVT::v64i8, 1 },
+ { ISD::BITREVERSE, MVT::v8i64, 5 },
+ { ISD::BITREVERSE, MVT::v16i32, 5 },
+ { ISD::BITREVERSE, MVT::v32i16, 5 },
+ { ISD::BITREVERSE, MVT::v64i8, 5 },
+ { ISD::CTLZ, MVT::v8i64, 23 },
+ { ISD::CTLZ, MVT::v16i32, 22 },
+ { ISD::CTLZ, MVT::v32i16, 18 },
+ { ISD::CTLZ, MVT::v64i8, 17 },
+ { ISD::CTPOP, MVT::v8i64, 7 },
+ { ISD::CTPOP, MVT::v16i32, 11 },
+ { ISD::CTPOP, MVT::v32i16, 9 },
+ { ISD::CTPOP, MVT::v64i8, 6 },
+ { ISD::CTTZ, MVT::v8i64, 10 },
+ { ISD::CTTZ, MVT::v16i32, 14 },
+ { ISD::CTTZ, MVT::v32i16, 12 },
+ { ISD::CTTZ, MVT::v64i8, 9 },
+ { ISD::SADDSAT, MVT::v32i16, 1 },
+ { ISD::SADDSAT, MVT::v64i8, 1 },
+ { ISD::SMAX, MVT::v32i16, 1 },
+ { ISD::SMAX, MVT::v64i8, 1 },
+ { ISD::SMIN, MVT::v32i16, 1 },
+ { ISD::SMIN, MVT::v64i8, 1 },
+ { ISD::SSUBSAT, MVT::v32i16, 1 },
+ { ISD::SSUBSAT, MVT::v64i8, 1 },
+ { ISD::UADDSAT, MVT::v32i16, 1 },
+ { ISD::UADDSAT, MVT::v64i8, 1 },
+ { ISD::UMAX, MVT::v32i16, 1 },
+ { ISD::UMAX, MVT::v64i8, 1 },
+ { ISD::UMIN, MVT::v32i16, 1 },
+ { ISD::UMIN, MVT::v64i8, 1 },
+ { ISD::USUBSAT, MVT::v32i16, 1 },
+ { ISD::USUBSAT, MVT::v64i8, 1 },
+ };
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::ABS, MVT::v8i64, 1 },
+ { ISD::ABS, MVT::v16i32, 1 },
+ { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::ABS, MVT::v4i64, 1 },
+ { ISD::ABS, MVT::v2i64, 1 },
+ { ISD::BITREVERSE, MVT::v8i64, 36 },
+ { ISD::BITREVERSE, MVT::v16i32, 24 },
+ { ISD::BITREVERSE, MVT::v32i16, 10 },
+ { ISD::BITREVERSE, MVT::v64i8, 10 },
+ { ISD::CTLZ, MVT::v8i64, 29 },
+ { ISD::CTLZ, MVT::v16i32, 35 },
+ { ISD::CTLZ, MVT::v32i16, 28 },
+ { ISD::CTLZ, MVT::v64i8, 18 },
+ { ISD::CTPOP, MVT::v8i64, 16 },
+ { ISD::CTPOP, MVT::v16i32, 24 },
+ { ISD::CTPOP, MVT::v32i16, 18 },
+ { ISD::CTPOP, MVT::v64i8, 12 },
+ { ISD::CTTZ, MVT::v8i64, 20 },
+ { ISD::CTTZ, MVT::v16i32, 28 },
+ { ISD::CTTZ, MVT::v32i16, 24 },
+ { ISD::CTTZ, MVT::v64i8, 18 },
+ { ISD::SMAX, MVT::v8i64, 1 },
+ { ISD::SMAX, MVT::v16i32, 1 },
+ { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SMAX, MVT::v4i64, 1 },
+ { ISD::SMAX, MVT::v2i64, 1 },
+ { ISD::SMIN, MVT::v8i64, 1 },
+ { ISD::SMIN, MVT::v16i32, 1 },
+ { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SMIN, MVT::v4i64, 1 },
+ { ISD::SMIN, MVT::v2i64, 1 },
+ { ISD::UMAX, MVT::v8i64, 1 },
+ { ISD::UMAX, MVT::v16i32, 1 },
+ { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UMAX, MVT::v4i64, 1 },
+ { ISD::UMAX, MVT::v2i64, 1 },
+ { ISD::UMIN, MVT::v8i64, 1 },
+ { ISD::UMIN, MVT::v16i32, 1 },
+ { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UMIN, MVT::v4i64, 1 },
+ { ISD::UMIN, MVT::v2i64, 1 },
+ { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
+ { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
+ { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
+ { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
+ { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
+ { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
+ { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
+ { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
+ { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::FMAXNUM, MVT::f32, 2 },
+ { ISD::FMAXNUM, MVT::v4f32, 2 },
+ { ISD::FMAXNUM, MVT::v8f32, 2 },
+ { ISD::FMAXNUM, MVT::v16f32, 2 },
+ { ISD::FMAXNUM, MVT::f64, 2 },
+ { ISD::FMAXNUM, MVT::v2f64, 2 },
+ { ISD::FMAXNUM, MVT::v4f64, 2 },
+ { ISD::FMAXNUM, MVT::v8f64, 2 },
+ };
+ static const CostTblEntry XOPCostTbl[] = {
+ { ISD::BITREVERSE, MVT::v4i64, 4 },
+ { ISD::BITREVERSE, MVT::v8i32, 4 },
+ { ISD::BITREVERSE, MVT::v16i16, 4 },
+ { ISD::BITREVERSE, MVT::v32i8, 4 },
+ { ISD::BITREVERSE, MVT::v2i64, 1 },
+ { ISD::BITREVERSE, MVT::v4i32, 1 },
+ { ISD::BITREVERSE, MVT::v8i16, 1 },
+ { ISD::BITREVERSE, MVT::v16i8, 1 },
+ { ISD::BITREVERSE, MVT::i64, 3 },
+ { ISD::BITREVERSE, MVT::i32, 3 },
+ { ISD::BITREVERSE, MVT::i16, 3 },
+ { ISD::BITREVERSE, MVT::i8, 3 }
+ };
+ static const CostTblEntry AVX2CostTbl[] = {
+ { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
+ { ISD::ABS, MVT::v8i32, 1 },
+ { ISD::ABS, MVT::v16i16, 1 },
+ { ISD::ABS, MVT::v32i8, 1 },
+ { ISD::BITREVERSE, MVT::v4i64, 5 },
+ { ISD::BITREVERSE, MVT::v8i32, 5 },
+ { ISD::BITREVERSE, MVT::v16i16, 5 },
+ { ISD::BITREVERSE, MVT::v32i8, 5 },
+ { ISD::BSWAP, MVT::v4i64, 1 },
+ { ISD::BSWAP, MVT::v8i32, 1 },
+ { ISD::BSWAP, MVT::v16i16, 1 },
+ { ISD::CTLZ, MVT::v4i64, 23 },
+ { ISD::CTLZ, MVT::v8i32, 18 },
+ { ISD::CTLZ, MVT::v16i16, 14 },
+ { ISD::CTLZ, MVT::v32i8, 9 },
+ { ISD::CTPOP, MVT::v4i64, 7 },
+ { ISD::CTPOP, MVT::v8i32, 11 },
+ { ISD::CTPOP, MVT::v16i16, 9 },
+ { ISD::CTPOP, MVT::v32i8, 6 },
+ { ISD::CTTZ, MVT::v4i64, 10 },
+ { ISD::CTTZ, MVT::v8i32, 14 },
+ { ISD::CTTZ, MVT::v16i16, 12 },
+ { ISD::CTTZ, MVT::v32i8, 9 },
+ { ISD::SADDSAT, MVT::v16i16, 1 },
+ { ISD::SADDSAT, MVT::v32i8, 1 },
+ { ISD::SMAX, MVT::v8i32, 1 },
+ { ISD::SMAX, MVT::v16i16, 1 },
+ { ISD::SMAX, MVT::v32i8, 1 },
+ { ISD::SMIN, MVT::v8i32, 1 },
+ { ISD::SMIN, MVT::v16i16, 1 },
+ { ISD::SMIN, MVT::v32i8, 1 },
+ { ISD::SSUBSAT, MVT::v16i16, 1 },
+ { ISD::SSUBSAT, MVT::v32i8, 1 },
+ { ISD::UADDSAT, MVT::v16i16, 1 },
+ { ISD::UADDSAT, MVT::v32i8, 1 },
+ { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
+ { ISD::UMAX, MVT::v8i32, 1 },
+ { ISD::UMAX, MVT::v16i16, 1 },
+ { ISD::UMAX, MVT::v32i8, 1 },
+ { ISD::UMIN, MVT::v8i32, 1 },
+ { ISD::UMIN, MVT::v16i16, 1 },
+ { ISD::UMIN, MVT::v32i8, 1 },
+ { ISD::USUBSAT, MVT::v16i16, 1 },
+ { ISD::USUBSAT, MVT::v32i8, 1 },
+ { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
+ { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
+ { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
+ { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
+ };
+ static const CostTblEntry AVX1CostTbl[] = {
+ { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
+ { ISD::ABS, MVT::v8i32, 3 },
+ { ISD::ABS, MVT::v16i16, 3 },
+ { ISD::ABS, MVT::v32i8, 3 },
+ { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
+ { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
+ { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
+ { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
+ { ISD::BSWAP, MVT::v4i64, 4 },
+ { ISD::BSWAP, MVT::v8i32, 4 },
+ { ISD::BSWAP, MVT::v16i16, 4 },
+ { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
+ { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
+ { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
+ { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
+ { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
+ { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
+ { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
+ { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
+ };
+ static const CostTblEntry GLMCostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
+ { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
+ { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
+ { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
+ };
+ static const CostTblEntry SLMCostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
+ { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
+ { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
+ { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
+ };
+ static const CostTblEntry SSE42CostTbl[] = {
+ { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
+ { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
+ { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
+ };
+ static const CostTblEntry SSE41CostTbl[] = {
+ { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
+ { ISD::SMAX, MVT::v4i32, 1 },
+ { ISD::SMAX, MVT::v16i8, 1 },
+ { ISD::SMIN, MVT::v4i32, 1 },
+ { ISD::SMIN, MVT::v16i8, 1 },
+ { ISD::UMAX, MVT::v4i32, 1 },
+ { ISD::UMAX, MVT::v8i16, 1 },
+ { ISD::UMIN, MVT::v4i32, 1 },
+ { ISD::UMIN, MVT::v8i16, 1 },
+ };
+ static const CostTblEntry SSSE3CostTbl[] = {
+ { ISD::ABS, MVT::v4i32, 1 },
+ { ISD::ABS, MVT::v8i16, 1 },
+ { ISD::ABS, MVT::v16i8, 1 },
+ { ISD::BITREVERSE, MVT::v2i64, 5 },
+ { ISD::BITREVERSE, MVT::v4i32, 5 },
+ { ISD::BITREVERSE, MVT::v8i16, 5 },
+ { ISD::BITREVERSE, MVT::v16i8, 5 },
+ { ISD::BSWAP, MVT::v2i64, 1 },
+ { ISD::BSWAP, MVT::v4i32, 1 },
+ { ISD::BSWAP, MVT::v8i16, 1 },
+ { ISD::CTLZ, MVT::v2i64, 23 },
+ { ISD::CTLZ, MVT::v4i32, 18 },
+ { ISD::CTLZ, MVT::v8i16, 14 },
+ { ISD::CTLZ, MVT::v16i8, 9 },
+ { ISD::CTPOP, MVT::v2i64, 7 },
+ { ISD::CTPOP, MVT::v4i32, 11 },
+ { ISD::CTPOP, MVT::v8i16, 9 },
+ { ISD::CTPOP, MVT::v16i8, 6 },
+ { ISD::CTTZ, MVT::v2i64, 10 },
+ { ISD::CTTZ, MVT::v4i32, 14 },
+ { ISD::CTTZ, MVT::v8i16, 12 },
+ { ISD::CTTZ, MVT::v16i8, 9 }
+ };
+ static const CostTblEntry SSE2CostTbl[] = {
+ { ISD::ABS, MVT::v2i64, 4 },
+ { ISD::ABS, MVT::v4i32, 3 },
+ { ISD::ABS, MVT::v8i16, 2 },
+ { ISD::ABS, MVT::v16i8, 2 },
+ { ISD::BITREVERSE, MVT::v2i64, 29 },
+ { ISD::BITREVERSE, MVT::v4i32, 27 },
+ { ISD::BITREVERSE, MVT::v8i16, 27 },
+ { ISD::BITREVERSE, MVT::v16i8, 20 },
+ { ISD::BSWAP, MVT::v2i64, 7 },
+ { ISD::BSWAP, MVT::v4i32, 7 },
+ { ISD::BSWAP, MVT::v8i16, 7 },
+ { ISD::CTLZ, MVT::v2i64, 25 },
+ { ISD::CTLZ, MVT::v4i32, 26 },
+ { ISD::CTLZ, MVT::v8i16, 20 },
+ { ISD::CTLZ, MVT::v16i8, 17 },
+ { ISD::CTPOP, MVT::v2i64, 12 },
+ { ISD::CTPOP, MVT::v4i32, 15 },
+ { ISD::CTPOP, MVT::v8i16, 13 },
+ { ISD::CTPOP, MVT::v16i8, 10 },
+ { ISD::CTTZ, MVT::v2i64, 14 },
+ { ISD::CTTZ, MVT::v4i32, 18 },
+ { ISD::CTTZ, MVT::v8i16, 16 },
+ { ISD::CTTZ, MVT::v16i8, 13 },
+ { ISD::SADDSAT, MVT::v8i16, 1 },
+ { ISD::SADDSAT, MVT::v16i8, 1 },
+ { ISD::SMAX, MVT::v8i16, 1 },
+ { ISD::SMIN, MVT::v8i16, 1 },
+ { ISD::SSUBSAT, MVT::v8i16, 1 },
+ { ISD::SSUBSAT, MVT::v16i8, 1 },
+ { ISD::UADDSAT, MVT::v8i16, 1 },
+ { ISD::UADDSAT, MVT::v16i8, 1 },
+ { ISD::UMAX, MVT::v8i16, 2 },
+ { ISD::UMAX, MVT::v16i8, 1 },
+ { ISD::UMIN, MVT::v8i16, 2 },
+ { ISD::UMIN, MVT::v16i8, 1 },
+ { ISD::USUBSAT, MVT::v8i16, 1 },
+ { ISD::USUBSAT, MVT::v16i8, 1 },
+ { ISD::FMAXNUM, MVT::f64, 4 },
+ { ISD::FMAXNUM, MVT::v2f64, 4 },
+ { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
+ { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
+ };
+ static const CostTblEntry SSE1CostTbl[] = {
+ { ISD::FMAXNUM, MVT::f32, 4 },
+ { ISD::FMAXNUM, MVT::v4f32, 4 },
+ { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
+ };
+ static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
+ { ISD::CTTZ, MVT::i64, 1 },
+ };
+ static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
+ { ISD::CTTZ, MVT::i32, 1 },
+ { ISD::CTTZ, MVT::i16, 1 },
+ { ISD::CTTZ, MVT::i8, 1 },
+ };
+ static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
+ { ISD::CTLZ, MVT::i64, 1 },
+ };
+ static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
+ { ISD::CTLZ, MVT::i32, 1 },
+ { ISD::CTLZ, MVT::i16, 1 },
+ { ISD::CTLZ, MVT::i8, 1 },
+ };
+ static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
+ { ISD::CTPOP, MVT::i64, 1 },
+ };
+ static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
+ { ISD::CTPOP, MVT::i32, 1 },
+ { ISD::CTPOP, MVT::i16, 1 },
+ { ISD::CTPOP, MVT::i8, 1 },
+ };
+ static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+ { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
+ { ISD::BITREVERSE, MVT::i64, 14 },
+ { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTPOP, MVT::i64, 10 },
+ { ISD::SADDO, MVT::i64, 1 },
+ { ISD::UADDO, MVT::i64, 1 },
+ { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
+ };
+ static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+ { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
+ { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
+ { ISD::BITREVERSE, MVT::i32, 14 },
+ { ISD::BITREVERSE, MVT::i16, 14 },
+ { ISD::BITREVERSE, MVT::i8, 11 },
+ { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTPOP, MVT::i32, 8 },
+ { ISD::CTPOP, MVT::i16, 9 },
+ { ISD::CTPOP, MVT::i8, 7 },
+ { ISD::SADDO, MVT::i32, 1 },
+ { ISD::SADDO, MVT::i16, 1 },
+ { ISD::SADDO, MVT::i8, 1 },
+ { ISD::UADDO, MVT::i32, 1 },
+ { ISD::UADDO, MVT::i16, 1 },
+ { ISD::UADDO, MVT::i8, 1 },
+ { ISD::UMULO, MVT::i32, 2 }, // mul + seto
+ { ISD::UMULO, MVT::i16, 2 },
+ { ISD::UMULO, MVT::i8, 2 },
+ };
+
+ Type *RetTy = ICA.getReturnType();
+ Type *OpTy = RetTy;
+ Intrinsic::ID IID = ICA.getID();
+ unsigned ISD = ISD::DELETED_NODE;
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::abs:
+ ISD = ISD::ABS;
+ break;
+ case Intrinsic::bitreverse:
+ ISD = ISD::BITREVERSE;
+ break;
+ case Intrinsic::bswap:
+ ISD = ISD::BSWAP;
+ break;
+ case Intrinsic::ctlz:
+ ISD = ISD::CTLZ;
+ break;
+ case Intrinsic::ctpop:
+ ISD = ISD::CTPOP;
+ break;
+ case Intrinsic::cttz:
+ ISD = ISD::CTTZ;
+ break;
+ case Intrinsic::maxnum:
+ case Intrinsic::minnum:
+ // FMINNUM has same costs so don't duplicate.
+ ISD = ISD::FMAXNUM;
+ break;
+ case Intrinsic::sadd_sat:
+ ISD = ISD::SADDSAT;
+ break;
+ case Intrinsic::smax:
+ ISD = ISD::SMAX;
+ break;
+ case Intrinsic::smin:
+ ISD = ISD::SMIN;
+ break;
+ case Intrinsic::ssub_sat:
+ ISD = ISD::SSUBSAT;
+ break;
+ case Intrinsic::uadd_sat:
+ ISD = ISD::UADDSAT;
+ break;
+ case Intrinsic::umax:
+ ISD = ISD::UMAX;
+ break;
+ case Intrinsic::umin:
+ ISD = ISD::UMIN;
+ break;
+ case Intrinsic::usub_sat:
+ ISD = ISD::USUBSAT;
+ break;
+ case Intrinsic::sqrt:
+ ISD = ISD::FSQRT;
+ break;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ // SSUBO has same costs so don't duplicate.
+ ISD = ISD::SADDO;
+ OpTy = RetTy->getContainedType(0);
+ break;
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ // USUBO has same costs so don't duplicate.
+ ISD = ISD::UADDO;
+ OpTy = RetTy->getContainedType(0);
+ break;
+ case Intrinsic::umul_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ // SMULO has same costs so don't duplicate.
+ ISD = ISD::UMULO;
+ OpTy = RetTy->getContainedType(0);
+ break;
+ }
+
+ if (ISD != ISD::DELETED_NODE) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
+ MVT MTy = LT.second;
+
+ // Attempt to lookup cost.
+ if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
+ MTy.isVector()) {
+ // With PSHUFB the code is very similar for all types. If we have integer
+ // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
+ // we also need a PSHUFB.
+ unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
+
+ // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
+ // instructions. We also need an extract and an insert.
+ if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
+ (ST->hasBWI() && MTy.is512BitVector())))
+ Cost = Cost * 2 + 2;
+
+ return LT.first * Cost;
+ }
+
+ auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
+ FastMathFlags FMF) {
+ // If there are no NANs to deal with, then these are reduced to a
+ // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
+ // assume is used in the non-fast case.
+ if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
+ if (FMF.noNaNs())
+ return LegalizationCost * 1;
+ }
+ return LegalizationCost * (int)Entry.Cost;
+ };
+
+ if (ST->useGLMDivSqrtCosts())
+ if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasCDI())
+ if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasXOP())
+ if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasBMI()) {
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ }
+
+ if (ST->hasLZCNT()) {
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ }
+
+ if (ST->hasPOPCNT()) {
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ }
+
+ // TODO - add BMI (TZCNT) scalar handling
+
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ }
+
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+}
+
+int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ if (ICA.isTypeBasedOnly())
+ return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::ROTL, MVT::v8i64, 1 },
+ { ISD::ROTL, MVT::v4i64, 1 },
+ { ISD::ROTL, MVT::v2i64, 1 },
+ { ISD::ROTL, MVT::v16i32, 1 },
+ { ISD::ROTL, MVT::v8i32, 1 },
+ { ISD::ROTL, MVT::v4i32, 1 },
+ { ISD::ROTR, MVT::v8i64, 1 },
+ { ISD::ROTR, MVT::v4i64, 1 },
+ { ISD::ROTR, MVT::v2i64, 1 },
+ { ISD::ROTR, MVT::v16i32, 1 },
+ { ISD::ROTR, MVT::v8i32, 1 },
+ { ISD::ROTR, MVT::v4i32, 1 }
+ };
+ // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
+ static const CostTblEntry XOPCostTbl[] = {
+ { ISD::ROTL, MVT::v4i64, 4 },
+ { ISD::ROTL, MVT::v8i32, 4 },
+ { ISD::ROTL, MVT::v16i16, 4 },
+ { ISD::ROTL, MVT::v32i8, 4 },
+ { ISD::ROTL, MVT::v2i64, 1 },
+ { ISD::ROTL, MVT::v4i32, 1 },
+ { ISD::ROTL, MVT::v8i16, 1 },
+ { ISD::ROTL, MVT::v16i8, 1 },
+ { ISD::ROTR, MVT::v4i64, 6 },
+ { ISD::ROTR, MVT::v8i32, 6 },
+ { ISD::ROTR, MVT::v16i16, 6 },
+ { ISD::ROTR, MVT::v32i8, 6 },
+ { ISD::ROTR, MVT::v2i64, 2 },
+ { ISD::ROTR, MVT::v4i32, 2 },
+ { ISD::ROTR, MVT::v8i16, 2 },
+ { ISD::ROTR, MVT::v16i8, 2 }
+ };
+ static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+ { ISD::ROTL, MVT::i64, 1 },
+ { ISD::ROTR, MVT::i64, 1 },
+ { ISD::FSHL, MVT::i64, 4 }
+ };
+ static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+ { ISD::ROTL, MVT::i32, 1 },
+ { ISD::ROTL, MVT::i16, 1 },
+ { ISD::ROTL, MVT::i8, 1 },
+ { ISD::ROTR, MVT::i32, 1 },
+ { ISD::ROTR, MVT::i16, 1 },
+ { ISD::ROTR, MVT::i8, 1 },
+ { ISD::FSHL, MVT::i32, 4 },
+ { ISD::FSHL, MVT::i16, 4 },
+ { ISD::FSHL, MVT::i8, 4 }
+ };
+
+ Intrinsic::ID IID = ICA.getID();
+ Type *RetTy = ICA.getReturnType();
+ const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
+ unsigned ISD = ISD::DELETED_NODE;
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::fshl:
+ ISD = ISD::FSHL;
+ if (Args[0] == Args[1])
+ ISD = ISD::ROTL;
+ break;
+ case Intrinsic::fshr:
+ // FSHR has same costs so don't duplicate.
+ ISD = ISD::FSHL;
+ if (Args[0] == Args[1])
+ ISD = ISD::ROTR;
+ break;
+ }
+
+ if (ISD != ISD::DELETED_NODE) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ MVT MTy = LT.second;
+
+ // Attempt to lookup cost.
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasXOP())
+ if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+}
+
+int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+ static const CostTblEntry SLMCostTbl[] = {
+ { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
+ { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
+ { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
+ { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
+ };
+
+ assert(Val->isVectorTy() && "This must be a vector type");
+ Type *ScalarType = Val->getScalarType();
+ int RegisterFileMoveCost = 0;
+
+ if (Index != -1U && (Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::InsertElement)) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+
+ // This type is legalized to a scalar type.
+ if (!LT.second.isVector())
+ return 0;
+
+ // The type may be split. Normalize the index to the new type.
+ unsigned NumElts = LT.second.getVectorNumElements();
+ unsigned SubNumElts = NumElts;
+ Index = Index % NumElts;
+
+ // For >128-bit vectors, we need to extract higher 128-bit subvectors.
+ // For inserts, we also need to insert the subvector back.
+ if (LT.second.getSizeInBits() > 128) {
+ assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
+ unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+ SubNumElts = NumElts / NumSubVecs;
+ if (SubNumElts <= Index) {
+ RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
+ Index %= SubNumElts;
+ }
+ }
+
+ if (Index == 0) {
+ // Floating point scalars are already located in index #0.
+ // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
+ // true for all.
+ if (ScalarType->isFloatingPointTy())
+ return RegisterFileMoveCost;
+
+ // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
+ if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
+ return 1 + RegisterFileMoveCost;
+ }
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Unexpected vector opcode");
+ MVT MScalarTy = LT.second.getScalarType();
+ if (ST->isSLM())
+ if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
+ return Entry->Cost + RegisterFileMoveCost;
+
+ // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
+ if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+ (MScalarTy.isInteger() && ST->hasSSE41()))
+ return 1 + RegisterFileMoveCost;
+
+ // Assume insertps is relatively cheap on all targets.
+ if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
+ Opcode == Instruction::InsertElement)
+ return 1 + RegisterFileMoveCost;
+
+ // For extractions we just need to shuffle the element to index 0, which
+ // should be very cheap (assume cost = 1). For insertions we need to shuffle
+ // the elements to its destination. In both cases we must handle the
+ // subvector move(s).
+ // If the vector type is already less than 128-bits then don't reduce it.
+ // TODO: Under what circumstances should we shuffle using the full width?
+ int ShuffleCost = 1;
+ if (Opcode == Instruction::InsertElement) {
+ auto *SubTy = cast<VectorType>(Val);
+ EVT VT = TLI->getValueType(DL, Val);
+ if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
+ SubTy = FixedVectorType::get(ScalarType, SubNumElts);
+ ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
+ }
+ int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
+ return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
+ }
+
+ // Add to the base cost if we know that the extracted element of a vector is
+ // destined to be moved to and used in the integer register file.
+ if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
+ RegisterFileMoveCost += 1;
+
+ return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
+}
+
+unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract) {
+ unsigned Cost = 0;
+
+ // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
+ // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
+ if (Insert) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ MVT MScalarTy = LT.second.getScalarType();
+
+ if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+ (MScalarTy.isInteger() && ST->hasSSE41()) ||
+ (MScalarTy == MVT::f32 && ST->hasSSE41())) {
+ // For types we can insert directly, insertion into 128-bit sub vectors is
+ // cheap, followed by a cheap chain of concatenations.
+ if (LT.second.getSizeInBits() <= 128) {
+ Cost +=
+ BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
+ } else {
+ // In each 128-lane, if at least one index is demanded but not all
+ // indices are demanded and this 128-lane is not the first 128-lane of
+ // the legalized-vector, then this 128-lane needs a extracti128; If in
+ // each 128-lane, there is at least one demanded index, this 128-lane
+ // needs a inserti128.
+
+ // The following cases will help you build a better understanding:
+ // Assume we insert several elements into a v8i32 vector in avx2,
+ // Case#1: inserting into 1th index needs vpinsrd + inserti128.
+ // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
+ // inserti128.
+ // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
+ unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
+ unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
+ APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
+ unsigned Scale = NumElts / Num128Lanes;
+ // We iterate each 128-lane, and check if we need a
+ // extracti128/inserti128 for this 128-lane.
+ for (unsigned I = 0; I < NumElts; I += Scale) {
+ APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
+ APInt MaskedDE = Mask & WidenedDemandedElts;
+ unsigned Population = MaskedDE.countPopulation();
+ Cost += (Population > 0 && Population != Scale &&
+ I % LT.second.getVectorNumElements() != 0);
+ Cost += Population > 0;
+ }
+ Cost += DemandedElts.countPopulation();
+
+ // For vXf32 cases, insertion into the 0'th index in each v4f32
+ // 128-bit vector is free.
+ // NOTE: This assumes legalization widens vXf32 vectors.
+ if (MScalarTy == MVT::f32)
+ for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
+ i < e; i += 4)
+ if (DemandedElts[i])
+ Cost--;
+ }
+ } else if (LT.second.isVector()) {
+ // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
+ // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
+ // series of UNPCK followed by CONCAT_VECTORS - all of these can be
+ // considered cheap.
+ if (Ty->isIntOrIntVectorTy())
+ Cost += DemandedElts.countPopulation();
+
+ // Get the smaller of the legalized or original pow2-extended number of
+ // vector elements, which represents the number of unpacks we'll end up
+ // performing.
+ unsigned NumElts = LT.second.getVectorNumElements();
+ unsigned Pow2Elts =
+ PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
+ Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
+ }
+ }
+
+ // TODO: Use default extraction for now, but we should investigate extending this
+ // to handle repeated subvector extraction.
+ if (Extract)
+ Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+
+ return Cost;
+}
+
+int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput) {
+ if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
+ // Store instruction with index and scale costs 2 Uops.
+ // Check the preceding GEP to identify non-const indices.
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
+ if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
+ return TTI::TCC_Basic * 2;
+ }
+ }
+ return TTI::TCC_Basic;
+ }
+
+ // Handle non-power-of-two vectors such as <3 x float>
+ if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
+ unsigned NumElem = VTy->getNumElements();
+
+ // Handle a few common cases:
+ // <3 x float>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
+ // Cost = 64 bit store + extract + 32 bit store.
+ return 3;
+
+ // <3 x double>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
+ // Cost = 128 bit store + unpack + 64 bit store.
+ return 3;
+
+ // Assume that all other non-power-of-two numbers are scalarized.
+ if (!isPowerOf2_32(NumElem)) {
+ APInt DemandedElts = APInt::getAllOnesValue(NumElem);
+ int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
+ AddressSpace, CostKind);
+ int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
+ Opcode == Instruction::Load,
+ Opcode == Instruction::Store);
+ return NumElem * Cost + SplitCost;
+ }
+ }
+
+ // Type legalization can't handle structs
+ if (TLI->getValueType(DL, Src, true) == MVT::Other)
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Invalid Opcode");
+
+ // Each load/store unit costs 1.
+ int Cost = LT.first * 1;
+
+ // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
+ // proxy for a double-pumped AVX memory interface such as on Sandybridge.
+ if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
+ Cost *= 2;
+
+ return Cost;
+}
+
+int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
+ bool IsLoad = (Instruction::Load == Opcode);
+ bool IsStore = (Instruction::Store == Opcode);
+
+ auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
+ if (!SrcVTy)
+ // To calculate scalar take the regular cost, without mask
+ return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
+
+ unsigned NumElem = SrcVTy->getNumElements();
+ auto *MaskTy =
+ FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
+ if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
+ (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
+ !isPowerOf2_32(NumElem)) {
+ // Scalarization
+ APInt DemandedElts = APInt::getAllOnesValue(NumElem);
+ int MaskSplitCost =
+ getScalarizationOverhead(MaskTy, DemandedElts, false, true);
+ int ScalarCompareCost = getCmpSelInstrCost(
+ Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
+ int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+ int ValueSplitCost =
+ getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
+ int MemopCost =
+ NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace, CostKind);
+ return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
+ }
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ auto VT = TLI->getValueType(DL, SrcVTy);
+ int Cost = 0;
+ if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
+ LT.second.getVectorNumElements() == NumElem)
+ // Promotion requires expand/truncate for data and a shuffle for mask.
+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
+ getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
+
+ else if (LT.second.getVectorNumElements() > NumElem) {
+ auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
+ LT.second.getVectorNumElements());
+ // Expanding requires fill mask with zeroes
+ Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
+ }
+
+ // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
+ if (!ST->hasAVX512())
+ return Cost + LT.first * (IsLoad ? 2 : 8);
+
+ // AVX-512 masked load/store is cheapper
+ return Cost + LT.first;
+}
+
+int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+ const SCEV *Ptr) {
+ // Address computations in vectorized code with non-consecutive addresses will
+ // likely result in more instructions compared to scalar code where the
+ // computation can more often be merged into the index mode. The resulting
+ // extra micro-ops can significantly decrease throughput.
+ const unsigned NumVectorInstToHideOverhead = 10;
+
+ // Cost modeling of Strided Access Computation is hidden by the indexing
+ // modes of X86 regardless of the stride value. We dont believe that there
+ // is a difference between constant strided access in gerenal and constant
+ // strided value which is less than or equal to 64.
+ // Even in the case of (loop invariant) stride whose value is not known at
+ // compile time, the address computation will not incur more than one extra
+ // ADD instruction.
+ if (Ty->isVectorTy() && SE) {
+ if (!BaseT::isStridedAccess(Ptr))
+ return NumVectorInstToHideOverhead;
+ if (!BaseT::getConstantStrideStep(SE, Ptr))
+ return 1;
+ }
+
+ return BaseT::getAddressComputationCost(Ty, SE, Ptr);
+}
+
+int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ bool IsPairwise,
+ TTI::TargetCostKind CostKind) {
+ // Just use the default implementation for pair reductions.
+ if (IsPairwise)
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
+
+ // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+ // and make it as the cost.
+
+ static const CostTblEntry SLMCostTblNoPairWise[] = {
+ { ISD::FADD, MVT::v2f64, 3 },
+ { ISD::ADD, MVT::v2i64, 5 },
+ };
+
+ static const CostTblEntry SSE2CostTblNoPairWise[] = {
+ { ISD::FADD, MVT::v2f64, 2 },
+ { ISD::FADD, MVT::v4f32, 4 },
+ { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
+ { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
+ { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
+ { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
+ { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
+ { ISD::ADD, MVT::v2i8, 2 },
+ { ISD::ADD, MVT::v4i8, 2 },
+ { ISD::ADD, MVT::v8i8, 2 },
+ { ISD::ADD, MVT::v16i8, 3 },
+ };
+
+ static const CostTblEntry AVX1CostTblNoPairWise[] = {
+ { ISD::FADD, MVT::v4f64, 3 },
+ { ISD::FADD, MVT::v4f32, 3 },
+ { ISD::FADD, MVT::v8f32, 4 },
+ { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
+ { ISD::ADD, MVT::v4i64, 3 },
+ { ISD::ADD, MVT::v8i32, 5 },
+ { ISD::ADD, MVT::v16i16, 5 },
+ { ISD::ADD, MVT::v32i8, 4 },
+ };
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // Before legalizing the type, give a chance to look up illegal narrow types
+ // in the table.
+ // FIXME: Is there a better way to do this?
+ EVT VT = TLI->getValueType(DL, ValTy);
+ if (VT.isSimple()) {
+ MVT MTy = VT.getSimpleVT();
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+ }
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ auto *ValVTy = cast<FixedVectorType>(ValTy);
+
+ unsigned ArithmeticCost = 0;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 arithmetic ops.
+ auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
+ ArithmeticCost *= LT.first - 1;
+ }
+
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
+
+ // FIXME: These assume a naive kshift+binop lowering, which is probably
+ // conservative in most cases.
+ static const CostTblEntry AVX512BoolReduction[] = {
+ { ISD::AND, MVT::v2i1, 3 },
+ { ISD::AND, MVT::v4i1, 5 },
+ { ISD::AND, MVT::v8i1, 7 },
+ { ISD::AND, MVT::v16i1, 9 },
+ { ISD::AND, MVT::v32i1, 11 },
+ { ISD::AND, MVT::v64i1, 13 },
+ { ISD::OR, MVT::v2i1, 3 },
+ { ISD::OR, MVT::v4i1, 5 },
+ { ISD::OR, MVT::v8i1, 7 },
+ { ISD::OR, MVT::v16i1, 9 },
+ { ISD::OR, MVT::v32i1, 11 },
+ { ISD::OR, MVT::v64i1, 13 },
+ };
+
+ static const CostTblEntry AVX2BoolReduction[] = {
+ { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
+ { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
+ { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
+ { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
+ };
+
+ static const CostTblEntry AVX1BoolReduction[] = {
+ { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
+ { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
+ { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
+ { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
+ { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
+ { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
+ { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
+ { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
+ };
+
+ static const CostTblEntry SSE2BoolReduction[] = {
+ { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
+ { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
+ { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
+ { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
+ { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
+ { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
+ { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
+ { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
+ };
+
+ // Handle bool allof/anyof patterns.
+ if (ValVTy->getElementType()->isIntegerTy(1)) {
+ unsigned ArithmeticCost = 0;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 arithmetic ops.
+ auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
+ ArithmeticCost *= LT.first - 1;
+ }
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
+
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
+ CostKind);
+ }
+
+ unsigned NumVecElts = ValVTy->getNumElements();
+ unsigned ScalarSize = ValVTy->getScalarSizeInBits();
+
+ // Special case power of 2 reductions where the scalar type isn't changed
+ // by type legalization.
+ if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
+ CostKind);
+
+ unsigned ReductionCost = 0;
+
+ auto *Ty = ValVTy;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 arithmetic ops.
+ Ty = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
+ ReductionCost *= LT.first - 1;
+ NumVecElts = MTy.getVectorNumElements();
+ }
+
+ // Now handle reduction with the legal type, taking into account size changes
+ // at each level.
+ while (NumVecElts > 1) {
+ // Determine the size of the remaining vector we need to reduce.
+ unsigned Size = NumVecElts * ScalarSize;
+ NumVecElts /= 2;
+ // If we're reducing from 256/512 bits, use an extract_subvector.
+ if (Size > 128) {
+ auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
+ ReductionCost +=
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ Ty = SubTy;
+ } else if (Size == 128) {
+ // Reducing from 128 bits is a permute of v2f64/v2i64.
+ FixedVectorType *ShufTy;
+ if (ValVTy->isFloatingPointTy())
+ ShufTy =
+ FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
+ else
+ ShufTy =
+ FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
+ ReductionCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else if (Size == 64) {
+ // Reducing from 64 bits is a shuffle of v4f32/v4i32.
+ FixedVectorType *ShufTy;
+ if (ValVTy->isFloatingPointTy())
+ ShufTy =
+ FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
+ else
+ ShufTy =
+ FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
+ ReductionCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else {
+ // Reducing from smaller size is a shift by immediate.
+ auto *ShiftTy = FixedVectorType::get(
+ Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
+ ReductionCost += getArithmeticInstrCost(
+ Instruction::LShr, ShiftTy, CostKind,
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_UniformConstantValue,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ }
+
+ // Add the arithmetic op for this level.
+ ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
+ }
+
+ // Add the final extract element to the cost.
+ return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
+}
+
+int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+ MVT MTy = LT.second;
+
+ int ISD;
+ if (Ty->isIntOrIntVectorTy()) {
+ ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+ } else {
+ assert(Ty->isFPOrFPVectorTy() &&
+ "Expected float point or integer vector type.");
+ ISD = ISD::FMINNUM;
+ }
+
+ static const CostTblEntry SSE1CostTbl[] = {
+ {ISD::FMINNUM, MVT::v4f32, 1},
+ };
+
+ static const CostTblEntry SSE2CostTbl[] = {
+ {ISD::FMINNUM, MVT::v2f64, 1},
+ {ISD::SMIN, MVT::v8i16, 1},
+ {ISD::UMIN, MVT::v16i8, 1},
+ };
+
+ static const CostTblEntry SSE41CostTbl[] = {
+ {ISD::SMIN, MVT::v4i32, 1},
+ {ISD::UMIN, MVT::v4i32, 1},
+ {ISD::UMIN, MVT::v8i16, 1},
+ {ISD::SMIN, MVT::v16i8, 1},
+ };
+
+ static const CostTblEntry SSE42CostTbl[] = {
+ {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
+ };
+
+ static const CostTblEntry AVX1CostTbl[] = {
+ {ISD::FMINNUM, MVT::v8f32, 1},
+ {ISD::FMINNUM, MVT::v4f64, 1},
+ {ISD::SMIN, MVT::v8i32, 3},
+ {ISD::UMIN, MVT::v8i32, 3},
+ {ISD::SMIN, MVT::v16i16, 3},
+ {ISD::UMIN, MVT::v16i16, 3},
+ {ISD::SMIN, MVT::v32i8, 3},
+ {ISD::UMIN, MVT::v32i8, 3},
+ };
+
+ static const CostTblEntry AVX2CostTbl[] = {
+ {ISD::SMIN, MVT::v8i32, 1},
+ {ISD::UMIN, MVT::v8i32, 1},
+ {ISD::SMIN, MVT::v16i16, 1},
+ {ISD::UMIN, MVT::v16i16, 1},
+ {ISD::SMIN, MVT::v32i8, 1},
+ {ISD::UMIN, MVT::v32i8, 1},
+ };
+
+ static const CostTblEntry AVX512CostTbl[] = {
+ {ISD::FMINNUM, MVT::v16f32, 1},
+ {ISD::FMINNUM, MVT::v8f64, 1},
+ {ISD::SMIN, MVT::v2i64, 1},
+ {ISD::UMIN, MVT::v2i64, 1},
+ {ISD::SMIN, MVT::v4i64, 1},
+ {ISD::UMIN, MVT::v4i64, 1},
+ {ISD::SMIN, MVT::v8i64, 1},
+ {ISD::UMIN, MVT::v8i64, 1},
+ {ISD::SMIN, MVT::v16i32, 1},
+ {ISD::UMIN, MVT::v16i32, 1},
+ };
+
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ {ISD::SMIN, MVT::v32i16, 1},
+ {ISD::UMIN, MVT::v32i16, 1},
+ {ISD::SMIN, MVT::v64i8, 1},
+ {ISD::UMIN, MVT::v64i8, 1},
+ };
+
+ // If we have a native MIN/MAX instruction for this type, use it.
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ unsigned CmpOpcode;
+ if (Ty->isFPOrFPVectorTy()) {
+ CmpOpcode = Instruction::FCmp;
+ } else {
+ assert(Ty->isIntOrIntVectorTy() &&
+ "expecting floating point or integer type for min/max reduction");
+ CmpOpcode = Instruction::ICmp;
+ }
+
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ // Otherwise fall back to cmp+select.
+ return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
+ CostKind) +
+ getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+}
+
+int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
+ bool IsPairwise, bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
+ // Just use the default implementation for pair reductions.
+ if (IsPairwise)
+ return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
+ CostKind);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD;
+ if (ValTy->isIntOrIntVectorTy()) {
+ ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+ } else {
+ assert(ValTy->isFPOrFPVectorTy() &&
+ "Expected float point or integer vector type.");
+ ISD = ISD::FMINNUM;
+ }
+
+ // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+ // and make it as the cost.
+
+ static const CostTblEntry SSE2CostTblNoPairWise[] = {
+ {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
+ {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
+ {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
+ };
+
+ static const CostTblEntry SSE41CostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
+ {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
+ {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
+ {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
+ {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
+ {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
+ {ISD::SMIN, MVT::v2i8, 3}, // pminsb
+ {ISD::SMIN, MVT::v4i8, 5}, // pminsb
+ {ISD::SMIN, MVT::v8i8, 7}, // pminsb
+ {ISD::SMIN, MVT::v16i8, 6},
+ {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
+ {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
+ {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
+ {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
+ };
+
+ static const CostTblEntry AVX1CostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v16i16, 6},
+ {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
+ {ISD::SMIN, MVT::v32i8, 8},
+ {ISD::UMIN, MVT::v32i8, 8},
+ };
+
+ static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v32i16, 8},
+ {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
+ {ISD::SMIN, MVT::v64i8, 10},
+ {ISD::UMIN, MVT::v64i8, 10},
+ };
+
+ // Before legalizing the type, give a chance to look up illegal narrow types
+ // in the table.
+ // FIXME: Is there a better way to do this?
+ EVT VT = TLI->getValueType(DL, ValTy);
+ if (VT.isSimple()) {
+ MVT MTy = VT.getSimpleVT();
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+ }
+
+ auto *ValVTy = cast<FixedVectorType>(ValTy);
+ unsigned NumVecElts = ValVTy->getNumElements();
+
+ auto *Ty = ValVTy;
+ unsigned MinMaxCost = 0;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 operations ops.
+ Ty = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
+ MTy.getVectorNumElements());
+ MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
+ MinMaxCost *= LT.first - 1;
+ NumVecElts = MTy.getVectorNumElements();
+ }
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ unsigned ScalarSize = ValTy->getScalarSizeInBits();
+
+ // Special case power of 2 reductions where the scalar type isn't changed
+ // by type legalization.
+ if (!isPowerOf2_32(ValVTy->getNumElements()) ||
+ ScalarSize != MTy.getScalarSizeInBits())
+ return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
+ CostKind);
+
+ // Now handle reduction with the legal type, taking into account size changes
+ // at each level.
+ while (NumVecElts > 1) {
+ // Determine the size of the remaining vector we need to reduce.
+ unsigned Size = NumVecElts * ScalarSize;
+ NumVecElts /= 2;
+ // If we're reducing from 256/512 bits, use an extract_subvector.
+ if (Size > 128) {
+ auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
+ MinMaxCost +=
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ Ty = SubTy;
+ } else if (Size == 128) {
+ // Reducing from 128 bits is a permute of v2f64/v2i64.
+ VectorType *ShufTy;
+ if (ValTy->isFloatingPointTy())
+ ShufTy =
+ FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
+ else
+ ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
+ MinMaxCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else if (Size == 64) {
+ // Reducing from 64 bits is a shuffle of v4f32/v4i32.
+ FixedVectorType *ShufTy;
+ if (ValTy->isFloatingPointTy())
+ ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
+ else
+ ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
+ MinMaxCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else {
+ // Reducing from smaller size is a shift by immediate.
+ auto *ShiftTy = FixedVectorType::get(
+ Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
+ MinMaxCost += getArithmeticInstrCost(
+ Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_UniformConstantValue,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ }
+
+ // Add the arithmetic op for this level.
+ auto *SubCondTy =
+ FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
+ MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
+ }
+
+ // Add the final extract element to the cost.
+ return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
+}
+
+/// Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+int X86TTIImpl::getIntImmCost(int64_t Val) {
+ if (Val == 0)
+ return TTI::TCC_Free;
+
+ if (isInt<32>(Val))
+ return TTI::TCC_Basic;
+
+ return 2 * TTI::TCC_Basic;
+}
+
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ if (BitSize == 0)
+ return ~0U;
+
+ // Never hoist constants larger than 128bit, because this might lead to
+ // incorrect code generation or assertions in codegen.
+ // Fixme: Create a cost model for types larger than i128 once the codegen
+ // issues have been fixed.
+ if (BitSize > 128)
+ return TTI::TCC_Free;
+
+ if (Imm == 0)
+ return TTI::TCC_Free;
+
+ // Sign-extend all constants to a multiple of 64-bit.
+ APInt ImmVal = Imm;
+ if (BitSize % 64 != 0)
+ ImmVal = Imm.sext(alignTo(BitSize, 64));
+
+ // Split the constant into 64-bit chunks and calculate the cost for each
+ // chunk.
+ int Cost = 0;
+ for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+ APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+ int64_t Val = Tmp.getSExtValue();
+ Cost += getIntImmCost(Val);
+ }
+ // We need at least one instruction to materialize the constant.
+ return std::max(1, Cost);
+}
+
+int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+
+ unsigned ImmIdx = ~0U;
+ switch (Opcode) {
+ default:
+ return TTI::TCC_Free;
+ case Instruction::GetElementPtr:
+ // Always hoist the base address of a GetElementPtr. This prevents the
+ // creation of new constants for every base constant that gets constant
+ // folded with the offset.
+ if (Idx == 0)
+ return 2 * TTI::TCC_Basic;
+ return TTI::TCC_Free;
+ case Instruction::Store:
+ ImmIdx = 0;
+ break;
+ case Instruction::ICmp:
+ // This is an imperfect hack to prevent constant hoisting of
+ // compares that might be trying to check if a 64-bit value fits in
+ // 32-bits. The backend can optimize these cases using a right shift by 32.
+ // Ideally we would check the compare predicate here. There also other
+ // similar immediates the backend can use shifts for.
+ if (Idx == 1 && Imm.getBitWidth() == 64) {
+ uint64_t ImmVal = Imm.getZExtValue();
+ if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
+ return TTI::TCC_Free;
+ }
+ ImmIdx = 1;
+ break;
+ case Instruction::And:
+ // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
+ // by using a 32-bit operation with implicit zero extension. Detect such
+ // immediates here as the normal path expects bit 31 to be sign extended.
+ if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
+ return TTI::TCC_Free;
+ ImmIdx = 1;
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ // For add/sub, we can use the opposite instruction for INT32_MIN.
+ if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
+ return TTI::TCC_Free;
+ ImmIdx = 1;
+ break;
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ // Division by constant is typically expanded later into a different
+ // instruction sequence. This completely changes the constants.
+ // Report them as "free" to stop ConstantHoist from marking them as opaque.
+ return TTI::TCC_Free;
+ case Instruction::Mul:
+ case Instruction::Or:
+ case Instruction::Xor:
+ ImmIdx = 1;
+ break;
+ // Always return TCC_Free for the shift value of a shift instruction.
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Idx == 1)
+ return TTI::TCC_Free;
+ break;
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::IntToPtr:
+ case Instruction::PtrToInt:
+ case Instruction::BitCast:
+ case Instruction::PHI:
+ case Instruction::Call:
+ case Instruction::Select:
+ case Instruction::Ret:
+ case Instruction::Load:
+ break;
+ }
+
+ if (Idx == ImmIdx) {
+ int NumConstants = divideCeil(BitSize, 64);
+ int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
+ return (Cost <= NumConstants * TTI::TCC_Basic)
+ ? static_cast<int>(TTI::TCC_Free)
+ : Cost;
+ }
+
+ return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
+}
+
+int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getPrimitiveSizeInBits();
+ // There is no cost model for constants with a bit size of 0. Return TCC_Free
+ // here, so that constant hoisting will ignore this constant.
+ if (BitSize == 0)
+ return TTI::TCC_Free;
+
+ switch (IID) {
+ default:
+ return TTI::TCC_Free;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_stackmap:
+ if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
+ }
+ return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
+}
+
+unsigned
+X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Opcode == Instruction::PHI ? 0 : 1;
+ // Branches are assumed to be predicted.
+ return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
+}
+
+int X86TTIImpl::getGatherOverhead() const {
+ // Some CPUs have more overhead for gather. The specified overhead is relative
+ // to the Load operation. "2" is the number provided by Intel architects. This
+ // parameter is used for cost estimation of Gather Op and comparison with
+ // other alternatives.
+ // TODO: Remove the explicit hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
+ return 2;
+
+ return 1024;
+}
+
+int X86TTIImpl::getScatterOverhead() const {
+ if (ST->hasAVX512())
+ return 2;
+
+ return 1024;
+}
+
+// Return an average cost of Gather / Scatter instruction, maybe improved later.
+// FIXME: Add TargetCostKind support.
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
+ Align Alignment, unsigned AddressSpace) {
+
+ assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
+ unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
+
+ // Try to reduce index size from 64 bit (default for GEP)
+ // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
+ // operation will use 16 x 64 indices which do not fit in a zmm and needs
+ // to split. Also check that the base pointer is the same for all lanes,
+ // and that there's at most one variable index.
+ auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
+ unsigned IndexSize = DL.getPointerSizeInBits();
+ const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (IndexSize < 64 || !GEP)
+ return IndexSize;
+
+ unsigned NumOfVarIndices = 0;
+ const Value *Ptrs = GEP->getPointerOperand();
+ if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
+ return IndexSize;
+ for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
+ if (isa<Constant>(GEP->getOperand(i)))
+ continue;
+ Type *IndxTy = GEP->getOperand(i)->getType();
+ if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
+ IndxTy = IndexVTy->getElementType();
+ if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
+ !isa<SExtInst>(GEP->getOperand(i))) ||
+ ++NumOfVarIndices > 1)
+ return IndexSize; // 64
+ }
+ return (unsigned)32;
+ };
+
+ // Trying to reduce IndexSize to 32 bits for vector 16.
+ // By default the IndexSize is equal to pointer size.
+ unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
+ ? getIndexSizeInBits(Ptr, DL)
+ : DL.getPointerSizeInBits();
+
+ auto *IndexVTy = FixedVectorType::get(
+ IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
+ std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
+ std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+ if (SplitFactor > 1) {
+ // Handle splitting of vector of pointers
+ auto *SplitSrcTy =
+ FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+ return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
+ AddressSpace);
+ }
+
+ // The gather / scatter cost is given by Intel architects. It is a rough
+ // number since we are looking at one instruction in a time.
+ const int GSOverhead = (Opcode == Instruction::Load)
+ ? getGatherOverhead()
+ : getScatterOverhead();
+ return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ MaybeAlign(Alignment), AddressSpace,
+ TTI::TCK_RecipThroughput);
+}
+
+/// Return the cost of full scalarization of gather / scatter operation.
+///
+/// Opcode - Load or Store instruction.
+/// SrcVTy - The type of the data vector that should be gathered or scattered.
+/// VariableMask - The mask is non-constant at compile time.
+/// Alignment - Alignment for one element.
+/// AddressSpace - pointer[s] address space.
+///
+/// FIXME: Add TargetCostKind support.
+int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+ bool VariableMask, Align Alignment,
+ unsigned AddressSpace) {
+ unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(VF);
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ int MaskUnpackCost = 0;
+ if (VariableMask) {
+ auto *MaskTy =
+ FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
+ MaskUnpackCost =
+ getScalarizationOverhead(MaskTy, DemandedElts, false, true);
+ int ScalarCompareCost = getCmpSelInstrCost(
+ Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
+ MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
+ }
+
+ // The cost of the scalar loads/stores.
+ int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ MaybeAlign(Alignment), AddressSpace,
+ CostKind);
+
+ int InsertExtractCost = 0;
+ if (Opcode == Instruction::Load)
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of inserting each scalar load into the vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
+ else
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of extracting each element out of the data vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
+
+ return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+}
+
+/// Calculate the cost of Gather / Scatter operation
+int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr) {
+ if (CostKind != TTI::TCK_RecipThroughput) {
+ if ((Opcode == Instruction::Load &&
+ isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
+ (Opcode == Instruction::Store &&
+ isLegalMaskedScatter(SrcVTy, Align(Alignment))))
+ return 1;
+ return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
+ Alignment, CostKind, I);
+ }
+
+ assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
+ unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
+ PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+ if (!PtrTy && Ptr->getType()->isVectorTy())
+ PtrTy = dyn_cast<PointerType>(
+ cast<VectorType>(Ptr->getType())->getElementType());
+ assert(PtrTy && "Unexpected type for Ptr argument");
+ unsigned AddressSpace = PtrTy->getAddressSpace();
+
+ bool Scalarize = false;
+ if ((Opcode == Instruction::Load &&
+ !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
+ (Opcode == Instruction::Store &&
+ !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
+ Scalarize = true;
+ // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+ // Vector-4 of gather/scatter instruction does not exist on KNL.
+ // We can extend it to 8 elements, but zeroing upper bits of
+ // the mask vector will add more instructions. Right now we give the scalar
+ // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
+ // is better in the VariableMask case.
+ if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
+ Scalarize = true;
+
+ if (Scalarize)
+ return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
+ AddressSpace);
+
+ return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
+}
+
+bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+ TargetTransformInfo::LSRCost &C2) {
+ // X86 specific here are "instruction number 1st priority".
+ return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
+ C1.NumIVMuls, C1.NumBaseAdds,
+ C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+ std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
+ C2.NumIVMuls, C2.NumBaseAdds,
+ C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+}
+
+bool X86TTIImpl::canMacroFuseCmp() {
+ return ST->hasMacroFusion() || ST->hasBranchFusion();
+}
+
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
+ if (!ST->hasAVX())
+ return false;
+
+ // The backend can't handle a single element vector.
+ if (isa<VectorType>(DataTy) &&
+ cast<FixedVectorType>(DataTy)->getNumElements() == 1)
+ return false;
+ Type *ScalarTy = DataTy->getScalarType();
+
+ if (ScalarTy->isPointerTy())
+ return true;
+
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64 ||
+ ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
+}
+
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
+ return isLegalMaskedLoad(DataType, Alignment);
+}
+
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+ // The only supported nontemporal loads are for aligned vectors of 16 or 32
+ // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
+ // (the equivalent stores only require AVX).
+ if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
+ return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
+
+ return false;
+}
+
+bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+
+ // SSE4A supports nontemporal stores of float and double at arbitrary
+ // alignment.
+ if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
+ return true;
+
+ // Besides the SSE4A subtarget exception above, only aligned stores are
+ // available nontemporaly on any other subtarget. And only stores with a size
+ // of 4..32 bytes (powers of 2, only) are permitted.
+ if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
+ !isPowerOf2_32(DataSize))
+ return false;
+
+ // 32-byte vector nontemporal stores are supported by AVX (the equivalent
+ // loads require AVX2).
+ if (DataSize == 32)
+ return ST->hasAVX();
+ else if (DataSize == 16)
+ return ST->hasSSE1();
+ return true;
+}
+
+bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
+ if (!isa<VectorType>(DataTy))
+ return false;
+
+ if (!ST->hasAVX512())
+ return false;
+
+ // The backend can't handle a single element vector.
+ if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
+ return false;
+
+ Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
+
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64 ||
+ ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
+}
+
+bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
+ return isLegalMaskedExpandLoad(DataTy);
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
+ // Some CPUs have better gather performance than others.
+ // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
+ return false;
+
+ // This function is called now in two cases: from the Loop Vectorizer
+ // and from the Scalarizer.
+ // When the Loop Vectorizer asks about legality of the feature,
+ // the vectorization factor is not calculated yet. The Loop Vectorizer
+ // sends a scalar type and the decision is based on the width of the
+ // scalar element.
+ // Later on, the cost model will estimate usage this intrinsic based on
+ // the vector type.
+ // The Scalarizer asks again about legality. It sends a vector type.
+ // In this case we can reject non-power-of-2 vectors.
+ // We also reject single element vectors as the type legalizer can't
+ // scalarize it.
+ if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
+ unsigned NumElts = DataVTy->getNumElements();
+ if (NumElts == 1)
+ return false;
+ }
+ Type *ScalarTy = DataTy->getScalarType();
+ if (ScalarTy->isPointerTy())
+ return true;
+
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64;
+}
+
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
+ // AVX2 doesn't support scatter
+ if (!ST->hasAVX512())
+ return false;
+ return isLegalMaskedGather(DataType, Alignment);
+}
+
+bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
+ EVT VT = TLI->getValueType(DL, DataType);
+ return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
+}
+
+bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
+ return false;
+}
+
+bool X86TTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ // Work this as a subsetting of subtarget features.
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
+ FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
+ return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
+}
+
+bool X86TTIImpl::areFunctionArgsABICompatible(
+ const Function *Caller, const Function *Callee,
+ SmallPtrSetImpl<Argument *> &Args) const {
+ if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
+ return false;
+
+ // If we get here, we know the target features match. If one function
+ // considers 512-bit vectors legal and the other does not, consider them
+ // incompatible.
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
+ TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
+ return true;
+
+ // Consider the arguments compatible if they aren't vectors or aggregates.
+ // FIXME: Look at the size of vectors.
+ // FIXME: Look at the element types of aggregates to see if there are vectors.
+ // FIXME: The API of this function seems intended to allow arguments
+ // to be removed from the set, but the caller doesn't check if the set
+ // becomes empty so that may not work in practice.
+ return llvm::none_of(Args, [](Argument *A) {
+ auto *EltTy = cast<PointerType>(A->getType())->getElementType();
+ return EltTy->isVectorTy() || EltTy->isAggregateType();
+ });
+}
+
+X86TTIImpl::TTI::MemCmpExpansionOptions
+X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = 2;
+ // All GPR and vector loads can be unaligned.
+ Options.AllowOverlappingLoads = true;
+ if (IsZeroCmp) {
+ // Only enable vector loads for equality comparison. Right now the vector
+ // version is not as fast for three way compare (see #33329).
+ const unsigned PreferredWidth = ST->getPreferVectorWidth();
+ if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
+ if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
+ if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
+ }
+ if (ST->is64Bit()) {
+ Options.LoadSizes.push_back(8);
+ }
+ Options.LoadSizes.push_back(4);
+ Options.LoadSizes.push_back(2);
+ Options.LoadSizes.push_back(1);
+ return Options;
+}
+
+bool X86TTIImpl::enableInterleavedAccessVectorization() {
+ // TODO: We expect this to be beneficial regardless of arch,
+ // but there are currently some unexplained performance artifacts on Atom.
+ // As a temporary solution, disable on Atom.
+ return !(ST->isAtom());
+}
+
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
+
+ if (UseMaskForCond || UseMaskForGaps)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, CostKind,
+ UseMaskForCond, UseMaskForGaps);
+
+ // We currently Support only fully-interleaved groups, with no gaps.
+ // TODO: Support also strided loads (interleaved-groups with gaps).
+ if (Indices.size() && Indices.size() != Factor)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace,
+ CostKind);
+
+ // VecTy for interleave memop is <VF*Factor x Elt>.
+ // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+ // VecTy = <12 x i32>.
+ MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+ // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+ // the VF=2, while v2i128 is an unsupported MVT vector type
+ // (see MachineValueType.h::getVectorVT()).
+ if (!LegalVT.isVector())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace,
+ CostKind);
+
+ unsigned VF = VecTy->getNumElements() / Factor;
+ Type *ScalarTy = VecTy->getElementType();
+
+ // Calculate the number of memory operations (NumOfMemOps), required
+ // for load/store the VecTy.
+ unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+ // Get the cost of one memory operation.
+ auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+ LegalVT.getVectorNumElements());
+ unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
+ MaybeAlign(Alignment), AddressSpace,
+ CostKind);
+
+ auto *VT = FixedVectorType::get(ScalarTy, VF);
+ EVT ETy = TLI->getValueType(DL, VT);
+ if (!ETy.isSimple())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace,
+ CostKind);
+
+ // TODO: Complete for other data-types and strides.
+ // Each combination of Stride, ElementTy and VF results in a different
+ // sequence; The cost tables are therefore accessed with:
+ // Factor (stride) and VectorType=VFxElemType.
+ // The Cost accounts only for the shuffle sequence;
+ // The cost of the loads/stores is accounted for separately.
+ //
+ static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+ { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
+ { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
+
+ { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
+ { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
+ { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
+ { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
+ { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
+ { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
+
+ { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
+ { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
+ { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
+ { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
+ { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
+
+ { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
+ };
+
+ static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+ { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
+ { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
+
+ { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
+ { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
+ { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
+ { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
+ { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+ { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
+ { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
+ { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
+ { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
+ { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
+ };
+
+ if (Opcode == Instruction::Load) {
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ } else {
+ assert(Opcode == Instruction::Store &&
+ "Expected Store Instruction at this point");
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, CostKind);
+}
+
+// Get estimation for interleaved load/store operations and strided load.
+// \p Indices contains indices for strided load.
+// \p Factor - the factor of interleaving.
+// AVX-512 provides 3-src shuffles that significantly reduces the cost.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
+
+ if (UseMaskForCond || UseMaskForGaps)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, CostKind,
+ UseMaskForCond, UseMaskForGaps);
+
+ // VecTy for interleave memop is <VF*Factor x Elt>.
+ // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+ // VecTy = <12 x i32>.
+
+ // Calculate the number of memory operations (NumOfMemOps), required
+ // for load/store the VecTy.
+ MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+ unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+ // Get the cost of one memory operation.
+ auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+ LegalVT.getVectorNumElements());
+ unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
+ MaybeAlign(Alignment), AddressSpace,
+ CostKind);
+
+ unsigned VF = VecTy->getNumElements() / Factor;
+ MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
+
+ if (Opcode == Instruction::Load) {
+ // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
+ // contain the cost of the optimized shuffle sequence that the
+ // X86InterleavedAccess pass will generate.
+ // The cost of loads and stores are computed separately from the table.
+
+ // X86InterleavedAccess support only the following interleaved-access group.
+ static const CostTblEntry AVX512InterleavedLoadTbl[] = {
+ {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
+ {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
+ {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
+ };
+
+ if (const auto *Entry =
+ CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ //If an entry does not exist, fallback to the default implementation.
+
+ // Kind of shuffle depends on number of loaded values.
+ // If we load the entire data in one register, we can use a 1-src shuffle.
+ // Otherwise, we'll merge 2 sources in each operation.
+ TTI::ShuffleKind ShuffleKind =
+ (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
+
+ unsigned ShuffleCost =
+ getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
+
+ unsigned NumOfLoadsInInterleaveGrp =
+ Indices.size() ? Indices.size() : Factor;
+ auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
+ VecTy->getNumElements() / Factor);
+ unsigned NumOfResults =
+ getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
+ NumOfLoadsInInterleaveGrp;
+
+ // About a half of the loads may be folded in shuffles when we have only
+ // one result. If we have more than one result, we do not fold loads at all.
+ unsigned NumOfUnfoldedLoads =
+ NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
+
+ // Get a number of shuffle operations per result.
+ unsigned NumOfShufflesPerResult =
+ std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
+
+ // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+ // When we have more than one destination, we need additional instructions
+ // to keep sources.
+ unsigned NumOfMoves = 0;
+ if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
+ NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
+
+ int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+ NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+
+ return Cost;
+ }
+
+ // Store.
+ assert(Opcode == Instruction::Store &&
+ "Expected Store Instruction at this point");
+ // X86InterleavedAccess support only the following interleaved-access group.
+ static const CostTblEntry AVX512InterleavedStoreTbl[] = {
+ {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
+ {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
+ {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
+
+ {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
+ {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
+ {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
+ {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
+ };
+
+ if (const auto *Entry =
+ CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ //If an entry does not exist, fallback to the default implementation.
+
+ // There is no strided stores meanwhile. And store can't be folded in
+ // shuffle.
+ unsigned NumOfSources = Factor; // The number of values to be merged.
+ unsigned ShuffleCost =
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+ unsigned NumOfShufflesPerStore = NumOfSources - 1;
+
+ // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+ // We need additional instructions to keep sources.
+ unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
+ int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+ NumOfMoves;
+ return Cost;
+}
+
+int X86TTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) {
+ auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
+ Type *EltTy = cast<VectorType>(VecTy)->getElementType();
+ if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
+ EltTy->isIntegerTy(32) || EltTy->isPointerTy())
+ return true;
+ if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
+ return HasBW;
+ return false;
+ };
+ if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
+ return getInterleavedMemoryOpCostAVX512(
+ Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+ AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
+ if (ST->hasAVX2())
+ return getInterleavedMemoryOpCostAVX2(
+ Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+ AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, CostKind,
+ UseMaskForCond, UseMaskForGaps);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
new file mode 100644
index 000000000000..17570f1c04a6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -0,0 +1,256 @@
+//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// X86 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+
+class InstCombiner;
+
+class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
+ typedef BasicTTIImplBase<X86TTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const X86Subtarget *ST;
+ const X86TargetLowering *TLI;
+
+ const X86Subtarget *getST() const { return ST; }
+ const X86TargetLowering *getTLI() const { return TLI; }
+
+ const FeatureBitset InlineFeatureIgnoreList = {
+ // This indicates the CPU is 64 bit capable not that we are in 64-bit
+ // mode.
+ X86::Feature64Bit,
+
+ // These features don't have any intrinsics or ABI effect.
+ X86::FeatureNOPL,
+ X86::FeatureCMPXCHG16B,
+ X86::FeatureLAHFSAHF,
+
+ // Codegen control options.
+ X86::FeatureFast11ByteNOP,
+ X86::FeatureFast15ByteNOP,
+ X86::FeatureFastBEXTR,
+ X86::FeatureFastHorizontalOps,
+ X86::FeatureFastLZCNT,
+ X86::FeatureFastScalarFSQRT,
+ X86::FeatureFastSHLDRotate,
+ X86::FeatureFastScalarShiftMasks,
+ X86::FeatureFastVectorShiftMasks,
+ X86::FeatureFastVariableShuffle,
+ X86::FeatureFastVectorFSQRT,
+ X86::FeatureLEAForSP,
+ X86::FeatureLEAUsesAG,
+ X86::FeatureLZCNTFalseDeps,
+ X86::FeatureBranchFusion,
+ X86::FeatureMacroFusion,
+ X86::FeaturePadShortFunctions,
+ X86::FeaturePOPCNTFalseDeps,
+ X86::FeatureSSEUnalignedMem,
+ X86::FeatureSlow3OpsLEA,
+ X86::FeatureSlowDivide32,
+ X86::FeatureSlowDivide64,
+ X86::FeatureSlowIncDec,
+ X86::FeatureSlowLEA,
+ X86::FeatureSlowPMADDWD,
+ X86::FeatureSlowPMULLD,
+ X86::FeatureSlowSHLD,
+ X86::FeatureSlowTwoMemOps,
+ X86::FeatureSlowUAMem16,
+ X86::FeaturePreferMaskRegisters,
+ X86::FeatureInsertVZEROUPPER,
+ X86::FeatureUseGLMDivSqrtCosts,
+
+ // Perf-tuning flags.
+ X86::FeatureHasFastGather,
+ X86::FeatureSlowUAMem32,
+
+ // Based on whether user set the -mprefer-vector-width command line.
+ X86::FeaturePrefer128Bit,
+ X86::FeaturePrefer256Bit,
+
+ // CPU name enums. These just follow CPU string.
+ X86::ProcIntelAtom,
+ X86::ProcIntelSLM,
+ };
+
+public:
+ explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ /// \name Scalar TTI Implementations
+ /// @{
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+ /// @}
+
+ /// \name Cache TTI Implementation
+ /// @{
+ llvm::Optional<unsigned> getCacheSize(
+ TargetTransformInfo::CacheLevel Level) const override;
+ llvm::Optional<unsigned> getCacheAssociativity(
+ TargetTransformInfo::CacheLevel Level) const override;
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
+ unsigned getRegisterBitWidth(bool Vector) const;
+ unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
+ unsigned getMaxInterleaveFactor(unsigned VF);
+ int getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+ const Instruction *CxtI = nullptr);
+ int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+ VectorType *SubTp);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
+ bool Insert, bool Extract);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ int getMaskedMemoryOpCost(
+ unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+ int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
+ bool VariableMask, Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I);
+ int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr);
+
+ Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const;
+ Optional<Value *>
+ simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+ APInt DemandedMask, KnownBits &Known,
+ bool &KnownBitsComputed) const;
+ Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+ APInt &UndefElts2, APInt &UndefElts3,
+ std::function<void(Instruction *, unsigned, APInt, APInt &)>
+ SimplifyAndSetOp) const;
+
+ unsigned getAtomicMemIntrinsicMaxElementSize() const;
+
+ int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+ int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+
+ int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ bool IsPairwiseForm,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+
+ int getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);
+
+ int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsPairwiseForm, bool IsUnsigned,
+ TTI::TargetCostKind CostKind);
+
+ int getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+ bool UseMaskForCond = false, bool UseMaskForGaps = false);
+ int getInterleavedMemoryOpCostAVX512(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+ bool UseMaskForCond = false, bool UseMaskForGaps = false);
+ int getInterleavedMemoryOpCostAVX2(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+ bool UseMaskForCond = false, bool UseMaskForGaps = false);
+
+ int getIntImmCost(int64_t);
+
+ int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
+
+ unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+
+ int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty, TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
+ int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty, TTI::TargetCostKind CostKind);
+ bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+ TargetTransformInfo::LSRCost &C2);
+ bool canMacroFuseCmp();
+ bool isLegalMaskedLoad(Type *DataType, Align Alignment);
+ bool isLegalMaskedStore(Type *DataType, Align Alignment);
+ bool isLegalNTLoad(Type *DataType, Align Alignment);
+ bool isLegalNTStore(Type *DataType, Align Alignment);
+ bool isLegalMaskedGather(Type *DataType, Align Alignment);
+ bool isLegalMaskedScatter(Type *DataType, Align Alignment);
+ bool isLegalMaskedExpandLoad(Type *DataType);
+ bool isLegalMaskedCompressStore(Type *DataType);
+ bool hasDivRemOp(Type *DataType, bool IsSigned);
+ bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const;
+ bool areFunctionArgsABICompatible(const Function *Caller,
+ const Function *Callee,
+ SmallPtrSetImpl<Argument *> &Args) const;
+ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+ bool IsZeroCmp) const;
+ bool enableInterleavedAccessVectorization();
+
+ /// Allow vectorizers to form reduction intrinsics in IR. The IR is expanded
+ /// into shuffles and vector math/logic by the backend
+ /// (see TTI::shouldExpandReduction)
+ bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ return true;
+ }
+
+private:
+ int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+ Align Alignment, unsigned AddressSpace);
+ int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
+ Align Alignment, unsigned AddressSpace);
+
+ int getGatherOverhead() const;
+ int getScatterOverhead() const;
+
+ /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
new file mode 100644
index 000000000000..ef010bcd38b7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -0,0 +1,248 @@
+//===-- X86TileConfig.cpp - Tile Register Configure----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to config the shape of AMX physical registers
+/// AMX register need to be configured before use. In X86PreTileConfig pass
+/// the pldtilecfg instruction is inserted, however at that time we don't
+/// know the shape of each physical tile registers, because the register
+/// allocation is not done yet. This pass runs after egister allocation
+/// pass. It collects the shape information of each physical tile register
+/// and store the shape in the stack slot that is allocated for load config
+/// to tile config register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-config"
+
+namespace {
+
+class X86TileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ VirtRegMap *VRM = nullptr;
+ LiveIntervals *LIS = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+ void tileConfig();
+
+public:
+ X86TileConfig() : MachineFunctionPass(ID) {}
+
+ /// Return the pass name.
+ StringRef getPassName() const override { return "Tile Register Configure"; }
+
+ /// X86TileConfig analysis usage.
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &mf) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoPHIs);
+ }
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86TileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
+ false, false)
+
+void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addRequired<VirtRegMap>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static unsigned getTilePhysRegIndex(Register PhysReg) {
+ assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) &&
+ "Tile register number is invalid");
+ return (PhysReg - X86::TMM0);
+}
+
+static MachineInstr *
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ Register SrcReg, unsigned BitSize, int FrameIdx, int Offset,
+ const TargetInstrInfo *TII, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) {
+
+ unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit;
+ unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr;
+ if (BitSize == TRI->getRegSizeInBits(*RC))
+ SubIdx = 0;
+ MachineInstr *NewMI =
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx,
+ Offset)
+ .addReg(SrcReg, 0, SubIdx);
+ return NewMI;
+}
+
+static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ int64_t Imm, unsigned BitSize,
+ int FrameIdx, int Offset,
+ const TargetInstrInfo *TII) {
+ unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi;
+ return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)),
+ FrameIdx, Offset)
+ .addImm(Imm);
+}
+
+MachineInstr *X86TileConfig::getTileConfigPoint() {
+ for (MachineBasicBlock &MBB : *MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB)
+ // Refer X86PreTileConfig.cpp.
+ // We only support one tile config for now.
+ if (MI.getOpcode() == X86::PLDTILECFG)
+ return &MI;
+ }
+
+ return nullptr;
+}
+
+void X86TileConfig::tileConfig() {
+ MachineInstr *MI = getTileConfigPoint();
+ if (!MI)
+ return;
+ MachineBasicBlock *MBB = MI->getParent();
+ int SS = MI->getOperand(1).getIndex();
+ BitVector PhysRegs(TRI->getNumRegs());
+
+ // Fill in the palette first.
+ auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII);
+ LIS->InsertMachineInstrInMaps(*NewMI);
+ // Fill in the shape of each tile physical register.
+ for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+ Register VirtReg = Register::index2VirtReg(i);
+ if (MRI->reg_nodbg_empty(VirtReg))
+ continue;
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ if (RC.getID() != X86::TILERegClassID)
+ continue;
+ Register PhysReg = VRM->getPhys(VirtReg);
+ if (PhysRegs.test(PhysReg))
+ continue;
+ PhysRegs.set(PhysReg);
+ ShapeT Shape = VRM->getShape(VirtReg);
+ Register RowReg = Shape.getRow()->getReg();
+ Register ColReg = Shape.getCol()->getReg();
+
+ // Here is the data format for the tile config.
+ // 0 palette
+ // 1 start_row
+ // 2-15 reserved, must be zero
+ // 16-17 tile0.colsb Tile 0 bytes per row.
+ // 18-19 tile1.colsb Tile 1 bytes per row.
+ // 20-21 tile2.colsb Tile 2 bytes per row.
+ // ... (sequence continues)
+ // 30-31 tile7.colsb Tile 7 bytes per row.
+ // 32-47 reserved, must be zero
+ // 48 tile0.rows Tile 0 rows.
+ // 49 tile1.rows Tile 1 rows.
+ // 50 tile2.rows Tile 2 rows.
+ // ... (sequence continues)
+ // 55 tile7.rows Tile 7 rows.
+ // 56-63 reserved, must be zero
+ unsigned Index = getTilePhysRegIndex(PhysReg);
+ int RowOffset = 48 + Index;
+ int ColOffset = 16 + Index * 2;
+
+ unsigned BitSize = 8;
+ for (const auto &Pair : {std::make_pair(RowReg, RowOffset),
+ std::make_pair(ColReg, ColOffset)}) {
+ int64_t Imm;
+ int ImmCount = 0;
+ // All def must be the same value, otherwise it is invalid MIs.
+ // Immediate is prefered.
+ for (const MachineOperand &MO : MRI->def_operands(Pair.first)) {
+ const auto *Inst = MO.getParent();
+ if (Inst->isMoveImmediate()) {
+ ImmCount++;
+ Imm = Inst->getOperand(1).getImm();
+ break;
+ }
+ }
+ auto StoreConfig = [&](int Offset) {
+ MachineInstr *NewMI = nullptr;
+ if (ImmCount)
+ NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII);
+ else {
+ const TargetRegisterClass *RC = MRI->getRegClass(Pair.first);
+ NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS,
+ Offset, TII, RC, TRI);
+ }
+ SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI);
+ if (!ImmCount) {
+ // Extend the live interval.
+ SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()};
+ LiveInterval &Int = LIS->getInterval(Pair.first);
+ LIS->extendToIndices(Int, EndPoints);
+ }
+ };
+ StoreConfig(Pair.second);
+ BitSize += 8;
+ }
+ }
+}
+
+bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ MRI = &mf.getRegInfo();
+ ST = &mf.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = mf.getSubtarget().getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+ VRM = &getAnalysis<VirtRegMap>();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ if (VRM->isShapeMapEmpty())
+ return false;
+
+ tileConfig();
+ return true;
+}
+
+FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 000000000000..c188c7443625
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,358 @@
+//===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when transferring control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-vzeroupper"
+
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
+ cl::desc("Minimize AVX to SSE transition penalty"),
+ cl::init(true));
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+
+ class VZeroUpperInserter : public MachineFunctionPass {
+ public:
+ VZeroUpperInserter() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override { return "X86 vzeroupper inserter"; }
+
+ private:
+ void processBasicBlock(MachineBasicBlock &MBB);
+ void insertVZeroUpper(MachineBasicBlock::iterator I,
+ MachineBasicBlock &MBB);
+ void addDirtySuccessor(MachineBasicBlock &MBB);
+
+ using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
+
+ static const char* getBlockExitStateName(BlockExitState ST);
+
+ // Core algorithm state:
+ // BlockState - Each block is either:
+ // - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor
+ // vzeroupper instructions in this block.
+ // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
+ // block that will ensure that YMM/ZMM is clean on exit.
+ // - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no
+ // subsequent vzeroupper in the block clears it.
+ //
+ // AddedToDirtySuccessors - This flag is raised when a block is added to the
+ // DirtySuccessors list to ensure that it's not
+ // added multiple times.
+ //
+ // FirstUnguardedCall - Records the location of the first unguarded call in
+ // each basic block that may need to be guarded by a
+ // vzeroupper. We won't know whether it actually needs
+ // to be guarded until we discover a predecessor that
+ // is DIRTY_OUT.
+ struct BlockState {
+ BlockExitState ExitState = PASS_THROUGH;
+ bool AddedToDirtySuccessors = false;
+ MachineBasicBlock::iterator FirstUnguardedCall;
+
+ BlockState() = default;
+ };
+
+ using BlockStateMap = SmallVector<BlockState, 8>;
+ using DirtySuccessorsWorkList = SmallVector<MachineBasicBlock *, 8>;
+
+ BlockStateMap BlockStates;
+ DirtySuccessorsWorkList DirtySuccessors;
+ bool EverMadeChange;
+ bool IsX86INTR;
+ const TargetInstrInfo *TII;
+
+ static char ID;
+ };
+
+} // end anonymous namespace
+
+char VZeroUpperInserter::ID = 0;
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+ return new VZeroUpperInserter();
+}
+
+#ifndef NDEBUG
+const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
+ switch (ST) {
+ case PASS_THROUGH: return "Pass-through";
+ case EXITS_DIRTY: return "Exits-dirty";
+ case EXITS_CLEAN: return "Exits-clean";
+ }
+ llvm_unreachable("Invalid block exit state.");
+}
+#endif
+
+/// VZEROUPPER cleans state that is related to Y/ZMM0-15 only.
+/// Thus, there is no need to check for Y/ZMM16 and above.
+static bool isYmmOrZmmReg(unsigned Reg) {
+ return (Reg >= X86::YMM0 && Reg <= X86::YMM15) ||
+ (Reg >= X86::ZMM0 && Reg <= X86::ZMM15);
+}
+
+static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) {
+ for (std::pair<unsigned, unsigned> LI : MRI.liveins())
+ if (isYmmOrZmmReg(LI.first))
+ return true;
+
+ return false;
+}
+
+static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) {
+ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
+ for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
+ return true;
+}
+
+static bool hasYmmOrZmmReg(MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO))
+ return true;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDebug())
+ continue;
+ if (isYmmOrZmmReg(MO.getReg()))
+ return true;
+ }
+ return false;
+}
+
+/// Check if given call instruction has a RegMask operand.
+static bool callHasRegMask(MachineInstr &MI) {
+ assert(MI.isCall() && "Can only be called on call instructions.");
+ for (const MachineOperand &MO : MI.operands()) {
+ if (MO.isRegMask())
+ return true;
+ }
+ return false;
+}
+
+/// Insert a vzeroupper instruction before I.
+void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
+ MachineBasicBlock &MBB) {
+ DebugLoc dl = I->getDebugLoc();
+ BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
+ ++NumVZU;
+ EverMadeChange = true;
+}
+
+/// Add MBB to the DirtySuccessors list if it hasn't already been added.
+void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
+ if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
+ DirtySuccessors.push_back(&MBB);
+ BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
+ }
+}
+
+/// Loop over all of the instructions in the basic block, inserting vzeroupper
+/// instructions before function calls.
+void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
+ // Start by assuming that the block is PASS_THROUGH which implies no unguarded
+ // calls.
+ BlockExitState CurState = PASS_THROUGH;
+ BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
+
+ for (MachineInstr &MI : MBB) {
+ bool IsCall = MI.isCall();
+ bool IsReturn = MI.isReturn();
+ bool IsControlFlow = IsCall || IsReturn;
+
+ // No need for vzeroupper before iret in interrupt handler function,
+ // epilogue will restore YMM/ZMM registers if needed.
+ if (IsX86INTR && IsReturn)
+ continue;
+
+ // An existing VZERO* instruction resets the state.
+ if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
+ CurState = EXITS_CLEAN;
+ continue;
+ }
+
+ // Shortcut: don't need to check regular instructions in dirty state.
+ if (!IsControlFlow && CurState == EXITS_DIRTY)
+ continue;
+
+ if (hasYmmOrZmmReg(MI)) {
+ // We found a ymm/zmm-using instruction; this could be an AVX/AVX512
+ // instruction, or it could be control flow.
+ CurState = EXITS_DIRTY;
+ continue;
+ }
+
+ // Check for control-flow out of the current function (which might
+ // indirectly execute SSE instructions).
+ if (!IsControlFlow)
+ continue;
+
+ // If the call has no RegMask, skip it as well. It usually happens on
+ // helper function calls (such as '_chkstk', '_ftol2') where standard
+ // calling convention is not used (RegMask is not used to mark register
+ // clobbered and register usage (def/implicit-def/use) is well-defined and
+ // explicitly specified.
+ if (IsCall && !callHasRegMask(MI))
+ continue;
+
+ // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15
+ // registers. In addition, the processor changes back to Clean state, after
+ // which execution of SSE instructions or AVX instructions has no transition
+ // penalty. Add the VZEROUPPER instruction before any function call/return
+ // that might execute SSE code.
+ // FIXME: In some cases, we may want to move the VZEROUPPER into a
+ // predecessor block.
+ if (CurState == EXITS_DIRTY) {
+ // After the inserted VZEROUPPER the state becomes clean again, but
+ // other YMM/ZMM may appear before other subsequent calls or even before
+ // the end of the BB.
+ insertVZeroUpper(MI, MBB);
+ CurState = EXITS_CLEAN;
+ } else if (CurState == PASS_THROUGH) {
+ // If this block is currently in pass-through state and we encounter a
+ // call then whether we need a vzeroupper or not depends on whether this
+ // block has successors that exit dirty. Record the location of the call,
+ // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
+ // It will be inserted later if necessary.
+ BlockStates[MBB.getNumber()].FirstUnguardedCall = MI;
+ CurState = EXITS_CLEAN;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
+ << getBlockExitStateName(CurState) << '\n');
+
+ if (CurState == EXITS_DIRTY)
+ for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+ SE = MBB.succ_end();
+ SI != SE; ++SI)
+ addDirtySuccessor(**SI);
+
+ BlockStates[MBB.getNumber()].ExitState = CurState;
+}
+
+/// Loop over all of the basic blocks, inserting vzeroupper instructions before
+/// function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+ if (!UseVZeroUpper)
+ return false;
+
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ if (!ST.hasAVX() || !ST.insertVZEROUPPER())
+ return false;
+ TII = ST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ EverMadeChange = false;
+ IsX86INTR = MF.getFunction().getCallingConv() == CallingConv::X86_INTR;
+
+ bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI);
+
+ // Fast check: if the function doesn't use any ymm/zmm registers, we don't
+ // need to insert any VZEROUPPER instructions. This is constant-time, so it
+ // is cheap in the common case of no ymm/zmm use.
+ bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
+ for (auto *RC : {&X86::VR256RegClass, &X86::VR512_0_15RegClass}) {
+ if (!YmmOrZmmUsed) {
+ for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+ i++) {
+ if (!MRI.reg_nodbg_empty(*i)) {
+ YmmOrZmmUsed = true;
+ break;
+ }
+ }
+ }
+ }
+ if (!YmmOrZmmUsed)
+ return false;
+
+ assert(BlockStates.empty() && DirtySuccessors.empty() &&
+ "X86VZeroUpper state should be clear");
+ BlockStates.resize(MF.getNumBlockIDs());
+
+ // Process all blocks. This will compute block exit states, record the first
+ // unguarded call in each block, and add successors of dirty blocks to the
+ // DirtySuccessors list.
+ for (MachineBasicBlock &MBB : MF)
+ processBasicBlock(MBB);
+
+ // If any YMM/ZMM regs are live-in to this function, add the entry block to
+ // the DirtySuccessors list
+ if (FnHasLiveInYmmOrZmm)
+ addDirtySuccessor(MF.front());
+
+ // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
+ // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
+ // through PASS_THROUGH blocks.
+ while (!DirtySuccessors.empty()) {
+ MachineBasicBlock &MBB = *DirtySuccessors.back();
+ DirtySuccessors.pop_back();
+ BlockState &BBState = BlockStates[MBB.getNumber()];
+
+ // MBB is a successor of a dirty block, so its first call needs to be
+ // guarded.
+ if (BBState.FirstUnguardedCall != MBB.end())
+ insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
+
+ // If this successor was a pass-through block, then it is now dirty. Its
+ // successors need to be added to the worklist (if they haven't been
+ // already).
+ if (BBState.ExitState == PASS_THROUGH) {
+ LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber()
+ << " was Pass-through, is now Dirty-out.\n");
+ for (MachineBasicBlock *Succ : MBB.successors())
+ addDirtySuccessor(*Succ);
+ }
+ }
+
+ BlockStates.clear();
+ return EverMadeChange;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
new file mode 100644
index 000000000000..72593afb2258
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -0,0 +1,302 @@
+//===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that expands WinAlloca pseudo-instructions.
+//
+// It performs a conservative analysis to determine whether each allocation
+// falls within a region of the stack that is safe to use, or whether stack
+// probes must be emitted.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class X86WinAllocaExpander : public MachineFunctionPass {
+public:
+ X86WinAllocaExpander() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ /// Strategies for lowering a WinAlloca.
+ enum Lowering { TouchAndSub, Sub, Probe };
+
+ /// Deterministic-order map from WinAlloca instruction to desired lowering.
+ typedef MapVector<MachineInstr*, Lowering> LoweringMap;
+
+ /// Compute which lowering to use for each WinAlloca instruction.
+ void computeLowerings(MachineFunction &MF, LoweringMap& Lowerings);
+
+ /// Get the appropriate lowering based on current offset and amount.
+ Lowering getLowering(int64_t CurrentOffset, int64_t AllocaAmount);
+
+ /// Lower a WinAlloca instruction.
+ void lower(MachineInstr* MI, Lowering L);
+
+ MachineRegisterInfo *MRI = nullptr;
+ const X86Subtarget *STI = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
+ unsigned StackPtr = 0;
+ unsigned SlotSize = 0;
+ int64_t StackProbeSize = 0;
+ bool NoStackArgProbe = false;
+
+ StringRef getPassName() const override { return "X86 WinAlloca Expander"; }
+ static char ID;
+};
+
+char X86WinAllocaExpander::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86WinAllocaExpander() {
+ return new X86WinAllocaExpander();
+}
+
+/// Return the allocation amount for a WinAlloca instruction, or -1 if unknown.
+static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
+ assert(MI->getOpcode() == X86::WIN_ALLOCA_32 ||
+ MI->getOpcode() == X86::WIN_ALLOCA_64);
+ assert(MI->getOperand(0).isReg());
+
+ Register AmountReg = MI->getOperand(0).getReg();
+ MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg);
+
+ if (!Def ||
+ (Def->getOpcode() != X86::MOV32ri && Def->getOpcode() != X86::MOV64ri) ||
+ !Def->getOperand(1).isImm())
+ return -1;
+
+ return Def->getOperand(1).getImm();
+}
+
+X86WinAllocaExpander::Lowering
+X86WinAllocaExpander::getLowering(int64_t CurrentOffset,
+ int64_t AllocaAmount) {
+ // For a non-constant amount or a large amount, we have to probe.
+ if (AllocaAmount < 0 || AllocaAmount > StackProbeSize)
+ return Probe;
+
+ // If it fits within the safe region of the stack, just subtract.
+ if (CurrentOffset + AllocaAmount <= StackProbeSize)
+ return Sub;
+
+ // Otherwise, touch the current tip of the stack, then subtract.
+ return TouchAndSub;
+}
+
+static bool isPushPop(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::PUSH32i8:
+ case X86::PUSH32r:
+ case X86::PUSH32rmm:
+ case X86::PUSH32rmr:
+ case X86::PUSHi32:
+ case X86::PUSH64i8:
+ case X86::PUSH64r:
+ case X86::PUSH64rmm:
+ case X86::PUSH64rmr:
+ case X86::PUSH64i32:
+ case X86::POP32r:
+ case X86::POP64r:
+ return true;
+ default:
+ return false;
+ }
+}
+
+void X86WinAllocaExpander::computeLowerings(MachineFunction &MF,
+ LoweringMap &Lowerings) {
+ // Do a one-pass reverse post-order walk of the CFG to conservatively estimate
+ // the offset between the stack pointer and the lowest touched part of the
+ // stack, and use that to decide how to lower each WinAlloca instruction.
+
+ // Initialize OutOffset[B], the stack offset at exit from B, to something big.
+ DenseMap<MachineBasicBlock *, int64_t> OutOffset;
+ for (MachineBasicBlock &MBB : MF)
+ OutOffset[&MBB] = INT32_MAX;
+
+ // Note: we don't know the offset at the start of the entry block since the
+ // prologue hasn't been inserted yet, and how much that will adjust the stack
+ // pointer depends on register spills, which have not been computed yet.
+
+ // Compute the reverse post-order.
+ ReversePostOrderTraversal<MachineFunction*> RPO(&MF);
+
+ for (MachineBasicBlock *MBB : RPO) {
+ int64_t Offset = -1;
+ for (MachineBasicBlock *Pred : MBB->predecessors())
+ Offset = std::max(Offset, OutOffset[Pred]);
+ if (Offset == -1) Offset = INT32_MAX;
+
+ for (MachineInstr &MI : *MBB) {
+ if (MI.getOpcode() == X86::WIN_ALLOCA_32 ||
+ MI.getOpcode() == X86::WIN_ALLOCA_64) {
+ // A WinAlloca moves StackPtr, and potentially touches it.
+ int64_t Amount = getWinAllocaAmount(&MI, MRI);
+ Lowering L = getLowering(Offset, Amount);
+ Lowerings[&MI] = L;
+ switch (L) {
+ case Sub:
+ Offset += Amount;
+ break;
+ case TouchAndSub:
+ Offset = Amount;
+ break;
+ case Probe:
+ Offset = 0;
+ break;
+ }
+ } else if (MI.isCall() || isPushPop(MI)) {
+ // Calls, pushes and pops touch the tip of the stack.
+ Offset = 0;
+ } else if (MI.getOpcode() == X86::ADJCALLSTACKUP32 ||
+ MI.getOpcode() == X86::ADJCALLSTACKUP64) {
+ Offset -= MI.getOperand(0).getImm();
+ } else if (MI.getOpcode() == X86::ADJCALLSTACKDOWN32 ||
+ MI.getOpcode() == X86::ADJCALLSTACKDOWN64) {
+ Offset += MI.getOperand(0).getImm();
+ } else if (MI.modifiesRegister(StackPtr, TRI)) {
+ // Any other modification of SP means we've lost track of it.
+ Offset = INT32_MAX;
+ }
+ }
+
+ OutOffset[MBB] = Offset;
+ }
+}
+
+static unsigned getSubOpcode(bool Is64Bit, int64_t Amount) {
+ if (Is64Bit)
+ return isInt<8>(Amount) ? X86::SUB64ri8 : X86::SUB64ri32;
+ return isInt<8>(Amount) ? X86::SUB32ri8 : X86::SUB32ri;
+}
+
+void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
+ DebugLoc DL = MI->getDebugLoc();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineBasicBlock::iterator I = *MI;
+
+ int64_t Amount = getWinAllocaAmount(MI, MRI);
+ if (Amount == 0) {
+ MI->eraseFromParent();
+ return;
+ }
+
+ // These two variables differ on x32, which is a 64-bit target with a
+ // 32-bit alloca.
+ bool Is64Bit = STI->is64Bit();
+ bool Is64BitAlloca = MI->getOpcode() == X86::WIN_ALLOCA_64;
+ assert(SlotSize == 4 || SlotSize == 8);
+
+ switch (L) {
+ case TouchAndSub: {
+ assert(Amount >= SlotSize);
+
+ // Use a push to touch the top of the stack.
+ unsigned RegA = Is64Bit ? X86::RAX : X86::EAX;
+ BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(RegA, RegState::Undef);
+ Amount -= SlotSize;
+ if (!Amount)
+ break;
+
+ // Fall through to make any remaining adjustment.
+ LLVM_FALLTHROUGH;
+ }
+ case Sub:
+ assert(Amount > 0);
+ if (Amount == SlotSize) {
+ // Use push to save size.
+ unsigned RegA = Is64Bit ? X86::RAX : X86::EAX;
+ BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(RegA, RegState::Undef);
+ } else {
+ // Sub.
+ BuildMI(*MBB, I, DL,
+ TII->get(getSubOpcode(Is64BitAlloca, Amount)), StackPtr)
+ .addReg(StackPtr)
+ .addImm(Amount);
+ }
+ break;
+ case Probe:
+ if (!NoStackArgProbe) {
+ // The probe lowering expects the amount in RAX/EAX.
+ unsigned RegA = Is64BitAlloca ? X86::RAX : X86::EAX;
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
+ .addReg(MI->getOperand(0).getReg());
+
+ // Do the probe.
+ STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
+ /*InProlog=*/false);
+ } else {
+ // Sub
+ BuildMI(*MBB, I, DL,
+ TII->get(Is64BitAlloca ? X86::SUB64rr : X86::SUB32rr), StackPtr)
+ .addReg(StackPtr)
+ .addReg(MI->getOperand(0).getReg());
+ }
+ break;
+ }
+
+ Register AmountReg = MI->getOperand(0).getReg();
+ MI->eraseFromParent();
+
+ // Delete the definition of AmountReg.
+ if (MRI->use_empty(AmountReg))
+ if (MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg))
+ AmountDef->eraseFromParent();
+}
+
+bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
+ if (!MF.getInfo<X86MachineFunctionInfo>()->hasWinAlloca())
+ return false;
+
+ MRI = &MF.getRegInfo();
+ STI = &MF.getSubtarget<X86Subtarget>();
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
+ StackPtr = TRI->getStackRegister();
+ SlotSize = TRI->getSlotSize();
+
+ StackProbeSize = 4096;
+ if (MF.getFunction().hasFnAttribute("stack-probe-size")) {
+ MF.getFunction()
+ .getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+ }
+ NoStackArgProbe = MF.getFunction().hasFnAttribute("no-stack-arg-probe");
+ if (NoStackArgProbe)
+ StackProbeSize = INT64_MAX;
+
+ LoweringMap Lowerings;
+ computeLowerings(MF, Lowerings);
+ for (auto &P : Lowerings)
+ lower(P.first, P.second);
+
+ return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
new file mode 100644
index 000000000000..8d8bd5e6b326
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -0,0 +1,789 @@
+//===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// All functions using an MSVC EH personality use an explicitly updated state
+// number stored in an exception registration stack object. The registration
+// object is linked into a thread-local chain of registrations stored at fs:00.
+// This pass adds the registration object and EH state updates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include <deque>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "winehstate"
+
+namespace {
+const int OverdefinedState = INT_MIN;
+
+class WinEHStatePass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid.
+
+ WinEHStatePass() : FunctionPass(ID) { }
+
+ bool runOnFunction(Function &Fn) override;
+
+ bool doInitialization(Module &M) override;
+
+ bool doFinalization(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ StringRef getPassName() const override {
+ return "Windows 32-bit x86 EH state insertion";
+ }
+
+private:
+ void emitExceptionRegistrationRecord(Function *F);
+
+ void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler);
+ void unlinkExceptionRegistration(IRBuilder<> &Builder);
+ void addStateStores(Function &F, WinEHFuncInfo &FuncInfo);
+ void insertStateNumberStore(Instruction *IP, int State);
+
+ Value *emitEHLSDA(IRBuilder<> &Builder, Function *F);
+
+ Function *generateLSDAInEAXThunk(Function *ParentFunc);
+
+ bool isStateStoreNeeded(EHPersonality Personality, CallBase &Call);
+ void rewriteSetJmpCall(IRBuilder<> &Builder, Function &F, CallBase &Call,
+ Value *State);
+ int getBaseStateForBB(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+ WinEHFuncInfo &FuncInfo, BasicBlock *BB);
+ int getStateForCall(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+ WinEHFuncInfo &FuncInfo, CallBase &Call);
+
+ // Module-level type getters.
+ Type *getEHLinkRegistrationType();
+ Type *getSEHRegistrationType();
+ Type *getCXXEHRegistrationType();
+
+ // Per-module data.
+ Module *TheModule = nullptr;
+ StructType *EHLinkRegistrationTy = nullptr;
+ StructType *CXXEHRegistrationTy = nullptr;
+ StructType *SEHRegistrationTy = nullptr;
+ FunctionCallee SetJmp3 = nullptr;
+ FunctionCallee CxxLongjmpUnwind = nullptr;
+
+ // Per-function state
+ EHPersonality Personality = EHPersonality::Unknown;
+ Function *PersonalityFn = nullptr;
+ bool UseStackGuard = false;
+ int ParentBaseState = 0;
+ FunctionCallee SehLongjmpUnwind = nullptr;
+ Constant *Cookie = nullptr;
+
+ /// The stack allocation containing all EH data, including the link in the
+ /// fs:00 chain and the current state.
+ AllocaInst *RegNode = nullptr;
+
+ // The allocation containing the EH security guard.
+ AllocaInst *EHGuardNode = nullptr;
+
+ /// The index of the state field of RegNode.
+ int StateFieldIndex = ~0U;
+
+ /// The linked list node subobject inside of RegNode.
+ Value *Link = nullptr;
+};
+} // namespace
+
+FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
+
+char WinEHStatePass::ID = 0;
+
+INITIALIZE_PASS(WinEHStatePass, "x86-winehstate",
+ "Insert stores for EH state numbers", false, false)
+
+bool WinEHStatePass::doInitialization(Module &M) {
+ TheModule = &M;
+ return false;
+}
+
+bool WinEHStatePass::doFinalization(Module &M) {
+ assert(TheModule == &M);
+ TheModule = nullptr;
+ EHLinkRegistrationTy = nullptr;
+ CXXEHRegistrationTy = nullptr;
+ SEHRegistrationTy = nullptr;
+ SetJmp3 = nullptr;
+ CxxLongjmpUnwind = nullptr;
+ SehLongjmpUnwind = nullptr;
+ Cookie = nullptr;
+ return false;
+}
+
+void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
+ // This pass should only insert a stack allocation, memory accesses, and
+ // localrecovers.
+ AU.setPreservesCFG();
+}
+
+bool WinEHStatePass::runOnFunction(Function &F) {
+ // Don't insert state stores or exception handler thunks for
+ // available_externally functions. The handler needs to reference the LSDA,
+ // which will not be emitted in this case.
+ if (F.hasAvailableExternallyLinkage())
+ return false;
+
+ // Check the personality. Do nothing if this personality doesn't use funclets.
+ if (!F.hasPersonalityFn())
+ return false;
+ PersonalityFn =
+ dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+ if (!PersonalityFn)
+ return false;
+ Personality = classifyEHPersonality(PersonalityFn);
+ if (!isFuncletEHPersonality(Personality))
+ return false;
+
+ // Skip this function if there are no EH pads and we aren't using IR-level
+ // outlining.
+ bool HasPads = false;
+ for (BasicBlock &BB : F) {
+ if (BB.isEHPad()) {
+ HasPads = true;
+ break;
+ }
+ }
+ if (!HasPads)
+ return false;
+
+ Type *Int8PtrType = Type::getInt8PtrTy(TheModule->getContext());
+ SetJmp3 = TheModule->getOrInsertFunction(
+ "_setjmp3", FunctionType::get(
+ Type::getInt32Ty(TheModule->getContext()),
+ {Int8PtrType, Type::getInt32Ty(TheModule->getContext())},
+ /*isVarArg=*/true));
+
+ emitExceptionRegistrationRecord(&F);
+
+ // The state numbers calculated here in IR must agree with what we calculate
+ // later on for the MachineFunction. In particular, if an IR pass deletes an
+ // unreachable EH pad after this point before machine CFG construction, we
+ // will be in trouble. If this assumption is ever broken, we should turn the
+ // numbers into an immutable analysis pass.
+ WinEHFuncInfo FuncInfo;
+ addStateStores(F, FuncInfo);
+
+ // Reset per-function state.
+ PersonalityFn = nullptr;
+ Personality = EHPersonality::Unknown;
+ UseStackGuard = false;
+ RegNode = nullptr;
+ EHGuardNode = nullptr;
+
+ return true;
+}
+
+/// Get the common EH registration subobject:
+/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+/// struct EHRegistrationNode {
+/// EHRegistrationNode *Next;
+/// PEXCEPTION_ROUTINE Handler;
+/// };
+Type *WinEHStatePass::getEHLinkRegistrationType() {
+ if (EHLinkRegistrationTy)
+ return EHLinkRegistrationTy;
+ LLVMContext &Context = TheModule->getContext();
+ EHLinkRegistrationTy = StructType::create(Context, "EHRegistrationNode");
+ Type *FieldTys[] = {
+ EHLinkRegistrationTy->getPointerTo(0), // EHRegistrationNode *Next
+ Type::getInt8PtrTy(Context) // EXCEPTION_DISPOSITION (*Handler)(...)
+ };
+ EHLinkRegistrationTy->setBody(FieldTys, false);
+ return EHLinkRegistrationTy;
+}
+
+/// The __CxxFrameHandler3 registration node:
+/// struct CXXExceptionRegistration {
+/// void *SavedESP;
+/// EHRegistrationNode SubRecord;
+/// int32_t TryLevel;
+/// };
+Type *WinEHStatePass::getCXXEHRegistrationType() {
+ if (CXXEHRegistrationTy)
+ return CXXEHRegistrationTy;
+ LLVMContext &Context = TheModule->getContext();
+ Type *FieldTys[] = {
+ Type::getInt8PtrTy(Context), // void *SavedESP
+ getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+ Type::getInt32Ty(Context) // int32_t TryLevel
+ };
+ CXXEHRegistrationTy =
+ StructType::create(FieldTys, "CXXExceptionRegistration");
+ return CXXEHRegistrationTy;
+}
+
+/// The _except_handler3/4 registration node:
+/// struct EH4ExceptionRegistration {
+/// void *SavedESP;
+/// _EXCEPTION_POINTERS *ExceptionPointers;
+/// EHRegistrationNode SubRecord;
+/// int32_t EncodedScopeTable;
+/// int32_t TryLevel;
+/// };
+Type *WinEHStatePass::getSEHRegistrationType() {
+ if (SEHRegistrationTy)
+ return SEHRegistrationTy;
+ LLVMContext &Context = TheModule->getContext();
+ Type *FieldTys[] = {
+ Type::getInt8PtrTy(Context), // void *SavedESP
+ Type::getInt8PtrTy(Context), // void *ExceptionPointers
+ getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+ Type::getInt32Ty(Context), // int32_t EncodedScopeTable
+ Type::getInt32Ty(Context) // int32_t TryLevel
+ };
+ SEHRegistrationTy = StructType::create(FieldTys, "SEHExceptionRegistration");
+ return SEHRegistrationTy;
+}
+
+// Emit an exception registration record. These are stack allocations with the
+// common subobject of two pointers: the previous registration record (the old
+// fs:00) and the personality function for the current frame. The data before
+// and after that is personality function specific.
+void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
+ assert(Personality == EHPersonality::MSVC_CXX ||
+ Personality == EHPersonality::MSVC_X86SEH);
+
+ // Struct type of RegNode. Used for GEPing.
+ Type *RegNodeTy;
+
+ IRBuilder<> Builder(&F->getEntryBlock(), F->getEntryBlock().begin());
+ Type *Int8PtrType = Builder.getInt8PtrTy();
+ Type *Int32Ty = Builder.getInt32Ty();
+ Type *VoidTy = Builder.getVoidTy();
+
+ if (Personality == EHPersonality::MSVC_CXX) {
+ RegNodeTy = getCXXEHRegistrationType();
+ RegNode = Builder.CreateAlloca(RegNodeTy);
+ // SavedESP = llvm.stacksave()
+ Value *SP = Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+ Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+ // TryLevel = -1
+ StateFieldIndex = 2;
+ ParentBaseState = -1;
+ insertStateNumberStore(&*Builder.GetInsertPoint(), ParentBaseState);
+ // Handler = __ehhandler$F
+ Function *Trampoline = generateLSDAInEAXThunk(F);
+ Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
+ linkExceptionRegistration(Builder, Trampoline);
+
+ CxxLongjmpUnwind = TheModule->getOrInsertFunction(
+ "__CxxLongjmpUnwind",
+ FunctionType::get(VoidTy, Int8PtrType, /*isVarArg=*/false));
+ cast<Function>(CxxLongjmpUnwind.getCallee()->stripPointerCasts())
+ ->setCallingConv(CallingConv::X86_StdCall);
+ } else if (Personality == EHPersonality::MSVC_X86SEH) {
+ // If _except_handler4 is in use, some additional guard checks and prologue
+ // stuff is required.
+ StringRef PersonalityName = PersonalityFn->getName();
+ UseStackGuard = (PersonalityName == "_except_handler4");
+
+ // Allocate local structures.
+ RegNodeTy = getSEHRegistrationType();
+ RegNode = Builder.CreateAlloca(RegNodeTy);
+ if (UseStackGuard)
+ EHGuardNode = Builder.CreateAlloca(Int32Ty);
+
+ // SavedESP = llvm.stacksave()
+ Value *SP = Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+ Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+ // TryLevel = -2 / -1
+ StateFieldIndex = 4;
+ ParentBaseState = UseStackGuard ? -2 : -1;
+ insertStateNumberStore(&*Builder.GetInsertPoint(), ParentBaseState);
+ // ScopeTable = llvm.x86.seh.lsda(F)
+ Value *LSDA = emitEHLSDA(Builder, F);
+ LSDA = Builder.CreatePtrToInt(LSDA, Int32Ty);
+ // If using _except_handler4, xor the address of the table with
+ // __security_cookie.
+ if (UseStackGuard) {
+ Cookie = TheModule->getOrInsertGlobal("__security_cookie", Int32Ty);
+ Value *Val = Builder.CreateLoad(Int32Ty, Cookie, "cookie");
+ LSDA = Builder.CreateXor(LSDA, Val);
+ }
+ Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 3));
+
+ // If using _except_handler4, the EHGuard contains: FramePtr xor Cookie.
+ if (UseStackGuard) {
+ Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
+ Value *FrameAddr = Builder.CreateCall(
+ Intrinsic::getDeclaration(
+ TheModule, Intrinsic::frameaddress,
+ Builder.getInt8PtrTy(
+ TheModule->getDataLayout().getAllocaAddrSpace())),
+ Builder.getInt32(0), "frameaddr");
+ Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
+ FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);
+ Builder.CreateStore(FrameAddrI32, EHGuardNode);
+ }
+
+ // Register the exception handler.
+ Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 2);
+ linkExceptionRegistration(Builder, PersonalityFn);
+
+ SehLongjmpUnwind = TheModule->getOrInsertFunction(
+ UseStackGuard ? "_seh_longjmp_unwind4" : "_seh_longjmp_unwind",
+ FunctionType::get(Type::getVoidTy(TheModule->getContext()), Int8PtrType,
+ /*isVarArg=*/false));
+ cast<Function>(SehLongjmpUnwind.getCallee()->stripPointerCasts())
+ ->setCallingConv(CallingConv::X86_StdCall);
+ } else {
+ llvm_unreachable("unexpected personality function");
+ }
+
+ // Insert an unlink before all returns.
+ for (BasicBlock &BB : *F) {
+ Instruction *T = BB.getTerminator();
+ if (!isa<ReturnInst>(T))
+ continue;
+ Builder.SetInsertPoint(T);
+ unlinkExceptionRegistration(Builder);
+ }
+}
+
+Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) {
+ Value *FI8 = Builder.CreateBitCast(F, Type::getInt8PtrTy(F->getContext()));
+ return Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8);
+}
+
+/// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls
+/// PersonalityFn, forwarding the parameters passed to PEXCEPTION_ROUTINE:
+/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+/// We essentially want this code:
+/// movl $lsda, %eax
+/// jmpl ___CxxFrameHandler3
+Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
+ LLVMContext &Context = ParentFunc->getContext();
+ Type *Int32Ty = Type::getInt32Ty(Context);
+ Type *Int8PtrType = Type::getInt8PtrTy(Context);
+ Type *ArgTys[5] = {Int8PtrType, Int8PtrType, Int8PtrType, Int8PtrType,
+ Int8PtrType};
+ FunctionType *TrampolineTy =
+ FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 4),
+ /*isVarArg=*/false);
+ FunctionType *TargetFuncTy =
+ FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5),
+ /*isVarArg=*/false);
+ Function *Trampoline =
+ Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
+ Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape(
+ ParentFunc->getName()),
+ TheModule);
+ if (auto *C = ParentFunc->getComdat())
+ Trampoline->setComdat(C);
+ BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
+ IRBuilder<> Builder(EntryBB);
+ Value *LSDA = emitEHLSDA(Builder, ParentFunc);
+ Value *CastPersonality =
+ Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo());
+ auto AI = Trampoline->arg_begin();
+ Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++};
+ CallInst *Call = Builder.CreateCall(TargetFuncTy, CastPersonality, Args);
+ // Can't use musttail due to prototype mismatch, but we can use tail.
+ Call->setTailCall(true);
+ // Set inreg so we pass it in EAX.
+ Call->addParamAttr(0, Attribute::InReg);
+ Builder.CreateRet(Call);
+ return Trampoline;
+}
+
+void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder,
+ Function *Handler) {
+ // Emit the .safeseh directive for this function.
+ Handler->addFnAttr("safeseh");
+
+ Type *LinkTy = getEHLinkRegistrationType();
+ // Handler = Handler
+ Value *HandlerI8 = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy());
+ Builder.CreateStore(HandlerI8, Builder.CreateStructGEP(LinkTy, Link, 1));
+ // Next = [fs:00]
+ Constant *FSZero =
+ Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+ Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(), FSZero);
+ Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0));
+ // [fs:00] = Link
+ Builder.CreateStore(Link, FSZero);
+}
+
+void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
+ // Clone Link into the current BB for better address mode folding.
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Link)) {
+ GEP = cast<GetElementPtrInst>(GEP->clone());
+ Builder.Insert(GEP);
+ Link = GEP;
+ }
+ Type *LinkTy = getEHLinkRegistrationType();
+ // [fs:00] = Link->Next
+ Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(),
+ Builder.CreateStructGEP(LinkTy, Link, 0));
+ Constant *FSZero =
+ Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+ Builder.CreateStore(Next, FSZero);
+}
+
+// Calls to setjmp(p) are lowered to _setjmp3(p, 0) by the frontend.
+// The idea behind _setjmp3 is that it takes an optional number of personality
+// specific parameters to indicate how to restore the personality-specific frame
+// state when longjmp is initiated. Typically, the current TryLevel is saved.
+void WinEHStatePass::rewriteSetJmpCall(IRBuilder<> &Builder, Function &F,
+ CallBase &Call, Value *State) {
+ // Don't rewrite calls with a weird number of arguments.
+ if (Call.getNumArgOperands() != 2)
+ return;
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ Call.getOperandBundlesAsDefs(OpBundles);
+
+ SmallVector<Value *, 3> OptionalArgs;
+ if (Personality == EHPersonality::MSVC_CXX) {
+ OptionalArgs.push_back(CxxLongjmpUnwind.getCallee());
+ OptionalArgs.push_back(State);
+ OptionalArgs.push_back(emitEHLSDA(Builder, &F));
+ } else if (Personality == EHPersonality::MSVC_X86SEH) {
+ OptionalArgs.push_back(SehLongjmpUnwind.getCallee());
+ OptionalArgs.push_back(State);
+ if (UseStackGuard)
+ OptionalArgs.push_back(Cookie);
+ } else {
+ llvm_unreachable("unhandled personality!");
+ }
+
+ SmallVector<Value *, 5> Args;
+ Args.push_back(
+ Builder.CreateBitCast(Call.getArgOperand(0), Builder.getInt8PtrTy()));
+ Args.push_back(Builder.getInt32(OptionalArgs.size()));
+ Args.append(OptionalArgs.begin(), OptionalArgs.end());
+
+ CallBase *NewCall;
+ if (auto *CI = dyn_cast<CallInst>(&Call)) {
+ CallInst *NewCI = Builder.CreateCall(SetJmp3, Args, OpBundles);
+ NewCI->setTailCallKind(CI->getTailCallKind());
+ NewCall = NewCI;
+ } else {
+ auto *II = cast<InvokeInst>(&Call);
+ NewCall = Builder.CreateInvoke(
+ SetJmp3, II->getNormalDest(), II->getUnwindDest(), Args, OpBundles);
+ }
+ NewCall->setCallingConv(Call.getCallingConv());
+ NewCall->setAttributes(Call.getAttributes());
+ NewCall->setDebugLoc(Call.getDebugLoc());
+
+ NewCall->takeName(&Call);
+ Call.replaceAllUsesWith(NewCall);
+ Call.eraseFromParent();
+}
+
+// Figure out what state we should assign calls in this block.
+int WinEHStatePass::getBaseStateForBB(
+ DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
+ BasicBlock *BB) {
+ int BaseState = ParentBaseState;
+ auto &BBColors = BlockColors[BB];
+
+ assert(BBColors.size() == 1 && "multi-color BB not removed by preparation");
+ BasicBlock *FuncletEntryBB = BBColors.front();
+ if (auto *FuncletPad =
+ dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
+ auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+ if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+ BaseState = BaseStateI->second;
+ }
+
+ return BaseState;
+}
+
+// Calculate the state a call-site is in.
+int WinEHStatePass::getStateForCall(
+ DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
+ CallBase &Call) {
+ if (auto *II = dyn_cast<InvokeInst>(&Call)) {
+ // Look up the state number of the EH pad this unwinds to.
+ assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
+ return FuncInfo.InvokeStateMap[II];
+ }
+ // Possibly throwing call instructions have no actions to take after
+ // an unwind. Ensure they are in the -1 state.
+ return getBaseStateForBB(BlockColors, FuncInfo, Call.getParent());
+}
+
+// Calculate the intersection of all the FinalStates for a BasicBlock's
+// predecessors.
+static int getPredState(DenseMap<BasicBlock *, int> &FinalStates, Function &F,
+ int ParentBaseState, BasicBlock *BB) {
+ // The entry block has no predecessors but we know that the prologue always
+ // sets us up with a fixed state.
+ if (&F.getEntryBlock() == BB)
+ return ParentBaseState;
+
+ // This is an EH Pad, conservatively report this basic block as overdefined.
+ if (BB->isEHPad())
+ return OverdefinedState;
+
+ int CommonState = OverdefinedState;
+ for (BasicBlock *PredBB : predecessors(BB)) {
+ // We didn't manage to get a state for one of these predecessors,
+ // conservatively report this basic block as overdefined.
+ auto PredEndState = FinalStates.find(PredBB);
+ if (PredEndState == FinalStates.end())
+ return OverdefinedState;
+
+ // This code is reachable via exceptional control flow,
+ // conservatively report this basic block as overdefined.
+ if (isa<CatchReturnInst>(PredBB->getTerminator()))
+ return OverdefinedState;
+
+ int PredState = PredEndState->second;
+ assert(PredState != OverdefinedState &&
+ "overdefined BBs shouldn't be in FinalStates");
+ if (CommonState == OverdefinedState)
+ CommonState = PredState;
+
+ // At least two predecessors have different FinalStates,
+ // conservatively report this basic block as overdefined.
+ if (CommonState != PredState)
+ return OverdefinedState;
+ }
+
+ return CommonState;
+}
+
+// Calculate the intersection of all the InitialStates for a BasicBlock's
+// successors.
+static int getSuccState(DenseMap<BasicBlock *, int> &InitialStates, Function &F,
+ int ParentBaseState, BasicBlock *BB) {
+ // This block rejoins normal control flow,
+ // conservatively report this basic block as overdefined.
+ if (isa<CatchReturnInst>(BB->getTerminator()))
+ return OverdefinedState;
+
+ int CommonState = OverdefinedState;
+ for (BasicBlock *SuccBB : successors(BB)) {
+ // We didn't manage to get a state for one of these predecessors,
+ // conservatively report this basic block as overdefined.
+ auto SuccStartState = InitialStates.find(SuccBB);
+ if (SuccStartState == InitialStates.end())
+ return OverdefinedState;
+
+ // This is an EH Pad, conservatively report this basic block as overdefined.
+ if (SuccBB->isEHPad())
+ return OverdefinedState;
+
+ int SuccState = SuccStartState->second;
+ assert(SuccState != OverdefinedState &&
+ "overdefined BBs shouldn't be in FinalStates");
+ if (CommonState == OverdefinedState)
+ CommonState = SuccState;
+
+ // At least two successors have different InitialStates,
+ // conservatively report this basic block as overdefined.
+ if (CommonState != SuccState)
+ return OverdefinedState;
+ }
+
+ return CommonState;
+}
+
+bool WinEHStatePass::isStateStoreNeeded(EHPersonality Personality,
+ CallBase &Call) {
+ // If the function touches memory, it needs a state store.
+ if (isAsynchronousEHPersonality(Personality))
+ return !Call.doesNotAccessMemory();
+
+ // If the function throws, it needs a state store.
+ return !Call.doesNotThrow();
+}
+
+void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
+ // Mark the registration node. The backend needs to know which alloca it is so
+ // that it can recover the original frame pointer.
+ IRBuilder<> Builder(RegNode->getNextNode());
+ Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy());
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode),
+ {RegNodeI8});
+
+ if (EHGuardNode) {
+ IRBuilder<> Builder(EHGuardNode->getNextNode());
+ Value *EHGuardNodeI8 =
+ Builder.CreateBitCast(EHGuardNode, Builder.getInt8PtrTy());
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehguard),
+ {EHGuardNodeI8});
+ }
+
+ // Calculate state numbers.
+ if (isAsynchronousEHPersonality(Personality))
+ calculateSEHStateNumbers(&F, FuncInfo);
+ else
+ calculateWinCXXEHStateNumbers(&F, FuncInfo);
+
+ // Iterate all the instructions and emit state number stores.
+ DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F);
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+
+ // InitialStates yields the state of the first call-site for a BasicBlock.
+ DenseMap<BasicBlock *, int> InitialStates;
+ // FinalStates yields the state of the last call-site for a BasicBlock.
+ DenseMap<BasicBlock *, int> FinalStates;
+ // Worklist used to revisit BasicBlocks with indeterminate
+ // Initial/Final-States.
+ std::deque<BasicBlock *> Worklist;
+ // Fill in InitialStates and FinalStates for BasicBlocks with call-sites.
+ for (BasicBlock *BB : RPOT) {
+ int InitialState = OverdefinedState;
+ int FinalState;
+ if (&F.getEntryBlock() == BB)
+ InitialState = FinalState = ParentBaseState;
+ for (Instruction &I : *BB) {
+ auto *Call = dyn_cast<CallBase>(&I);
+ if (!Call || !isStateStoreNeeded(Personality, *Call))
+ continue;
+
+ int State = getStateForCall(BlockColors, FuncInfo, *Call);
+ if (InitialState == OverdefinedState)
+ InitialState = State;
+ FinalState = State;
+ }
+ // No call-sites in this basic block? That's OK, we will come back to these
+ // in a later pass.
+ if (InitialState == OverdefinedState) {
+ Worklist.push_back(BB);
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+ << " InitialState=" << InitialState << '\n');
+ LLVM_DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+ << " FinalState=" << FinalState << '\n');
+ InitialStates.insert({BB, InitialState});
+ FinalStates.insert({BB, FinalState});
+ }
+
+ // Try to fill-in InitialStates and FinalStates which have no call-sites.
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.front();
+ Worklist.pop_front();
+ // This BasicBlock has already been figured out, nothing more we can do.
+ if (InitialStates.count(BB) != 0)
+ continue;
+
+ int PredState = getPredState(FinalStates, F, ParentBaseState, BB);
+ if (PredState == OverdefinedState)
+ continue;
+
+ // We successfully inferred this BasicBlock's state via it's predecessors;
+ // enqueue it's successors to see if we can infer their states.
+ InitialStates.insert({BB, PredState});
+ FinalStates.insert({BB, PredState});
+ for (BasicBlock *SuccBB : successors(BB))
+ Worklist.push_back(SuccBB);
+ }
+
+ // Try to hoist stores from successors.
+ for (BasicBlock *BB : RPOT) {
+ int SuccState = getSuccState(InitialStates, F, ParentBaseState, BB);
+ if (SuccState == OverdefinedState)
+ continue;
+
+ // Update our FinalState to reflect the common InitialState of our
+ // successors.
+ FinalStates.insert({BB, SuccState});
+ }
+
+ // Finally, insert state stores before call-sites which transition us to a new
+ // state.
+ for (BasicBlock *BB : RPOT) {
+ auto &BBColors = BlockColors[BB];
+ BasicBlock *FuncletEntryBB = BBColors.front();
+ if (isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI()))
+ continue;
+
+ int PrevState = getPredState(FinalStates, F, ParentBaseState, BB);
+ LLVM_DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+ << " PrevState=" << PrevState << '\n');
+
+ for (Instruction &I : *BB) {
+ auto *Call = dyn_cast<CallBase>(&I);
+ if (!Call || !isStateStoreNeeded(Personality, *Call))
+ continue;
+
+ int State = getStateForCall(BlockColors, FuncInfo, *Call);
+ if (State != PrevState)
+ insertStateNumberStore(&I, State);
+ PrevState = State;
+ }
+
+ // We might have hoisted a state store into this block, emit it now.
+ auto EndState = FinalStates.find(BB);
+ if (EndState != FinalStates.end())
+ if (EndState->second != PrevState)
+ insertStateNumberStore(BB->getTerminator(), EndState->second);
+ }
+
+ SmallVector<CallBase *, 1> SetJmp3Calls;
+ for (BasicBlock *BB : RPOT) {
+ for (Instruction &I : *BB) {
+ auto *Call = dyn_cast<CallBase>(&I);
+ if (!Call)
+ continue;
+ if (Call->getCalledOperand()->stripPointerCasts() !=
+ SetJmp3.getCallee()->stripPointerCasts())
+ continue;
+
+ SetJmp3Calls.push_back(Call);
+ }
+ }
+
+ for (CallBase *Call : SetJmp3Calls) {
+ auto &BBColors = BlockColors[Call->getParent()];
+ BasicBlock *FuncletEntryBB = BBColors.front();
+ bool InCleanup = isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI());
+
+ IRBuilder<> Builder(Call);
+ Value *State;
+ if (InCleanup) {
+ Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(),
+ RegNode, StateFieldIndex);
+ State = Builder.CreateLoad(Builder.getInt32Ty(), StateField);
+ } else {
+ State = Builder.getInt32(getStateForCall(BlockColors, FuncInfo, *Call));
+ }
+ rewriteSetJmpCall(Builder, F, *Call, State);
+ }
+}
+
+void WinEHStatePass::insertStateNumberStore(Instruction *IP, int State) {
+ IRBuilder<> Builder(IP);
+ Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(),
+ RegNode, StateFieldIndex);
+ Builder.CreateStore(Builder.getInt32(State), StateField);
+}